]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
[cleanup] Misc
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
cc52de43 1#!/usr/bin/env python3
15dfb392 2import atexit
1e399778 3import base64
5bc880b9 4import binascii
912b38b4 5import calendar
676eb3f2 6import codecs
c380cc28 7import collections
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96 10import datetime
0c265486 11import email.header
f8271158 12import email.utils
f45c185f 13import errno
d77c3dfd 14import gzip
49fa4d9a
N
15import hashlib
16import hmac
019a94f7 17import importlib.util
03f9daab 18import io
79a2e94e 19import itertools
f4bfd65f 20import json
d77c3dfd 21import locale
02dbf93f 22import math
f8271158 23import mimetypes
347de493 24import operator
d77c3dfd 25import os
c496ca96 26import platform
773f291d 27import random
d77c3dfd 28import re
f8271158 29import shlex
c496ca96 30import socket
79a2e94e 31import ssl
1c088fa8 32import subprocess
d77c3dfd 33import sys
181c8655 34import tempfile
c380cc28 35import time
01951dda 36import traceback
f8271158 37import urllib.parse
bcf89ce6 38import xml.etree.ElementTree
d77c3dfd 39import zlib
d77c3dfd 40
c487cf00 41from .compat import asyncio, functools # isort: split
8c25f81b
PH
42from .compat import (
43 compat_chr,
1bab3437 44 compat_cookiejar,
36e6f62c 45 compat_etree_fromstring,
51098426 46 compat_expanduser,
8c25f81b 47 compat_html_entities,
55b2f099 48 compat_html_entities_html5,
f8271158 49 compat_HTMLParseError,
50 compat_HTMLParser,
be4a824d 51 compat_http_client,
f8271158 52 compat_HTTPError,
efa97bdc 53 compat_os_name,
8c25f81b 54 compat_parse_qs,
702ccf2d 55 compat_shlex_quote,
8c25f81b 56 compat_str,
edaa23f8 57 compat_struct_pack,
d3f8e038 58 compat_struct_unpack,
8c25f81b 59 compat_urllib_error,
f8271158 60 compat_urllib_parse_unquote_plus,
15707c7e 61 compat_urllib_parse_urlencode,
8c25f81b
PH
62 compat_urllib_parse_urlparse,
63 compat_urllib_request,
64 compat_urlparse,
65)
9b8ee23b 66from .dependencies import brotli, certifi, websockets
f8271158 67from .socks import ProxyType, sockssocket
71aff188 68
4644ac55 69
51fb4995
YCH
70def register_socks_protocols():
71 # "Register" SOCKS protocols
d5ae6bb5
YCH
72 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
73 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
74 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
75 if scheme not in compat_urlparse.uses_netloc:
76 compat_urlparse.uses_netloc.append(scheme)
77
78
468e2e92
FV
79# This is not clearly defined otherwise
80compiled_regex_type = type(re.compile(''))
81
f7a147e3
S
82
83def random_user_agent():
84 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
85 _CHROME_VERSIONS = (
19b4c74d 86 '90.0.4430.212',
87 '90.0.4430.24',
88 '90.0.4430.70',
89 '90.0.4430.72',
90 '90.0.4430.85',
91 '90.0.4430.93',
92 '91.0.4472.101',
93 '91.0.4472.106',
94 '91.0.4472.114',
95 '91.0.4472.124',
96 '91.0.4472.164',
97 '91.0.4472.19',
98 '91.0.4472.77',
99 '92.0.4515.107',
100 '92.0.4515.115',
101 '92.0.4515.131',
102 '92.0.4515.159',
103 '92.0.4515.43',
104 '93.0.4556.0',
105 '93.0.4577.15',
106 '93.0.4577.63',
107 '93.0.4577.82',
108 '94.0.4606.41',
109 '94.0.4606.54',
110 '94.0.4606.61',
111 '94.0.4606.71',
112 '94.0.4606.81',
113 '94.0.4606.85',
114 '95.0.4638.17',
115 '95.0.4638.50',
116 '95.0.4638.54',
117 '95.0.4638.69',
118 '95.0.4638.74',
119 '96.0.4664.18',
120 '96.0.4664.45',
121 '96.0.4664.55',
122 '96.0.4664.93',
123 '97.0.4692.20',
f7a147e3
S
124 )
125 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
126
127
4390d5ec 128SUPPORTED_ENCODINGS = [
129 'gzip', 'deflate'
130]
9b8ee23b 131if brotli:
4390d5ec 132 SUPPORTED_ENCODINGS.append('br')
133
3e669f36 134std_headers = {
f7a147e3 135 'User-Agent': random_user_agent(),
59ae15a5 136 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59ae15a5 137 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 138 'Sec-Fetch-Mode': 'navigate',
3e669f36 139}
f427df17 140
5f6a1245 141
fb37eb25
S
142USER_AGENTS = {
143 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
144}
145
146
bf42a990
S
147NO_DEFAULT = object()
148
7105440c
YCH
149ENGLISH_MONTH_NAMES = [
150 'January', 'February', 'March', 'April', 'May', 'June',
151 'July', 'August', 'September', 'October', 'November', 'December']
152
f6717dec
S
153MONTH_NAMES = {
154 'en': ENGLISH_MONTH_NAMES,
155 'fr': [
3e4185c3
S
156 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
157 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 158}
a942d6cb 159
a7aaa398
S
160KNOWN_EXTENSIONS = (
161 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
162 'flv', 'f4v', 'f4a', 'f4b',
163 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
164 'mkv', 'mka', 'mk3d',
165 'avi', 'divx',
166 'mov',
167 'asf', 'wmv', 'wma',
168 '3gp', '3g2',
169 'mp3',
170 'flac',
171 'ape',
172 'wav',
173 'f4f', 'f4m', 'm3u8', 'smil')
174
c587cbb7 175# needed for sanitizing filenames in restricted mode
c8827027 176ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
177 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
178 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 179
46f59e89
S
180DATE_FORMATS = (
181 '%d %B %Y',
182 '%d %b %Y',
183 '%B %d %Y',
cb655f34
S
184 '%B %dst %Y',
185 '%B %dnd %Y',
9d30c213 186 '%B %drd %Y',
cb655f34 187 '%B %dth %Y',
46f59e89 188 '%b %d %Y',
cb655f34
S
189 '%b %dst %Y',
190 '%b %dnd %Y',
9d30c213 191 '%b %drd %Y',
cb655f34 192 '%b %dth %Y',
46f59e89
S
193 '%b %dst %Y %I:%M',
194 '%b %dnd %Y %I:%M',
9d30c213 195 '%b %drd %Y %I:%M',
46f59e89
S
196 '%b %dth %Y %I:%M',
197 '%Y %m %d',
198 '%Y-%m-%d',
bccdbd22 199 '%Y.%m.%d.',
46f59e89 200 '%Y/%m/%d',
81c13222 201 '%Y/%m/%d %H:%M',
46f59e89 202 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
203 '%Y%m%d%H%M',
204 '%Y%m%d%H%M%S',
4f3fa23e 205 '%Y%m%d',
0c1c6f4b 206 '%Y-%m-%d %H:%M',
46f59e89
S
207 '%Y-%m-%d %H:%M:%S',
208 '%Y-%m-%d %H:%M:%S.%f',
5014558a 209 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
210 '%d.%m.%Y %H:%M',
211 '%d.%m.%Y %H.%M',
212 '%Y-%m-%dT%H:%M:%SZ',
213 '%Y-%m-%dT%H:%M:%S.%fZ',
214 '%Y-%m-%dT%H:%M:%S.%f0Z',
215 '%Y-%m-%dT%H:%M:%S',
216 '%Y-%m-%dT%H:%M:%S.%f',
217 '%Y-%m-%dT%H:%M',
c6eed6b8
S
218 '%b %d %Y at %H:%M',
219 '%b %d %Y at %H:%M:%S',
b555ae9b
S
220 '%B %d %Y at %H:%M',
221 '%B %d %Y at %H:%M:%S',
a63d9bd0 222 '%H:%M %d-%b-%Y',
46f59e89
S
223)
224
225DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
226DATE_FORMATS_DAY_FIRST.extend([
227 '%d-%m-%Y',
228 '%d.%m.%Y',
229 '%d.%m.%y',
230 '%d/%m/%Y',
231 '%d/%m/%y',
232 '%d/%m/%Y %H:%M:%S',
233])
234
235DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
236DATE_FORMATS_MONTH_FIRST.extend([
237 '%m-%d-%Y',
238 '%m.%d.%Y',
239 '%m/%d/%Y',
240 '%m/%d/%y',
241 '%m/%d/%Y %H:%M:%S',
242])
243
06b3fe29 244PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
22f5f5c6 245JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
06b3fe29 246
1d485a1a 247NUMBER_RE = r'\d+(?:\.\d+)?'
248
7105440c 249
0b9c08b4 250@functools.cache
d77c3dfd 251def preferredencoding():
59ae15a5 252 """Get preferred encoding.
d77c3dfd 253
59ae15a5
PH
254 Returns the best encoding scheme for the system, based on
255 locale.getpreferredencoding() and some further tweaks.
256 """
257 try:
258 pref = locale.getpreferredencoding()
28e614de 259 'TEST'.encode(pref)
70a1165b 260 except Exception:
59ae15a5 261 pref = 'UTF-8'
bae611f2 262
59ae15a5 263 return pref
d77c3dfd 264
f4bfd65f 265
181c8655 266def write_json_file(obj, fn):
1394646a 267 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 268
cfb0511d 269 tf = tempfile.NamedTemporaryFile(
270 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
271 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
272
273 try:
274 with tf:
45d86abe 275 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
276 if sys.platform == 'win32':
277 # Need to remove existing file on Windows, else os.rename raises
278 # WindowsError or FileExistsError.
19a03940 279 with contextlib.suppress(OSError):
1394646a 280 os.unlink(fn)
19a03940 281 with contextlib.suppress(OSError):
9cd5f54e
R
282 mask = os.umask(0)
283 os.umask(mask)
284 os.chmod(tf.name, 0o666 & ~mask)
181c8655 285 os.rename(tf.name, fn)
70a1165b 286 except Exception:
19a03940 287 with contextlib.suppress(OSError):
181c8655 288 os.remove(tf.name)
181c8655
PH
289 raise
290
291
cfb0511d 292def find_xpath_attr(node, xpath, key, val=None):
293 """ Find the xpath xpath[@key=val] """
294 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 295 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 296 return node.find(expr)
59ae56fa 297
d7e66d39
JMF
298# On python2.6 the xml.etree.ElementTree.Element methods don't support
299# the namespace parameter
5f6a1245
JW
300
301
d7e66d39
JMF
302def xpath_with_ns(path, ns_map):
303 components = [c.split(':') for c in path.split('/')]
304 replaced = []
305 for c in components:
306 if len(c) == 1:
307 replaced.append(c[0])
308 else:
309 ns, tag = c
310 replaced.append('{%s}%s' % (ns_map[ns], tag))
311 return '/'.join(replaced)
312
d77c3dfd 313
a41fb80c 314def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 315 def _find_xpath(xpath):
f9934b96 316 return node.find(xpath)
578c0745
S
317
318 if isinstance(xpath, (str, compat_str)):
319 n = _find_xpath(xpath)
320 else:
321 for xp in xpath:
322 n = _find_xpath(xp)
323 if n is not None:
324 break
d74bebd5 325
8e636da4 326 if n is None:
bf42a990
S
327 if default is not NO_DEFAULT:
328 return default
329 elif fatal:
bf0ff932
PH
330 name = xpath if name is None else name
331 raise ExtractorError('Could not find XML element %s' % name)
332 else:
333 return None
a41fb80c
S
334 return n
335
336
337def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
338 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
339 if n is None or n == default:
340 return n
341 if n.text is None:
342 if default is not NO_DEFAULT:
343 return default
344 elif fatal:
345 name = xpath if name is None else name
346 raise ExtractorError('Could not find XML element\'s text %s' % name)
347 else:
348 return None
349 return n.text
a41fb80c
S
350
351
352def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
353 n = find_xpath_attr(node, xpath, key)
354 if n is None:
355 if default is not NO_DEFAULT:
356 return default
357 elif fatal:
86e5f3ed 358 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
359 raise ExtractorError('Could not find XML attribute %s' % name)
360 else:
361 return None
362 return n.attrib[key]
bf0ff932
PH
363
364
c487cf00 365def get_element_by_id(id, html, **kwargs):
43e8fafd 366 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 367 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 368
12ea2f30 369
c487cf00 370def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 371 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 372 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
373
374
84c237fb 375def get_element_by_class(class_name, html):
2af12ad9
TC
376 """Return the content of the first tag with the specified class in the passed HTML document"""
377 retval = get_elements_by_class(class_name, html)
378 return retval[0] if retval else None
379
380
6f32a0b5
ZM
381def get_element_html_by_class(class_name, html):
382 """Return the html of the first tag with the specified class in the passed HTML document"""
383 retval = get_elements_html_by_class(class_name, html)
384 return retval[0] if retval else None
385
386
c487cf00 387def get_element_by_attribute(attribute, value, html, **kwargs):
388 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
389 return retval[0] if retval else None
390
391
c487cf00 392def get_element_html_by_attribute(attribute, value, html, **kargs):
393 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
394 return retval[0] if retval else None
395
396
c487cf00 397def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
398 """Return the content of all tags with the specified class in the passed HTML document as a list"""
399 return get_elements_by_attribute(
84c237fb
YCH
400 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
401 html, escape_value=False)
402
403
6f32a0b5
ZM
404def get_elements_html_by_class(class_name, html):
405 """Return the html of all tags with the specified class in the passed HTML document as a list"""
406 return get_elements_html_by_attribute(
407 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
408 html, escape_value=False)
409
410
411def get_elements_by_attribute(*args, **kwargs):
43e8fafd 412 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
413 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
414
415
416def get_elements_html_by_attribute(*args, **kwargs):
417 """Return the html of the tag with the specified attribute in the passed HTML document"""
418 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
419
420
421def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
422 """
423 Return the text (content) and the html (whole) of the tag with the specified
424 attribute in the passed HTML document
425 """
9e6dd238 426
86e5f3ed 427 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 428
84c237fb
YCH
429 value = re.escape(value) if escape_value else value
430
86e5f3ed 431 partial_element_re = rf'''(?x)
6f32a0b5 432 <(?P<tag>[a-zA-Z0-9:._-]+)
0254f162 433 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 434 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
435 '''
38285056 436
0254f162
ZM
437 for m in re.finditer(partial_element_re, html):
438 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 439
0254f162
ZM
440 yield (
441 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
442 whole
443 )
a921f407 444
c5229f39 445
6f32a0b5
ZM
446class HTMLBreakOnClosingTagParser(compat_HTMLParser):
447 """
448 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
449 closing tag for the first opening tag it has encountered, and can be used
450 as a context manager
451 """
452
453 class HTMLBreakOnClosingTagException(Exception):
454 pass
455
456 def __init__(self):
457 self.tagstack = collections.deque()
458 compat_HTMLParser.__init__(self)
459
460 def __enter__(self):
461 return self
462
463 def __exit__(self, *_):
464 self.close()
465
466 def close(self):
467 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
468 # so data remains buffered; we no longer have any interest in it, thus
469 # override this method to discard it
470 pass
471
472 def handle_starttag(self, tag, _):
473 self.tagstack.append(tag)
474
475 def handle_endtag(self, tag):
476 if not self.tagstack:
477 raise compat_HTMLParseError('no tags in the stack')
478 while self.tagstack:
479 inner_tag = self.tagstack.pop()
480 if inner_tag == tag:
481 break
482 else:
483 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
484 if not self.tagstack:
485 raise self.HTMLBreakOnClosingTagException()
486
487
488def get_element_text_and_html_by_tag(tag, html):
489 """
490 For the first element with the specified tag in the passed HTML document
491 return its' content (text) and the whole element (html)
492 """
493 def find_or_raise(haystack, needle, exc):
494 try:
495 return haystack.index(needle)
496 except ValueError:
497 raise exc
498 closing_tag = f'</{tag}>'
499 whole_start = find_or_raise(
500 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
501 content_start = find_or_raise(
502 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
503 content_start += whole_start + 1
504 with HTMLBreakOnClosingTagParser() as parser:
505 parser.feed(html[whole_start:content_start])
506 if not parser.tagstack or parser.tagstack[0] != tag:
507 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
508 offset = content_start
509 while offset < len(html):
510 next_closing_tag_start = find_or_raise(
511 html[offset:], closing_tag,
512 compat_HTMLParseError(f'closing {tag} tag not found'))
513 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
514 try:
515 parser.feed(html[offset:offset + next_closing_tag_end])
516 offset += next_closing_tag_end
517 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
518 return html[content_start:offset + next_closing_tag_start], \
519 html[whole_start:offset + next_closing_tag_end]
520 raise compat_HTMLParseError('unexpected end of html')
521
522
8bb56eee
BF
523class HTMLAttributeParser(compat_HTMLParser):
524 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 525
8bb56eee 526 def __init__(self):
c5229f39 527 self.attrs = {}
8bb56eee
BF
528 compat_HTMLParser.__init__(self)
529
530 def handle_starttag(self, tag, attrs):
531 self.attrs = dict(attrs)
532
c5229f39 533
73673ccf
FF
534class HTMLListAttrsParser(compat_HTMLParser):
535 """HTML parser to gather the attributes for the elements of a list"""
536
537 def __init__(self):
538 compat_HTMLParser.__init__(self)
539 self.items = []
540 self._level = 0
541
542 def handle_starttag(self, tag, attrs):
543 if tag == 'li' and self._level == 0:
544 self.items.append(dict(attrs))
545 self._level += 1
546
547 def handle_endtag(self, tag):
548 self._level -= 1
549
550
8bb56eee
BF
551def extract_attributes(html_element):
552 """Given a string for an HTML element such as
553 <el
554 a="foo" B="bar" c="&98;az" d=boz
555 empty= noval entity="&amp;"
556 sq='"' dq="'"
557 >
558 Decode and return a dictionary of attributes.
559 {
560 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
561 'empty': '', 'noval': None, 'entity': '&',
562 'sq': '"', 'dq': '\''
563 }.
8bb56eee
BF
564 """
565 parser = HTMLAttributeParser()
19a03940 566 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
567 parser.feed(html_element)
568 parser.close()
8bb56eee 569 return parser.attrs
9e6dd238 570
c5229f39 571
73673ccf
FF
572def parse_list(webpage):
573 """Given a string for an series of HTML <li> elements,
574 return a dictionary of their attributes"""
575 parser = HTMLListAttrsParser()
576 parser.feed(webpage)
577 parser.close()
578 return parser.items
579
580
9e6dd238 581def clean_html(html):
59ae15a5 582 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
583
584 if html is None: # Convenience for sanitizing descriptions etc.
585 return html
586
49185227 587 html = re.sub(r'\s+', ' ', html)
588 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
589 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
590 # Strip html tags
591 html = re.sub('<.*?>', '', html)
592 # Replace html entities
593 html = unescapeHTML(html)
7decf895 594 return html.strip()
9e6dd238
FV
595
596
d77c3dfd 597def sanitize_open(filename, open_mode):
59ae15a5
PH
598 """Try to open the given filename, and slightly tweak it if this fails.
599
600 Attempts to open the given filename. If this fails, it tries to change
601 the filename slightly, step by step, until it's either able to open it
602 or it fails and raises a final exception, like the standard open()
603 function.
604
605 It returns the tuple (stream, definitive_file_name).
606 """
0edb3e33 607 if filename == '-':
608 if sys.platform == 'win32':
609 import msvcrt
610 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
611 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 612
0edb3e33 613 for attempt in range(2):
614 try:
615 try:
89737671 616 if sys.platform == 'win32':
b506289f 617 # FIXME: An exclusive lock also locks the file from being read.
618 # Since windows locks are mandatory, don't lock the file on windows (for now).
619 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 620 raise LockingUnsupportedError()
0edb3e33 621 stream = locked_file(filename, open_mode, block=False).__enter__()
622 except LockingUnsupportedError:
623 stream = open(filename, open_mode)
624 return (stream, filename)
86e5f3ed 625 except OSError as err:
0edb3e33 626 if attempt or err.errno in (errno.EACCES,):
627 raise
628 old_filename, filename = filename, sanitize_path(filename)
629 if old_filename == filename:
630 raise
d77c3dfd
FV
631
632
633def timeconvert(timestr):
59ae15a5
PH
634 """Convert RFC 2822 defined time string into system timestamp"""
635 timestamp = None
636 timetuple = email.utils.parsedate_tz(timestr)
637 if timetuple is not None:
638 timestamp = email.utils.mktime_tz(timetuple)
639 return timestamp
1c469a94 640
5f6a1245 641
5c3895ff 642def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 643 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 644 @param restricted Use a stricter subset of allowed characters
645 @param is_id Whether this is an ID that should be kept unchanged if possible.
646 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 647 """
5c3895ff 648 if s == '':
649 return ''
650
59ae15a5 651 def replace_insane(char):
c587cbb7
AT
652 if restricted and char in ACCENT_CHARS:
653 return ACCENT_CHARS[char]
91dd88b9 654 elif not restricted and char == '\n':
5c3895ff 655 return '\0 '
91dd88b9 656 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
657 return ''
658 elif char == '"':
659 return '' if restricted else '\''
660 elif char == ':':
5c3895ff 661 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 662 elif char in '\\/|*<>':
5c3895ff 663 return '\0_'
664 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
665 return '\0_'
59ae15a5
PH
666 return char
667
5c3895ff 668 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 669 result = ''.join(map(replace_insane, s))
5c3895ff 670 if is_id is NO_DEFAULT:
671 result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result) # Remove repeated substitute chars
672 STRIP_RE = '(?:\0.|[ _-])*'
673 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
674 result = result.replace('\0', '') or '_'
675
796173d0
PH
676 if not is_id:
677 while '__' in result:
678 result = result.replace('__', '_')
679 result = result.strip('_')
680 # Common case of "Foreign band name - English song title"
681 if restricted and result.startswith('-_'):
682 result = result[2:]
5a42414b
PH
683 if result.startswith('-'):
684 result = '_' + result[len('-'):]
a7440261 685 result = result.lstrip('.')
796173d0
PH
686 if not result:
687 result = '_'
59ae15a5 688 return result
d77c3dfd 689
5f6a1245 690
c2934512 691def sanitize_path(s, force=False):
a2aaf4db 692 """Sanitizes and normalizes path on Windows"""
c2934512 693 if sys.platform == 'win32':
c4218ac3 694 force = False
c2934512 695 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 696 elif force:
697 drive_or_unc = ''
698 else:
a2aaf4db 699 return s
c2934512 700
be531ef1
S
701 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
702 if drive_or_unc:
a2aaf4db
S
703 norm_path.pop(0)
704 sanitized_path = [
ec85ded8 705 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 706 for path_part in norm_path]
be531ef1
S
707 if drive_or_unc:
708 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 709 elif force and s and s[0] == os.path.sep:
c4218ac3 710 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
711 return os.path.join(*sanitized_path)
712
713
17bcc626 714def sanitize_url(url):
befa4708
S
715 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
716 # the number of unwanted failures due to missing protocol
21633673 717 if url is None:
718 return
719 elif url.startswith('//'):
befa4708
S
720 return 'http:%s' % url
721 # Fix some common typos seen so far
722 COMMON_TYPOS = (
067aa17e 723 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
724 (r'^httpss://', r'https://'),
725 # https://bx1.be/lives/direct-tv/
726 (r'^rmtp([es]?)://', r'rtmp\1://'),
727 )
728 for mistake, fixup in COMMON_TYPOS:
729 if re.match(mistake, url):
730 return re.sub(mistake, fixup, url)
bc6b9bcd 731 return url
17bcc626
S
732
733
5435dcf9
HH
734def extract_basic_auth(url):
735 parts = compat_urlparse.urlsplit(url)
736 if parts.username is None:
737 return url, None
738 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
739 parts.hostname if parts.port is None
740 else '%s:%d' % (parts.hostname, parts.port))))
741 auth_payload = base64.b64encode(
0f06bcd7 742 ('%s:%s' % (parts.username, parts.password or '')).encode())
743 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
744
745
67dda517 746def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 747 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
748 if auth_header is not None:
749 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
750 headers['Authorization'] = auth_header
751 return compat_urllib_request.Request(url, *args, **kwargs)
67dda517
S
752
753
51098426
S
754def expand_path(s):
755 """Expand shell variables and ~"""
756 return os.path.expandvars(compat_expanduser(s))
757
758
d77c3dfd 759def orderedSet(iterable):
59ae15a5
PH
760 """ Remove all duplicates from the input iterable """
761 res = []
762 for el in iterable:
763 if el not in res:
764 res.append(el)
765 return res
d77c3dfd 766
912b38b4 767
55b2f099 768def _htmlentity_transform(entity_with_semicolon):
4e408e47 769 """Transforms an HTML entity to a character."""
55b2f099
YCH
770 entity = entity_with_semicolon[:-1]
771
4e408e47
PH
772 # Known non-numeric HTML entity
773 if entity in compat_html_entities.name2codepoint:
774 return compat_chr(compat_html_entities.name2codepoint[entity])
775
55b2f099
YCH
776 # TODO: HTML5 allows entities without a semicolon. For example,
777 # '&Eacuteric' should be decoded as 'Éric'.
778 if entity_with_semicolon in compat_html_entities_html5:
779 return compat_html_entities_html5[entity_with_semicolon]
780
91757b0f 781 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
782 if mobj is not None:
783 numstr = mobj.group(1)
28e614de 784 if numstr.startswith('x'):
4e408e47 785 base = 16
28e614de 786 numstr = '0%s' % numstr
4e408e47
PH
787 else:
788 base = 10
067aa17e 789 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 790 with contextlib.suppress(ValueError):
7aefc49c 791 return compat_chr(int(numstr, base))
4e408e47
PH
792
793 # Unknown entity in name, return its literal representation
7a3f0c00 794 return '&%s;' % entity
4e408e47
PH
795
796
d77c3dfd 797def unescapeHTML(s):
912b38b4
PH
798 if s is None:
799 return None
19a03940 800 assert isinstance(s, str)
d77c3dfd 801
4e408e47 802 return re.sub(
95f3f7c2 803 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 804
8bf48f23 805
cdb19aa4 806def escapeHTML(text):
807 return (
808 text
809 .replace('&', '&amp;')
810 .replace('<', '&lt;')
811 .replace('>', '&gt;')
812 .replace('"', '&quot;')
813 .replace("'", '&#39;')
814 )
815
816
f5b1bca9 817def process_communicate_or_kill(p, *args, **kwargs):
818 try:
819 return p.communicate(*args, **kwargs)
820 except BaseException: # Including KeyboardInterrupt
821 p.kill()
822 p.wait()
823 raise
824
825
d3c93ec2 826class Popen(subprocess.Popen):
827 if sys.platform == 'win32':
828 _startupinfo = subprocess.STARTUPINFO()
829 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
830 else:
831 _startupinfo = None
832
833 def __init__(self, *args, **kwargs):
86e5f3ed 834 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 835
836 def communicate_or_kill(self, *args, **kwargs):
837 return process_communicate_or_kill(self, *args, **kwargs)
838
839
aa49acd1
S
840def get_subprocess_encoding():
841 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
842 # For subprocess calls, encode with locale encoding
843 # Refer to http://stackoverflow.com/a/9951851/35070
844 encoding = preferredencoding()
845 else:
846 encoding = sys.getfilesystemencoding()
847 if encoding is None:
848 encoding = 'utf-8'
849 return encoding
850
851
8bf48f23 852def encodeFilename(s, for_subprocess=False):
19a03940 853 assert isinstance(s, str)
cfb0511d 854 return s
aa49acd1
S
855
856
857def decodeFilename(b, for_subprocess=False):
cfb0511d 858 return b
8bf48f23 859
f07b74fc
PH
860
861def encodeArgument(s):
cfb0511d 862 # Legacy code that uses byte strings
863 # Uncomment the following line after fixing all post processors
864 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
865 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
866
867
aa49acd1 868def decodeArgument(b):
cfb0511d 869 return b
aa49acd1
S
870
871
8271226a
PH
872def decodeOption(optval):
873 if optval is None:
874 return optval
875 if isinstance(optval, bytes):
876 optval = optval.decode(preferredencoding())
877
878 assert isinstance(optval, compat_str)
879 return optval
1c256f70 880
5f6a1245 881
aa7785f8 882_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
883
884
885def timetuple_from_msec(msec):
886 secs, msec = divmod(msec, 1000)
887 mins, secs = divmod(secs, 60)
888 hrs, mins = divmod(mins, 60)
889 return _timetuple(hrs, mins, secs, msec)
890
891
cdb19aa4 892def formatSeconds(secs, delim=':', msec=False):
aa7785f8 893 time = timetuple_from_msec(secs * 1000)
894 if time.hours:
895 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
896 elif time.minutes:
897 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 898 else:
aa7785f8 899 ret = '%d' % time.seconds
900 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 901
a0ddb8a2 902
77562778 903def _ssl_load_windows_store_certs(ssl_context, storename):
904 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
905 try:
906 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
907 if encoding == 'x509_asn' and (
908 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
909 except PermissionError:
910 return
911 for cert in certs:
19a03940 912 with contextlib.suppress(ssl.SSLError):
77562778 913 ssl_context.load_verify_locations(cadata=cert)
a2366922 914
77562778 915
916def make_HTTPS_handler(params, **kwargs):
917 opts_check_certificate = not params.get('nocheckcertificate')
918 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
919 context.check_hostname = opts_check_certificate
f81c62a6 920 if params.get('legacyserverconnect'):
921 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
4f28b537 922 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
923 context.set_ciphers('DEFAULT')
77562778 924 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
925 if opts_check_certificate:
d5820461 926 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
927 context.load_verify_locations(cafile=certifi.where())
928 else:
929 try:
930 context.load_default_certs()
931 # Work around the issue in load_default_certs when there are bad certificates. See:
932 # https://github.com/yt-dlp/yt-dlp/issues/1060,
933 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
934 except ssl.SSLError:
935 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
936 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
d5820461 937 for storename in ('CA', 'ROOT'):
938 _ssl_load_windows_store_certs(context, storename)
939 context.set_default_verify_paths()
bb58c9ed 940 client_certfile = params.get('client_certificate')
941 if client_certfile:
942 try:
943 context.load_cert_chain(
944 client_certfile, keyfile=params.get('client_certificate_key'),
945 password=params.get('client_certificate_password'))
946 except ssl.SSLError:
947 raise YoutubeDLError('Unable to load client certificate')
77562778 948 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 949
732ea2f0 950
5873d4cc 951def bug_reports_message(before=';'):
a44ca5a4 952 msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , '
592b7485 953 'filling out the appropriate issue template. '
08d30158 954 'Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
955
956 before = before.rstrip()
957 if not before or before.endswith(('.', '!', '?')):
958 msg = msg[0].title() + msg[1:]
959
960 return (before + ' ' if before else '') + msg
08f2a92c
JMF
961
962
bf5b9d85
PM
963class YoutubeDLError(Exception):
964 """Base exception for YoutubeDL errors."""
aa9369a2 965 msg = None
966
967 def __init__(self, msg=None):
968 if msg is not None:
969 self.msg = msg
970 elif self.msg is None:
971 self.msg = type(self).__name__
972 super().__init__(self.msg)
bf5b9d85
PM
973
974
3158150c 975network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
976if hasattr(ssl, 'CertificateError'):
977 network_exceptions.append(ssl.CertificateError)
978network_exceptions = tuple(network_exceptions)
979
980
bf5b9d85 981class ExtractorError(YoutubeDLError):
1c256f70 982 """Error during info extraction."""
5f6a1245 983
1151c407 984 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 985 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 986 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 987 """
3158150c 988 if sys.exc_info()[0] in network_exceptions:
9a82b238 989 expected = True
d5979c5d 990
7265a219 991 self.orig_msg = str(msg)
1c256f70 992 self.traceback = tb
1151c407 993 self.expected = expected
2eabb802 994 self.cause = cause
d11271dd 995 self.video_id = video_id
1151c407 996 self.ie = ie
997 self.exc_info = sys.exc_info() # preserve original exception
998
86e5f3ed 999 super().__init__(''.join((
1151c407 1000 format_field(ie, template='[%s] '),
1001 format_field(video_id, template='%s: '),
7265a219 1002 msg,
1151c407 1003 format_field(cause, template=' (caused by %r)'),
1004 '' if expected else bug_reports_message())))
1c256f70 1005
01951dda 1006 def format_traceback(self):
497d2fab 1007 return join_nonempty(
1008 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 1009 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 1010 delim='\n') or None
01951dda 1011
1c256f70 1012
416c7fcb
PH
1013class UnsupportedError(ExtractorError):
1014 def __init__(self, url):
86e5f3ed 1015 super().__init__(
416c7fcb
PH
1016 'Unsupported URL: %s' % url, expected=True)
1017 self.url = url
1018
1019
55b3e45b
JMF
1020class RegexNotFoundError(ExtractorError):
1021 """Error when a regex didn't match"""
1022 pass
1023
1024
773f291d
S
1025class GeoRestrictedError(ExtractorError):
1026 """Geographic restriction Error exception.
1027
1028 This exception may be thrown when a video is not available from your
1029 geographic location due to geographic restrictions imposed by a website.
1030 """
b6e0c7d2 1031
0db3bae8 1032 def __init__(self, msg, countries=None, **kwargs):
1033 kwargs['expected'] = True
86e5f3ed 1034 super().__init__(msg, **kwargs)
773f291d
S
1035 self.countries = countries
1036
1037
bf5b9d85 1038class DownloadError(YoutubeDLError):
59ae15a5 1039 """Download Error exception.
d77c3dfd 1040
59ae15a5
PH
1041 This exception may be thrown by FileDownloader objects if they are not
1042 configured to continue on errors. They will contain the appropriate
1043 error message.
1044 """
5f6a1245 1045
8cc83b8d
FV
1046 def __init__(self, msg, exc_info=None):
1047 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1048 super().__init__(msg)
8cc83b8d 1049 self.exc_info = exc_info
d77c3dfd
FV
1050
1051
498f5606 1052class EntryNotInPlaylist(YoutubeDLError):
1053 """Entry not in playlist exception.
1054
1055 This exception will be thrown by YoutubeDL when a requested entry
1056 is not found in the playlist info_dict
1057 """
aa9369a2 1058 msg = 'Entry not found in info'
498f5606 1059
1060
bf5b9d85 1061class SameFileError(YoutubeDLError):
59ae15a5 1062 """Same File exception.
d77c3dfd 1063
59ae15a5
PH
1064 This exception will be thrown by FileDownloader objects if they detect
1065 multiple files would have to be downloaded to the same file on disk.
1066 """
aa9369a2 1067 msg = 'Fixed output name but more than one file to download'
1068
1069 def __init__(self, filename=None):
1070 if filename is not None:
1071 self.msg += f': {filename}'
1072 super().__init__(self.msg)
d77c3dfd
FV
1073
1074
bf5b9d85 1075class PostProcessingError(YoutubeDLError):
59ae15a5 1076 """Post Processing exception.
d77c3dfd 1077
59ae15a5
PH
1078 This exception may be raised by PostProcessor's .run() method to
1079 indicate an error in the postprocessing task.
1080 """
5f6a1245 1081
5f6a1245 1082
48f79687 1083class DownloadCancelled(YoutubeDLError):
1084 """ Exception raised when the download queue should be interrupted """
1085 msg = 'The download was cancelled'
8b0d7497 1086
8b0d7497 1087
48f79687 1088class ExistingVideoReached(DownloadCancelled):
1089 """ --break-on-existing triggered """
1090 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1091
48f79687 1092
1093class RejectedVideoReached(DownloadCancelled):
1094 """ --break-on-reject triggered """
1095 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
51d9739f 1096
1097
48f79687 1098class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1099 """ --max-downloads limit has been reached. """
48f79687 1100 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1101
1102
f2ebc5c7 1103class ReExtractInfo(YoutubeDLError):
1104 """ Video info needs to be re-extracted. """
1105
1106 def __init__(self, msg, expected=False):
1107 super().__init__(msg)
1108 self.expected = expected
1109
1110
1111class ThrottledDownload(ReExtractInfo):
48f79687 1112 """ Download speed below --throttled-rate. """
aa9369a2 1113 msg = 'The download speed is below throttle limit'
d77c3dfd 1114
43b22906 1115 def __init__(self):
1116 super().__init__(self.msg, expected=False)
f2ebc5c7 1117
d77c3dfd 1118
bf5b9d85 1119class UnavailableVideoError(YoutubeDLError):
59ae15a5 1120 """Unavailable Format exception.
d77c3dfd 1121
59ae15a5
PH
1122 This exception will be thrown when a video is requested
1123 in a format that is not available for that video.
1124 """
aa9369a2 1125 msg = 'Unable to download video'
1126
1127 def __init__(self, err=None):
1128 if err is not None:
1129 self.msg += f': {err}'
1130 super().__init__(self.msg)
d77c3dfd
FV
1131
1132
bf5b9d85 1133class ContentTooShortError(YoutubeDLError):
59ae15a5 1134 """Content Too Short exception.
d77c3dfd 1135
59ae15a5
PH
1136 This exception may be raised by FileDownloader objects when a file they
1137 download is too small for what the server announced first, indicating
1138 the connection was probably interrupted.
1139 """
d77c3dfd 1140
59ae15a5 1141 def __init__(self, downloaded, expected):
86e5f3ed 1142 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1143 # Both in bytes
59ae15a5
PH
1144 self.downloaded = downloaded
1145 self.expected = expected
d77c3dfd 1146
5f6a1245 1147
bf5b9d85 1148class XAttrMetadataError(YoutubeDLError):
efa97bdc 1149 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1150 super().__init__(msg)
efa97bdc 1151 self.code = code
bd264412 1152 self.msg = msg
efa97bdc
YCH
1153
1154 # Parsing code and msg
3089bc74 1155 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1156 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1157 self.reason = 'NO_SPACE'
1158 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1159 self.reason = 'VALUE_TOO_LONG'
1160 else:
1161 self.reason = 'NOT_SUPPORTED'
1162
1163
bf5b9d85 1164class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1165 pass
1166
1167
c5a59d93 1168def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
f9934b96 1169 hc = http_class(*args, **kwargs)
be4a824d 1170 source_address = ydl_handler._params.get('source_address')
8959018a 1171
be4a824d 1172 if source_address is not None:
8959018a
AU
1173 # This is to workaround _create_connection() from socket where it will try all
1174 # address data from getaddrinfo() including IPv6. This filters the result from
1175 # getaddrinfo() based on the source_address value.
1176 # This is based on the cpython socket.create_connection() function.
1177 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1178 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1179 host, port = address
1180 err = None
1181 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1182 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1183 ip_addrs = [addr for addr in addrs if addr[0] == af]
1184 if addrs and not ip_addrs:
1185 ip_version = 'v4' if af == socket.AF_INET else 'v6'
86e5f3ed 1186 raise OSError(
9e21e6d9
S
1187 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1188 % (ip_version, source_address[0]))
8959018a
AU
1189 for res in ip_addrs:
1190 af, socktype, proto, canonname, sa = res
1191 sock = None
1192 try:
1193 sock = socket.socket(af, socktype, proto)
1194 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1195 sock.settimeout(timeout)
1196 sock.bind(source_address)
1197 sock.connect(sa)
1198 err = None # Explicitly break reference cycle
1199 return sock
86e5f3ed 1200 except OSError as _:
8959018a
AU
1201 err = _
1202 if sock is not None:
1203 sock.close()
1204 if err is not None:
1205 raise err
1206 else:
86e5f3ed 1207 raise OSError('getaddrinfo returns an empty list')
9e21e6d9
S
1208 if hasattr(hc, '_create_connection'):
1209 hc._create_connection = _create_connection
cfb0511d 1210 hc.source_address = (source_address, 0)
be4a824d
PH
1211
1212 return hc
1213
1214
87f0e62d 1215def handle_youtubedl_headers(headers):
992fc9d6
YCH
1216 filtered_headers = headers
1217
1218 if 'Youtubedl-no-compression' in filtered_headers:
86e5f3ed 1219 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
87f0e62d 1220 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1221
992fc9d6 1222 return filtered_headers
87f0e62d
YCH
1223
1224
acebc9cd 1225class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
1226 """Handler for HTTP requests and responses.
1227
1228 This class, when installed with an OpenerDirector, automatically adds
1229 the standard headers to every HTTP request and handles gzipped and
1230 deflated responses from web servers. If compression is to be avoided in
1231 a particular request, the original request in the program code only has
0424ec30 1232 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1233 removed before making the real request.
1234
1235 Part of this code was copied from:
1236
1237 http://techknack.net/python-urllib2-handlers/
1238
1239 Andrew Rowls, the author of that code, agreed to release it to the
1240 public domain.
1241 """
1242
be4a824d
PH
1243 def __init__(self, params, *args, **kwargs):
1244 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1245 self._params = params
1246
1247 def http_open(self, req):
71aff188
YCH
1248 conn_class = compat_http_client.HTTPConnection
1249
1250 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1251 if socks_proxy:
1252 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1253 del req.headers['Ytdl-socks-proxy']
1254
be4a824d 1255 return self.do_open(functools.partial(
71aff188 1256 _create_http_connection, self, conn_class, False),
be4a824d
PH
1257 req)
1258
59ae15a5
PH
1259 @staticmethod
1260 def deflate(data):
fc2119f2 1261 if not data:
1262 return data
59ae15a5
PH
1263 try:
1264 return zlib.decompress(data, -zlib.MAX_WBITS)
1265 except zlib.error:
1266 return zlib.decompress(data)
1267
4390d5ec 1268 @staticmethod
1269 def brotli(data):
1270 if not data:
1271 return data
9b8ee23b 1272 return brotli.decompress(data)
4390d5ec 1273
acebc9cd 1274 def http_request(self, req):
51f267d9
S
1275 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1276 # always respected by websites, some tend to give out URLs with non percent-encoded
1277 # non-ASCII characters (see telemb.py, ard.py [#3412])
1278 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1279 # To work around aforementioned issue we will replace request's original URL with
1280 # percent-encoded one
1281 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1282 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1283 url = req.get_full_url()
1284 url_escaped = escape_url(url)
1285
1286 # Substitute URL if any change after escaping
1287 if url != url_escaped:
15d260eb 1288 req = update_Request(req, url=url_escaped)
51f267d9 1289
8b7539d2 1290 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1291 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1292 # The dict keys are capitalized because of this bug by urllib
1293 if h.capitalize() not in req.headers:
33ac271b 1294 req.add_header(h, v)
87f0e62d 1295
af14914b 1296 if 'Accept-encoding' not in req.headers:
1297 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1298
87f0e62d 1299 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b 1300
59ae15a5
PH
1301 return req
1302
acebc9cd 1303 def http_response(self, req, resp):
59ae15a5
PH
1304 old_resp = resp
1305 # gzip
1306 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1307 content = resp.read()
1308 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1309 try:
1310 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1311 except OSError as original_ioerror:
aa3e9507
PH
1312 # There may be junk add the end of the file
1313 # See http://stackoverflow.com/q/4928560/35070 for details
1314 for i in range(1, 1024):
1315 try:
1316 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1317 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1318 except OSError:
aa3e9507
PH
1319 continue
1320 break
1321 else:
1322 raise original_ioerror
b407d853 1323 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1324 resp.msg = old_resp.msg
c047270c 1325 del resp.headers['Content-encoding']
59ae15a5
PH
1326 # deflate
1327 if resp.headers.get('Content-encoding', '') == 'deflate':
1328 gz = io.BytesIO(self.deflate(resp.read()))
b407d853 1329 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1330 resp.msg = old_resp.msg
c047270c 1331 del resp.headers['Content-encoding']
4390d5ec 1332 # brotli
1333 if resp.headers.get('Content-encoding', '') == 'br':
1334 resp = compat_urllib_request.addinfourl(
1335 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1336 resp.msg = old_resp.msg
1337 del resp.headers['Content-encoding']
ad729172 1338 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1339 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1340 if 300 <= resp.code < 400:
1341 location = resp.headers.get('Location')
1342 if location:
1343 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
0f06bcd7 1344 location = location.encode('iso-8859-1').decode()
5a4d9ddb
S
1345 location_escaped = escape_url(location)
1346 if location != location_escaped:
1347 del resp.headers['Location']
1348 resp.headers['Location'] = location_escaped
59ae15a5 1349 return resp
0f8d03f8 1350
acebc9cd
PH
1351 https_request = http_request
1352 https_response = http_response
bf50b038 1353
5de90176 1354
71aff188
YCH
1355def make_socks_conn_class(base_class, socks_proxy):
1356 assert issubclass(base_class, (
1357 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1358
1359 url_components = compat_urlparse.urlparse(socks_proxy)
1360 if url_components.scheme.lower() == 'socks5':
1361 socks_type = ProxyType.SOCKS5
1362 elif url_components.scheme.lower() in ('socks', 'socks4'):
1363 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1364 elif url_components.scheme.lower() == 'socks4a':
1365 socks_type = ProxyType.SOCKS4A
71aff188 1366
cdd94c2e
YCH
1367 def unquote_if_non_empty(s):
1368 if not s:
1369 return s
1370 return compat_urllib_parse_unquote_plus(s)
1371
71aff188
YCH
1372 proxy_args = (
1373 socks_type,
1374 url_components.hostname, url_components.port or 1080,
1375 True, # Remote DNS
cdd94c2e
YCH
1376 unquote_if_non_empty(url_components.username),
1377 unquote_if_non_empty(url_components.password),
71aff188
YCH
1378 )
1379
1380 class SocksConnection(base_class):
1381 def connect(self):
1382 self.sock = sockssocket()
1383 self.sock.setproxy(*proxy_args)
19a03940 1384 if isinstance(self.timeout, (int, float)):
71aff188
YCH
1385 self.sock.settimeout(self.timeout)
1386 self.sock.connect((self.host, self.port))
1387
1388 if isinstance(self, compat_http_client.HTTPSConnection):
1389 if hasattr(self, '_context'): # Python > 2.6
1390 self.sock = self._context.wrap_socket(
1391 self.sock, server_hostname=self.host)
1392 else:
1393 self.sock = ssl.wrap_socket(self.sock)
1394
1395 return SocksConnection
1396
1397
be4a824d
PH
1398class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1399 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1400 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1401 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1402 self._params = params
1403
1404 def https_open(self, req):
4f264c02 1405 kwargs = {}
71aff188
YCH
1406 conn_class = self._https_conn_class
1407
4f264c02
JMF
1408 if hasattr(self, '_context'): # python > 2.6
1409 kwargs['context'] = self._context
1410 if hasattr(self, '_check_hostname'): # python 3.x
1411 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1412
1413 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1414 if socks_proxy:
1415 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1416 del req.headers['Ytdl-socks-proxy']
1417
4f28b537 1418 try:
1419 return self.do_open(
1420 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1421 except urllib.error.URLError as e:
1422 if (isinstance(e.reason, ssl.SSLError)
1423 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1424 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1425 raise
be4a824d
PH
1426
1427
1bab3437 1428class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
f1a8511f
S
1429 """
1430 See [1] for cookie file format.
1431
1432 1. https://curl.haxx.se/docs/http-cookies.html
1433 """
e7e62441 1434 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1435 _ENTRY_LEN = 7
1436 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1437# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1438
1439'''
1440 _CookieFileEntry = collections.namedtuple(
1441 'CookieFileEntry',
1442 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1443
d76fa1f3 1444 def __init__(self, filename=None, *args, **kwargs):
1445 super().__init__(None, *args, **kwargs)
1446 if self.is_path(filename):
1447 filename = os.fspath(filename)
1448 self.filename = filename
1449
24146491 1450 @staticmethod
1451 def _true_or_false(cndn):
1452 return 'TRUE' if cndn else 'FALSE'
1453
d76fa1f3 1454 @staticmethod
1455 def is_path(file):
1456 return isinstance(file, (str, bytes, os.PathLike))
1457
1458 @contextlib.contextmanager
1459 def open(self, file, *, write=False):
1460 if self.is_path(file):
1461 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1462 yield f
1463 else:
1464 if write:
1465 file.truncate(0)
1466 yield file
1467
24146491 1468 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1469 now = time.time()
1470 for cookie in self:
1471 if (not ignore_discard and cookie.discard
1472 or not ignore_expires and cookie.is_expired(now)):
1473 continue
1474 name, value = cookie.name, cookie.value
1475 if value is None:
1476 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1477 # with no name, whereas http.cookiejar regards it as a
1478 # cookie with no value.
1479 name, value = '', name
1480 f.write('%s\n' % '\t'.join((
1481 cookie.domain,
1482 self._true_or_false(cookie.domain.startswith('.')),
1483 cookie.path,
1484 self._true_or_false(cookie.secure),
1485 str_or_none(cookie.expires, default=''),
1486 name, value
1487 )))
1488
1489 def save(self, filename=None, *args, **kwargs):
c380cc28
S
1490 """
1491 Save cookies to a file.
24146491 1492 Code is taken from CPython 3.6
1493 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
c380cc28 1494
c380cc28
S
1495 if filename is None:
1496 if self.filename is not None:
1497 filename = self.filename
1498 else:
1499 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1500
24146491 1501 # Store session cookies with `expires` set to 0 instead of an empty string
1bab3437
S
1502 for cookie in self:
1503 if cookie.expires is None:
1504 cookie.expires = 0
c380cc28 1505
d76fa1f3 1506 with self.open(filename, write=True) as f:
c380cc28 1507 f.write(self._HEADER)
24146491 1508 self._really_save(f, *args, **kwargs)
1bab3437
S
1509
1510 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1511 """Load cookies from a file."""
1512 if filename is None:
1513 if self.filename is not None:
1514 filename = self.filename
1515 else:
1516 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1517
c380cc28
S
1518 def prepare_line(line):
1519 if line.startswith(self._HTTPONLY_PREFIX):
1520 line = line[len(self._HTTPONLY_PREFIX):]
1521 # comments and empty lines are fine
1522 if line.startswith('#') or not line.strip():
1523 return line
1524 cookie_list = line.split('\t')
1525 if len(cookie_list) != self._ENTRY_LEN:
1526 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1527 cookie = self._CookieFileEntry(*cookie_list)
1528 if cookie.expires_at and not cookie.expires_at.isdigit():
1529 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1530 return line
1531
e7e62441 1532 cf = io.StringIO()
d76fa1f3 1533 with self.open(filename) as f:
e7e62441 1534 for line in f:
c380cc28
S
1535 try:
1536 cf.write(prepare_line(line))
1537 except compat_cookiejar.LoadError as e:
94aa0644
L
1538 if f'{line.strip()} '[0] in '[{"':
1539 raise compat_cookiejar.LoadError(
1540 'Cookies file must be Netscape formatted, not JSON. See '
1541 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
19a03940 1542 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
c380cc28 1543 continue
e7e62441 1544 cf.seek(0)
1545 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1546 # Session cookies are denoted by either `expires` field set to
1547 # an empty string or 0. MozillaCookieJar only recognizes the former
1548 # (see [1]). So we need force the latter to be recognized as session
1549 # cookies on our own.
1550 # Session cookies may be important for cookies-based authentication,
1551 # e.g. usually, when user does not check 'Remember me' check box while
1552 # logging in on a site, some important cookies are stored as session
1553 # cookies so that not recognizing them will result in failed login.
1554 # 1. https://bugs.python.org/issue17164
1555 for cookie in self:
1556 # Treat `expires=0` cookies as session cookies
1557 if cookie.expires == 0:
1558 cookie.expires = None
1559 cookie.discard = True
1560
1561
a6420bf5
S
1562class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1563 def __init__(self, cookiejar=None):
1564 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1565
1566 def http_response(self, request, response):
a6420bf5
S
1567 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1568
f5fa042c 1569 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
a6420bf5
S
1570 https_response = http_response
1571
1572
fca6dba8 1573class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
201c1459 1574 """YoutubeDL redirect handler
1575
1576 The code is based on HTTPRedirectHandler implementation from CPython [1].
1577
1578 This redirect handler solves two issues:
1579 - ensures redirect URL is always unicode under python 2
1580 - introduces support for experimental HTTP response status code
1581 308 Permanent Redirect [2] used by some sites [3]
1582
1583 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1584 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1585 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1586 """
1587
1588 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1589
1590 def redirect_request(self, req, fp, code, msg, headers, newurl):
1591 """Return a Request or None in response to a redirect.
1592
1593 This is called by the http_error_30x methods when a
1594 redirection response is received. If a redirection should
1595 take place, return a new Request to allow http_error_30x to
1596 perform the redirect. Otherwise, raise HTTPError if no-one
1597 else should try to handle this url. Return None if you can't
1598 but another Handler might.
1599 """
1600 m = req.get_method()
1601 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1602 or code in (301, 302, 303) and m == "POST")):
1603 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1604 # Strictly (according to RFC 2616), 301 or 302 in response to
1605 # a POST MUST NOT cause a redirection without confirmation
1606 # from the user (of urllib.request, in this case). In practice,
1607 # essentially all clients do redirect in this case, so we do
1608 # the same.
1609
201c1459 1610 # Be conciliant with URIs containing a space. This is mainly
1611 # redundant with the more complete encoding done in http_error_302(),
1612 # but it is kept for compatibility with other callers.
1613 newurl = newurl.replace(' ', '%20')
1614
1615 CONTENT_HEADERS = ("content-length", "content-type")
1616 # NB: don't use dict comprehension for python 2.6 compatibility
86e5f3ed 1617 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
afac4caa 1618
1619 # A 303 must either use GET or HEAD for subsequent request
1620 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1621 if code == 303 and m != 'HEAD':
1622 m = 'GET'
1623 # 301 and 302 redirects are commonly turned into a GET from a POST
1624 # for subsequent requests by browsers, so we'll do the same.
1625 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1626 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1627 if code in (301, 302) and m == 'POST':
1628 m = 'GET'
1629
201c1459 1630 return compat_urllib_request.Request(
1631 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
afac4caa 1632 unverifiable=True, method=m)
fca6dba8
S
1633
1634
46f59e89
S
1635def extract_timezone(date_str):
1636 m = re.search(
f137e4c2 1637 r'''(?x)
1638 ^.{8,}? # >=8 char non-TZ prefix, if present
1639 (?P<tz>Z| # just the UTC Z, or
1640 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1641 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1642 [ ]? # optional space
1643 (?P<sign>\+|-) # +/-
1644 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1645 $)
1646 ''', date_str)
46f59e89
S
1647 if not m:
1648 timezone = datetime.timedelta()
1649 else:
1650 date_str = date_str[:-len(m.group('tz'))]
1651 if not m.group('sign'):
1652 timezone = datetime.timedelta()
1653 else:
1654 sign = 1 if m.group('sign') == '+' else -1
1655 timezone = datetime.timedelta(
1656 hours=sign * int(m.group('hours')),
1657 minutes=sign * int(m.group('minutes')))
1658 return timezone, date_str
1659
1660
08b38d54 1661def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1662 """ Return a UNIX timestamp from the given date """
1663
1664 if date_str is None:
1665 return None
1666
52c3a6e4
S
1667 date_str = re.sub(r'\.[0-9]+', '', date_str)
1668
08b38d54 1669 if timezone is None:
46f59e89
S
1670 timezone, date_str = extract_timezone(date_str)
1671
19a03940 1672 with contextlib.suppress(ValueError):
86e5f3ed 1673 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1674 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1675 return calendar.timegm(dt.timetuple())
912b38b4
PH
1676
1677
46f59e89
S
1678def date_formats(day_first=True):
1679 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1680
1681
42bdd9d0 1682def unified_strdate(date_str, day_first=True):
bf50b038 1683 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1684
1685 if date_str is None:
1686 return None
bf50b038 1687 upload_date = None
5f6a1245 1688 # Replace commas
026fcc04 1689 date_str = date_str.replace(',', ' ')
42bdd9d0 1690 # Remove AM/PM + timezone
9bb8e0a3 1691 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1692 _, date_str = extract_timezone(date_str)
42bdd9d0 1693
46f59e89 1694 for expression in date_formats(day_first):
19a03940 1695 with contextlib.suppress(ValueError):
bf50b038 1696 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1697 if upload_date is None:
1698 timetuple = email.utils.parsedate_tz(date_str)
1699 if timetuple:
19a03940 1700 with contextlib.suppress(ValueError):
c6b9cf05 1701 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402
JMF
1702 if upload_date is not None:
1703 return compat_str(upload_date)
bf50b038 1704
5f6a1245 1705
46f59e89
S
1706def unified_timestamp(date_str, day_first=True):
1707 if date_str is None:
1708 return None
1709
2ae2ffda 1710 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1711
7dc2a74e 1712 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1713 timezone, date_str = extract_timezone(date_str)
1714
1715 # Remove AM/PM + timezone
1716 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1717
deef3195
S
1718 # Remove unrecognized timezones from ISO 8601 alike timestamps
1719 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1720 if m:
1721 date_str = date_str[:-len(m.group('tz'))]
1722
f226880c
PH
1723 # Python only supports microseconds, so remove nanoseconds
1724 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1725 if m:
1726 date_str = m.group(1)
1727
46f59e89 1728 for expression in date_formats(day_first):
19a03940 1729 with contextlib.suppress(ValueError):
7dc2a74e 1730 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1731 return calendar.timegm(dt.timetuple())
46f59e89
S
1732 timetuple = email.utils.parsedate_tz(date_str)
1733 if timetuple:
7dc2a74e 1734 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1735
1736
28e614de 1737def determine_ext(url, default_ext='unknown_video'):
85750f89 1738 if url is None or '.' not in url:
f4776371 1739 return default_ext
9cb9a5df 1740 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1741 if re.match(r'^[A-Za-z0-9]+$', guess):
1742 return guess
a7aaa398
S
1743 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1744 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1745 return guess.rstrip('/')
73e79f2a 1746 else:
cbdbb766 1747 return default_ext
73e79f2a 1748
5f6a1245 1749
824fa511
S
1750def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1751 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1752
5f6a1245 1753
9e62f283 1754def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1755 R"""
1756 Return a datetime object from a string.
1757 Supported format:
1758 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1759
1760 @param format strftime format of DATE
1761 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1762 auto: round to the unit provided in date_str (if applicable).
9e62f283 1763 """
1764 auto_precision = False
1765 if precision == 'auto':
1766 auto_precision = True
1767 precision = 'microsecond'
396a76f7 1768 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1769 if date_str in ('now', 'today'):
37254abc 1770 return today
f8795e10
PH
1771 if date_str == 'yesterday':
1772 return today - datetime.timedelta(days=1)
9e62f283 1773 match = re.match(
3d38b2d6 1774 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1775 date_str)
37254abc 1776 if match is not None:
9e62f283 1777 start_time = datetime_from_str(match.group('start'), precision, format)
1778 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1779 unit = match.group('unit')
9e62f283 1780 if unit == 'month' or unit == 'year':
1781 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1782 unit = 'day'
9e62f283 1783 else:
1784 if unit == 'week':
1785 unit = 'day'
1786 time *= 7
1787 delta = datetime.timedelta(**{unit + 's': time})
1788 new_date = start_time + delta
1789 if auto_precision:
1790 return datetime_round(new_date, unit)
1791 return new_date
1792
1793 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1794
1795
d49f8db3 1796def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1797 R"""
1798 Return a date object from a string using datetime_from_str
9e62f283 1799
3d38b2d6 1800 @param strict Restrict allowed patterns to "YYYYMMDD" and
1801 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1802 """
3d38b2d6 1803 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1804 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1805 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1806
1807
1808def datetime_add_months(dt, months):
1809 """Increment/Decrement a datetime object by months."""
1810 month = dt.month + months - 1
1811 year = dt.year + month // 12
1812 month = month % 12 + 1
1813 day = min(dt.day, calendar.monthrange(year, month)[1])
1814 return dt.replace(year, month, day)
1815
1816
1817def datetime_round(dt, precision='day'):
1818 """
1819 Round a datetime object's time to a specific precision
1820 """
1821 if precision == 'microsecond':
1822 return dt
1823
1824 unit_seconds = {
1825 'day': 86400,
1826 'hour': 3600,
1827 'minute': 60,
1828 'second': 1,
1829 }
1830 roundto = lambda x, n: ((x + n / 2) // n) * n
1831 timestamp = calendar.timegm(dt.timetuple())
1832 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1833
1834
e63fc1be 1835def hyphenate_date(date_str):
1836 """
1837 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1838 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1839 if match is not None:
1840 return '-'.join(match.groups())
1841 else:
1842 return date_str
1843
5f6a1245 1844
86e5f3ed 1845class DateRange:
bd558525 1846 """Represents a time interval between two dates"""
5f6a1245 1847
bd558525
JMF
1848 def __init__(self, start=None, end=None):
1849 """start and end must be strings in the format accepted by date"""
1850 if start is not None:
d49f8db3 1851 self.start = date_from_str(start, strict=True)
bd558525
JMF
1852 else:
1853 self.start = datetime.datetime.min.date()
1854 if end is not None:
d49f8db3 1855 self.end = date_from_str(end, strict=True)
bd558525
JMF
1856 else:
1857 self.end = datetime.datetime.max.date()
37254abc 1858 if self.start > self.end:
bd558525 1859 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1860
bd558525
JMF
1861 @classmethod
1862 def day(cls, day):
1863 """Returns a range that only contains the given day"""
5f6a1245
JW
1864 return cls(day, day)
1865
bd558525
JMF
1866 def __contains__(self, date):
1867 """Check if the date is in the range"""
37254abc
JMF
1868 if not isinstance(date, datetime.date):
1869 date = date_from_str(date)
1870 return self.start <= date <= self.end
5f6a1245 1871
bd558525 1872 def __str__(self):
86e5f3ed 1873 return f'{self.start.isoformat()} - {self.end.isoformat()}'
c496ca96
PH
1874
1875
1876def platform_name():
1877 """ Returns the platform name as a compat_str """
1878 res = platform.platform()
1879 if isinstance(res, bytes):
1880 res = res.decode(preferredencoding())
1881
1882 assert isinstance(res, compat_str)
1883 return res
c257baff
PH
1884
1885
0b9c08b4 1886@functools.cache
49fa4d9a
N
1887def get_windows_version():
1888 ''' Get Windows version. None if it's not running on Windows '''
1889 if compat_os_name == 'nt':
1890 return version_tuple(platform.win32_ver()[1])
1891 else:
1892 return None
1893
1894
734f90bb 1895def write_string(s, out=None, encoding=None):
19a03940 1896 assert isinstance(s, str)
1897 out = out or sys.stderr
7459e3a2 1898
fe1daad3 1899 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 1900 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 1901
c487cf00 1902 enc = None
cfb0511d 1903 if 'b' in getattr(out, 'mode', ''):
c487cf00 1904 enc = encoding or preferredencoding()
104aa738 1905 elif hasattr(out, 'buffer'):
c487cf00 1906 out = out.buffer
104aa738 1907 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 1908
1909 out.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
1910 out.flush()
1911
1912
48ea9cea
PH
1913def bytes_to_intlist(bs):
1914 if not bs:
1915 return []
1916 if isinstance(bs[0], int): # Python 3
1917 return list(bs)
1918 else:
1919 return [ord(c) for c in bs]
1920
c257baff 1921
cba892fa 1922def intlist_to_bytes(xs):
1923 if not xs:
1924 return b''
edaa23f8 1925 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1926
1927
0edb3e33 1928class LockingUnsupportedError(IOError):
1929 msg = 'File locking is not supported on this platform'
1930
1931 def __init__(self):
1932 super().__init__(self.msg)
1933
1934
c1c9a79c
PH
1935# Cross-platform file locking
1936if sys.platform == 'win32':
1937 import ctypes.wintypes
1938 import msvcrt
1939
1940 class OVERLAPPED(ctypes.Structure):
1941 _fields_ = [
1942 ('Internal', ctypes.wintypes.LPVOID),
1943 ('InternalHigh', ctypes.wintypes.LPVOID),
1944 ('Offset', ctypes.wintypes.DWORD),
1945 ('OffsetHigh', ctypes.wintypes.DWORD),
1946 ('hEvent', ctypes.wintypes.HANDLE),
1947 ]
1948
1949 kernel32 = ctypes.windll.kernel32
1950 LockFileEx = kernel32.LockFileEx
1951 LockFileEx.argtypes = [
1952 ctypes.wintypes.HANDLE, # hFile
1953 ctypes.wintypes.DWORD, # dwFlags
1954 ctypes.wintypes.DWORD, # dwReserved
1955 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1956 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1957 ctypes.POINTER(OVERLAPPED) # Overlapped
1958 ]
1959 LockFileEx.restype = ctypes.wintypes.BOOL
1960 UnlockFileEx = kernel32.UnlockFileEx
1961 UnlockFileEx.argtypes = [
1962 ctypes.wintypes.HANDLE, # hFile
1963 ctypes.wintypes.DWORD, # dwReserved
1964 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1965 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1966 ctypes.POINTER(OVERLAPPED) # Overlapped
1967 ]
1968 UnlockFileEx.restype = ctypes.wintypes.BOOL
1969 whole_low = 0xffffffff
1970 whole_high = 0x7fffffff
1971
747c0bd1 1972 def _lock_file(f, exclusive, block):
c1c9a79c
PH
1973 overlapped = OVERLAPPED()
1974 overlapped.Offset = 0
1975 overlapped.OffsetHigh = 0
1976 overlapped.hEvent = 0
1977 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 1978
1979 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1980 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1981 0, whole_low, whole_high, f._lock_file_overlapped_p):
1982 raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
c1c9a79c
PH
1983
1984 def _unlock_file(f):
1985 assert f._lock_file_overlapped_p
1986 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 1987 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
1988 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1989
1990else:
399a76e6
YCH
1991 try:
1992 import fcntl
c1c9a79c 1993
a3125791 1994 def _lock_file(f, exclusive, block):
b63837bc 1995 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1996 if not block:
1997 flags |= fcntl.LOCK_NB
acea8d7c 1998 try:
b63837bc 1999 fcntl.flock(f, flags)
acea8d7c
JK
2000 except BlockingIOError:
2001 raise
2002 except OSError: # AOSP does not have flock()
b63837bc 2003 fcntl.lockf(f, flags)
c1c9a79c 2004
399a76e6 2005 def _unlock_file(f):
acea8d7c
JK
2006 try:
2007 fcntl.flock(f, fcntl.LOCK_UN)
2008 except OSError:
2009 fcntl.lockf(f, fcntl.LOCK_UN)
a3125791 2010
399a76e6 2011 except ImportError:
399a76e6 2012
a3125791 2013 def _lock_file(f, exclusive, block):
0edb3e33 2014 raise LockingUnsupportedError()
399a76e6
YCH
2015
2016 def _unlock_file(f):
0edb3e33 2017 raise LockingUnsupportedError()
c1c9a79c
PH
2018
2019
86e5f3ed 2020class locked_file:
0edb3e33 2021 locked = False
747c0bd1 2022
a3125791 2023 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
2024 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2025 raise NotImplementedError(mode)
2026 self.mode, self.block = mode, block
2027
2028 writable = any(f in mode for f in 'wax+')
2029 readable = any(f in mode for f in 'r+')
2030 flags = functools.reduce(operator.ior, (
2031 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2032 getattr(os, 'O_BINARY', 0), # Windows only
2033 getattr(os, 'O_NOINHERIT', 0), # Windows only
2034 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2035 os.O_APPEND if 'a' in mode else 0,
2036 os.O_EXCL if 'x' in mode else 0,
2037 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2038 ))
2039
98804d03 2040 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
2041
2042 def __enter__(self):
a3125791 2043 exclusive = 'r' not in self.mode
c1c9a79c 2044 try:
a3125791 2045 _lock_file(self.f, exclusive, self.block)
0edb3e33 2046 self.locked = True
86e5f3ed 2047 except OSError:
c1c9a79c
PH
2048 self.f.close()
2049 raise
fcfa8853 2050 if 'w' in self.mode:
131e14dc
JK
2051 try:
2052 self.f.truncate()
2053 except OSError as e:
2054 if e.errno != 29: # Illegal seek, expected when self.f is a FIFO
2055 raise e
c1c9a79c
PH
2056 return self
2057
0edb3e33 2058 def unlock(self):
2059 if not self.locked:
2060 return
c1c9a79c 2061 try:
0edb3e33 2062 _unlock_file(self.f)
c1c9a79c 2063 finally:
0edb3e33 2064 self.locked = False
c1c9a79c 2065
0edb3e33 2066 def __exit__(self, *_):
2067 try:
2068 self.unlock()
2069 finally:
2070 self.f.close()
4eb7f1d1 2071
0edb3e33 2072 open = __enter__
2073 close = __exit__
a3125791 2074
0edb3e33 2075 def __getattr__(self, attr):
2076 return getattr(self.f, attr)
a3125791 2077
0edb3e33 2078 def __iter__(self):
2079 return iter(self.f)
a3125791 2080
4eb7f1d1 2081
0b9c08b4 2082@functools.cache
4644ac55
S
2083def get_filesystem_encoding():
2084 encoding = sys.getfilesystemencoding()
2085 return encoding if encoding is not None else 'utf-8'
2086
2087
4eb7f1d1 2088def shell_quote(args):
a6a173c2 2089 quoted_args = []
4644ac55 2090 encoding = get_filesystem_encoding()
a6a173c2
JMF
2091 for a in args:
2092 if isinstance(a, bytes):
2093 # We may get a filename encoded with 'encodeFilename'
2094 a = a.decode(encoding)
aefce8e6 2095 quoted_args.append(compat_shlex_quote(a))
28e614de 2096 return ' '.join(quoted_args)
9d4660ca
PH
2097
2098
2099def smuggle_url(url, data):
2100 """ Pass additional data in a URL for internal use. """
2101
81953d1a
RA
2102 url, idata = unsmuggle_url(url, {})
2103 data.update(idata)
15707c7e 2104 sdata = compat_urllib_parse_urlencode(
28e614de
PH
2105 {'__youtubedl_smuggle': json.dumps(data)})
2106 return url + '#' + sdata
9d4660ca
PH
2107
2108
79f82953 2109def unsmuggle_url(smug_url, default=None):
83e865a3 2110 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2111 return smug_url, default
28e614de
PH
2112 url, _, sdata = smug_url.rpartition('#')
2113 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2114 data = json.loads(jsond)
2115 return url, data
02dbf93f
PH
2116
2117
e0fd9573 2118def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2119 """ Formats numbers with decimal sufixes like K, M, etc """
2120 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2121 if num is None or num < 0:
e0fd9573 2122 return None
eeb2a770 2123 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2124 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2125 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 2126 if factor == 1024:
2127 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2128 converted = num / (factor ** exponent)
abbeeebc 2129 return fmt % (converted, suffix)
e0fd9573 2130
2131
02dbf93f 2132def format_bytes(bytes):
f02d24d8 2133 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2134
1c088fa8 2135
fb47597b
S
2136def lookup_unit_table(unit_table, s):
2137 units_re = '|'.join(re.escape(u) for u in unit_table)
2138 m = re.match(
782b1b5b 2139 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
2140 if not m:
2141 return None
2142 num_str = m.group('num').replace(',', '.')
2143 mult = unit_table[m.group('unit')]
2144 return int(float(num_str) * mult)
2145
2146
be64b5b0
PH
2147def parse_filesize(s):
2148 if s is None:
2149 return None
2150
dfb1b146 2151 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2152 # but we support those too
2153 _UNIT_TABLE = {
2154 'B': 1,
2155 'b': 1,
70852b47 2156 'bytes': 1,
be64b5b0
PH
2157 'KiB': 1024,
2158 'KB': 1000,
2159 'kB': 1024,
2160 'Kb': 1000,
13585d76 2161 'kb': 1000,
70852b47
YCH
2162 'kilobytes': 1000,
2163 'kibibytes': 1024,
be64b5b0
PH
2164 'MiB': 1024 ** 2,
2165 'MB': 1000 ** 2,
2166 'mB': 1024 ** 2,
2167 'Mb': 1000 ** 2,
13585d76 2168 'mb': 1000 ** 2,
70852b47
YCH
2169 'megabytes': 1000 ** 2,
2170 'mebibytes': 1024 ** 2,
be64b5b0
PH
2171 'GiB': 1024 ** 3,
2172 'GB': 1000 ** 3,
2173 'gB': 1024 ** 3,
2174 'Gb': 1000 ** 3,
13585d76 2175 'gb': 1000 ** 3,
70852b47
YCH
2176 'gigabytes': 1000 ** 3,
2177 'gibibytes': 1024 ** 3,
be64b5b0
PH
2178 'TiB': 1024 ** 4,
2179 'TB': 1000 ** 4,
2180 'tB': 1024 ** 4,
2181 'Tb': 1000 ** 4,
13585d76 2182 'tb': 1000 ** 4,
70852b47
YCH
2183 'terabytes': 1000 ** 4,
2184 'tebibytes': 1024 ** 4,
be64b5b0
PH
2185 'PiB': 1024 ** 5,
2186 'PB': 1000 ** 5,
2187 'pB': 1024 ** 5,
2188 'Pb': 1000 ** 5,
13585d76 2189 'pb': 1000 ** 5,
70852b47
YCH
2190 'petabytes': 1000 ** 5,
2191 'pebibytes': 1024 ** 5,
be64b5b0
PH
2192 'EiB': 1024 ** 6,
2193 'EB': 1000 ** 6,
2194 'eB': 1024 ** 6,
2195 'Eb': 1000 ** 6,
13585d76 2196 'eb': 1000 ** 6,
70852b47
YCH
2197 'exabytes': 1000 ** 6,
2198 'exbibytes': 1024 ** 6,
be64b5b0
PH
2199 'ZiB': 1024 ** 7,
2200 'ZB': 1000 ** 7,
2201 'zB': 1024 ** 7,
2202 'Zb': 1000 ** 7,
13585d76 2203 'zb': 1000 ** 7,
70852b47
YCH
2204 'zettabytes': 1000 ** 7,
2205 'zebibytes': 1024 ** 7,
be64b5b0
PH
2206 'YiB': 1024 ** 8,
2207 'YB': 1000 ** 8,
2208 'yB': 1024 ** 8,
2209 'Yb': 1000 ** 8,
13585d76 2210 'yb': 1000 ** 8,
70852b47
YCH
2211 'yottabytes': 1000 ** 8,
2212 'yobibytes': 1024 ** 8,
be64b5b0
PH
2213 }
2214
fb47597b
S
2215 return lookup_unit_table(_UNIT_TABLE, s)
2216
2217
2218def parse_count(s):
2219 if s is None:
be64b5b0
PH
2220 return None
2221
352d5da8 2222 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2223
2224 if re.match(r'^[\d,.]+$', s):
2225 return str_to_int(s)
2226
2227 _UNIT_TABLE = {
2228 'k': 1000,
2229 'K': 1000,
2230 'm': 1000 ** 2,
2231 'M': 1000 ** 2,
2232 'kk': 1000 ** 2,
2233 'KK': 1000 ** 2,
352d5da8 2234 'b': 1000 ** 3,
2235 'B': 1000 ** 3,
fb47597b 2236 }
be64b5b0 2237
352d5da8 2238 ret = lookup_unit_table(_UNIT_TABLE, s)
2239 if ret is not None:
2240 return ret
2241
2242 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2243 if mobj:
2244 return str_to_int(mobj.group(1))
be64b5b0 2245
2f7ae819 2246
5d45484c 2247def parse_resolution(s, *, lenient=False):
b871d7e9
S
2248 if s is None:
2249 return {}
2250
5d45484c
LNO
2251 if lenient:
2252 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2253 else:
2254 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2255 if mobj:
2256 return {
2257 'width': int(mobj.group('w')),
2258 'height': int(mobj.group('h')),
2259 }
2260
17ec8bcf 2261 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2262 if mobj:
2263 return {'height': int(mobj.group(1))}
2264
2265 mobj = re.search(r'\b([48])[kK]\b', s)
2266 if mobj:
2267 return {'height': int(mobj.group(1)) * 540}
2268
2269 return {}
2270
2271
0dc41787
S
2272def parse_bitrate(s):
2273 if not isinstance(s, compat_str):
2274 return
2275 mobj = re.search(r'\b(\d+)\s*kbps', s)
2276 if mobj:
2277 return int(mobj.group(1))
2278
2279
a942d6cb 2280def month_by_name(name, lang='en'):
caefb1de
PH
2281 """ Return the number of a month by (locale-independently) English name """
2282
f6717dec 2283 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2284
caefb1de 2285 try:
f6717dec 2286 return month_names.index(name) + 1
7105440c
YCH
2287 except ValueError:
2288 return None
2289
2290
2291def month_by_abbreviation(abbrev):
2292 """ Return the number of a month by (locale-independently) English
2293 abbreviations """
2294
2295 try:
2296 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2297 except ValueError:
2298 return None
18258362
JMF
2299
2300
5aafe895 2301def fix_xml_ampersands(xml_str):
18258362 2302 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2303 return re.sub(
2304 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2305 '&amp;',
5aafe895 2306 xml_str)
e3946f98
PH
2307
2308
2309def setproctitle(title):
8bf48f23 2310 assert isinstance(title, compat_str)
c1c05c67
YCH
2311
2312 # ctypes in Jython is not complete
2313 # http://bugs.jython.org/issue2148
2314 if sys.platform.startswith('java'):
2315 return
2316
e3946f98 2317 try:
611c1dd9 2318 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2319 except OSError:
2320 return
2f49bcd6
RC
2321 except TypeError:
2322 # LoadLibrary in Windows Python 2.7.13 only expects
2323 # a bytestring, but since unicode_literals turns
2324 # every string into a unicode string, it fails.
2325 return
0f06bcd7 2326 title_bytes = title.encode()
6eefe533
PH
2327 buf = ctypes.create_string_buffer(len(title_bytes))
2328 buf.value = title_bytes
e3946f98 2329 try:
6eefe533 2330 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2331 except AttributeError:
2332 return # Strange libc, just skip this
d7dda168
PH
2333
2334
2335def remove_start(s, start):
46bc9b7d 2336 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2337
2338
2b9faf55 2339def remove_end(s, end):
46bc9b7d 2340 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2341
2342
31b2051e
S
2343def remove_quotes(s):
2344 if s is None or len(s) < 2:
2345 return s
2346 for quote in ('"', "'", ):
2347 if s[0] == quote and s[-1] == quote:
2348 return s[1:-1]
2349 return s
2350
2351
b6e0c7d2
U
2352def get_domain(url):
2353 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2354 return domain.group('domain') if domain else None
2355
2356
29eb5174 2357def url_basename(url):
9b8aaeed 2358 path = compat_urlparse.urlparse(url).path
28e614de 2359 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2360
2361
02dc0a36
S
2362def base_url(url):
2363 return re.match(r'https?://[^?#&]+/', url).group()
2364
2365
e34c3361 2366def urljoin(base, path):
4b5de77b 2367 if isinstance(path, bytes):
0f06bcd7 2368 path = path.decode()
e34c3361
S
2369 if not isinstance(path, compat_str) or not path:
2370 return None
fad4ceb5 2371 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2372 return path
4b5de77b 2373 if isinstance(base, bytes):
0f06bcd7 2374 base = base.decode()
4b5de77b
S
2375 if not isinstance(base, compat_str) or not re.match(
2376 r'^(?:https?:)?//', base):
e34c3361
S
2377 return None
2378 return compat_urlparse.urljoin(base, path)
2379
2380
aa94a6d3
PH
2381class HEADRequest(compat_urllib_request.Request):
2382 def get_method(self):
611c1dd9 2383 return 'HEAD'
7217e148
PH
2384
2385
95cf60e8
S
2386class PUTRequest(compat_urllib_request.Request):
2387 def get_method(self):
2388 return 'PUT'
2389
2390
9732d77e 2391def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2392 if get_attr and v is not None:
2393 v = getattr(v, get_attr, None)
1812afb7
S
2394 try:
2395 return int(v) * invscale // scale
31c49255 2396 except (ValueError, TypeError, OverflowError):
af98f8ff 2397 return default
9732d77e 2398
9572013d 2399
40a90862
JMF
2400def str_or_none(v, default=None):
2401 return default if v is None else compat_str(v)
2402
9732d77e
PH
2403
2404def str_to_int(int_str):
48d4681e 2405 """ A more relaxed version of int_or_none """
f9934b96 2406 if isinstance(int_str, int):
348c6bf1 2407 return int_str
42db58ec
S
2408 elif isinstance(int_str, compat_str):
2409 int_str = re.sub(r'[,\.\+]', '', int_str)
2410 return int_or_none(int_str)
608d11f5
PH
2411
2412
9732d77e 2413def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2414 if v is None:
2415 return default
2416 try:
2417 return float(v) * invscale / scale
5e1271c5 2418 except (ValueError, TypeError):
caf80631 2419 return default
43f775e4
PH
2420
2421
c7e327c4
S
2422def bool_or_none(v, default=None):
2423 return v if isinstance(v, bool) else default
2424
2425
53cd37ba
S
2426def strip_or_none(v, default=None):
2427 return v.strip() if isinstance(v, compat_str) else default
b72b4431
S
2428
2429
af03000a
S
2430def url_or_none(url):
2431 if not url or not isinstance(url, compat_str):
2432 return None
2433 url = url.strip()
29f7c58a 2434 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2435
2436
3e9b66d7
LNO
2437def request_to_url(req):
2438 if isinstance(req, compat_urllib_request.Request):
2439 return req.get_full_url()
2440 else:
2441 return req
2442
2443
e29663c6 2444def strftime_or_none(timestamp, date_format, default=None):
2445 datetime_object = None
2446 try:
f9934b96 2447 if isinstance(timestamp, (int, float)): # unix timestamp
e29663c6 2448 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2449 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2450 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2451 return datetime_object.strftime(date_format)
2452 except (ValueError, TypeError, AttributeError):
2453 return default
2454
2455
608d11f5 2456def parse_duration(s):
f9934b96 2457 if not isinstance(s, str):
608d11f5 2458 return None
ca7b3246 2459 s = s.strip()
38d79fd1 2460 if not s:
2461 return None
ca7b3246 2462
acaff495 2463 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2464 m = re.match(r'''(?x)
2465 (?P<before_secs>
2466 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2467 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2468 (?P<ms>[.:][0-9]+)?Z?$
2469 ''', s)
acaff495 2470 if m:
8bd1c00b 2471 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2472 else:
2473 m = re.match(
056653bb
S
2474 r'''(?ix)(?:P?
2475 (?:
1c1b2f96 2476 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2477 )?
2478 (?:
1c1b2f96 2479 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2480 )?
2481 (?:
1c1b2f96 2482 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2483 )?
8f4b58d7 2484 (?:
1c1b2f96 2485 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2486 )?
056653bb 2487 T)?
acaff495 2488 (?:
1c1b2f96 2489 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2490 )?
2491 (?:
1c1b2f96 2492 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2493 )?
2494 (?:
2495 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2496 )?Z?$''', s)
acaff495 2497 if m:
2498 days, hours, mins, secs, ms = m.groups()
2499 else:
15846398 2500 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2501 if m:
2502 hours, mins = m.groups()
2503 else:
2504 return None
2505
acaff495 2506 if ms:
19a03940 2507 ms = ms.replace(':', '.')
2508 return sum(float(part or 0) * mult for part, mult in (
2509 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2510
2511
e65e4c88 2512def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2513 name, real_ext = os.path.splitext(filename)
e65e4c88 2514 return (
86e5f3ed 2515 f'{name}.{ext}{real_ext}'
e65e4c88 2516 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2517 else f'{filename}.{ext}')
d70ad093
PH
2518
2519
b3ed15b7
S
2520def replace_extension(filename, ext, expected_real_ext=None):
2521 name, real_ext = os.path.splitext(filename)
86e5f3ed 2522 return '{}.{}'.format(
b3ed15b7
S
2523 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2524 ext)
2525
2526
d70ad093
PH
2527def check_executable(exe, args=[]):
2528 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2529 args can be a list of arguments for a short output (like -version) """
2530 try:
d3c93ec2 2531 Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
d70ad093
PH
2532 except OSError:
2533 return False
2534 return exe
b7ab0590
PH
2535
2536
8a7f68d0 2537def _get_exe_version_output(exe, args, *, to_screen=None):
2538 if to_screen:
2539 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
95807118 2540 try:
b64d04c1 2541 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2542 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2543 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
d3c93ec2 2544 out, _ = Popen(
2545 [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2546 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
95807118
PH
2547 except OSError:
2548 return False
cae97f65
PH
2549 if isinstance(out, bytes): # Python 2.x
2550 out = out.decode('ascii', 'ignore')
9af98e17 2551 return out
cae97f65
PH
2552
2553
2554def detect_exe_version(output, version_re=None, unrecognized='present'):
2555 assert isinstance(output, compat_str)
2556 if version_re is None:
2557 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2558 m = re.search(version_re, output)
95807118
PH
2559 if m:
2560 return m.group(1)
2561 else:
2562 return unrecognized
2563
2564
9af98e17 2565def get_exe_version(exe, args=['--version'],
2566 version_re=None, unrecognized='present'):
2567 """ Returns the version of the specified executable,
2568 or False if the executable is not present """
2569 out = _get_exe_version_output(exe, args)
2570 return detect_exe_version(out, version_re, unrecognized) if out else False
2571
2572
cb89cfc1 2573class LazyList(collections.abc.Sequence):
0f06bcd7 2574 """Lazy immutable list from an iterable
2575 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2576
8e5fecc8 2577 class IndexError(IndexError):
2578 pass
2579
282f5709 2580 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2581 self._iterable = iter(iterable)
2582 self._cache = [] if _cache is None else _cache
2583 self._reversed = reverse
483336e7 2584
2585 def __iter__(self):
0f06bcd7 2586 if self._reversed:
28419ca2 2587 # We need to consume the entire iterable to iterate in reverse
981052c9 2588 yield from self.exhaust()
28419ca2 2589 return
0f06bcd7 2590 yield from self._cache
2591 for item in self._iterable:
2592 self._cache.append(item)
483336e7 2593 yield item
2594
0f06bcd7 2595 def _exhaust(self):
2596 self._cache.extend(self._iterable)
2597 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2598 return self._cache
28419ca2 2599
981052c9 2600 def exhaust(self):
0f06bcd7 2601 """Evaluate the entire iterable"""
2602 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2603
28419ca2 2604 @staticmethod
0f06bcd7 2605 def _reverse_index(x):
e0f2b4b4 2606 return None if x is None else -(x + 1)
483336e7 2607
2608 def __getitem__(self, idx):
2609 if isinstance(idx, slice):
0f06bcd7 2610 if self._reversed:
2611 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2612 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2613 elif isinstance(idx, int):
0f06bcd7 2614 if self._reversed:
2615 idx = self._reverse_index(idx)
e0f2b4b4 2616 start, stop, step = idx, idx, 0
483336e7 2617 else:
2618 raise TypeError('indices must be integers or slices')
e0f2b4b4 2619 if ((start or 0) < 0 or (stop or 0) < 0
2620 or (start is None and step < 0)
2621 or (stop is None and step > 0)):
483336e7 2622 # We need to consume the entire iterable to be able to slice from the end
2623 # Obviously, never use this with infinite iterables
0f06bcd7 2624 self._exhaust()
8e5fecc8 2625 try:
0f06bcd7 2626 return self._cache[idx]
8e5fecc8 2627 except IndexError as e:
2628 raise self.IndexError(e) from e
0f06bcd7 2629 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2630 if n > 0:
0f06bcd7 2631 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2632 try:
0f06bcd7 2633 return self._cache[idx]
8e5fecc8 2634 except IndexError as e:
2635 raise self.IndexError(e) from e
483336e7 2636
2637 def __bool__(self):
2638 try:
0f06bcd7 2639 self[-1] if self._reversed else self[0]
8e5fecc8 2640 except self.IndexError:
483336e7 2641 return False
2642 return True
2643
2644 def __len__(self):
0f06bcd7 2645 self._exhaust()
2646 return len(self._cache)
483336e7 2647
282f5709 2648 def __reversed__(self):
0f06bcd7 2649 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2650
2651 def __copy__(self):
0f06bcd7 2652 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2653
28419ca2 2654 def __repr__(self):
2655 # repr and str should mimic a list. So we exhaust the iterable
2656 return repr(self.exhaust())
2657
2658 def __str__(self):
2659 return repr(self.exhaust())
2660
483336e7 2661
7be9ccff 2662class PagedList:
c07a39ae 2663
2664 class IndexError(IndexError):
2665 pass
2666
dd26ced1
PH
2667 def __len__(self):
2668 # This is only useful for tests
2669 return len(self.getslice())
2670
7be9ccff 2671 def __init__(self, pagefunc, pagesize, use_cache=True):
2672 self._pagefunc = pagefunc
2673 self._pagesize = pagesize
f1d13090 2674 self._pagecount = float('inf')
7be9ccff 2675 self._use_cache = use_cache
2676 self._cache = {}
2677
2678 def getpage(self, pagenum):
d8cf8d97 2679 page_results = self._cache.get(pagenum)
2680 if page_results is None:
f1d13090 2681 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2682 if self._use_cache:
2683 self._cache[pagenum] = page_results
2684 return page_results
2685
2686 def getslice(self, start=0, end=None):
2687 return list(self._getslice(start, end))
2688
2689 def _getslice(self, start, end):
55575225 2690 raise NotImplementedError('This method must be implemented by subclasses')
2691
2692 def __getitem__(self, idx):
f1d13090 2693 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2694 if not isinstance(idx, int) or idx < 0:
2695 raise TypeError('indices must be non-negative integers')
2696 entries = self.getslice(idx, idx + 1)
d8cf8d97 2697 if not entries:
c07a39ae 2698 raise self.IndexError()
d8cf8d97 2699 return entries[0]
55575225 2700
9c44d242
PH
2701
2702class OnDemandPagedList(PagedList):
a44ca5a4 2703 """Download pages until a page with less than maximum results"""
86e5f3ed 2704
7be9ccff 2705 def _getslice(self, start, end):
b7ab0590
PH
2706 for pagenum in itertools.count(start // self._pagesize):
2707 firstid = pagenum * self._pagesize
2708 nextfirstid = pagenum * self._pagesize + self._pagesize
2709 if start >= nextfirstid:
2710 continue
2711
b7ab0590
PH
2712 startv = (
2713 start % self._pagesize
2714 if firstid <= start < nextfirstid
2715 else 0)
b7ab0590
PH
2716 endv = (
2717 ((end - 1) % self._pagesize) + 1
2718 if (end is not None and firstid <= end <= nextfirstid)
2719 else None)
2720
f1d13090 2721 try:
2722 page_results = self.getpage(pagenum)
2723 except Exception:
2724 self._pagecount = pagenum - 1
2725 raise
b7ab0590
PH
2726 if startv != 0 or endv is not None:
2727 page_results = page_results[startv:endv]
7be9ccff 2728 yield from page_results
b7ab0590
PH
2729
2730 # A little optimization - if current page is not "full", ie. does
2731 # not contain page_size videos then we can assume that this page
2732 # is the last one - there are no more ids on further pages -
2733 # i.e. no need to query again.
2734 if len(page_results) + startv < self._pagesize:
2735 break
2736
2737 # If we got the whole page, but the next page is not interesting,
2738 # break out early as well
2739 if end == nextfirstid:
2740 break
81c2f20b
PH
2741
2742
9c44d242 2743class InAdvancePagedList(PagedList):
a44ca5a4 2744 """PagedList with total number of pages known in advance"""
86e5f3ed 2745
9c44d242 2746 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2747 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2748 self._pagecount = pagecount
9c44d242 2749
7be9ccff 2750 def _getslice(self, start, end):
9c44d242 2751 start_page = start // self._pagesize
d37707bd 2752 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2753 skip_elems = start - start_page * self._pagesize
2754 only_more = None if end is None else end - start
2755 for pagenum in range(start_page, end_page):
7be9ccff 2756 page_results = self.getpage(pagenum)
9c44d242 2757 if skip_elems:
7be9ccff 2758 page_results = page_results[skip_elems:]
9c44d242
PH
2759 skip_elems = None
2760 if only_more is not None:
7be9ccff 2761 if len(page_results) < only_more:
2762 only_more -= len(page_results)
9c44d242 2763 else:
7be9ccff 2764 yield from page_results[:only_more]
9c44d242 2765 break
7be9ccff 2766 yield from page_results
9c44d242
PH
2767
2768
81c2f20b 2769def uppercase_escape(s):
676eb3f2 2770 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2771 return re.sub(
a612753d 2772 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2773 lambda m: unicode_escape(m.group(0))[0],
2774 s)
0fe2ff78
YCH
2775
2776
2777def lowercase_escape(s):
2778 unicode_escape = codecs.getdecoder('unicode_escape')
2779 return re.sub(
2780 r'\\u[0-9a-fA-F]{4}',
2781 lambda m: unicode_escape(m.group(0))[0],
2782 s)
b53466e1 2783
d05cfe06
S
2784
2785def escape_rfc3986(s):
2786 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 2787 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2788
2789
2790def escape_url(url):
2791 """Escape URL as suggested by RFC 3986"""
2792 url_parsed = compat_urllib_parse_urlparse(url)
2793 return url_parsed._replace(
efbed08d 2794 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2795 path=escape_rfc3986(url_parsed.path),
2796 params=escape_rfc3986(url_parsed.params),
2797 query=escape_rfc3986(url_parsed.query),
2798 fragment=escape_rfc3986(url_parsed.fragment)
2799 ).geturl()
2800
62e609ab 2801
4dfbf869 2802def parse_qs(url):
2803 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2804
2805
62e609ab
PH
2806def read_batch_urls(batch_fd):
2807 def fixup(url):
2808 if not isinstance(url, compat_str):
2809 url = url.decode('utf-8', 'replace')
8c04f0be 2810 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2811 for bom in BOM_UTF8:
2812 if url.startswith(bom):
2813 url = url[len(bom):]
2814 url = url.lstrip()
2815 if not url or url.startswith(('#', ';', ']')):
62e609ab 2816 return False
8c04f0be 2817 # "#" cannot be stripped out since it is part of the URI
2818 # However, it can be safely stipped out if follwing a whitespace
2819 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
2820
2821 with contextlib.closing(batch_fd) as fd:
2822 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2823
2824
2825def urlencode_postdata(*args, **kargs):
15707c7e 2826 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2827
2828
38f9ef31 2829def update_url_query(url, query):
cacd9966
YCH
2830 if not query:
2831 return url
38f9ef31 2832 parsed_url = compat_urlparse.urlparse(url)
2833 qs = compat_parse_qs(parsed_url.query)
2834 qs.update(query)
2835 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2836 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2837
8e60dc75 2838
ed0291d1
S
2839def update_Request(req, url=None, data=None, headers={}, query={}):
2840 req_headers = req.headers.copy()
2841 req_headers.update(headers)
2842 req_data = data or req.data
2843 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2844 req_get_method = req.get_method()
2845 if req_get_method == 'HEAD':
2846 req_type = HEADRequest
2847 elif req_get_method == 'PUT':
2848 req_type = PUTRequest
2849 else:
2850 req_type = compat_urllib_request.Request
ed0291d1
S
2851 new_req = req_type(
2852 req_url, data=req_data, headers=req_headers,
2853 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2854 if hasattr(req, 'timeout'):
2855 new_req.timeout = req.timeout
2856 return new_req
2857
2858
10c87c15 2859def _multipart_encode_impl(data, boundary):
0c265486
YCH
2860 content_type = 'multipart/form-data; boundary=%s' % boundary
2861
2862 out = b''
2863 for k, v in data.items():
2864 out += b'--' + boundary.encode('ascii') + b'\r\n'
2865 if isinstance(k, compat_str):
0f06bcd7 2866 k = k.encode()
0c265486 2867 if isinstance(v, compat_str):
0f06bcd7 2868 v = v.encode()
0c265486
YCH
2869 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2870 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2871 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2872 if boundary.encode('ascii') in content:
2873 raise ValueError('Boundary overlaps with data')
2874 out += content
2875
2876 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2877
2878 return out, content_type
2879
2880
2881def multipart_encode(data, boundary=None):
2882 '''
2883 Encode a dict to RFC 7578-compliant form-data
2884
2885 data:
2886 A dict where keys and values can be either Unicode or bytes-like
2887 objects.
2888 boundary:
2889 If specified a Unicode object, it's used as the boundary. Otherwise
2890 a random boundary is generated.
2891
2892 Reference: https://tools.ietf.org/html/rfc7578
2893 '''
2894 has_specified_boundary = boundary is not None
2895
2896 while True:
2897 if boundary is None:
2898 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2899
2900 try:
10c87c15 2901 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2902 break
2903 except ValueError:
2904 if has_specified_boundary:
2905 raise
2906 boundary = None
2907
2908 return out, content_type
2909
2910
86296ad2 2911def dict_get(d, key_or_keys, default=None, skip_false_values=True):
a44ca5a4 2912 for val in map(d.get, variadic(key_or_keys)):
2913 if val is not None and (val or not skip_false_values):
2914 return val
2915 return default
cbecc9b9
S
2916
2917
c4f60dd7 2918def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2919 for f in funcs:
a32a9a7e 2920 try:
c4f60dd7 2921 val = f(*args, **kwargs)
2922 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
a32a9a7e
S
2923 pass
2924 else:
c4f60dd7 2925 if expected_type is None or isinstance(val, expected_type):
2926 return val
2927
2928
2929def try_get(src, getter, expected_type=None):
2930 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
2931
2932
90137ca4 2933def filter_dict(dct, cndn=lambda _, v: v is not None):
2934 return {k: v for k, v in dct.items() if cndn(k, v)}
2935
2936
6cc62232
S
2937def merge_dicts(*dicts):
2938 merged = {}
2939 for a_dict in dicts:
2940 for k, v in a_dict.items():
90137ca4 2941 if (v is not None and k not in merged
2942 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
2943 merged[k] = v
2944 return merged
2945
2946
8e60dc75
S
2947def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2948 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2949
16392824 2950
a1a530b0
PH
2951US_RATINGS = {
2952 'G': 0,
2953 'PG': 10,
2954 'PG-13': 13,
2955 'R': 16,
2956 'NC': 18,
2957}
fac55558
PH
2958
2959
a8795327 2960TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
2961 'TV-Y': 0,
2962 'TV-Y7': 7,
2963 'TV-G': 0,
2964 'TV-PG': 0,
2965 'TV-14': 14,
2966 'TV-MA': 17,
a8795327
S
2967}
2968
2969
146c80e2 2970def parse_age_limit(s):
19a03940 2971 # isinstance(False, int) is True. So type() must be used instead
c487cf00 2972 if type(s) is int: # noqa: E721
a8795327 2973 return s if 0 <= s <= 21 else None
19a03940 2974 elif not isinstance(s, str):
d838b1bd 2975 return None
146c80e2 2976 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2977 if m:
2978 return int(m.group('age'))
5c5fae6d 2979 s = s.upper()
a8795327
S
2980 if s in US_RATINGS:
2981 return US_RATINGS[s]
5a16c9d9 2982 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 2983 if m:
5a16c9d9 2984 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 2985 return None
146c80e2
S
2986
2987
fac55558 2988def strip_jsonp(code):
609a61e3 2989 return re.sub(
5552c9eb 2990 r'''(?sx)^
e9c671d5 2991 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
2992 (?:\s*&&\s*(?P=func_name))?
2993 \s*\(\s*(?P<callback_data>.*)\);?
2994 \s*?(?://[^\n]*)*$''',
2995 r'\g<callback_data>', code)
478c2c61
PH
2996
2997
5c610515 2998def js_to_json(code, vars={}):
2999 # vars is a dict of var, val pairs to substitute
c843e685 3000 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 3001 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 3002 INTEGER_TABLE = (
86e5f3ed 3003 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3004 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
3005 )
3006
e05f6939 3007 def fix_kv(m):
e7b6d122
PH
3008 v = m.group(0)
3009 if v in ('true', 'false', 'null'):
3010 return v
421ddcb8
C
3011 elif v in ('undefined', 'void 0'):
3012 return 'null'
8bdd16b4 3013 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
bd1e4844 3014 return ""
3015
3016 if v[0] in ("'", '"'):
3017 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 3018 '"': '\\"',
bd1e4844 3019 "\\'": "'",
3020 '\\\n': '',
3021 '\\x': '\\u00',
3022 }.get(m.group(0), m.group(0)), v[1:-1])
8bdd16b4 3023 else:
3024 for regex, base in INTEGER_TABLE:
3025 im = re.match(regex, v)
3026 if im:
3027 i = int(im.group(1), base)
3028 return '"%d":' % i if v.endswith(':') else '%d' % i
89ac4a19 3029
5c610515 3030 if v in vars:
3031 return vars[v]
3032
e7b6d122 3033 return '"%s"' % v
e05f6939 3034
febff4c1
B
3035 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3036
bd1e4844 3037 return re.sub(r'''(?sx)
3038 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3039 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 3040 {comment}|,(?={skip}[\]}}])|
421ddcb8 3041 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
4195096e 3042 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
8bdd16b4 3043 [0-9]+(?={skip}:)|
3044 !+
4195096e 3045 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
3046
3047
478c2c61
PH
3048def qualities(quality_ids):
3049 """ Get a numeric quality value out of a list of possible values """
3050 def q(qid):
3051 try:
3052 return quality_ids.index(qid)
3053 except ValueError:
3054 return -1
3055 return q
3056
acd69589 3057
62f6f1cb 3058POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist')
1e43a6f7 3059
3060
de6000d9 3061DEFAULT_OUTTMPL = {
3062 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3063 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3064}
3065OUTTMPL_TYPES = {
72755351 3066 'chapter': None,
de6000d9 3067 'subtitle': None,
3068 'thumbnail': None,
3069 'description': 'description',
3070 'annotation': 'annotations.xml',
3071 'infojson': 'info.json',
08438d2c 3072 'link': None,
3b603dbd 3073 'pl_video': None,
5112f26a 3074 'pl_thumbnail': None,
de6000d9 3075 'pl_description': 'description',
3076 'pl_infojson': 'info.json',
3077}
0a871f68 3078
143db31d 3079# As of [1] format syntax is:
3080# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3081# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3082STR_FORMAT_RE_TMPL = r'''(?x)
3083 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3084 %
524e2e4f 3085 (?P<has_key>\((?P<key>{0})\))?
752cda38 3086 (?P<format>
524e2e4f 3087 (?P<conversion>[#0\-+ ]+)?
3088 (?P<min_width>\d+)?
3089 (?P<precision>\.\d+)?
3090 (?P<len_mod>[hlL])? # unused in python
901130bb 3091 {1} # conversion type
752cda38 3092 )
143db31d 3093'''
3094
7d1eb38a 3095
901130bb 3096STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3097
7d1eb38a 3098
a020a0dc
PH
3099def limit_length(s, length):
3100 """ Add ellipses to overly long strings """
3101 if s is None:
3102 return None
3103 ELLIPSES = '...'
3104 if len(s) > length:
3105 return s[:length - len(ELLIPSES)] + ELLIPSES
3106 return s
48844745
PH
3107
3108
3109def version_tuple(v):
5f9b8394 3110 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3111
3112
3113def is_outdated_version(version, limit, assume_new=True):
3114 if not version:
3115 return not assume_new
3116 try:
3117 return version_tuple(version) < version_tuple(limit)
3118 except ValueError:
3119 return not assume_new
732ea2f0
PH
3120
3121
3122def ytdl_is_updateable():
7a5c1cfe 3123 """ Returns if yt-dlp can be updated with -U """
735d865e 3124
5d535b4a 3125 from .update import is_non_updateable
732ea2f0 3126
5d535b4a 3127 return not is_non_updateable()
7d4111ed
PH
3128
3129
3130def args_to_str(args):
3131 # Get a short string representation for a subprocess command
702ccf2d 3132 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3133
3134
9b9c5355 3135def error_to_compat_str(err):
cfb0511d 3136 return str(err)
fdae2358
S
3137
3138
a44ca5a4 3139def error_to_str(err):
3140 return f'{type(err).__name__}: {err}'
3141
3142
c460bdd5 3143def mimetype2ext(mt):
eb9ee194
S
3144 if mt is None:
3145 return None
3146
9359f3d4
F
3147 mt, _, params = mt.partition(';')
3148 mt = mt.strip()
3149
3150 FULL_MAP = {
765ac263 3151 'audio/mp4': 'm4a',
6c33d24b
YCH
3152 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3153 # it's the most popular one
3154 'audio/mpeg': 'mp3',
ba39289d 3155 'audio/x-wav': 'wav',
9359f3d4
F
3156 'audio/wav': 'wav',
3157 'audio/wave': 'wav',
3158 }
3159
3160 ext = FULL_MAP.get(mt)
765ac263
JMF
3161 if ext is not None:
3162 return ext
3163
9359f3d4 3164 SUBTYPE_MAP = {
f6861ec9 3165 '3gpp': '3gp',
cafcf657 3166 'smptett+xml': 'tt',
cafcf657 3167 'ttaf+xml': 'dfxp',
a0d8d704 3168 'ttml+xml': 'ttml',
f6861ec9 3169 'x-flv': 'flv',
a0d8d704 3170 'x-mp4-fragmented': 'mp4',
d4f05d47 3171 'x-ms-sami': 'sami',
a0d8d704 3172 'x-ms-wmv': 'wmv',
b4173f15
RA
3173 'mpegurl': 'm3u8',
3174 'x-mpegurl': 'm3u8',
3175 'vnd.apple.mpegurl': 'm3u8',
3176 'dash+xml': 'mpd',
b4173f15 3177 'f4m+xml': 'f4m',
f164b971 3178 'hds+xml': 'f4m',
e910fe2f 3179 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 3180 'quicktime': 'mov',
98ce1a3f 3181 'mp2t': 'ts',
39e7107d 3182 'x-wav': 'wav',
9359f3d4
F
3183 'filmstrip+json': 'fs',
3184 'svg+xml': 'svg',
3185 }
3186
3187 _, _, subtype = mt.rpartition('/')
3188 ext = SUBTYPE_MAP.get(subtype.lower())
3189 if ext is not None:
3190 return ext
3191
3192 SUFFIX_MAP = {
3193 'json': 'json',
3194 'xml': 'xml',
3195 'zip': 'zip',
3196 'gzip': 'gz',
3197 }
3198
3199 _, _, suffix = subtype.partition('+')
3200 ext = SUFFIX_MAP.get(suffix)
3201 if ext is not None:
3202 return ext
3203
3204 return subtype.replace('+', '.')
c460bdd5
PH
3205
3206
2814f12b
THD
3207def ext2mimetype(ext_or_url):
3208 if not ext_or_url:
3209 return None
3210 if '.' not in ext_or_url:
3211 ext_or_url = f'file.{ext_or_url}'
3212 return mimetypes.guess_type(ext_or_url)[0]
3213
3214
4f3c5e06 3215def parse_codecs(codecs_str):
3216 # http://tools.ietf.org/html/rfc6381
3217 if not codecs_str:
3218 return {}
a0566bbf 3219 split_codecs = list(filter(None, map(
dbf5416a 3220 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3221 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3222 for full_codec in split_codecs:
9bd979ca 3223 parts = full_codec.split('.')
3224 codec = parts[0].replace('0', '')
3225 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3226 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
4f3c5e06 3227 if not vcodec:
b69fd25c 3228 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
176f1866 3229 if codec in ('dvh1', 'dvhe'):
3230 hdr = 'DV'
9bd979ca 3231 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3232 hdr = 'HDR10'
3233 elif full_codec.replace('0', '').startswith('vp9.2'):
176f1866 3234 hdr = 'HDR10'
b69fd25c 3235 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 3236 if not acodec:
3237 acodec = full_codec
4afa3ec4 3238 elif codec in ('stpp', 'wvtt',):
3fe75fdc 3239 if not scodec:
3240 scodec = full_codec
4f3c5e06 3241 else:
19a03940 3242 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3243 if vcodec or acodec or scodec:
4f3c5e06 3244 return {
3245 'vcodec': vcodec or 'none',
3246 'acodec': acodec or 'none',
176f1866 3247 'dynamic_range': hdr,
3fe75fdc 3248 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3249 }
b69fd25c 3250 elif len(split_codecs) == 2:
3251 return {
3252 'vcodec': split_codecs[0],
3253 'acodec': split_codecs[1],
3254 }
4f3c5e06 3255 return {}
3256
3257
2ccd1b10 3258def urlhandle_detect_ext(url_handle):
79298173 3259 getheader = url_handle.headers.get
2ccd1b10 3260
b55ee18f
PH
3261 cd = getheader('Content-Disposition')
3262 if cd:
3263 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3264 if m:
3265 e = determine_ext(m.group('filename'), default_ext=None)
3266 if e:
3267 return e
3268
c460bdd5 3269 return mimetype2ext(getheader('Content-Type'))
05900629
PH
3270
3271
1e399778
YCH
3272def encode_data_uri(data, mime_type):
3273 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3274
3275
05900629 3276def age_restricted(content_limit, age_limit):
6ec6cb4e 3277 """ Returns True iff the content should be blocked """
05900629
PH
3278
3279 if age_limit is None: # No limit set
3280 return False
3281 if content_limit is None:
3282 return False # Content available for everyone
3283 return age_limit < content_limit
61ca9a80
PH
3284
3285
3286def is_html(first_bytes):
3287 """ Detect whether a file contains HTML by examining its first bytes. """
3288
3289 BOMS = [
3290 (b'\xef\xbb\xbf', 'utf-8'),
3291 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3292 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3293 (b'\xff\xfe', 'utf-16-le'),
3294 (b'\xfe\xff', 'utf-16-be'),
3295 ]
80e8493e 3296
3297 encoding = 'utf-8'
61ca9a80 3298 for bom, enc in BOMS:
80e8493e 3299 while first_bytes.startswith(bom):
3300 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3301
80e8493e 3302 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3303
3304
3305def determine_protocol(info_dict):
3306 protocol = info_dict.get('protocol')
3307 if protocol is not None:
3308 return protocol
3309
7de837a5 3310 url = sanitize_url(info_dict['url'])
a055469f
PH
3311 if url.startswith('rtmp'):
3312 return 'rtmp'
3313 elif url.startswith('mms'):
3314 return 'mms'
3315 elif url.startswith('rtsp'):
3316 return 'rtsp'
3317
3318 ext = determine_ext(url)
3319 if ext == 'm3u8':
3320 return 'm3u8'
3321 elif ext == 'f4m':
3322 return 'f4m'
3323
3324 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
3325
3326
c5e3f849 3327def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3328 """ Render a list of rows, each as a list of values.
3329 Text after a \t will be right aligned """
ec11a9f4 3330 def width(string):
c5e3f849 3331 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3332
3333 def get_max_lens(table):
ec11a9f4 3334 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3335
3336 def filter_using_list(row, filterArray):
d16df59d 3337 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3338
d16df59d 3339 max_lens = get_max_lens(data) if hide_empty else []
3340 header_row = filter_using_list(header_row, max_lens)
3341 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3342
cfb56d1a 3343 table = [header_row] + data
76d321f6 3344 max_lens = get_max_lens(table)
c5e3f849 3345 extra_gap += 1
76d321f6 3346 if delim:
c5e3f849 3347 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3348 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3349 for row in table:
3350 for pos, text in enumerate(map(str, row)):
c5e3f849 3351 if '\t' in text:
3352 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3353 else:
3354 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3355 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3356 return ret
347de493
PH
3357
3358
8f18aca8 3359def _match_one(filter_part, dct, incomplete):
77b87f05 3360 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3361 STRING_OPERATORS = {
3362 '*=': operator.contains,
3363 '^=': lambda attr, value: attr.startswith(value),
3364 '$=': lambda attr, value: attr.endswith(value),
3365 '~=': lambda attr, value: re.search(value, attr),
3366 }
347de493 3367 COMPARISON_OPERATORS = {
a047eeb6 3368 **STRING_OPERATORS,
3369 '<=': operator.le, # "<=" must be defined above "<"
347de493 3370 '<': operator.lt,
347de493 3371 '>=': operator.ge,
a047eeb6 3372 '>': operator.gt,
347de493 3373 '=': operator.eq,
347de493 3374 }
a047eeb6 3375
6db9c4d5 3376 if isinstance(incomplete, bool):
3377 is_incomplete = lambda _: incomplete
3378 else:
3379 is_incomplete = lambda k: k in incomplete
3380
347de493
PH
3381 operator_rex = re.compile(r'''(?x)\s*
3382 (?P<key>[a-z_]+)
77b87f05 3383 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3384 (?:
a047eeb6 3385 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3386 (?P<strval>.+?)
347de493
PH
3387 )
3388 \s*$
3389 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3390 m = operator_rex.search(filter_part)
3391 if m:
18f96d12 3392 m = m.groupdict()
3393 unnegated_op = COMPARISON_OPERATORS[m['op']]
3394 if m['negation']:
77b87f05
MT
3395 op = lambda attr, value: not unnegated_op(attr, value)
3396 else:
3397 op = unnegated_op
18f96d12 3398 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3399 if m['quote']:
3400 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3401 actual_value = dct.get(m['key'])
3402 numeric_comparison = None
f9934b96 3403 if isinstance(actual_value, (int, float)):
e5a088dc
S
3404 # If the original field is a string and matching comparisonvalue is
3405 # a number we should respect the origin of the original field
3406 # and process comparison value as a string (see
18f96d12 3407 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3408 try:
18f96d12 3409 numeric_comparison = int(comparison_value)
347de493 3410 except ValueError:
18f96d12 3411 numeric_comparison = parse_filesize(comparison_value)
3412 if numeric_comparison is None:
3413 numeric_comparison = parse_filesize(f'{comparison_value}B')
3414 if numeric_comparison is None:
3415 numeric_comparison = parse_duration(comparison_value)
3416 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3417 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3418 if actual_value is None:
6db9c4d5 3419 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3420 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3421
3422 UNARY_OPERATORS = {
1cc47c66
S
3423 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3424 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493
PH
3425 }
3426 operator_rex = re.compile(r'''(?x)\s*
3427 (?P<op>%s)\s*(?P<key>[a-z_]+)
3428 \s*$
3429 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3430 m = operator_rex.search(filter_part)
3431 if m:
3432 op = UNARY_OPERATORS[m.group('op')]
3433 actual_value = dct.get(m.group('key'))
6db9c4d5 3434 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3435 return True
347de493
PH
3436 return op(actual_value)
3437
3438 raise ValueError('Invalid filter part %r' % filter_part)
3439
3440
8f18aca8 3441def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3442 """ Filter a dictionary with a simple string syntax.
3443 @returns Whether the filter passes
3444 @param incomplete Set of keys that is expected to be missing from dct.
3445 Can be True/False to indicate all/none of the keys may be missing.
3446 All conditions on incomplete keys pass if the key is missing
8f18aca8 3447 """
347de493 3448 return all(
8f18aca8 3449 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3450 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3451
3452
b1a7cd05 3453def match_filter_func(filters):
3454 if not filters:
d1b5f70b 3455 return None
492272fe 3456 filters = set(variadic(filters))
d1b5f70b 3457
492272fe 3458 interactive = '-' in filters
3459 if interactive:
3460 filters.remove('-')
3461
3462 def _match_func(info_dict, incomplete=False):
3463 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3464 return NO_DEFAULT if interactive and not incomplete else None
347de493 3465 else:
b1a7cd05 3466 video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3467 filter_str = ') | ('.join(map(str.strip, filters))
3468 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3469 return _match_func
91410c9b
PH
3470
3471
bf6427d2
YCH
3472def parse_dfxp_time_expr(time_expr):
3473 if not time_expr:
d631d5f9 3474 return
bf6427d2 3475
1d485a1a 3476 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3477 if mobj:
3478 return float(mobj.group('time_offset'))
3479
db2fe38b 3480 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3481 if mobj:
db2fe38b 3482 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3483
3484
c1c924ab 3485def srt_subtitles_timecode(seconds):
aa7785f8 3486 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3487
3488
3489def ass_subtitles_timecode(seconds):
3490 time = timetuple_from_msec(seconds * 1000)
3491 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3492
3493
3494def dfxp2srt(dfxp_data):
3869028f
YCH
3495 '''
3496 @param dfxp_data A bytes-like object containing DFXP data
3497 @returns A unicode object containing converted SRT data
3498 '''
5b995f71 3499 LEGACY_NAMESPACES = (
3869028f
YCH
3500 (b'http://www.w3.org/ns/ttml', [
3501 b'http://www.w3.org/2004/11/ttaf1',
3502 b'http://www.w3.org/2006/04/ttaf1',
3503 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3504 ]),
3869028f
YCH
3505 (b'http://www.w3.org/ns/ttml#styling', [
3506 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3507 ]),
3508 )
3509
3510 SUPPORTED_STYLING = [
3511 'color',
3512 'fontFamily',
3513 'fontSize',
3514 'fontStyle',
3515 'fontWeight',
3516 'textDecoration'
3517 ]
3518
4e335771 3519 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3520 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3521 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3522 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3523 })
bf6427d2 3524
5b995f71
RA
3525 styles = {}
3526 default_style = {}
3527
86e5f3ed 3528 class TTMLPElementParser:
5b995f71
RA
3529 _out = ''
3530 _unclosed_elements = []
3531 _applied_styles = []
bf6427d2 3532
2b14cb56 3533 def start(self, tag, attrib):
5b995f71
RA
3534 if tag in (_x('ttml:br'), 'br'):
3535 self._out += '\n'
3536 else:
3537 unclosed_elements = []
3538 style = {}
3539 element_style_id = attrib.get('style')
3540 if default_style:
3541 style.update(default_style)
3542 if element_style_id:
3543 style.update(styles.get(element_style_id, {}))
3544 for prop in SUPPORTED_STYLING:
3545 prop_val = attrib.get(_x('tts:' + prop))
3546 if prop_val:
3547 style[prop] = prop_val
3548 if style:
3549 font = ''
3550 for k, v in sorted(style.items()):
3551 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3552 continue
3553 if k == 'color':
3554 font += ' color="%s"' % v
3555 elif k == 'fontSize':
3556 font += ' size="%s"' % v
3557 elif k == 'fontFamily':
3558 font += ' face="%s"' % v
3559 elif k == 'fontWeight' and v == 'bold':
3560 self._out += '<b>'
3561 unclosed_elements.append('b')
3562 elif k == 'fontStyle' and v == 'italic':
3563 self._out += '<i>'
3564 unclosed_elements.append('i')
3565 elif k == 'textDecoration' and v == 'underline':
3566 self._out += '<u>'
3567 unclosed_elements.append('u')
3568 if font:
3569 self._out += '<font' + font + '>'
3570 unclosed_elements.append('font')
3571 applied_style = {}
3572 if self._applied_styles:
3573 applied_style.update(self._applied_styles[-1])
3574 applied_style.update(style)
3575 self._applied_styles.append(applied_style)
3576 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3577
2b14cb56 3578 def end(self, tag):
5b995f71
RA
3579 if tag not in (_x('ttml:br'), 'br'):
3580 unclosed_elements = self._unclosed_elements.pop()
3581 for element in reversed(unclosed_elements):
3582 self._out += '</%s>' % element
3583 if unclosed_elements and self._applied_styles:
3584 self._applied_styles.pop()
bf6427d2 3585
2b14cb56 3586 def data(self, data):
5b995f71 3587 self._out += data
2b14cb56 3588
3589 def close(self):
5b995f71 3590 return self._out.strip()
2b14cb56 3591
3592 def parse_node(node):
3593 target = TTMLPElementParser()
3594 parser = xml.etree.ElementTree.XMLParser(target=target)
3595 parser.feed(xml.etree.ElementTree.tostring(node))
3596 return parser.close()
bf6427d2 3597
5b995f71
RA
3598 for k, v in LEGACY_NAMESPACES:
3599 for ns in v:
3600 dfxp_data = dfxp_data.replace(ns, k)
3601
3869028f 3602 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3603 out = []
5b995f71 3604 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3605
3606 if not paras:
3607 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3608
5b995f71
RA
3609 repeat = False
3610 while True:
3611 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3612 style_id = style.get('id') or style.get(_x('xml:id'))
3613 if not style_id:
3614 continue
5b995f71
RA
3615 parent_style_id = style.get('style')
3616 if parent_style_id:
3617 if parent_style_id not in styles:
3618 repeat = True
3619 continue
3620 styles[style_id] = styles[parent_style_id].copy()
3621 for prop in SUPPORTED_STYLING:
3622 prop_val = style.get(_x('tts:' + prop))
3623 if prop_val:
3624 styles.setdefault(style_id, {})[prop] = prop_val
3625 if repeat:
3626 repeat = False
3627 else:
3628 break
3629
3630 for p in ('body', 'div'):
3631 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3632 if ele is None:
3633 continue
3634 style = styles.get(ele.get('style'))
3635 if not style:
3636 continue
3637 default_style.update(style)
3638
bf6427d2 3639 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3640 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3641 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3642 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3643 if begin_time is None:
3644 continue
7dff0363 3645 if not end_time:
d631d5f9
YCH
3646 if not dur:
3647 continue
3648 end_time = begin_time + dur
bf6427d2
YCH
3649 out.append('%d\n%s --> %s\n%s\n\n' % (
3650 index,
c1c924ab
YCH
3651 srt_subtitles_timecode(begin_time),
3652 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3653 parse_node(para)))
3654
3655 return ''.join(out)
3656
3657
c487cf00 3658def cli_option(params, command_option, param, separator=None):
66e289ba 3659 param = params.get(param)
c487cf00 3660 return ([] if param is None
3661 else [command_option, str(param)] if separator is None
3662 else [f'{command_option}{separator}{param}'])
66e289ba
S
3663
3664
3665def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3666 param = params.get(param)
c487cf00 3667 assert param in (True, False, None)
3668 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
3669
3670
3671def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 3672 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
3673
3674
e92caff5 3675def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3676 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3677 if use_compat:
5b1ecbb3 3678 return argdict
3679 else:
3680 argdict = None
eab9b2bc 3681 if argdict is None:
5b1ecbb3 3682 return default
eab9b2bc 3683 assert isinstance(argdict, dict)
3684
e92caff5 3685 assert isinstance(keys, (list, tuple))
3686 for key_list in keys:
e92caff5 3687 arg_list = list(filter(
3688 lambda x: x is not None,
6606817a 3689 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3690 if arg_list:
3691 return [arg for args in arg_list for arg in args]
3692 return default
66e289ba 3693
6251555f 3694
330690a2 3695def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3696 main_key, exe = main_key.lower(), exe.lower()
3697 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3698 keys = [f'{root_key}{k}' for k in (keys or [''])]
3699 if root_key in keys:
3700 if main_key != exe:
3701 keys.append((main_key, exe))
3702 keys.append('default')
3703 else:
3704 use_compat = False
3705 return cli_configuration_args(argdict, keys, default, use_compat)
3706
66e289ba 3707
86e5f3ed 3708class ISO639Utils:
39672624
YCH
3709 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3710 _lang_map = {
3711 'aa': 'aar',
3712 'ab': 'abk',
3713 'ae': 'ave',
3714 'af': 'afr',
3715 'ak': 'aka',
3716 'am': 'amh',
3717 'an': 'arg',
3718 'ar': 'ara',
3719 'as': 'asm',
3720 'av': 'ava',
3721 'ay': 'aym',
3722 'az': 'aze',
3723 'ba': 'bak',
3724 'be': 'bel',
3725 'bg': 'bul',
3726 'bh': 'bih',
3727 'bi': 'bis',
3728 'bm': 'bam',
3729 'bn': 'ben',
3730 'bo': 'bod',
3731 'br': 'bre',
3732 'bs': 'bos',
3733 'ca': 'cat',
3734 'ce': 'che',
3735 'ch': 'cha',
3736 'co': 'cos',
3737 'cr': 'cre',
3738 'cs': 'ces',
3739 'cu': 'chu',
3740 'cv': 'chv',
3741 'cy': 'cym',
3742 'da': 'dan',
3743 'de': 'deu',
3744 'dv': 'div',
3745 'dz': 'dzo',
3746 'ee': 'ewe',
3747 'el': 'ell',
3748 'en': 'eng',
3749 'eo': 'epo',
3750 'es': 'spa',
3751 'et': 'est',
3752 'eu': 'eus',
3753 'fa': 'fas',
3754 'ff': 'ful',
3755 'fi': 'fin',
3756 'fj': 'fij',
3757 'fo': 'fao',
3758 'fr': 'fra',
3759 'fy': 'fry',
3760 'ga': 'gle',
3761 'gd': 'gla',
3762 'gl': 'glg',
3763 'gn': 'grn',
3764 'gu': 'guj',
3765 'gv': 'glv',
3766 'ha': 'hau',
3767 'he': 'heb',
b7acc835 3768 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
3769 'hi': 'hin',
3770 'ho': 'hmo',
3771 'hr': 'hrv',
3772 'ht': 'hat',
3773 'hu': 'hun',
3774 'hy': 'hye',
3775 'hz': 'her',
3776 'ia': 'ina',
3777 'id': 'ind',
b7acc835 3778 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
3779 'ie': 'ile',
3780 'ig': 'ibo',
3781 'ii': 'iii',
3782 'ik': 'ipk',
3783 'io': 'ido',
3784 'is': 'isl',
3785 'it': 'ita',
3786 'iu': 'iku',
3787 'ja': 'jpn',
3788 'jv': 'jav',
3789 'ka': 'kat',
3790 'kg': 'kon',
3791 'ki': 'kik',
3792 'kj': 'kua',
3793 'kk': 'kaz',
3794 'kl': 'kal',
3795 'km': 'khm',
3796 'kn': 'kan',
3797 'ko': 'kor',
3798 'kr': 'kau',
3799 'ks': 'kas',
3800 'ku': 'kur',
3801 'kv': 'kom',
3802 'kw': 'cor',
3803 'ky': 'kir',
3804 'la': 'lat',
3805 'lb': 'ltz',
3806 'lg': 'lug',
3807 'li': 'lim',
3808 'ln': 'lin',
3809 'lo': 'lao',
3810 'lt': 'lit',
3811 'lu': 'lub',
3812 'lv': 'lav',
3813 'mg': 'mlg',
3814 'mh': 'mah',
3815 'mi': 'mri',
3816 'mk': 'mkd',
3817 'ml': 'mal',
3818 'mn': 'mon',
3819 'mr': 'mar',
3820 'ms': 'msa',
3821 'mt': 'mlt',
3822 'my': 'mya',
3823 'na': 'nau',
3824 'nb': 'nob',
3825 'nd': 'nde',
3826 'ne': 'nep',
3827 'ng': 'ndo',
3828 'nl': 'nld',
3829 'nn': 'nno',
3830 'no': 'nor',
3831 'nr': 'nbl',
3832 'nv': 'nav',
3833 'ny': 'nya',
3834 'oc': 'oci',
3835 'oj': 'oji',
3836 'om': 'orm',
3837 'or': 'ori',
3838 'os': 'oss',
3839 'pa': 'pan',
3840 'pi': 'pli',
3841 'pl': 'pol',
3842 'ps': 'pus',
3843 'pt': 'por',
3844 'qu': 'que',
3845 'rm': 'roh',
3846 'rn': 'run',
3847 'ro': 'ron',
3848 'ru': 'rus',
3849 'rw': 'kin',
3850 'sa': 'san',
3851 'sc': 'srd',
3852 'sd': 'snd',
3853 'se': 'sme',
3854 'sg': 'sag',
3855 'si': 'sin',
3856 'sk': 'slk',
3857 'sl': 'slv',
3858 'sm': 'smo',
3859 'sn': 'sna',
3860 'so': 'som',
3861 'sq': 'sqi',
3862 'sr': 'srp',
3863 'ss': 'ssw',
3864 'st': 'sot',
3865 'su': 'sun',
3866 'sv': 'swe',
3867 'sw': 'swa',
3868 'ta': 'tam',
3869 'te': 'tel',
3870 'tg': 'tgk',
3871 'th': 'tha',
3872 'ti': 'tir',
3873 'tk': 'tuk',
3874 'tl': 'tgl',
3875 'tn': 'tsn',
3876 'to': 'ton',
3877 'tr': 'tur',
3878 'ts': 'tso',
3879 'tt': 'tat',
3880 'tw': 'twi',
3881 'ty': 'tah',
3882 'ug': 'uig',
3883 'uk': 'ukr',
3884 'ur': 'urd',
3885 'uz': 'uzb',
3886 've': 'ven',
3887 'vi': 'vie',
3888 'vo': 'vol',
3889 'wa': 'wln',
3890 'wo': 'wol',
3891 'xh': 'xho',
3892 'yi': 'yid',
e9a50fba 3893 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
3894 'yo': 'yor',
3895 'za': 'zha',
3896 'zh': 'zho',
3897 'zu': 'zul',
3898 }
3899
3900 @classmethod
3901 def short2long(cls, code):
3902 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3903 return cls._lang_map.get(code[:2])
3904
3905 @classmethod
3906 def long2short(cls, code):
3907 """Convert language code from ISO 639-2/T to ISO 639-1"""
3908 for short_name, long_name in cls._lang_map.items():
3909 if long_name == code:
3910 return short_name
3911
3912
86e5f3ed 3913class ISO3166Utils:
4eb10f66
YCH
3914 # From http://data.okfn.org/data/core/country-list
3915 _country_map = {
3916 'AF': 'Afghanistan',
3917 'AX': 'Åland Islands',
3918 'AL': 'Albania',
3919 'DZ': 'Algeria',
3920 'AS': 'American Samoa',
3921 'AD': 'Andorra',
3922 'AO': 'Angola',
3923 'AI': 'Anguilla',
3924 'AQ': 'Antarctica',
3925 'AG': 'Antigua and Barbuda',
3926 'AR': 'Argentina',
3927 'AM': 'Armenia',
3928 'AW': 'Aruba',
3929 'AU': 'Australia',
3930 'AT': 'Austria',
3931 'AZ': 'Azerbaijan',
3932 'BS': 'Bahamas',
3933 'BH': 'Bahrain',
3934 'BD': 'Bangladesh',
3935 'BB': 'Barbados',
3936 'BY': 'Belarus',
3937 'BE': 'Belgium',
3938 'BZ': 'Belize',
3939 'BJ': 'Benin',
3940 'BM': 'Bermuda',
3941 'BT': 'Bhutan',
3942 'BO': 'Bolivia, Plurinational State of',
3943 'BQ': 'Bonaire, Sint Eustatius and Saba',
3944 'BA': 'Bosnia and Herzegovina',
3945 'BW': 'Botswana',
3946 'BV': 'Bouvet Island',
3947 'BR': 'Brazil',
3948 'IO': 'British Indian Ocean Territory',
3949 'BN': 'Brunei Darussalam',
3950 'BG': 'Bulgaria',
3951 'BF': 'Burkina Faso',
3952 'BI': 'Burundi',
3953 'KH': 'Cambodia',
3954 'CM': 'Cameroon',
3955 'CA': 'Canada',
3956 'CV': 'Cape Verde',
3957 'KY': 'Cayman Islands',
3958 'CF': 'Central African Republic',
3959 'TD': 'Chad',
3960 'CL': 'Chile',
3961 'CN': 'China',
3962 'CX': 'Christmas Island',
3963 'CC': 'Cocos (Keeling) Islands',
3964 'CO': 'Colombia',
3965 'KM': 'Comoros',
3966 'CG': 'Congo',
3967 'CD': 'Congo, the Democratic Republic of the',
3968 'CK': 'Cook Islands',
3969 'CR': 'Costa Rica',
3970 'CI': 'Côte d\'Ivoire',
3971 'HR': 'Croatia',
3972 'CU': 'Cuba',
3973 'CW': 'Curaçao',
3974 'CY': 'Cyprus',
3975 'CZ': 'Czech Republic',
3976 'DK': 'Denmark',
3977 'DJ': 'Djibouti',
3978 'DM': 'Dominica',
3979 'DO': 'Dominican Republic',
3980 'EC': 'Ecuador',
3981 'EG': 'Egypt',
3982 'SV': 'El Salvador',
3983 'GQ': 'Equatorial Guinea',
3984 'ER': 'Eritrea',
3985 'EE': 'Estonia',
3986 'ET': 'Ethiopia',
3987 'FK': 'Falkland Islands (Malvinas)',
3988 'FO': 'Faroe Islands',
3989 'FJ': 'Fiji',
3990 'FI': 'Finland',
3991 'FR': 'France',
3992 'GF': 'French Guiana',
3993 'PF': 'French Polynesia',
3994 'TF': 'French Southern Territories',
3995 'GA': 'Gabon',
3996 'GM': 'Gambia',
3997 'GE': 'Georgia',
3998 'DE': 'Germany',
3999 'GH': 'Ghana',
4000 'GI': 'Gibraltar',
4001 'GR': 'Greece',
4002 'GL': 'Greenland',
4003 'GD': 'Grenada',
4004 'GP': 'Guadeloupe',
4005 'GU': 'Guam',
4006 'GT': 'Guatemala',
4007 'GG': 'Guernsey',
4008 'GN': 'Guinea',
4009 'GW': 'Guinea-Bissau',
4010 'GY': 'Guyana',
4011 'HT': 'Haiti',
4012 'HM': 'Heard Island and McDonald Islands',
4013 'VA': 'Holy See (Vatican City State)',
4014 'HN': 'Honduras',
4015 'HK': 'Hong Kong',
4016 'HU': 'Hungary',
4017 'IS': 'Iceland',
4018 'IN': 'India',
4019 'ID': 'Indonesia',
4020 'IR': 'Iran, Islamic Republic of',
4021 'IQ': 'Iraq',
4022 'IE': 'Ireland',
4023 'IM': 'Isle of Man',
4024 'IL': 'Israel',
4025 'IT': 'Italy',
4026 'JM': 'Jamaica',
4027 'JP': 'Japan',
4028 'JE': 'Jersey',
4029 'JO': 'Jordan',
4030 'KZ': 'Kazakhstan',
4031 'KE': 'Kenya',
4032 'KI': 'Kiribati',
4033 'KP': 'Korea, Democratic People\'s Republic of',
4034 'KR': 'Korea, Republic of',
4035 'KW': 'Kuwait',
4036 'KG': 'Kyrgyzstan',
4037 'LA': 'Lao People\'s Democratic Republic',
4038 'LV': 'Latvia',
4039 'LB': 'Lebanon',
4040 'LS': 'Lesotho',
4041 'LR': 'Liberia',
4042 'LY': 'Libya',
4043 'LI': 'Liechtenstein',
4044 'LT': 'Lithuania',
4045 'LU': 'Luxembourg',
4046 'MO': 'Macao',
4047 'MK': 'Macedonia, the Former Yugoslav Republic of',
4048 'MG': 'Madagascar',
4049 'MW': 'Malawi',
4050 'MY': 'Malaysia',
4051 'MV': 'Maldives',
4052 'ML': 'Mali',
4053 'MT': 'Malta',
4054 'MH': 'Marshall Islands',
4055 'MQ': 'Martinique',
4056 'MR': 'Mauritania',
4057 'MU': 'Mauritius',
4058 'YT': 'Mayotte',
4059 'MX': 'Mexico',
4060 'FM': 'Micronesia, Federated States of',
4061 'MD': 'Moldova, Republic of',
4062 'MC': 'Monaco',
4063 'MN': 'Mongolia',
4064 'ME': 'Montenegro',
4065 'MS': 'Montserrat',
4066 'MA': 'Morocco',
4067 'MZ': 'Mozambique',
4068 'MM': 'Myanmar',
4069 'NA': 'Namibia',
4070 'NR': 'Nauru',
4071 'NP': 'Nepal',
4072 'NL': 'Netherlands',
4073 'NC': 'New Caledonia',
4074 'NZ': 'New Zealand',
4075 'NI': 'Nicaragua',
4076 'NE': 'Niger',
4077 'NG': 'Nigeria',
4078 'NU': 'Niue',
4079 'NF': 'Norfolk Island',
4080 'MP': 'Northern Mariana Islands',
4081 'NO': 'Norway',
4082 'OM': 'Oman',
4083 'PK': 'Pakistan',
4084 'PW': 'Palau',
4085 'PS': 'Palestine, State of',
4086 'PA': 'Panama',
4087 'PG': 'Papua New Guinea',
4088 'PY': 'Paraguay',
4089 'PE': 'Peru',
4090 'PH': 'Philippines',
4091 'PN': 'Pitcairn',
4092 'PL': 'Poland',
4093 'PT': 'Portugal',
4094 'PR': 'Puerto Rico',
4095 'QA': 'Qatar',
4096 'RE': 'Réunion',
4097 'RO': 'Romania',
4098 'RU': 'Russian Federation',
4099 'RW': 'Rwanda',
4100 'BL': 'Saint Barthélemy',
4101 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4102 'KN': 'Saint Kitts and Nevis',
4103 'LC': 'Saint Lucia',
4104 'MF': 'Saint Martin (French part)',
4105 'PM': 'Saint Pierre and Miquelon',
4106 'VC': 'Saint Vincent and the Grenadines',
4107 'WS': 'Samoa',
4108 'SM': 'San Marino',
4109 'ST': 'Sao Tome and Principe',
4110 'SA': 'Saudi Arabia',
4111 'SN': 'Senegal',
4112 'RS': 'Serbia',
4113 'SC': 'Seychelles',
4114 'SL': 'Sierra Leone',
4115 'SG': 'Singapore',
4116 'SX': 'Sint Maarten (Dutch part)',
4117 'SK': 'Slovakia',
4118 'SI': 'Slovenia',
4119 'SB': 'Solomon Islands',
4120 'SO': 'Somalia',
4121 'ZA': 'South Africa',
4122 'GS': 'South Georgia and the South Sandwich Islands',
4123 'SS': 'South Sudan',
4124 'ES': 'Spain',
4125 'LK': 'Sri Lanka',
4126 'SD': 'Sudan',
4127 'SR': 'Suriname',
4128 'SJ': 'Svalbard and Jan Mayen',
4129 'SZ': 'Swaziland',
4130 'SE': 'Sweden',
4131 'CH': 'Switzerland',
4132 'SY': 'Syrian Arab Republic',
4133 'TW': 'Taiwan, Province of China',
4134 'TJ': 'Tajikistan',
4135 'TZ': 'Tanzania, United Republic of',
4136 'TH': 'Thailand',
4137 'TL': 'Timor-Leste',
4138 'TG': 'Togo',
4139 'TK': 'Tokelau',
4140 'TO': 'Tonga',
4141 'TT': 'Trinidad and Tobago',
4142 'TN': 'Tunisia',
4143 'TR': 'Turkey',
4144 'TM': 'Turkmenistan',
4145 'TC': 'Turks and Caicos Islands',
4146 'TV': 'Tuvalu',
4147 'UG': 'Uganda',
4148 'UA': 'Ukraine',
4149 'AE': 'United Arab Emirates',
4150 'GB': 'United Kingdom',
4151 'US': 'United States',
4152 'UM': 'United States Minor Outlying Islands',
4153 'UY': 'Uruguay',
4154 'UZ': 'Uzbekistan',
4155 'VU': 'Vanuatu',
4156 'VE': 'Venezuela, Bolivarian Republic of',
4157 'VN': 'Viet Nam',
4158 'VG': 'Virgin Islands, British',
4159 'VI': 'Virgin Islands, U.S.',
4160 'WF': 'Wallis and Futuna',
4161 'EH': 'Western Sahara',
4162 'YE': 'Yemen',
4163 'ZM': 'Zambia',
4164 'ZW': 'Zimbabwe',
2f97cc61 4165 # Not ISO 3166 codes, but used for IP blocks
4166 'AP': 'Asia/Pacific Region',
4167 'EU': 'Europe',
4eb10f66
YCH
4168 }
4169
4170 @classmethod
4171 def short2full(cls, code):
4172 """Convert an ISO 3166-2 country code to the corresponding full name"""
4173 return cls._country_map.get(code.upper())
4174
4175
86e5f3ed 4176class GeoUtils:
773f291d
S
4177 # Major IPv4 address blocks per country
4178 _country_ip_map = {
53896ca5 4179 'AD': '46.172.224.0/19',
773f291d
S
4180 'AE': '94.200.0.0/13',
4181 'AF': '149.54.0.0/17',
4182 'AG': '209.59.64.0/18',
4183 'AI': '204.14.248.0/21',
4184 'AL': '46.99.0.0/16',
4185 'AM': '46.70.0.0/15',
4186 'AO': '105.168.0.0/13',
53896ca5
S
4187 'AP': '182.50.184.0/21',
4188 'AQ': '23.154.160.0/24',
773f291d
S
4189 'AR': '181.0.0.0/12',
4190 'AS': '202.70.112.0/20',
53896ca5 4191 'AT': '77.116.0.0/14',
773f291d
S
4192 'AU': '1.128.0.0/11',
4193 'AW': '181.41.0.0/18',
53896ca5
S
4194 'AX': '185.217.4.0/22',
4195 'AZ': '5.197.0.0/16',
773f291d
S
4196 'BA': '31.176.128.0/17',
4197 'BB': '65.48.128.0/17',
4198 'BD': '114.130.0.0/16',
4199 'BE': '57.0.0.0/8',
53896ca5 4200 'BF': '102.178.0.0/15',
773f291d
S
4201 'BG': '95.42.0.0/15',
4202 'BH': '37.131.0.0/17',
4203 'BI': '154.117.192.0/18',
4204 'BJ': '137.255.0.0/16',
53896ca5 4205 'BL': '185.212.72.0/23',
773f291d
S
4206 'BM': '196.12.64.0/18',
4207 'BN': '156.31.0.0/16',
4208 'BO': '161.56.0.0/16',
4209 'BQ': '161.0.80.0/20',
53896ca5 4210 'BR': '191.128.0.0/12',
773f291d
S
4211 'BS': '24.51.64.0/18',
4212 'BT': '119.2.96.0/19',
4213 'BW': '168.167.0.0/16',
4214 'BY': '178.120.0.0/13',
4215 'BZ': '179.42.192.0/18',
4216 'CA': '99.224.0.0/11',
4217 'CD': '41.243.0.0/16',
53896ca5
S
4218 'CF': '197.242.176.0/21',
4219 'CG': '160.113.0.0/16',
773f291d 4220 'CH': '85.0.0.0/13',
53896ca5 4221 'CI': '102.136.0.0/14',
773f291d
S
4222 'CK': '202.65.32.0/19',
4223 'CL': '152.172.0.0/14',
53896ca5 4224 'CM': '102.244.0.0/14',
773f291d
S
4225 'CN': '36.128.0.0/10',
4226 'CO': '181.240.0.0/12',
4227 'CR': '201.192.0.0/12',
4228 'CU': '152.206.0.0/15',
4229 'CV': '165.90.96.0/19',
4230 'CW': '190.88.128.0/17',
53896ca5 4231 'CY': '31.153.0.0/16',
773f291d
S
4232 'CZ': '88.100.0.0/14',
4233 'DE': '53.0.0.0/8',
4234 'DJ': '197.241.0.0/17',
4235 'DK': '87.48.0.0/12',
4236 'DM': '192.243.48.0/20',
4237 'DO': '152.166.0.0/15',
4238 'DZ': '41.96.0.0/12',
4239 'EC': '186.68.0.0/15',
4240 'EE': '90.190.0.0/15',
4241 'EG': '156.160.0.0/11',
4242 'ER': '196.200.96.0/20',
4243 'ES': '88.0.0.0/11',
4244 'ET': '196.188.0.0/14',
4245 'EU': '2.16.0.0/13',
4246 'FI': '91.152.0.0/13',
4247 'FJ': '144.120.0.0/16',
53896ca5 4248 'FK': '80.73.208.0/21',
773f291d
S
4249 'FM': '119.252.112.0/20',
4250 'FO': '88.85.32.0/19',
4251 'FR': '90.0.0.0/9',
4252 'GA': '41.158.0.0/15',
4253 'GB': '25.0.0.0/8',
4254 'GD': '74.122.88.0/21',
4255 'GE': '31.146.0.0/16',
4256 'GF': '161.22.64.0/18',
4257 'GG': '62.68.160.0/19',
53896ca5
S
4258 'GH': '154.160.0.0/12',
4259 'GI': '95.164.0.0/16',
773f291d
S
4260 'GL': '88.83.0.0/19',
4261 'GM': '160.182.0.0/15',
4262 'GN': '197.149.192.0/18',
4263 'GP': '104.250.0.0/19',
4264 'GQ': '105.235.224.0/20',
4265 'GR': '94.64.0.0/13',
4266 'GT': '168.234.0.0/16',
4267 'GU': '168.123.0.0/16',
4268 'GW': '197.214.80.0/20',
4269 'GY': '181.41.64.0/18',
4270 'HK': '113.252.0.0/14',
4271 'HN': '181.210.0.0/16',
4272 'HR': '93.136.0.0/13',
4273 'HT': '148.102.128.0/17',
4274 'HU': '84.0.0.0/14',
4275 'ID': '39.192.0.0/10',
4276 'IE': '87.32.0.0/12',
4277 'IL': '79.176.0.0/13',
4278 'IM': '5.62.80.0/20',
4279 'IN': '117.192.0.0/10',
4280 'IO': '203.83.48.0/21',
4281 'IQ': '37.236.0.0/14',
4282 'IR': '2.176.0.0/12',
4283 'IS': '82.221.0.0/16',
4284 'IT': '79.0.0.0/10',
4285 'JE': '87.244.64.0/18',
4286 'JM': '72.27.0.0/17',
4287 'JO': '176.29.0.0/16',
53896ca5 4288 'JP': '133.0.0.0/8',
773f291d
S
4289 'KE': '105.48.0.0/12',
4290 'KG': '158.181.128.0/17',
4291 'KH': '36.37.128.0/17',
4292 'KI': '103.25.140.0/22',
4293 'KM': '197.255.224.0/20',
53896ca5 4294 'KN': '198.167.192.0/19',
773f291d
S
4295 'KP': '175.45.176.0/22',
4296 'KR': '175.192.0.0/10',
4297 'KW': '37.36.0.0/14',
4298 'KY': '64.96.0.0/15',
4299 'KZ': '2.72.0.0/13',
4300 'LA': '115.84.64.0/18',
4301 'LB': '178.135.0.0/16',
53896ca5 4302 'LC': '24.92.144.0/20',
773f291d
S
4303 'LI': '82.117.0.0/19',
4304 'LK': '112.134.0.0/15',
53896ca5 4305 'LR': '102.183.0.0/16',
773f291d
S
4306 'LS': '129.232.0.0/17',
4307 'LT': '78.56.0.0/13',
4308 'LU': '188.42.0.0/16',
4309 'LV': '46.109.0.0/16',
4310 'LY': '41.252.0.0/14',
4311 'MA': '105.128.0.0/11',
4312 'MC': '88.209.64.0/18',
4313 'MD': '37.246.0.0/16',
4314 'ME': '178.175.0.0/17',
4315 'MF': '74.112.232.0/21',
4316 'MG': '154.126.0.0/17',
4317 'MH': '117.103.88.0/21',
4318 'MK': '77.28.0.0/15',
4319 'ML': '154.118.128.0/18',
4320 'MM': '37.111.0.0/17',
4321 'MN': '49.0.128.0/17',
4322 'MO': '60.246.0.0/16',
4323 'MP': '202.88.64.0/20',
4324 'MQ': '109.203.224.0/19',
4325 'MR': '41.188.64.0/18',
4326 'MS': '208.90.112.0/22',
4327 'MT': '46.11.0.0/16',
4328 'MU': '105.16.0.0/12',
4329 'MV': '27.114.128.0/18',
53896ca5 4330 'MW': '102.70.0.0/15',
773f291d
S
4331 'MX': '187.192.0.0/11',
4332 'MY': '175.136.0.0/13',
4333 'MZ': '197.218.0.0/15',
4334 'NA': '41.182.0.0/16',
4335 'NC': '101.101.0.0/18',
4336 'NE': '197.214.0.0/18',
4337 'NF': '203.17.240.0/22',
4338 'NG': '105.112.0.0/12',
4339 'NI': '186.76.0.0/15',
4340 'NL': '145.96.0.0/11',
4341 'NO': '84.208.0.0/13',
4342 'NP': '36.252.0.0/15',
4343 'NR': '203.98.224.0/19',
4344 'NU': '49.156.48.0/22',
4345 'NZ': '49.224.0.0/14',
4346 'OM': '5.36.0.0/15',
4347 'PA': '186.72.0.0/15',
4348 'PE': '186.160.0.0/14',
4349 'PF': '123.50.64.0/18',
4350 'PG': '124.240.192.0/19',
4351 'PH': '49.144.0.0/13',
4352 'PK': '39.32.0.0/11',
4353 'PL': '83.0.0.0/11',
4354 'PM': '70.36.0.0/20',
4355 'PR': '66.50.0.0/16',
4356 'PS': '188.161.0.0/16',
4357 'PT': '85.240.0.0/13',
4358 'PW': '202.124.224.0/20',
4359 'PY': '181.120.0.0/14',
4360 'QA': '37.210.0.0/15',
53896ca5 4361 'RE': '102.35.0.0/16',
773f291d 4362 'RO': '79.112.0.0/13',
53896ca5 4363 'RS': '93.86.0.0/15',
773f291d 4364 'RU': '5.136.0.0/13',
53896ca5 4365 'RW': '41.186.0.0/16',
773f291d
S
4366 'SA': '188.48.0.0/13',
4367 'SB': '202.1.160.0/19',
4368 'SC': '154.192.0.0/11',
53896ca5 4369 'SD': '102.120.0.0/13',
773f291d 4370 'SE': '78.64.0.0/12',
53896ca5 4371 'SG': '8.128.0.0/10',
773f291d
S
4372 'SI': '188.196.0.0/14',
4373 'SK': '78.98.0.0/15',
53896ca5 4374 'SL': '102.143.0.0/17',
773f291d
S
4375 'SM': '89.186.32.0/19',
4376 'SN': '41.82.0.0/15',
53896ca5 4377 'SO': '154.115.192.0/18',
773f291d
S
4378 'SR': '186.179.128.0/17',
4379 'SS': '105.235.208.0/21',
4380 'ST': '197.159.160.0/19',
4381 'SV': '168.243.0.0/16',
4382 'SX': '190.102.0.0/20',
4383 'SY': '5.0.0.0/16',
4384 'SZ': '41.84.224.0/19',
4385 'TC': '65.255.48.0/20',
4386 'TD': '154.68.128.0/19',
4387 'TG': '196.168.0.0/14',
4388 'TH': '171.96.0.0/13',
4389 'TJ': '85.9.128.0/18',
4390 'TK': '27.96.24.0/21',
4391 'TL': '180.189.160.0/20',
4392 'TM': '95.85.96.0/19',
4393 'TN': '197.0.0.0/11',
4394 'TO': '175.176.144.0/21',
4395 'TR': '78.160.0.0/11',
4396 'TT': '186.44.0.0/15',
4397 'TV': '202.2.96.0/19',
4398 'TW': '120.96.0.0/11',
4399 'TZ': '156.156.0.0/14',
53896ca5
S
4400 'UA': '37.52.0.0/14',
4401 'UG': '102.80.0.0/13',
4402 'US': '6.0.0.0/8',
773f291d 4403 'UY': '167.56.0.0/13',
53896ca5 4404 'UZ': '84.54.64.0/18',
773f291d 4405 'VA': '212.77.0.0/19',
53896ca5 4406 'VC': '207.191.240.0/21',
773f291d 4407 'VE': '186.88.0.0/13',
53896ca5 4408 'VG': '66.81.192.0/20',
773f291d
S
4409 'VI': '146.226.0.0/16',
4410 'VN': '14.160.0.0/11',
4411 'VU': '202.80.32.0/20',
4412 'WF': '117.20.32.0/21',
4413 'WS': '202.4.32.0/19',
4414 'YE': '134.35.0.0/16',
4415 'YT': '41.242.116.0/22',
4416 'ZA': '41.0.0.0/11',
53896ca5
S
4417 'ZM': '102.144.0.0/13',
4418 'ZW': '102.177.192.0/18',
773f291d
S
4419 }
4420
4421 @classmethod
5f95927a
S
4422 def random_ipv4(cls, code_or_block):
4423 if len(code_or_block) == 2:
4424 block = cls._country_ip_map.get(code_or_block.upper())
4425 if not block:
4426 return None
4427 else:
4428 block = code_or_block
773f291d
S
4429 addr, preflen = block.split('/')
4430 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4431 addr_max = addr_min | (0xffffffff >> int(preflen))
18a0defa 4432 return compat_str(socket.inet_ntoa(
4248dad9 4433 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4434
4435
91410c9b 4436class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
4437 def __init__(self, proxies=None):
4438 # Set default handlers
4439 for type in ('http', 'https'):
4440 setattr(self, '%s_open' % type,
4441 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4442 meth(r, proxy, type))
38e87f6c 4443 compat_urllib_request.ProxyHandler.__init__(self, proxies)
2461f79d 4444
91410c9b 4445 def proxy_open(self, req, proxy, type):
2461f79d 4446 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4447 if req_proxy is not None:
4448 proxy = req_proxy
2461f79d
PH
4449 del req.headers['Ytdl-request-proxy']
4450
4451 if proxy == '__noproxy__':
4452 return None # No Proxy
51fb4995 4453 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4454 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4455 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4456 return None
91410c9b
PH
4457 return compat_urllib_request.ProxyHandler.proxy_open(
4458 self, req, proxy, type)
5bc880b9
YCH
4459
4460
0a5445dd
YCH
4461# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4462# released into Public Domain
4463# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4464
4465def long_to_bytes(n, blocksize=0):
4466 """long_to_bytes(n:long, blocksize:int) : string
4467 Convert a long integer to a byte string.
4468
4469 If optional blocksize is given and greater than zero, pad the front of the
4470 byte string with binary zeros so that the length is a multiple of
4471 blocksize.
4472 """
4473 # after much testing, this algorithm was deemed to be the fastest
4474 s = b''
4475 n = int(n)
4476 while n > 0:
4477 s = compat_struct_pack('>I', n & 0xffffffff) + s
4478 n = n >> 32
4479 # strip off leading zeros
4480 for i in range(len(s)):
4481 if s[i] != b'\000'[0]:
4482 break
4483 else:
4484 # only happens when n == 0
4485 s = b'\000'
4486 i = 0
4487 s = s[i:]
4488 # add back some pad bytes. this could be done more efficiently w.r.t. the
4489 # de-padding being done above, but sigh...
4490 if blocksize > 0 and len(s) % blocksize:
4491 s = (blocksize - len(s) % blocksize) * b'\000' + s
4492 return s
4493
4494
4495def bytes_to_long(s):
4496 """bytes_to_long(string) : long
4497 Convert a byte string to a long integer.
4498
4499 This is (essentially) the inverse of long_to_bytes().
4500 """
4501 acc = 0
4502 length = len(s)
4503 if length % 4:
4504 extra = (4 - length % 4)
4505 s = b'\000' * extra + s
4506 length = length + extra
4507 for i in range(0, length, 4):
4508 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4509 return acc
4510
4511
5bc880b9
YCH
4512def ohdave_rsa_encrypt(data, exponent, modulus):
4513 '''
4514 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4515
4516 Input:
4517 data: data to encrypt, bytes-like object
4518 exponent, modulus: parameter e and N of RSA algorithm, both integer
4519 Output: hex string of encrypted data
4520
4521 Limitation: supports one block encryption only
4522 '''
4523
4524 payload = int(binascii.hexlify(data[::-1]), 16)
4525 encrypted = pow(payload, exponent, modulus)
4526 return '%x' % encrypted
81bdc8fd
YCH
4527
4528
f48409c7
YCH
4529def pkcs1pad(data, length):
4530 """
4531 Padding input data with PKCS#1 scheme
4532
4533 @param {int[]} data input data
4534 @param {int} length target length
4535 @returns {int[]} padded data
4536 """
4537 if len(data) > length - 11:
4538 raise ValueError('Input data too long for PKCS#1 padding')
4539
4540 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4541 return [0, 2] + pseudo_random + [0] + data
4542
4543
5eb6bdce 4544def encode_base_n(num, n, table=None):
59f898b7 4545 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
4546 if not table:
4547 table = FULL_TABLE[:n]
4548
5eb6bdce
YCH
4549 if n > len(table):
4550 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4551
4552 if num == 0:
4553 return table[0]
4554
81bdc8fd
YCH
4555 ret = ''
4556 while num:
4557 ret = table[num % n] + ret
4558 num = num // n
4559 return ret
f52354a8
YCH
4560
4561
4562def decode_packed_codes(code):
06b3fe29 4563 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4564 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4565 base = int(base)
4566 count = int(count)
4567 symbols = symbols.split('|')
4568 symbol_table = {}
4569
4570 while count:
4571 count -= 1
5eb6bdce 4572 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4573 symbol_table[base_n_count] = symbols[count] or base_n_count
4574
4575 return re.sub(
4576 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4577 obfuscated_code)
e154c651 4578
4579
1ced2221
S
4580def caesar(s, alphabet, shift):
4581 if shift == 0:
4582 return s
4583 l = len(alphabet)
4584 return ''.join(
4585 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4586 for c in s)
4587
4588
4589def rot47(s):
4590 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4591
4592
e154c651 4593def parse_m3u8_attributes(attrib):
4594 info = {}
4595 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4596 if val.startswith('"'):
4597 val = val[1:-1]
4598 info[key] = val
4599 return info
1143535d
YCH
4600
4601
4602def urshift(val, n):
4603 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4604
4605
4606# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 4607# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
4608def decode_png(png_data):
4609 # Reference: https://www.w3.org/TR/PNG/
4610 header = png_data[8:]
4611
4612 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
86e5f3ed 4613 raise OSError('Not a valid PNG file.')
d3f8e038
YCH
4614
4615 int_map = {1: '>B', 2: '>H', 4: '>I'}
4616 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4617
4618 chunks = []
4619
4620 while header:
4621 length = unpack_integer(header[:4])
4622 header = header[4:]
4623
4624 chunk_type = header[:4]
4625 header = header[4:]
4626
4627 chunk_data = header[:length]
4628 header = header[length:]
4629
4630 header = header[4:] # Skip CRC
4631
4632 chunks.append({
4633 'type': chunk_type,
4634 'length': length,
4635 'data': chunk_data
4636 })
4637
4638 ihdr = chunks[0]['data']
4639
4640 width = unpack_integer(ihdr[:4])
4641 height = unpack_integer(ihdr[4:8])
4642
4643 idat = b''
4644
4645 for chunk in chunks:
4646 if chunk['type'] == b'IDAT':
4647 idat += chunk['data']
4648
4649 if not idat:
86e5f3ed 4650 raise OSError('Unable to read PNG data.')
d3f8e038
YCH
4651
4652 decompressed_data = bytearray(zlib.decompress(idat))
4653
4654 stride = width * 3
4655 pixels = []
4656
4657 def _get_pixel(idx):
4658 x = idx % stride
4659 y = idx // stride
4660 return pixels[y][x]
4661
4662 for y in range(height):
4663 basePos = y * (1 + stride)
4664 filter_type = decompressed_data[basePos]
4665
4666 current_row = []
4667
4668 pixels.append(current_row)
4669
4670 for x in range(stride):
4671 color = decompressed_data[1 + basePos + x]
4672 basex = y * stride + x
4673 left = 0
4674 up = 0
4675
4676 if x > 2:
4677 left = _get_pixel(basex - 3)
4678 if y > 0:
4679 up = _get_pixel(basex - stride)
4680
4681 if filter_type == 1: # Sub
4682 color = (color + left) & 0xff
4683 elif filter_type == 2: # Up
4684 color = (color + up) & 0xff
4685 elif filter_type == 3: # Average
4686 color = (color + ((left + up) >> 1)) & 0xff
4687 elif filter_type == 4: # Paeth
4688 a = left
4689 b = up
4690 c = 0
4691
4692 if x > 2 and y > 0:
4693 c = _get_pixel(basex - stride - 3)
4694
4695 p = a + b - c
4696
4697 pa = abs(p - a)
4698 pb = abs(p - b)
4699 pc = abs(p - c)
4700
4701 if pa <= pb and pa <= pc:
4702 color = (color + a) & 0xff
4703 elif pb <= pc:
4704 color = (color + b) & 0xff
4705 else:
4706 color = (color + c) & 0xff
4707
4708 current_row.append(color)
4709
4710 return width, height, pixels
efa97bdc
YCH
4711
4712
4713def write_xattr(path, key, value):
6f7563be 4714 # Windows: Write xattrs to NTFS Alternate Data Streams:
4715 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4716 if compat_os_name == 'nt':
4717 assert ':' not in key
4718 assert os.path.exists(path)
efa97bdc
YCH
4719
4720 try:
6f7563be 4721 with open(f'{path}:{key}', 'wb') as f:
4722 f.write(value)
86e5f3ed 4723 except OSError as e:
efa97bdc 4724 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 4725 return
efa97bdc 4726
6f7563be 4727 # UNIX Method 1. Use xattrs/pyxattrs modules
4728 from .dependencies import xattr
efa97bdc 4729
6f7563be 4730 setxattr = None
4731 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4732 # Unicode arguments are not supported in pyxattr until version 0.5.0
4733 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4734 if version_tuple(xattr.__version__) >= (0, 5, 0):
4735 setxattr = xattr.set
4736 elif xattr:
4737 setxattr = xattr.setxattr
efa97bdc 4738
6f7563be 4739 if setxattr:
4740 try:
4741 setxattr(path, key, value)
4742 except OSError as e:
4743 raise XAttrMetadataError(e.errno, e.strerror)
4744 return
efa97bdc 4745
6f7563be 4746 # UNIX Method 2. Use setfattr/xattr executables
4747 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4748 else 'xattr' if check_executable('xattr', ['-h']) else None)
4749 if not exe:
4750 raise XAttrUnavailableError(
4751 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4752 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 4753
0f06bcd7 4754 value = value.decode()
6f7563be 4755 try:
4756 p = Popen(
4757 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4758 stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4759 except OSError as e:
4760 raise XAttrMetadataError(e.errno, e.strerror)
4761 stderr = p.communicate_or_kill()[1].decode('utf-8', 'replace')
4762 if p.returncode:
4763 raise XAttrMetadataError(p.returncode, stderr)
0c265486
YCH
4764
4765
4766def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
4767 start_date = datetime.date(1950, 1, 1)
4768 end_date = datetime.date(1995, 12, 31)
4769 offset = random.randint(0, (end_date - start_date).days)
4770 random_date = start_date + datetime.timedelta(offset)
0c265486 4771 return {
aa374bc7
AS
4772 year_field: str(random_date.year),
4773 month_field: str(random_date.month),
4774 day_field: str(random_date.day),
0c265486 4775 }
732044af 4776
c76eb41b 4777
732044af 4778# Templates for internet shortcut files, which are plain text files.
e5a998f3 4779DOT_URL_LINK_TEMPLATE = '''\
732044af 4780[InternetShortcut]
4781URL=%(url)s
e5a998f3 4782'''
732044af 4783
e5a998f3 4784DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 4785<?xml version="1.0" encoding="UTF-8"?>
4786<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4787<plist version="1.0">
4788<dict>
4789\t<key>URL</key>
4790\t<string>%(url)s</string>
4791</dict>
4792</plist>
e5a998f3 4793'''
732044af 4794
e5a998f3 4795DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 4796[Desktop Entry]
4797Encoding=UTF-8
4798Name=%(filename)s
4799Type=Link
4800URL=%(url)s
4801Icon=text-html
e5a998f3 4802'''
732044af 4803
08438d2c 4804LINK_TEMPLATES = {
4805 'url': DOT_URL_LINK_TEMPLATE,
4806 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4807 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4808}
4809
732044af 4810
4811def iri_to_uri(iri):
4812 """
4813 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4814
4815 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4816 """
4817
4818 iri_parts = compat_urllib_parse_urlparse(iri)
4819
4820 if '[' in iri_parts.netloc:
4821 raise ValueError('IPv6 URIs are not, yet, supported.')
4822 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4823
4824 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4825
4826 net_location = ''
4827 if iri_parts.username:
f9934b96 4828 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 4829 if iri_parts.password is not None:
f9934b96 4830 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 4831 net_location += '@'
4832
0f06bcd7 4833 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 4834 # The 'idna' encoding produces ASCII text.
4835 if iri_parts.port is not None and iri_parts.port != 80:
4836 net_location += ':' + str(iri_parts.port)
4837
f9934b96 4838 return urllib.parse.urlunparse(
732044af 4839 (iri_parts.scheme,
4840 net_location,
4841
f9934b96 4842 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 4843
4844 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 4845 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 4846
4847 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 4848 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 4849
f9934b96 4850 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 4851
4852 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4853
4854
4855def to_high_limit_path(path):
4856 if sys.platform in ['win32', 'cygwin']:
4857 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 4858 return '\\\\?\\' + os.path.abspath(path)
732044af 4859
4860 return path
76d321f6 4861
c76eb41b 4862
b868936c 4863def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
e0ddbd02 4864 val = traverse_obj(obj, *variadic(field))
4865 if val in ignore:
4866 return default
4867 return template % (func(val) if func else val)
00dd0cd5 4868
4869
4870def clean_podcast_url(url):
4871 return re.sub(r'''(?x)
4872 (?:
4873 (?:
4874 chtbl\.com/track|
4875 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4876 play\.podtrac\.com
4877 )/[^/]+|
4878 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4879 flex\.acast\.com|
4880 pd(?:
4881 cn\.co| # https://podcorn.com/analytics-prefix/
4882 st\.fm # https://podsights.com/docs/
4883 )/e
4884 )/''', '', url)
ffcb8191
THD
4885
4886
4887_HEX_TABLE = '0123456789abcdef'
4888
4889
4890def random_uuidv4():
4891 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 4892
4893
4894def make_dir(path, to_screen=None):
4895 try:
4896 dn = os.path.dirname(path)
4897 if dn and not os.path.exists(dn):
4898 os.makedirs(dn)
4899 return True
86e5f3ed 4900 except OSError as err:
0202b52a 4901 if callable(to_screen) is not None:
4902 to_screen('unable to create directory ' + error_to_compat_str(err))
4903 return False
f74980cb 4904
4905
4906def get_executable_path():
c487cf00 4907 from .update import get_variant_and_executable_path
4908
4909 return os.path.abspath(get_variant_and_executable_path()[1])
f74980cb 4910
4911
2f567473 4912def load_plugins(name, suffix, namespace):
3ae5e797 4913 classes = {}
19a03940 4914 with contextlib.suppress(FileNotFoundError):
019a94f7
ÁS
4915 plugins_spec = importlib.util.spec_from_file_location(
4916 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
4917 plugins = importlib.util.module_from_spec(plugins_spec)
4918 sys.modules[plugins_spec.name] = plugins
4919 plugins_spec.loader.exec_module(plugins)
f74980cb 4920 for name in dir(plugins):
2f567473 4921 if name in namespace:
4922 continue
4923 if not name.endswith(suffix):
f74980cb 4924 continue
4925 klass = getattr(plugins, name)
3ae5e797 4926 classes[name] = namespace[name] = klass
f74980cb 4927 return classes
06167fbb 4928
4929
325ebc17 4930def traverse_obj(
352d63fd 4931 obj, *path_list, default=None, expected_type=None, get_all=True,
325ebc17 4932 casesense=True, is_user_input=False, traverse_string=False):
324ad820 4933 ''' Traverse nested list/dict/tuple
8f334380 4934 @param path_list A list of paths which are checked one by one.
19a03940 4935 Each path is a list of keys where each key is a:
4936 - None: Do nothing
4937 - string: A dictionary key
4938 - int: An index into a list
4939 - tuple: A list of keys all of which will be traversed
4940 - Ellipsis: Fetch all values in the object
4941 - Function: Takes the key and value as arguments
4942 and returns whether the key matches or not
325ebc17 4943 @param default Default value to return
352d63fd 4944 @param expected_type Only accept final value of this type (Can also be any callable)
4945 @param get_all Return all the values obtained from a path or only the first one
324ad820 4946 @param casesense Whether to consider dictionary keys as case sensitive
4947 @param is_user_input Whether the keys are generated from user input. If True,
4948 strings are converted to int/slice if necessary
4949 @param traverse_string Whether to traverse inside strings. If True, any
4950 non-compatible object will also be converted into a string
8f334380 4951 # TODO: Write tests
324ad820 4952 '''
325ebc17 4953 if not casesense:
dbf5416a 4954 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
8f334380 4955 path_list = (map(_lower, variadic(path)) for path in path_list)
4956
4957 def _traverse_obj(obj, path, _current_depth=0):
4958 nonlocal depth
4959 path = tuple(variadic(path))
4960 for i, key in enumerate(path):
1797b073 4961 if None in (key, obj):
4962 return obj
8f334380 4963 if isinstance(key, (list, tuple)):
4964 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
4965 key = ...
4966 if key is ...:
4967 obj = (obj.values() if isinstance(obj, dict)
4968 else obj if isinstance(obj, (list, tuple, LazyList))
4969 else str(obj) if traverse_string else [])
4970 _current_depth += 1
4971 depth = max(depth, _current_depth)
4972 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
2614f646 4973 elif callable(key):
4974 if isinstance(obj, (list, tuple, LazyList)):
4975 obj = enumerate(obj)
4976 elif isinstance(obj, dict):
4977 obj = obj.items()
4978 else:
4979 if not traverse_string:
4980 return None
4981 obj = str(obj)
4982 _current_depth += 1
4983 depth = max(depth, _current_depth)
e6f868a6 4984 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
575e17a1 4985 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
325ebc17 4986 obj = (obj.get(key) if casesense or (key in obj)
4987 else next((v for k, v in obj.items() if _lower(k) == key), None))
4988 else:
4989 if is_user_input:
4990 key = (int_or_none(key) if ':' not in key
4991 else slice(*map(int_or_none, key.split(':'))))
8f334380 4992 if key == slice(None):
575e17a1 4993 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
325ebc17 4994 if not isinstance(key, (int, slice)):
9fea350f 4995 return None
8f334380 4996 if not isinstance(obj, (list, tuple, LazyList)):
325ebc17 4997 if not traverse_string:
4998 return None
4999 obj = str(obj)
5000 try:
5001 obj = obj[key]
5002 except IndexError:
324ad820 5003 return None
325ebc17 5004 return obj
5005
352d63fd 5006 if isinstance(expected_type, type):
5007 type_test = lambda val: val if isinstance(val, expected_type) else None
5008 elif expected_type is not None:
5009 type_test = expected_type
5010 else:
5011 type_test = lambda val: val
5012
8f334380 5013 for path in path_list:
5014 depth = 0
5015 val = _traverse_obj(obj, path)
325ebc17 5016 if val is not None:
8f334380 5017 if depth:
5018 for _ in range(depth - 1):
6586bca9 5019 val = itertools.chain.from_iterable(v for v in val if v is not None)
352d63fd 5020 val = [v for v in map(type_test, val) if v is not None]
8f334380 5021 if val:
352d63fd 5022 return val if get_all else val[0]
5023 else:
5024 val = type_test(val)
5025 if val is not None:
8f334380 5026 return val
325ebc17 5027 return default
324ad820 5028
5029
5030def traverse_dict(dictn, keys, casesense=True):
ee8dd27a 5031 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5032 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5033 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5034
5035
ff91cf74 5036def get_first(obj, keys, **kwargs):
5037 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5038
5039
4b4b7f74 5040def variadic(x, allowed_types=(str, bytes, dict)):
cb89cfc1 5041 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
bd50a52b
THD
5042
5043
3e9b66d7
LNO
5044def decode_base(value, digits):
5045 # This will convert given base-x string to scalar (long or int)
5046 table = {char: index for index, char in enumerate(digits)}
5047 result = 0
5048 base = len(digits)
5049 for chr in value:
5050 result *= base
5051 result += table[chr]
5052 return result
5053
5054
5055def time_seconds(**kwargs):
5056 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5057 return t.timestamp()
5058
5059
49fa4d9a
N
5060# create a JSON Web Signature (jws) with HS256 algorithm
5061# the resulting format is in JWS Compact Serialization
5062# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5063# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5064def jwt_encode_hs256(payload_data, key, headers={}):
5065 header_data = {
5066 'alg': 'HS256',
5067 'typ': 'JWT',
5068 }
5069 if headers:
5070 header_data.update(headers)
0f06bcd7 5071 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5072 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5073 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
5074 signature_b64 = base64.b64encode(h.digest())
5075 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5076 return token
819e0531 5077
5078
16b0d7e6 5079# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5080def jwt_decode_hs256(jwt):
5081 header_b64, payload_b64, signature_b64 = jwt.split('.')
5082 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5083 return payload_data
5084
5085
53973b4d 5086WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5087
5088
0b9c08b4 5089@functools.cache
819e0531 5090def supports_terminal_sequences(stream):
5091 if compat_os_name == 'nt':
e3c7d495 5092 if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
819e0531 5093 return False
5094 elif not os.getenv('TERM'):
5095 return False
5096 try:
5097 return stream.isatty()
5098 except BaseException:
5099 return False
5100
5101
53973b4d 5102def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
5103 if compat_os_name != 'nt':
5104 return
5105 global WINDOWS_VT_MODE
5106 startupinfo = subprocess.STARTUPINFO()
5107 startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
5108 try:
5109 subprocess.Popen('', shell=True, startupinfo=startupinfo).wait()
5110 except Exception:
5111 return
5112
5113 WINDOWS_VT_MODE = True
5114 supports_terminal_sequences.cache_clear()
5115
5116
ec11a9f4 5117_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5118
5119
5120def remove_terminal_sequences(string):
5121 return _terminal_sequences_re.sub('', string)
5122
5123
5124def number_of_digits(number):
5125 return len('%d' % number)
34921b43 5126
5127
5128def join_nonempty(*values, delim='-', from_dict=None):
5129 if from_dict is not None:
c586f9e8 5130 values = map(from_dict.get, values)
34921b43 5131 return delim.join(map(str, filter(None, values)))
06e57990 5132
5133
27231526
ZM
5134def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5135 """
5136 Find the largest format dimensions in terms of video width and, for each thumbnail:
5137 * Modify the URL: Match the width with the provided regex and replace with the former width
5138 * Update dimensions
5139
5140 This function is useful with video services that scale the provided thumbnails on demand
5141 """
5142 _keys = ('width', 'height')
5143 max_dimensions = max(
86e5f3ed 5144 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
5145 default=(0, 0))
5146 if not max_dimensions[0]:
5147 return thumbnails
5148 return [
5149 merge_dicts(
5150 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5151 dict(zip(_keys, max_dimensions)), thumbnail)
5152 for thumbnail in thumbnails
5153 ]
5154
5155
93c8410d
LNO
5156def parse_http_range(range):
5157 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5158 if not range:
5159 return None, None, None
5160 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5161 if not crg:
5162 return None, None, None
5163 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5164
5165
06e57990 5166class Config:
5167 own_args = None
9e491463 5168 parsed_args = None
06e57990 5169 filename = None
5170 __initialized = False
5171
5172 def __init__(self, parser, label=None):
9e491463 5173 self.parser, self.label = parser, label
06e57990 5174 self._loaded_paths, self.configs = set(), []
5175
5176 def init(self, args=None, filename=None):
5177 assert not self.__initialized
65662dff 5178 directory = ''
06e57990 5179 if filename:
5180 location = os.path.realpath(filename)
65662dff 5181 directory = os.path.dirname(location)
06e57990 5182 if location in self._loaded_paths:
5183 return False
5184 self._loaded_paths.add(location)
5185
9e491463 5186 self.own_args, self.__initialized = args, True
5187 opts, _ = self.parser.parse_known_args(args)
5188 self.parsed_args, self.filename = args, filename
5189
5190 for location in opts.config_locations or []:
65662dff 5191 location = os.path.join(directory, expand_path(location))
06e57990 5192 if os.path.isdir(location):
5193 location = os.path.join(location, 'yt-dlp.conf')
5194 if not os.path.exists(location):
9e491463 5195 self.parser.error(f'config location {location} does not exist')
06e57990 5196 self.append_config(self.read_file(location), location)
5197 return True
5198
5199 def __str__(self):
5200 label = join_nonempty(
5201 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5202 delim=' ')
5203 return join_nonempty(
5204 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5205 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5206 delim='\n')
5207
5208 @staticmethod
5209 def read_file(filename, default=[]):
5210 try:
5211 optionf = open(filename)
86e5f3ed 5212 except OSError:
06e57990 5213 return default # silently skip if file is not present
5214 try:
5215 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5216 contents = optionf.read()
f9934b96 5217 res = shlex.split(contents, comments=True)
06e57990 5218 finally:
5219 optionf.close()
5220 return res
5221
5222 @staticmethod
5223 def hide_login_info(opts):
86e5f3ed 5224 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 5225 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5226
5227 def _scrub_eq(o):
5228 m = eqre.match(o)
5229 if m:
5230 return m.group('key') + '=PRIVATE'
5231 else:
5232 return o
5233
5234 opts = list(map(_scrub_eq, opts))
5235 for idx, opt in enumerate(opts):
5236 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5237 opts[idx + 1] = 'PRIVATE'
5238 return opts
5239
5240 def append_config(self, *args, label=None):
9e491463 5241 config = type(self)(self.parser, label)
06e57990 5242 config._loaded_paths = self._loaded_paths
5243 if config.init(*args):
5244 self.configs.append(config)
5245
5246 @property
5247 def all_args(self):
5248 for config in reversed(self.configs):
5249 yield from config.all_args
9e491463 5250 yield from self.parsed_args or []
5251
5252 def parse_known_args(self, **kwargs):
5253 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 5254
5255 def parse_args(self):
9e491463 5256 return self.parser.parse_args(self.all_args)
da42679b
LNO
5257
5258
5259class WebSocketsWrapper():
5260 """Wraps websockets module to use in non-async scopes"""
abfecb7b 5261 pool = None
da42679b 5262
3cea3edd 5263 def __init__(self, url, headers=None, connect=True):
059bc4db 5264 self.loop = asyncio.new_event_loop()
9cd08050 5265 # XXX: "loop" is deprecated
5266 self.conn = websockets.connect(
5267 url, extra_headers=headers, ping_interval=None,
5268 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5269 if connect:
5270 self.__enter__()
15dfb392 5271 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5272
5273 def __enter__(self):
3cea3edd 5274 if not self.pool:
9cd08050 5275 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5276 return self
5277
5278 def send(self, *args):
5279 self.run_with_loop(self.pool.send(*args), self.loop)
5280
5281 def recv(self, *args):
5282 return self.run_with_loop(self.pool.recv(*args), self.loop)
5283
5284 def __exit__(self, type, value, traceback):
5285 try:
5286 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5287 finally:
5288 self.loop.close()
15dfb392 5289 self._cancel_all_tasks(self.loop)
da42679b
LNO
5290
5291 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5292 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5293 @staticmethod
5294 def run_with_loop(main, loop):
059bc4db 5295 if not asyncio.iscoroutine(main):
da42679b
LNO
5296 raise ValueError(f'a coroutine was expected, got {main!r}')
5297
5298 try:
5299 return loop.run_until_complete(main)
5300 finally:
5301 loop.run_until_complete(loop.shutdown_asyncgens())
5302 if hasattr(loop, 'shutdown_default_executor'):
5303 loop.run_until_complete(loop.shutdown_default_executor())
5304
5305 @staticmethod
5306 def _cancel_all_tasks(loop):
059bc4db 5307 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
5308
5309 if not to_cancel:
5310 return
5311
5312 for task in to_cancel:
5313 task.cancel()
5314
9cd08050 5315 # XXX: "loop" is removed in python 3.10+
da42679b 5316 loop.run_until_complete(
059bc4db 5317 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
5318
5319 for task in to_cancel:
5320 if task.cancelled():
5321 continue
5322 if task.exception() is not None:
5323 loop.call_exception_handler({
5324 'message': 'unhandled exception during asyncio.run() shutdown',
5325 'exception': task.exception(),
5326 'task': task,
5327 })
5328
5329
8b7539d2 5330def merge_headers(*dicts):
08d30158 5331 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 5332 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 5333
5334
5335class classproperty:
c487cf00 5336 """classmethod(property(func)) that works in py < 3.9"""
5337
5338 def __init__(self, func):
5339 functools.update_wrapper(self, func)
5340 self.func = func
28787f16 5341
5342 def __get__(self, _, cls):
c487cf00 5343 return self.func(cls)
19a03940 5344
5345
591bb9d3 5346class Namespace:
5347 """Immutable namespace"""
591bb9d3 5348
5349 def __init__(self, **kwargs):
5350 self._dict = kwargs
5351
5352 def __getattr__(self, attr):
5353 return self._dict[attr]
5354
7896214c 5355 def __contains__(self, item):
5356 return item in self._dict.values()
5357
5358 def __iter__(self):
5359 return iter(self._dict.items())
5360
591bb9d3 5361 def __repr__(self):
7896214c 5362 return f'{type(self).__name__}({", ".join(f"{k}={v}" for k, v in self)})'
9b8ee23b 5363
5364
5365# Deprecated
5366has_certifi = bool(certifi)
5367has_websockets = bool(websockets)