]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
[docs] Improvements
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
cc52de43 1#!/usr/bin/env python3
15dfb392 2import atexit
1e399778 3import base64
5bc880b9 4import binascii
912b38b4 5import calendar
676eb3f2 6import codecs
c380cc28 7import collections
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96 10import datetime
0c265486 11import email.header
f8271158 12import email.utils
f45c185f 13import errno
d77c3dfd 14import gzip
49fa4d9a
N
15import hashlib
16import hmac
019a94f7 17import importlib.util
03f9daab 18import io
79a2e94e 19import itertools
f4bfd65f 20import json
d77c3dfd 21import locale
02dbf93f 22import math
f8271158 23import mimetypes
347de493 24import operator
d77c3dfd 25import os
c496ca96 26import platform
773f291d 27import random
d77c3dfd 28import re
f8271158 29import shlex
c496ca96 30import socket
79a2e94e 31import ssl
1c088fa8 32import subprocess
d77c3dfd 33import sys
181c8655 34import tempfile
c380cc28 35import time
01951dda 36import traceback
64fa820c 37import types
f8271158 38import urllib.parse
bcf89ce6 39import xml.etree.ElementTree
d77c3dfd 40import zlib
d77c3dfd 41
c487cf00 42from .compat import asyncio, functools # isort: split
8c25f81b
PH
43from .compat import (
44 compat_chr,
1bab3437 45 compat_cookiejar,
36e6f62c 46 compat_etree_fromstring,
51098426 47 compat_expanduser,
8c25f81b 48 compat_html_entities,
55b2f099 49 compat_html_entities_html5,
f8271158 50 compat_HTMLParseError,
51 compat_HTMLParser,
be4a824d 52 compat_http_client,
f8271158 53 compat_HTTPError,
efa97bdc 54 compat_os_name,
8c25f81b 55 compat_parse_qs,
702ccf2d 56 compat_shlex_quote,
8c25f81b 57 compat_str,
edaa23f8 58 compat_struct_pack,
d3f8e038 59 compat_struct_unpack,
8c25f81b 60 compat_urllib_error,
f8271158 61 compat_urllib_parse_unquote_plus,
15707c7e 62 compat_urllib_parse_urlencode,
8c25f81b
PH
63 compat_urllib_parse_urlparse,
64 compat_urllib_request,
65 compat_urlparse,
66)
9b8ee23b 67from .dependencies import brotli, certifi, websockets
f8271158 68from .socks import ProxyType, sockssocket
71aff188 69
4644ac55 70
51fb4995
YCH
71def register_socks_protocols():
72 # "Register" SOCKS protocols
d5ae6bb5
YCH
73 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
74 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
75 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
76 if scheme not in compat_urlparse.uses_netloc:
77 compat_urlparse.uses_netloc.append(scheme)
78
79
468e2e92
FV
80# This is not clearly defined otherwise
81compiled_regex_type = type(re.compile(''))
82
f7a147e3
S
83
84def random_user_agent():
85 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
86 _CHROME_VERSIONS = (
19b4c74d 87 '90.0.4430.212',
88 '90.0.4430.24',
89 '90.0.4430.70',
90 '90.0.4430.72',
91 '90.0.4430.85',
92 '90.0.4430.93',
93 '91.0.4472.101',
94 '91.0.4472.106',
95 '91.0.4472.114',
96 '91.0.4472.124',
97 '91.0.4472.164',
98 '91.0.4472.19',
99 '91.0.4472.77',
100 '92.0.4515.107',
101 '92.0.4515.115',
102 '92.0.4515.131',
103 '92.0.4515.159',
104 '92.0.4515.43',
105 '93.0.4556.0',
106 '93.0.4577.15',
107 '93.0.4577.63',
108 '93.0.4577.82',
109 '94.0.4606.41',
110 '94.0.4606.54',
111 '94.0.4606.61',
112 '94.0.4606.71',
113 '94.0.4606.81',
114 '94.0.4606.85',
115 '95.0.4638.17',
116 '95.0.4638.50',
117 '95.0.4638.54',
118 '95.0.4638.69',
119 '95.0.4638.74',
120 '96.0.4664.18',
121 '96.0.4664.45',
122 '96.0.4664.55',
123 '96.0.4664.93',
124 '97.0.4692.20',
f7a147e3
S
125 )
126 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
127
128
4390d5ec 129SUPPORTED_ENCODINGS = [
130 'gzip', 'deflate'
131]
9b8ee23b 132if brotli:
4390d5ec 133 SUPPORTED_ENCODINGS.append('br')
134
3e669f36 135std_headers = {
f7a147e3 136 'User-Agent': random_user_agent(),
59ae15a5 137 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59ae15a5 138 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 139 'Sec-Fetch-Mode': 'navigate',
3e669f36 140}
f427df17 141
5f6a1245 142
fb37eb25
S
143USER_AGENTS = {
144 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
145}
146
147
bf42a990
S
148NO_DEFAULT = object()
149
7105440c
YCH
150ENGLISH_MONTH_NAMES = [
151 'January', 'February', 'March', 'April', 'May', 'June',
152 'July', 'August', 'September', 'October', 'November', 'December']
153
f6717dec
S
154MONTH_NAMES = {
155 'en': ENGLISH_MONTH_NAMES,
156 'fr': [
3e4185c3
S
157 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
158 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 159}
a942d6cb 160
a7aaa398
S
161KNOWN_EXTENSIONS = (
162 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
163 'flv', 'f4v', 'f4a', 'f4b',
164 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
165 'mkv', 'mka', 'mk3d',
166 'avi', 'divx',
167 'mov',
168 'asf', 'wmv', 'wma',
169 '3gp', '3g2',
170 'mp3',
171 'flac',
172 'ape',
173 'wav',
174 'f4f', 'f4m', 'm3u8', 'smil')
175
c587cbb7 176# needed for sanitizing filenames in restricted mode
c8827027 177ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
178 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
179 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 180
46f59e89
S
181DATE_FORMATS = (
182 '%d %B %Y',
183 '%d %b %Y',
184 '%B %d %Y',
cb655f34
S
185 '%B %dst %Y',
186 '%B %dnd %Y',
9d30c213 187 '%B %drd %Y',
cb655f34 188 '%B %dth %Y',
46f59e89 189 '%b %d %Y',
cb655f34
S
190 '%b %dst %Y',
191 '%b %dnd %Y',
9d30c213 192 '%b %drd %Y',
cb655f34 193 '%b %dth %Y',
46f59e89
S
194 '%b %dst %Y %I:%M',
195 '%b %dnd %Y %I:%M',
9d30c213 196 '%b %drd %Y %I:%M',
46f59e89
S
197 '%b %dth %Y %I:%M',
198 '%Y %m %d',
199 '%Y-%m-%d',
bccdbd22 200 '%Y.%m.%d.',
46f59e89 201 '%Y/%m/%d',
81c13222 202 '%Y/%m/%d %H:%M',
46f59e89 203 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
204 '%Y%m%d%H%M',
205 '%Y%m%d%H%M%S',
4f3fa23e 206 '%Y%m%d',
0c1c6f4b 207 '%Y-%m-%d %H:%M',
46f59e89
S
208 '%Y-%m-%d %H:%M:%S',
209 '%Y-%m-%d %H:%M:%S.%f',
5014558a 210 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
211 '%d.%m.%Y %H:%M',
212 '%d.%m.%Y %H.%M',
213 '%Y-%m-%dT%H:%M:%SZ',
214 '%Y-%m-%dT%H:%M:%S.%fZ',
215 '%Y-%m-%dT%H:%M:%S.%f0Z',
216 '%Y-%m-%dT%H:%M:%S',
217 '%Y-%m-%dT%H:%M:%S.%f',
218 '%Y-%m-%dT%H:%M',
c6eed6b8
S
219 '%b %d %Y at %H:%M',
220 '%b %d %Y at %H:%M:%S',
b555ae9b
S
221 '%B %d %Y at %H:%M',
222 '%B %d %Y at %H:%M:%S',
a63d9bd0 223 '%H:%M %d-%b-%Y',
46f59e89
S
224)
225
226DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
227DATE_FORMATS_DAY_FIRST.extend([
228 '%d-%m-%Y',
229 '%d.%m.%Y',
230 '%d.%m.%y',
231 '%d/%m/%Y',
232 '%d/%m/%y',
233 '%d/%m/%Y %H:%M:%S',
234])
235
236DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
237DATE_FORMATS_MONTH_FIRST.extend([
238 '%m-%d-%Y',
239 '%m.%d.%Y',
240 '%m/%d/%Y',
241 '%m/%d/%y',
242 '%m/%d/%Y %H:%M:%S',
243])
244
06b3fe29 245PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
22f5f5c6 246JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
06b3fe29 247
1d485a1a 248NUMBER_RE = r'\d+(?:\.\d+)?'
249
7105440c 250
0b9c08b4 251@functools.cache
d77c3dfd 252def preferredencoding():
59ae15a5 253 """Get preferred encoding.
d77c3dfd 254
59ae15a5
PH
255 Returns the best encoding scheme for the system, based on
256 locale.getpreferredencoding() and some further tweaks.
257 """
258 try:
259 pref = locale.getpreferredencoding()
28e614de 260 'TEST'.encode(pref)
70a1165b 261 except Exception:
59ae15a5 262 pref = 'UTF-8'
bae611f2 263
59ae15a5 264 return pref
d77c3dfd 265
f4bfd65f 266
181c8655 267def write_json_file(obj, fn):
1394646a 268 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 269
cfb0511d 270 tf = tempfile.NamedTemporaryFile(
271 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
272 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
273
274 try:
275 with tf:
45d86abe 276 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
277 if sys.platform == 'win32':
278 # Need to remove existing file on Windows, else os.rename raises
279 # WindowsError or FileExistsError.
19a03940 280 with contextlib.suppress(OSError):
1394646a 281 os.unlink(fn)
19a03940 282 with contextlib.suppress(OSError):
9cd5f54e
R
283 mask = os.umask(0)
284 os.umask(mask)
285 os.chmod(tf.name, 0o666 & ~mask)
181c8655 286 os.rename(tf.name, fn)
70a1165b 287 except Exception:
19a03940 288 with contextlib.suppress(OSError):
181c8655 289 os.remove(tf.name)
181c8655
PH
290 raise
291
292
cfb0511d 293def find_xpath_attr(node, xpath, key, val=None):
294 """ Find the xpath xpath[@key=val] """
295 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 296 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 297 return node.find(expr)
59ae56fa 298
d7e66d39
JMF
299# On python2.6 the xml.etree.ElementTree.Element methods don't support
300# the namespace parameter
5f6a1245
JW
301
302
d7e66d39
JMF
303def xpath_with_ns(path, ns_map):
304 components = [c.split(':') for c in path.split('/')]
305 replaced = []
306 for c in components:
307 if len(c) == 1:
308 replaced.append(c[0])
309 else:
310 ns, tag = c
311 replaced.append('{%s}%s' % (ns_map[ns], tag))
312 return '/'.join(replaced)
313
d77c3dfd 314
a41fb80c 315def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 316 def _find_xpath(xpath):
f9934b96 317 return node.find(xpath)
578c0745
S
318
319 if isinstance(xpath, (str, compat_str)):
320 n = _find_xpath(xpath)
321 else:
322 for xp in xpath:
323 n = _find_xpath(xp)
324 if n is not None:
325 break
d74bebd5 326
8e636da4 327 if n is None:
bf42a990
S
328 if default is not NO_DEFAULT:
329 return default
330 elif fatal:
bf0ff932
PH
331 name = xpath if name is None else name
332 raise ExtractorError('Could not find XML element %s' % name)
333 else:
334 return None
a41fb80c
S
335 return n
336
337
338def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
339 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
340 if n is None or n == default:
341 return n
342 if n.text is None:
343 if default is not NO_DEFAULT:
344 return default
345 elif fatal:
346 name = xpath if name is None else name
347 raise ExtractorError('Could not find XML element\'s text %s' % name)
348 else:
349 return None
350 return n.text
a41fb80c
S
351
352
353def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
354 n = find_xpath_attr(node, xpath, key)
355 if n is None:
356 if default is not NO_DEFAULT:
357 return default
358 elif fatal:
86e5f3ed 359 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
360 raise ExtractorError('Could not find XML attribute %s' % name)
361 else:
362 return None
363 return n.attrib[key]
bf0ff932
PH
364
365
c487cf00 366def get_element_by_id(id, html, **kwargs):
43e8fafd 367 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 368 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 369
12ea2f30 370
c487cf00 371def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 372 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 373 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
374
375
84c237fb 376def get_element_by_class(class_name, html):
2af12ad9
TC
377 """Return the content of the first tag with the specified class in the passed HTML document"""
378 retval = get_elements_by_class(class_name, html)
379 return retval[0] if retval else None
380
381
6f32a0b5
ZM
382def get_element_html_by_class(class_name, html):
383 """Return the html of the first tag with the specified class in the passed HTML document"""
384 retval = get_elements_html_by_class(class_name, html)
385 return retval[0] if retval else None
386
387
c487cf00 388def get_element_by_attribute(attribute, value, html, **kwargs):
389 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
390 return retval[0] if retval else None
391
392
c487cf00 393def get_element_html_by_attribute(attribute, value, html, **kargs):
394 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
395 return retval[0] if retval else None
396
397
c487cf00 398def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
399 """Return the content of all tags with the specified class in the passed HTML document as a list"""
400 return get_elements_by_attribute(
64fa820c 401 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
402 html, escape_value=False)
403
404
6f32a0b5
ZM
405def get_elements_html_by_class(class_name, html):
406 """Return the html of all tags with the specified class in the passed HTML document as a list"""
407 return get_elements_html_by_attribute(
64fa820c 408 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
409 html, escape_value=False)
410
411
412def get_elements_by_attribute(*args, **kwargs):
43e8fafd 413 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
414 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
415
416
417def get_elements_html_by_attribute(*args, **kwargs):
418 """Return the html of the tag with the specified attribute in the passed HTML document"""
419 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
420
421
422def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
423 """
424 Return the text (content) and the html (whole) of the tag with the specified
425 attribute in the passed HTML document
426 """
9e6dd238 427
86e5f3ed 428 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 429
84c237fb
YCH
430 value = re.escape(value) if escape_value else value
431
86e5f3ed 432 partial_element_re = rf'''(?x)
6f32a0b5 433 <(?P<tag>[a-zA-Z0-9:._-]+)
0254f162 434 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 435 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
436 '''
38285056 437
0254f162
ZM
438 for m in re.finditer(partial_element_re, html):
439 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 440
0254f162
ZM
441 yield (
442 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
443 whole
444 )
a921f407 445
c5229f39 446
6f32a0b5
ZM
447class HTMLBreakOnClosingTagParser(compat_HTMLParser):
448 """
449 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
450 closing tag for the first opening tag it has encountered, and can be used
451 as a context manager
452 """
453
454 class HTMLBreakOnClosingTagException(Exception):
455 pass
456
457 def __init__(self):
458 self.tagstack = collections.deque()
459 compat_HTMLParser.__init__(self)
460
461 def __enter__(self):
462 return self
463
464 def __exit__(self, *_):
465 self.close()
466
467 def close(self):
468 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
469 # so data remains buffered; we no longer have any interest in it, thus
470 # override this method to discard it
471 pass
472
473 def handle_starttag(self, tag, _):
474 self.tagstack.append(tag)
475
476 def handle_endtag(self, tag):
477 if not self.tagstack:
478 raise compat_HTMLParseError('no tags in the stack')
479 while self.tagstack:
480 inner_tag = self.tagstack.pop()
481 if inner_tag == tag:
482 break
483 else:
484 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
485 if not self.tagstack:
486 raise self.HTMLBreakOnClosingTagException()
487
488
489def get_element_text_and_html_by_tag(tag, html):
490 """
491 For the first element with the specified tag in the passed HTML document
492 return its' content (text) and the whole element (html)
493 """
494 def find_or_raise(haystack, needle, exc):
495 try:
496 return haystack.index(needle)
497 except ValueError:
498 raise exc
499 closing_tag = f'</{tag}>'
500 whole_start = find_or_raise(
501 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
502 content_start = find_or_raise(
503 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
504 content_start += whole_start + 1
505 with HTMLBreakOnClosingTagParser() as parser:
506 parser.feed(html[whole_start:content_start])
507 if not parser.tagstack or parser.tagstack[0] != tag:
508 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
509 offset = content_start
510 while offset < len(html):
511 next_closing_tag_start = find_or_raise(
512 html[offset:], closing_tag,
513 compat_HTMLParseError(f'closing {tag} tag not found'))
514 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
515 try:
516 parser.feed(html[offset:offset + next_closing_tag_end])
517 offset += next_closing_tag_end
518 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
519 return html[content_start:offset + next_closing_tag_start], \
520 html[whole_start:offset + next_closing_tag_end]
521 raise compat_HTMLParseError('unexpected end of html')
522
523
8bb56eee
BF
524class HTMLAttributeParser(compat_HTMLParser):
525 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 526
8bb56eee 527 def __init__(self):
c5229f39 528 self.attrs = {}
8bb56eee
BF
529 compat_HTMLParser.__init__(self)
530
531 def handle_starttag(self, tag, attrs):
532 self.attrs = dict(attrs)
533
c5229f39 534
73673ccf
FF
535class HTMLListAttrsParser(compat_HTMLParser):
536 """HTML parser to gather the attributes for the elements of a list"""
537
538 def __init__(self):
539 compat_HTMLParser.__init__(self)
540 self.items = []
541 self._level = 0
542
543 def handle_starttag(self, tag, attrs):
544 if tag == 'li' and self._level == 0:
545 self.items.append(dict(attrs))
546 self._level += 1
547
548 def handle_endtag(self, tag):
549 self._level -= 1
550
551
8bb56eee
BF
552def extract_attributes(html_element):
553 """Given a string for an HTML element such as
554 <el
555 a="foo" B="bar" c="&98;az" d=boz
556 empty= noval entity="&amp;"
557 sq='"' dq="'"
558 >
559 Decode and return a dictionary of attributes.
560 {
561 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
562 'empty': '', 'noval': None, 'entity': '&',
563 'sq': '"', 'dq': '\''
564 }.
8bb56eee
BF
565 """
566 parser = HTMLAttributeParser()
19a03940 567 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
568 parser.feed(html_element)
569 parser.close()
8bb56eee 570 return parser.attrs
9e6dd238 571
c5229f39 572
73673ccf
FF
573def parse_list(webpage):
574 """Given a string for an series of HTML <li> elements,
575 return a dictionary of their attributes"""
576 parser = HTMLListAttrsParser()
577 parser.feed(webpage)
578 parser.close()
579 return parser.items
580
581
9e6dd238 582def clean_html(html):
59ae15a5 583 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
584
585 if html is None: # Convenience for sanitizing descriptions etc.
586 return html
587
49185227 588 html = re.sub(r'\s+', ' ', html)
589 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
590 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
591 # Strip html tags
592 html = re.sub('<.*?>', '', html)
593 # Replace html entities
594 html = unescapeHTML(html)
7decf895 595 return html.strip()
9e6dd238
FV
596
597
b7c47b74 598class LenientJSONDecoder(json.JSONDecoder):
599 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
600 self.transform_source, self.ignore_extra = transform_source, ignore_extra
601 super().__init__(*args, **kwargs)
602
603 def decode(self, s):
604 if self.transform_source:
605 s = self.transform_source(s)
606 if self.ignore_extra:
607 return self.raw_decode(s.lstrip())[0]
608 return super().decode(s)
609
610
d77c3dfd 611def sanitize_open(filename, open_mode):
59ae15a5
PH
612 """Try to open the given filename, and slightly tweak it if this fails.
613
614 Attempts to open the given filename. If this fails, it tries to change
615 the filename slightly, step by step, until it's either able to open it
616 or it fails and raises a final exception, like the standard open()
617 function.
618
619 It returns the tuple (stream, definitive_file_name).
620 """
0edb3e33 621 if filename == '-':
622 if sys.platform == 'win32':
623 import msvcrt
624 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
625 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 626
0edb3e33 627 for attempt in range(2):
628 try:
629 try:
89737671 630 if sys.platform == 'win32':
b506289f 631 # FIXME: An exclusive lock also locks the file from being read.
632 # Since windows locks are mandatory, don't lock the file on windows (for now).
633 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 634 raise LockingUnsupportedError()
0edb3e33 635 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 636 except OSError:
0edb3e33 637 stream = open(filename, open_mode)
8a82af35 638 return stream, filename
86e5f3ed 639 except OSError as err:
0edb3e33 640 if attempt or err.errno in (errno.EACCES,):
641 raise
642 old_filename, filename = filename, sanitize_path(filename)
643 if old_filename == filename:
644 raise
d77c3dfd
FV
645
646
647def timeconvert(timestr):
59ae15a5
PH
648 """Convert RFC 2822 defined time string into system timestamp"""
649 timestamp = None
650 timetuple = email.utils.parsedate_tz(timestr)
651 if timetuple is not None:
652 timestamp = email.utils.mktime_tz(timetuple)
653 return timestamp
1c469a94 654
5f6a1245 655
5c3895ff 656def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 657 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 658 @param restricted Use a stricter subset of allowed characters
659 @param is_id Whether this is an ID that should be kept unchanged if possible.
660 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 661 """
5c3895ff 662 if s == '':
663 return ''
664
59ae15a5 665 def replace_insane(char):
c587cbb7
AT
666 if restricted and char in ACCENT_CHARS:
667 return ACCENT_CHARS[char]
91dd88b9 668 elif not restricted and char == '\n':
5c3895ff 669 return '\0 '
91dd88b9 670 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
671 return ''
672 elif char == '"':
673 return '' if restricted else '\''
674 elif char == ':':
5c3895ff 675 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 676 elif char in '\\/|*<>':
5c3895ff 677 return '\0_'
678 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
679 return '\0_'
59ae15a5
PH
680 return char
681
5c3895ff 682 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 683 result = ''.join(map(replace_insane, s))
5c3895ff 684 if is_id is NO_DEFAULT:
685 result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result) # Remove repeated substitute chars
686 STRIP_RE = '(?:\0.|[ _-])*'
687 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
688 result = result.replace('\0', '') or '_'
689
796173d0
PH
690 if not is_id:
691 while '__' in result:
692 result = result.replace('__', '_')
693 result = result.strip('_')
694 # Common case of "Foreign band name - English song title"
695 if restricted and result.startswith('-_'):
696 result = result[2:]
5a42414b
PH
697 if result.startswith('-'):
698 result = '_' + result[len('-'):]
a7440261 699 result = result.lstrip('.')
796173d0
PH
700 if not result:
701 result = '_'
59ae15a5 702 return result
d77c3dfd 703
5f6a1245 704
c2934512 705def sanitize_path(s, force=False):
a2aaf4db 706 """Sanitizes and normalizes path on Windows"""
c2934512 707 if sys.platform == 'win32':
c4218ac3 708 force = False
c2934512 709 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 710 elif force:
711 drive_or_unc = ''
712 else:
a2aaf4db 713 return s
c2934512 714
be531ef1
S
715 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
716 if drive_or_unc:
a2aaf4db
S
717 norm_path.pop(0)
718 sanitized_path = [
ec85ded8 719 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 720 for path_part in norm_path]
be531ef1
S
721 if drive_or_unc:
722 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 723 elif force and s and s[0] == os.path.sep:
c4218ac3 724 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
725 return os.path.join(*sanitized_path)
726
727
17bcc626 728def sanitize_url(url):
befa4708
S
729 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
730 # the number of unwanted failures due to missing protocol
21633673 731 if url is None:
732 return
733 elif url.startswith('//'):
befa4708
S
734 return 'http:%s' % url
735 # Fix some common typos seen so far
736 COMMON_TYPOS = (
067aa17e 737 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
738 (r'^httpss://', r'https://'),
739 # https://bx1.be/lives/direct-tv/
740 (r'^rmtp([es]?)://', r'rtmp\1://'),
741 )
742 for mistake, fixup in COMMON_TYPOS:
743 if re.match(mistake, url):
744 return re.sub(mistake, fixup, url)
bc6b9bcd 745 return url
17bcc626
S
746
747
5435dcf9
HH
748def extract_basic_auth(url):
749 parts = compat_urlparse.urlsplit(url)
750 if parts.username is None:
751 return url, None
752 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
753 parts.hostname if parts.port is None
754 else '%s:%d' % (parts.hostname, parts.port))))
755 auth_payload = base64.b64encode(
0f06bcd7 756 ('%s:%s' % (parts.username, parts.password or '')).encode())
757 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
758
759
67dda517 760def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 761 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
762 if auth_header is not None:
763 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
764 headers['Authorization'] = auth_header
765 return compat_urllib_request.Request(url, *args, **kwargs)
67dda517
S
766
767
51098426
S
768def expand_path(s):
769 """Expand shell variables and ~"""
770 return os.path.expandvars(compat_expanduser(s))
771
772
7e9a6125 773def orderedSet(iterable, *, lazy=False):
774 """Remove all duplicates from the input iterable"""
775 def _iter():
776 seen = [] # Do not use set since the items can be unhashable
777 for x in iterable:
778 if x not in seen:
779 seen.append(x)
780 yield x
781
782 return _iter() if lazy else list(_iter())
d77c3dfd 783
912b38b4 784
55b2f099 785def _htmlentity_transform(entity_with_semicolon):
4e408e47 786 """Transforms an HTML entity to a character."""
55b2f099
YCH
787 entity = entity_with_semicolon[:-1]
788
4e408e47
PH
789 # Known non-numeric HTML entity
790 if entity in compat_html_entities.name2codepoint:
791 return compat_chr(compat_html_entities.name2codepoint[entity])
792
55b2f099
YCH
793 # TODO: HTML5 allows entities without a semicolon. For example,
794 # '&Eacuteric' should be decoded as 'Éric'.
795 if entity_with_semicolon in compat_html_entities_html5:
796 return compat_html_entities_html5[entity_with_semicolon]
797
91757b0f 798 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
799 if mobj is not None:
800 numstr = mobj.group(1)
28e614de 801 if numstr.startswith('x'):
4e408e47 802 base = 16
28e614de 803 numstr = '0%s' % numstr
4e408e47
PH
804 else:
805 base = 10
067aa17e 806 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 807 with contextlib.suppress(ValueError):
7aefc49c 808 return compat_chr(int(numstr, base))
4e408e47
PH
809
810 # Unknown entity in name, return its literal representation
7a3f0c00 811 return '&%s;' % entity
4e408e47
PH
812
813
d77c3dfd 814def unescapeHTML(s):
912b38b4
PH
815 if s is None:
816 return None
19a03940 817 assert isinstance(s, str)
d77c3dfd 818
4e408e47 819 return re.sub(
95f3f7c2 820 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 821
8bf48f23 822
cdb19aa4 823def escapeHTML(text):
824 return (
825 text
826 .replace('&', '&amp;')
827 .replace('<', '&lt;')
828 .replace('>', '&gt;')
829 .replace('"', '&quot;')
830 .replace("'", '&#39;')
831 )
832
833
f5b1bca9 834def process_communicate_or_kill(p, *args, **kwargs):
8a82af35 835 write_string('DeprecationWarning: yt_dlp.utils.process_communicate_or_kill is deprecated '
836 'and may be removed in a future version. Use yt_dlp.utils.Popen.communicate_or_kill instead')
837 return Popen.communicate_or_kill(p, *args, **kwargs)
f5b1bca9 838
839
d3c93ec2 840class Popen(subprocess.Popen):
841 if sys.platform == 'win32':
842 _startupinfo = subprocess.STARTUPINFO()
843 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
844 else:
845 _startupinfo = None
846
f0c9fb96 847 def __init__(self, *args, text=False, **kwargs):
848 if text is True:
849 kwargs['universal_newlines'] = True # For 3.6 compatibility
850 kwargs.setdefault('encoding', 'utf-8')
851 kwargs.setdefault('errors', 'replace')
86e5f3ed 852 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 853
854 def communicate_or_kill(self, *args, **kwargs):
8a82af35 855 try:
856 return self.communicate(*args, **kwargs)
857 except BaseException: # Including KeyboardInterrupt
f0c9fb96 858 self.kill(timeout=None)
8a82af35 859 raise
d3c93ec2 860
f0c9fb96 861 def kill(self, *, timeout=0):
862 super().kill()
863 if timeout != 0:
864 self.wait(timeout=timeout)
865
866 @classmethod
867 def run(cls, *args, **kwargs):
868 with cls(*args, **kwargs) as proc:
869 stdout, stderr = proc.communicate_or_kill()
870 return stdout or '', stderr or '', proc.returncode
871
d3c93ec2 872
aa49acd1
S
873def get_subprocess_encoding():
874 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
875 # For subprocess calls, encode with locale encoding
876 # Refer to http://stackoverflow.com/a/9951851/35070
877 encoding = preferredencoding()
878 else:
879 encoding = sys.getfilesystemencoding()
880 if encoding is None:
881 encoding = 'utf-8'
882 return encoding
883
884
8bf48f23 885def encodeFilename(s, for_subprocess=False):
19a03940 886 assert isinstance(s, str)
cfb0511d 887 return s
aa49acd1
S
888
889
890def decodeFilename(b, for_subprocess=False):
cfb0511d 891 return b
8bf48f23 892
f07b74fc
PH
893
894def encodeArgument(s):
cfb0511d 895 # Legacy code that uses byte strings
896 # Uncomment the following line after fixing all post processors
897 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
898 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
899
900
aa49acd1 901def decodeArgument(b):
cfb0511d 902 return b
aa49acd1
S
903
904
8271226a
PH
905def decodeOption(optval):
906 if optval is None:
907 return optval
908 if isinstance(optval, bytes):
909 optval = optval.decode(preferredencoding())
910
911 assert isinstance(optval, compat_str)
912 return optval
1c256f70 913
5f6a1245 914
aa7785f8 915_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
916
917
918def timetuple_from_msec(msec):
919 secs, msec = divmod(msec, 1000)
920 mins, secs = divmod(secs, 60)
921 hrs, mins = divmod(mins, 60)
922 return _timetuple(hrs, mins, secs, msec)
923
924
cdb19aa4 925def formatSeconds(secs, delim=':', msec=False):
aa7785f8 926 time = timetuple_from_msec(secs * 1000)
927 if time.hours:
928 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
929 elif time.minutes:
930 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 931 else:
aa7785f8 932 ret = '%d' % time.seconds
933 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 934
a0ddb8a2 935
77562778 936def _ssl_load_windows_store_certs(ssl_context, storename):
937 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
938 try:
939 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
940 if encoding == 'x509_asn' and (
941 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
942 except PermissionError:
943 return
944 for cert in certs:
19a03940 945 with contextlib.suppress(ssl.SSLError):
77562778 946 ssl_context.load_verify_locations(cadata=cert)
a2366922 947
77562778 948
949def make_HTTPS_handler(params, **kwargs):
950 opts_check_certificate = not params.get('nocheckcertificate')
951 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
952 context.check_hostname = opts_check_certificate
f81c62a6 953 if params.get('legacyserverconnect'):
954 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
4f28b537 955 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
956 context.set_ciphers('DEFAULT')
8a82af35 957
77562778 958 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
959 if opts_check_certificate:
d5820461 960 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
961 context.load_verify_locations(cafile=certifi.where())
8a82af35 962 try:
963 context.load_default_certs()
964 # Work around the issue in load_default_certs when there are bad certificates. See:
965 # https://github.com/yt-dlp/yt-dlp/issues/1060,
966 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
967 except ssl.SSLError:
968 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
969 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
970 for storename in ('CA', 'ROOT'):
971 _ssl_load_windows_store_certs(context, storename)
972 context.set_default_verify_paths()
973
bb58c9ed 974 client_certfile = params.get('client_certificate')
975 if client_certfile:
976 try:
977 context.load_cert_chain(
978 client_certfile, keyfile=params.get('client_certificate_key'),
979 password=params.get('client_certificate_password'))
980 except ssl.SSLError:
981 raise YoutubeDLError('Unable to load client certificate')
2c6dcb65 982
983 # Some servers may reject requests if ALPN extension is not sent. See:
984 # https://github.com/python/cpython/issues/85140
985 # https://github.com/yt-dlp/yt-dlp/issues/3878
986 with contextlib.suppress(NotImplementedError):
987 context.set_alpn_protocols(['http/1.1'])
988
77562778 989 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 990
732ea2f0 991
5873d4cc 992def bug_reports_message(before=';'):
a44ca5a4 993 msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , '
592b7485 994 'filling out the appropriate issue template. '
08d30158 995 'Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
996
997 before = before.rstrip()
998 if not before or before.endswith(('.', '!', '?')):
999 msg = msg[0].title() + msg[1:]
1000
1001 return (before + ' ' if before else '') + msg
08f2a92c
JMF
1002
1003
bf5b9d85
PM
1004class YoutubeDLError(Exception):
1005 """Base exception for YoutubeDL errors."""
aa9369a2 1006 msg = None
1007
1008 def __init__(self, msg=None):
1009 if msg is not None:
1010 self.msg = msg
1011 elif self.msg is None:
1012 self.msg = type(self).__name__
1013 super().__init__(self.msg)
bf5b9d85
PM
1014
1015
3158150c 1016network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1017if hasattr(ssl, 'CertificateError'):
1018 network_exceptions.append(ssl.CertificateError)
1019network_exceptions = tuple(network_exceptions)
1020
1021
bf5b9d85 1022class ExtractorError(YoutubeDLError):
1c256f70 1023 """Error during info extraction."""
5f6a1245 1024
1151c407 1025 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1026 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1027 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1028 """
3158150c 1029 if sys.exc_info()[0] in network_exceptions:
9a82b238 1030 expected = True
d5979c5d 1031
7265a219 1032 self.orig_msg = str(msg)
1c256f70 1033 self.traceback = tb
1151c407 1034 self.expected = expected
2eabb802 1035 self.cause = cause
d11271dd 1036 self.video_id = video_id
1151c407 1037 self.ie = ie
1038 self.exc_info = sys.exc_info() # preserve original exception
1039
86e5f3ed 1040 super().__init__(''.join((
a70635b8 1041 format_field(ie, None, '[%s] '),
1042 format_field(video_id, None, '%s: '),
7265a219 1043 msg,
a70635b8 1044 format_field(cause, None, ' (caused by %r)'),
1151c407 1045 '' if expected else bug_reports_message())))
1c256f70 1046
01951dda 1047 def format_traceback(self):
497d2fab 1048 return join_nonempty(
1049 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 1050 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 1051 delim='\n') or None
01951dda 1052
1c256f70 1053
416c7fcb
PH
1054class UnsupportedError(ExtractorError):
1055 def __init__(self, url):
86e5f3ed 1056 super().__init__(
416c7fcb
PH
1057 'Unsupported URL: %s' % url, expected=True)
1058 self.url = url
1059
1060
55b3e45b
JMF
1061class RegexNotFoundError(ExtractorError):
1062 """Error when a regex didn't match"""
1063 pass
1064
1065
773f291d
S
1066class GeoRestrictedError(ExtractorError):
1067 """Geographic restriction Error exception.
1068
1069 This exception may be thrown when a video is not available from your
1070 geographic location due to geographic restrictions imposed by a website.
1071 """
b6e0c7d2 1072
0db3bae8 1073 def __init__(self, msg, countries=None, **kwargs):
1074 kwargs['expected'] = True
86e5f3ed 1075 super().__init__(msg, **kwargs)
773f291d
S
1076 self.countries = countries
1077
1078
bf5b9d85 1079class DownloadError(YoutubeDLError):
59ae15a5 1080 """Download Error exception.
d77c3dfd 1081
59ae15a5
PH
1082 This exception may be thrown by FileDownloader objects if they are not
1083 configured to continue on errors. They will contain the appropriate
1084 error message.
1085 """
5f6a1245 1086
8cc83b8d
FV
1087 def __init__(self, msg, exc_info=None):
1088 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1089 super().__init__(msg)
8cc83b8d 1090 self.exc_info = exc_info
d77c3dfd
FV
1091
1092
498f5606 1093class EntryNotInPlaylist(YoutubeDLError):
1094 """Entry not in playlist exception.
1095
1096 This exception will be thrown by YoutubeDL when a requested entry
1097 is not found in the playlist info_dict
1098 """
aa9369a2 1099 msg = 'Entry not found in info'
498f5606 1100
1101
bf5b9d85 1102class SameFileError(YoutubeDLError):
59ae15a5 1103 """Same File exception.
d77c3dfd 1104
59ae15a5
PH
1105 This exception will be thrown by FileDownloader objects if they detect
1106 multiple files would have to be downloaded to the same file on disk.
1107 """
aa9369a2 1108 msg = 'Fixed output name but more than one file to download'
1109
1110 def __init__(self, filename=None):
1111 if filename is not None:
1112 self.msg += f': {filename}'
1113 super().__init__(self.msg)
d77c3dfd
FV
1114
1115
bf5b9d85 1116class PostProcessingError(YoutubeDLError):
59ae15a5 1117 """Post Processing exception.
d77c3dfd 1118
59ae15a5
PH
1119 This exception may be raised by PostProcessor's .run() method to
1120 indicate an error in the postprocessing task.
1121 """
5f6a1245 1122
5f6a1245 1123
48f79687 1124class DownloadCancelled(YoutubeDLError):
1125 """ Exception raised when the download queue should be interrupted """
1126 msg = 'The download was cancelled'
8b0d7497 1127
8b0d7497 1128
48f79687 1129class ExistingVideoReached(DownloadCancelled):
1130 """ --break-on-existing triggered """
1131 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1132
48f79687 1133
1134class RejectedVideoReached(DownloadCancelled):
1135 """ --break-on-reject triggered """
1136 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
51d9739f 1137
1138
48f79687 1139class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1140 """ --max-downloads limit has been reached. """
48f79687 1141 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1142
1143
f2ebc5c7 1144class ReExtractInfo(YoutubeDLError):
1145 """ Video info needs to be re-extracted. """
1146
1147 def __init__(self, msg, expected=False):
1148 super().__init__(msg)
1149 self.expected = expected
1150
1151
1152class ThrottledDownload(ReExtractInfo):
48f79687 1153 """ Download speed below --throttled-rate. """
aa9369a2 1154 msg = 'The download speed is below throttle limit'
d77c3dfd 1155
43b22906 1156 def __init__(self):
1157 super().__init__(self.msg, expected=False)
f2ebc5c7 1158
d77c3dfd 1159
bf5b9d85 1160class UnavailableVideoError(YoutubeDLError):
59ae15a5 1161 """Unavailable Format exception.
d77c3dfd 1162
59ae15a5
PH
1163 This exception will be thrown when a video is requested
1164 in a format that is not available for that video.
1165 """
aa9369a2 1166 msg = 'Unable to download video'
1167
1168 def __init__(self, err=None):
1169 if err is not None:
1170 self.msg += f': {err}'
1171 super().__init__(self.msg)
d77c3dfd
FV
1172
1173
bf5b9d85 1174class ContentTooShortError(YoutubeDLError):
59ae15a5 1175 """Content Too Short exception.
d77c3dfd 1176
59ae15a5
PH
1177 This exception may be raised by FileDownloader objects when a file they
1178 download is too small for what the server announced first, indicating
1179 the connection was probably interrupted.
1180 """
d77c3dfd 1181
59ae15a5 1182 def __init__(self, downloaded, expected):
86e5f3ed 1183 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1184 # Both in bytes
59ae15a5
PH
1185 self.downloaded = downloaded
1186 self.expected = expected
d77c3dfd 1187
5f6a1245 1188
bf5b9d85 1189class XAttrMetadataError(YoutubeDLError):
efa97bdc 1190 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1191 super().__init__(msg)
efa97bdc 1192 self.code = code
bd264412 1193 self.msg = msg
efa97bdc
YCH
1194
1195 # Parsing code and msg
3089bc74 1196 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1197 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1198 self.reason = 'NO_SPACE'
1199 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1200 self.reason = 'VALUE_TOO_LONG'
1201 else:
1202 self.reason = 'NOT_SUPPORTED'
1203
1204
bf5b9d85 1205class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1206 pass
1207
1208
c5a59d93 1209def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
f9934b96 1210 hc = http_class(*args, **kwargs)
be4a824d 1211 source_address = ydl_handler._params.get('source_address')
8959018a 1212
be4a824d 1213 if source_address is not None:
8959018a
AU
1214 # This is to workaround _create_connection() from socket where it will try all
1215 # address data from getaddrinfo() including IPv6. This filters the result from
1216 # getaddrinfo() based on the source_address value.
1217 # This is based on the cpython socket.create_connection() function.
1218 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1219 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1220 host, port = address
1221 err = None
1222 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1223 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1224 ip_addrs = [addr for addr in addrs if addr[0] == af]
1225 if addrs and not ip_addrs:
1226 ip_version = 'v4' if af == socket.AF_INET else 'v6'
86e5f3ed 1227 raise OSError(
9e21e6d9
S
1228 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1229 % (ip_version, source_address[0]))
8959018a
AU
1230 for res in ip_addrs:
1231 af, socktype, proto, canonname, sa = res
1232 sock = None
1233 try:
1234 sock = socket.socket(af, socktype, proto)
1235 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1236 sock.settimeout(timeout)
1237 sock.bind(source_address)
1238 sock.connect(sa)
1239 err = None # Explicitly break reference cycle
1240 return sock
86e5f3ed 1241 except OSError as _:
8959018a
AU
1242 err = _
1243 if sock is not None:
1244 sock.close()
1245 if err is not None:
1246 raise err
1247 else:
86e5f3ed 1248 raise OSError('getaddrinfo returns an empty list')
9e21e6d9
S
1249 if hasattr(hc, '_create_connection'):
1250 hc._create_connection = _create_connection
cfb0511d 1251 hc.source_address = (source_address, 0)
be4a824d
PH
1252
1253 return hc
1254
1255
87f0e62d 1256def handle_youtubedl_headers(headers):
992fc9d6
YCH
1257 filtered_headers = headers
1258
1259 if 'Youtubedl-no-compression' in filtered_headers:
86e5f3ed 1260 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
87f0e62d 1261 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1262
992fc9d6 1263 return filtered_headers
87f0e62d
YCH
1264
1265
acebc9cd 1266class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
1267 """Handler for HTTP requests and responses.
1268
1269 This class, when installed with an OpenerDirector, automatically adds
1270 the standard headers to every HTTP request and handles gzipped and
1271 deflated responses from web servers. If compression is to be avoided in
1272 a particular request, the original request in the program code only has
0424ec30 1273 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1274 removed before making the real request.
1275
1276 Part of this code was copied from:
1277
1278 http://techknack.net/python-urllib2-handlers/
1279
1280 Andrew Rowls, the author of that code, agreed to release it to the
1281 public domain.
1282 """
1283
be4a824d
PH
1284 def __init__(self, params, *args, **kwargs):
1285 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1286 self._params = params
1287
1288 def http_open(self, req):
71aff188
YCH
1289 conn_class = compat_http_client.HTTPConnection
1290
1291 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1292 if socks_proxy:
1293 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1294 del req.headers['Ytdl-socks-proxy']
1295
be4a824d 1296 return self.do_open(functools.partial(
71aff188 1297 _create_http_connection, self, conn_class, False),
be4a824d
PH
1298 req)
1299
59ae15a5
PH
1300 @staticmethod
1301 def deflate(data):
fc2119f2 1302 if not data:
1303 return data
59ae15a5
PH
1304 try:
1305 return zlib.decompress(data, -zlib.MAX_WBITS)
1306 except zlib.error:
1307 return zlib.decompress(data)
1308
4390d5ec 1309 @staticmethod
1310 def brotli(data):
1311 if not data:
1312 return data
9b8ee23b 1313 return brotli.decompress(data)
4390d5ec 1314
acebc9cd 1315 def http_request(self, req):
51f267d9
S
1316 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1317 # always respected by websites, some tend to give out URLs with non percent-encoded
1318 # non-ASCII characters (see telemb.py, ard.py [#3412])
1319 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1320 # To work around aforementioned issue we will replace request's original URL with
1321 # percent-encoded one
1322 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1323 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1324 url = req.get_full_url()
1325 url_escaped = escape_url(url)
1326
1327 # Substitute URL if any change after escaping
1328 if url != url_escaped:
15d260eb 1329 req = update_Request(req, url=url_escaped)
51f267d9 1330
8b7539d2 1331 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1332 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1333 # The dict keys are capitalized because of this bug by urllib
1334 if h.capitalize() not in req.headers:
33ac271b 1335 req.add_header(h, v)
87f0e62d 1336
af14914b 1337 if 'Accept-encoding' not in req.headers:
1338 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1339
87f0e62d 1340 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b 1341
59ae15a5
PH
1342 return req
1343
acebc9cd 1344 def http_response(self, req, resp):
59ae15a5
PH
1345 old_resp = resp
1346 # gzip
1347 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1348 content = resp.read()
1349 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1350 try:
1351 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1352 except OSError as original_ioerror:
aa3e9507
PH
1353 # There may be junk add the end of the file
1354 # See http://stackoverflow.com/q/4928560/35070 for details
1355 for i in range(1, 1024):
1356 try:
1357 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1358 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1359 except OSError:
aa3e9507
PH
1360 continue
1361 break
1362 else:
1363 raise original_ioerror
b407d853 1364 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1365 resp.msg = old_resp.msg
c047270c 1366 del resp.headers['Content-encoding']
59ae15a5
PH
1367 # deflate
1368 if resp.headers.get('Content-encoding', '') == 'deflate':
1369 gz = io.BytesIO(self.deflate(resp.read()))
b407d853 1370 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1371 resp.msg = old_resp.msg
c047270c 1372 del resp.headers['Content-encoding']
4390d5ec 1373 # brotli
1374 if resp.headers.get('Content-encoding', '') == 'br':
1375 resp = compat_urllib_request.addinfourl(
1376 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1377 resp.msg = old_resp.msg
1378 del resp.headers['Content-encoding']
ad729172 1379 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1380 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1381 if 300 <= resp.code < 400:
1382 location = resp.headers.get('Location')
1383 if location:
1384 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
0f06bcd7 1385 location = location.encode('iso-8859-1').decode()
5a4d9ddb
S
1386 location_escaped = escape_url(location)
1387 if location != location_escaped:
1388 del resp.headers['Location']
1389 resp.headers['Location'] = location_escaped
59ae15a5 1390 return resp
0f8d03f8 1391
acebc9cd
PH
1392 https_request = http_request
1393 https_response = http_response
bf50b038 1394
5de90176 1395
71aff188
YCH
1396def make_socks_conn_class(base_class, socks_proxy):
1397 assert issubclass(base_class, (
1398 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1399
1400 url_components = compat_urlparse.urlparse(socks_proxy)
1401 if url_components.scheme.lower() == 'socks5':
1402 socks_type = ProxyType.SOCKS5
1403 elif url_components.scheme.lower() in ('socks', 'socks4'):
1404 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1405 elif url_components.scheme.lower() == 'socks4a':
1406 socks_type = ProxyType.SOCKS4A
71aff188 1407
cdd94c2e
YCH
1408 def unquote_if_non_empty(s):
1409 if not s:
1410 return s
1411 return compat_urllib_parse_unquote_plus(s)
1412
71aff188
YCH
1413 proxy_args = (
1414 socks_type,
1415 url_components.hostname, url_components.port or 1080,
1416 True, # Remote DNS
cdd94c2e
YCH
1417 unquote_if_non_empty(url_components.username),
1418 unquote_if_non_empty(url_components.password),
71aff188
YCH
1419 )
1420
1421 class SocksConnection(base_class):
1422 def connect(self):
1423 self.sock = sockssocket()
1424 self.sock.setproxy(*proxy_args)
19a03940 1425 if isinstance(self.timeout, (int, float)):
71aff188
YCH
1426 self.sock.settimeout(self.timeout)
1427 self.sock.connect((self.host, self.port))
1428
1429 if isinstance(self, compat_http_client.HTTPSConnection):
1430 if hasattr(self, '_context'): # Python > 2.6
1431 self.sock = self._context.wrap_socket(
1432 self.sock, server_hostname=self.host)
1433 else:
1434 self.sock = ssl.wrap_socket(self.sock)
1435
1436 return SocksConnection
1437
1438
be4a824d
PH
1439class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1440 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1441 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1442 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1443 self._params = params
1444
1445 def https_open(self, req):
4f264c02 1446 kwargs = {}
71aff188
YCH
1447 conn_class = self._https_conn_class
1448
4f264c02
JMF
1449 if hasattr(self, '_context'): # python > 2.6
1450 kwargs['context'] = self._context
1451 if hasattr(self, '_check_hostname'): # python 3.x
1452 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1453
1454 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1455 if socks_proxy:
1456 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1457 del req.headers['Ytdl-socks-proxy']
1458
4f28b537 1459 try:
1460 return self.do_open(
1461 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1462 except urllib.error.URLError as e:
1463 if (isinstance(e.reason, ssl.SSLError)
1464 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1465 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1466 raise
be4a824d
PH
1467
1468
1bab3437 1469class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
f1a8511f
S
1470 """
1471 See [1] for cookie file format.
1472
1473 1. https://curl.haxx.se/docs/http-cookies.html
1474 """
e7e62441 1475 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1476 _ENTRY_LEN = 7
1477 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1478# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1479
1480'''
1481 _CookieFileEntry = collections.namedtuple(
1482 'CookieFileEntry',
1483 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1484
d76fa1f3 1485 def __init__(self, filename=None, *args, **kwargs):
1486 super().__init__(None, *args, **kwargs)
1487 if self.is_path(filename):
1488 filename = os.fspath(filename)
1489 self.filename = filename
1490
24146491 1491 @staticmethod
1492 def _true_or_false(cndn):
1493 return 'TRUE' if cndn else 'FALSE'
1494
d76fa1f3 1495 @staticmethod
1496 def is_path(file):
1497 return isinstance(file, (str, bytes, os.PathLike))
1498
1499 @contextlib.contextmanager
1500 def open(self, file, *, write=False):
1501 if self.is_path(file):
1502 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1503 yield f
1504 else:
1505 if write:
1506 file.truncate(0)
1507 yield file
1508
24146491 1509 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1510 now = time.time()
1511 for cookie in self:
1512 if (not ignore_discard and cookie.discard
1513 or not ignore_expires and cookie.is_expired(now)):
1514 continue
1515 name, value = cookie.name, cookie.value
1516 if value is None:
1517 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1518 # with no name, whereas http.cookiejar regards it as a
1519 # cookie with no value.
1520 name, value = '', name
1521 f.write('%s\n' % '\t'.join((
1522 cookie.domain,
1523 self._true_or_false(cookie.domain.startswith('.')),
1524 cookie.path,
1525 self._true_or_false(cookie.secure),
1526 str_or_none(cookie.expires, default=''),
1527 name, value
1528 )))
1529
1530 def save(self, filename=None, *args, **kwargs):
c380cc28
S
1531 """
1532 Save cookies to a file.
24146491 1533 Code is taken from CPython 3.6
1534 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
c380cc28 1535
c380cc28
S
1536 if filename is None:
1537 if self.filename is not None:
1538 filename = self.filename
1539 else:
1540 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1541
24146491 1542 # Store session cookies with `expires` set to 0 instead of an empty string
1bab3437
S
1543 for cookie in self:
1544 if cookie.expires is None:
1545 cookie.expires = 0
c380cc28 1546
d76fa1f3 1547 with self.open(filename, write=True) as f:
c380cc28 1548 f.write(self._HEADER)
24146491 1549 self._really_save(f, *args, **kwargs)
1bab3437
S
1550
1551 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1552 """Load cookies from a file."""
1553 if filename is None:
1554 if self.filename is not None:
1555 filename = self.filename
1556 else:
1557 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1558
c380cc28
S
1559 def prepare_line(line):
1560 if line.startswith(self._HTTPONLY_PREFIX):
1561 line = line[len(self._HTTPONLY_PREFIX):]
1562 # comments and empty lines are fine
1563 if line.startswith('#') or not line.strip():
1564 return line
1565 cookie_list = line.split('\t')
1566 if len(cookie_list) != self._ENTRY_LEN:
1567 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1568 cookie = self._CookieFileEntry(*cookie_list)
1569 if cookie.expires_at and not cookie.expires_at.isdigit():
1570 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1571 return line
1572
e7e62441 1573 cf = io.StringIO()
d76fa1f3 1574 with self.open(filename) as f:
e7e62441 1575 for line in f:
c380cc28
S
1576 try:
1577 cf.write(prepare_line(line))
1578 except compat_cookiejar.LoadError as e:
94aa0644
L
1579 if f'{line.strip()} '[0] in '[{"':
1580 raise compat_cookiejar.LoadError(
1581 'Cookies file must be Netscape formatted, not JSON. See '
1582 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
19a03940 1583 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
c380cc28 1584 continue
e7e62441 1585 cf.seek(0)
1586 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1587 # Session cookies are denoted by either `expires` field set to
1588 # an empty string or 0. MozillaCookieJar only recognizes the former
1589 # (see [1]). So we need force the latter to be recognized as session
1590 # cookies on our own.
1591 # Session cookies may be important for cookies-based authentication,
1592 # e.g. usually, when user does not check 'Remember me' check box while
1593 # logging in on a site, some important cookies are stored as session
1594 # cookies so that not recognizing them will result in failed login.
1595 # 1. https://bugs.python.org/issue17164
1596 for cookie in self:
1597 # Treat `expires=0` cookies as session cookies
1598 if cookie.expires == 0:
1599 cookie.expires = None
1600 cookie.discard = True
1601
1602
a6420bf5
S
1603class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1604 def __init__(self, cookiejar=None):
1605 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1606
1607 def http_response(self, request, response):
a6420bf5
S
1608 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1609
f5fa042c 1610 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
a6420bf5
S
1611 https_response = http_response
1612
1613
fca6dba8 1614class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
201c1459 1615 """YoutubeDL redirect handler
1616
1617 The code is based on HTTPRedirectHandler implementation from CPython [1].
1618
1619 This redirect handler solves two issues:
1620 - ensures redirect URL is always unicode under python 2
1621 - introduces support for experimental HTTP response status code
1622 308 Permanent Redirect [2] used by some sites [3]
1623
1624 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1625 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1626 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1627 """
1628
1629 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1630
1631 def redirect_request(self, req, fp, code, msg, headers, newurl):
1632 """Return a Request or None in response to a redirect.
1633
1634 This is called by the http_error_30x methods when a
1635 redirection response is received. If a redirection should
1636 take place, return a new Request to allow http_error_30x to
1637 perform the redirect. Otherwise, raise HTTPError if no-one
1638 else should try to handle this url. Return None if you can't
1639 but another Handler might.
1640 """
1641 m = req.get_method()
1642 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1643 or code in (301, 302, 303) and m == "POST")):
1644 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1645 # Strictly (according to RFC 2616), 301 or 302 in response to
1646 # a POST MUST NOT cause a redirection without confirmation
1647 # from the user (of urllib.request, in this case). In practice,
1648 # essentially all clients do redirect in this case, so we do
1649 # the same.
1650
201c1459 1651 # Be conciliant with URIs containing a space. This is mainly
1652 # redundant with the more complete encoding done in http_error_302(),
1653 # but it is kept for compatibility with other callers.
1654 newurl = newurl.replace(' ', '%20')
1655
1656 CONTENT_HEADERS = ("content-length", "content-type")
1657 # NB: don't use dict comprehension for python 2.6 compatibility
86e5f3ed 1658 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
afac4caa 1659
1660 # A 303 must either use GET or HEAD for subsequent request
1661 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1662 if code == 303 and m != 'HEAD':
1663 m = 'GET'
1664 # 301 and 302 redirects are commonly turned into a GET from a POST
1665 # for subsequent requests by browsers, so we'll do the same.
1666 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1667 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1668 if code in (301, 302) and m == 'POST':
1669 m = 'GET'
1670
201c1459 1671 return compat_urllib_request.Request(
1672 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
afac4caa 1673 unverifiable=True, method=m)
fca6dba8
S
1674
1675
46f59e89
S
1676def extract_timezone(date_str):
1677 m = re.search(
f137e4c2 1678 r'''(?x)
1679 ^.{8,}? # >=8 char non-TZ prefix, if present
1680 (?P<tz>Z| # just the UTC Z, or
1681 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1682 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1683 [ ]? # optional space
1684 (?P<sign>\+|-) # +/-
1685 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1686 $)
1687 ''', date_str)
46f59e89
S
1688 if not m:
1689 timezone = datetime.timedelta()
1690 else:
1691 date_str = date_str[:-len(m.group('tz'))]
1692 if not m.group('sign'):
1693 timezone = datetime.timedelta()
1694 else:
1695 sign = 1 if m.group('sign') == '+' else -1
1696 timezone = datetime.timedelta(
1697 hours=sign * int(m.group('hours')),
1698 minutes=sign * int(m.group('minutes')))
1699 return timezone, date_str
1700
1701
08b38d54 1702def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1703 """ Return a UNIX timestamp from the given date """
1704
1705 if date_str is None:
1706 return None
1707
52c3a6e4
S
1708 date_str = re.sub(r'\.[0-9]+', '', date_str)
1709
08b38d54 1710 if timezone is None:
46f59e89
S
1711 timezone, date_str = extract_timezone(date_str)
1712
19a03940 1713 with contextlib.suppress(ValueError):
86e5f3ed 1714 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1715 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1716 return calendar.timegm(dt.timetuple())
912b38b4
PH
1717
1718
46f59e89
S
1719def date_formats(day_first=True):
1720 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1721
1722
42bdd9d0 1723def unified_strdate(date_str, day_first=True):
bf50b038 1724 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1725
1726 if date_str is None:
1727 return None
bf50b038 1728 upload_date = None
5f6a1245 1729 # Replace commas
026fcc04 1730 date_str = date_str.replace(',', ' ')
42bdd9d0 1731 # Remove AM/PM + timezone
9bb8e0a3 1732 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1733 _, date_str = extract_timezone(date_str)
42bdd9d0 1734
46f59e89 1735 for expression in date_formats(day_first):
19a03940 1736 with contextlib.suppress(ValueError):
bf50b038 1737 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1738 if upload_date is None:
1739 timetuple = email.utils.parsedate_tz(date_str)
1740 if timetuple:
19a03940 1741 with contextlib.suppress(ValueError):
c6b9cf05 1742 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402
JMF
1743 if upload_date is not None:
1744 return compat_str(upload_date)
bf50b038 1745
5f6a1245 1746
46f59e89
S
1747def unified_timestamp(date_str, day_first=True):
1748 if date_str is None:
1749 return None
1750
2ae2ffda 1751 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1752
7dc2a74e 1753 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1754 timezone, date_str = extract_timezone(date_str)
1755
1756 # Remove AM/PM + timezone
1757 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1758
deef3195
S
1759 # Remove unrecognized timezones from ISO 8601 alike timestamps
1760 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1761 if m:
1762 date_str = date_str[:-len(m.group('tz'))]
1763
f226880c
PH
1764 # Python only supports microseconds, so remove nanoseconds
1765 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1766 if m:
1767 date_str = m.group(1)
1768
46f59e89 1769 for expression in date_formats(day_first):
19a03940 1770 with contextlib.suppress(ValueError):
7dc2a74e 1771 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1772 return calendar.timegm(dt.timetuple())
46f59e89
S
1773 timetuple = email.utils.parsedate_tz(date_str)
1774 if timetuple:
7dc2a74e 1775 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1776
1777
28e614de 1778def determine_ext(url, default_ext='unknown_video'):
85750f89 1779 if url is None or '.' not in url:
f4776371 1780 return default_ext
9cb9a5df 1781 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1782 if re.match(r'^[A-Za-z0-9]+$', guess):
1783 return guess
a7aaa398
S
1784 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1785 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1786 return guess.rstrip('/')
73e79f2a 1787 else:
cbdbb766 1788 return default_ext
73e79f2a 1789
5f6a1245 1790
824fa511
S
1791def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1792 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1793
5f6a1245 1794
9e62f283 1795def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1796 R"""
1797 Return a datetime object from a string.
1798 Supported format:
1799 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1800
1801 @param format strftime format of DATE
1802 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1803 auto: round to the unit provided in date_str (if applicable).
9e62f283 1804 """
1805 auto_precision = False
1806 if precision == 'auto':
1807 auto_precision = True
1808 precision = 'microsecond'
396a76f7 1809 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1810 if date_str in ('now', 'today'):
37254abc 1811 return today
f8795e10
PH
1812 if date_str == 'yesterday':
1813 return today - datetime.timedelta(days=1)
9e62f283 1814 match = re.match(
3d38b2d6 1815 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1816 date_str)
37254abc 1817 if match is not None:
9e62f283 1818 start_time = datetime_from_str(match.group('start'), precision, format)
1819 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1820 unit = match.group('unit')
9e62f283 1821 if unit == 'month' or unit == 'year':
1822 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1823 unit = 'day'
9e62f283 1824 else:
1825 if unit == 'week':
1826 unit = 'day'
1827 time *= 7
1828 delta = datetime.timedelta(**{unit + 's': time})
1829 new_date = start_time + delta
1830 if auto_precision:
1831 return datetime_round(new_date, unit)
1832 return new_date
1833
1834 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1835
1836
d49f8db3 1837def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1838 R"""
1839 Return a date object from a string using datetime_from_str
9e62f283 1840
3d38b2d6 1841 @param strict Restrict allowed patterns to "YYYYMMDD" and
1842 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1843 """
3d38b2d6 1844 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1845 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1846 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1847
1848
1849def datetime_add_months(dt, months):
1850 """Increment/Decrement a datetime object by months."""
1851 month = dt.month + months - 1
1852 year = dt.year + month // 12
1853 month = month % 12 + 1
1854 day = min(dt.day, calendar.monthrange(year, month)[1])
1855 return dt.replace(year, month, day)
1856
1857
1858def datetime_round(dt, precision='day'):
1859 """
1860 Round a datetime object's time to a specific precision
1861 """
1862 if precision == 'microsecond':
1863 return dt
1864
1865 unit_seconds = {
1866 'day': 86400,
1867 'hour': 3600,
1868 'minute': 60,
1869 'second': 1,
1870 }
1871 roundto = lambda x, n: ((x + n / 2) // n) * n
1872 timestamp = calendar.timegm(dt.timetuple())
1873 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1874
1875
e63fc1be 1876def hyphenate_date(date_str):
1877 """
1878 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1879 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1880 if match is not None:
1881 return '-'.join(match.groups())
1882 else:
1883 return date_str
1884
5f6a1245 1885
86e5f3ed 1886class DateRange:
bd558525 1887 """Represents a time interval between two dates"""
5f6a1245 1888
bd558525
JMF
1889 def __init__(self, start=None, end=None):
1890 """start and end must be strings in the format accepted by date"""
1891 if start is not None:
d49f8db3 1892 self.start = date_from_str(start, strict=True)
bd558525
JMF
1893 else:
1894 self.start = datetime.datetime.min.date()
1895 if end is not None:
d49f8db3 1896 self.end = date_from_str(end, strict=True)
bd558525
JMF
1897 else:
1898 self.end = datetime.datetime.max.date()
37254abc 1899 if self.start > self.end:
bd558525 1900 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1901
bd558525
JMF
1902 @classmethod
1903 def day(cls, day):
1904 """Returns a range that only contains the given day"""
5f6a1245
JW
1905 return cls(day, day)
1906
bd558525
JMF
1907 def __contains__(self, date):
1908 """Check if the date is in the range"""
37254abc
JMF
1909 if not isinstance(date, datetime.date):
1910 date = date_from_str(date)
1911 return self.start <= date <= self.end
5f6a1245 1912
bd558525 1913 def __str__(self):
86e5f3ed 1914 return f'{self.start.isoformat()} - {self.end.isoformat()}'
c496ca96
PH
1915
1916
1917def platform_name():
1918 """ Returns the platform name as a compat_str """
1919 res = platform.platform()
1920 if isinstance(res, bytes):
1921 res = res.decode(preferredencoding())
1922
1923 assert isinstance(res, compat_str)
1924 return res
c257baff
PH
1925
1926
0b9c08b4 1927@functools.cache
49fa4d9a 1928def get_windows_version():
8a82af35 1929 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
1930 if compat_os_name == 'nt':
1931 return version_tuple(platform.win32_ver()[1])
1932 else:
8a82af35 1933 return ()
49fa4d9a
N
1934
1935
734f90bb 1936def write_string(s, out=None, encoding=None):
19a03940 1937 assert isinstance(s, str)
1938 out = out or sys.stderr
7459e3a2 1939
fe1daad3 1940 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 1941 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 1942
8a82af35 1943 enc, buffer = None, out
cfb0511d 1944 if 'b' in getattr(out, 'mode', ''):
c487cf00 1945 enc = encoding or preferredencoding()
104aa738 1946 elif hasattr(out, 'buffer'):
8a82af35 1947 buffer = out.buffer
104aa738 1948 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 1949
8a82af35 1950 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
1951 out.flush()
1952
1953
48ea9cea
PH
1954def bytes_to_intlist(bs):
1955 if not bs:
1956 return []
1957 if isinstance(bs[0], int): # Python 3
1958 return list(bs)
1959 else:
1960 return [ord(c) for c in bs]
1961
c257baff 1962
cba892fa 1963def intlist_to_bytes(xs):
1964 if not xs:
1965 return b''
edaa23f8 1966 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1967
1968
8a82af35 1969class LockingUnsupportedError(OSError):
1890fc63 1970 msg = 'File locking is not supported'
0edb3e33 1971
1972 def __init__(self):
1973 super().__init__(self.msg)
1974
1975
c1c9a79c
PH
1976# Cross-platform file locking
1977if sys.platform == 'win32':
1978 import ctypes.wintypes
1979 import msvcrt
1980
1981 class OVERLAPPED(ctypes.Structure):
1982 _fields_ = [
1983 ('Internal', ctypes.wintypes.LPVOID),
1984 ('InternalHigh', ctypes.wintypes.LPVOID),
1985 ('Offset', ctypes.wintypes.DWORD),
1986 ('OffsetHigh', ctypes.wintypes.DWORD),
1987 ('hEvent', ctypes.wintypes.HANDLE),
1988 ]
1989
1990 kernel32 = ctypes.windll.kernel32
1991 LockFileEx = kernel32.LockFileEx
1992 LockFileEx.argtypes = [
1993 ctypes.wintypes.HANDLE, # hFile
1994 ctypes.wintypes.DWORD, # dwFlags
1995 ctypes.wintypes.DWORD, # dwReserved
1996 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1997 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1998 ctypes.POINTER(OVERLAPPED) # Overlapped
1999 ]
2000 LockFileEx.restype = ctypes.wintypes.BOOL
2001 UnlockFileEx = kernel32.UnlockFileEx
2002 UnlockFileEx.argtypes = [
2003 ctypes.wintypes.HANDLE, # hFile
2004 ctypes.wintypes.DWORD, # dwReserved
2005 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2006 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2007 ctypes.POINTER(OVERLAPPED) # Overlapped
2008 ]
2009 UnlockFileEx.restype = ctypes.wintypes.BOOL
2010 whole_low = 0xffffffff
2011 whole_high = 0x7fffffff
2012
747c0bd1 2013 def _lock_file(f, exclusive, block):
c1c9a79c
PH
2014 overlapped = OVERLAPPED()
2015 overlapped.Offset = 0
2016 overlapped.OffsetHigh = 0
2017 overlapped.hEvent = 0
2018 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 2019
2020 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2021 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2022 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 2023 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2024 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
2025
2026 def _unlock_file(f):
2027 assert f._lock_file_overlapped_p
2028 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 2029 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
2030 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2031
2032else:
399a76e6
YCH
2033 try:
2034 import fcntl
c1c9a79c 2035
a3125791 2036 def _lock_file(f, exclusive, block):
b63837bc 2037 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2038 if not block:
2039 flags |= fcntl.LOCK_NB
acea8d7c 2040 try:
b63837bc 2041 fcntl.flock(f, flags)
acea8d7c
JK
2042 except BlockingIOError:
2043 raise
2044 except OSError: # AOSP does not have flock()
b63837bc 2045 fcntl.lockf(f, flags)
c1c9a79c 2046
399a76e6 2047 def _unlock_file(f):
acea8d7c
JK
2048 try:
2049 fcntl.flock(f, fcntl.LOCK_UN)
2050 except OSError:
2051 fcntl.lockf(f, fcntl.LOCK_UN)
a3125791 2052
399a76e6 2053 except ImportError:
399a76e6 2054
a3125791 2055 def _lock_file(f, exclusive, block):
0edb3e33 2056 raise LockingUnsupportedError()
399a76e6
YCH
2057
2058 def _unlock_file(f):
0edb3e33 2059 raise LockingUnsupportedError()
c1c9a79c
PH
2060
2061
86e5f3ed 2062class locked_file:
0edb3e33 2063 locked = False
747c0bd1 2064
a3125791 2065 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
2066 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2067 raise NotImplementedError(mode)
2068 self.mode, self.block = mode, block
2069
2070 writable = any(f in mode for f in 'wax+')
2071 readable = any(f in mode for f in 'r+')
2072 flags = functools.reduce(operator.ior, (
2073 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2074 getattr(os, 'O_BINARY', 0), # Windows only
2075 getattr(os, 'O_NOINHERIT', 0), # Windows only
2076 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2077 os.O_APPEND if 'a' in mode else 0,
2078 os.O_EXCL if 'x' in mode else 0,
2079 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2080 ))
2081
98804d03 2082 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
2083
2084 def __enter__(self):
a3125791 2085 exclusive = 'r' not in self.mode
c1c9a79c 2086 try:
a3125791 2087 _lock_file(self.f, exclusive, self.block)
0edb3e33 2088 self.locked = True
86e5f3ed 2089 except OSError:
c1c9a79c
PH
2090 self.f.close()
2091 raise
fcfa8853 2092 if 'w' in self.mode:
131e14dc
JK
2093 try:
2094 self.f.truncate()
2095 except OSError as e:
1890fc63 2096 if e.errno not in (
2097 errno.ESPIPE, # Illegal seek - expected for FIFO
2098 errno.EINVAL, # Invalid argument - expected for /dev/null
2099 ):
2100 raise
c1c9a79c
PH
2101 return self
2102
0edb3e33 2103 def unlock(self):
2104 if not self.locked:
2105 return
c1c9a79c 2106 try:
0edb3e33 2107 _unlock_file(self.f)
c1c9a79c 2108 finally:
0edb3e33 2109 self.locked = False
c1c9a79c 2110
0edb3e33 2111 def __exit__(self, *_):
2112 try:
2113 self.unlock()
2114 finally:
2115 self.f.close()
4eb7f1d1 2116
0edb3e33 2117 open = __enter__
2118 close = __exit__
a3125791 2119
0edb3e33 2120 def __getattr__(self, attr):
2121 return getattr(self.f, attr)
a3125791 2122
0edb3e33 2123 def __iter__(self):
2124 return iter(self.f)
a3125791 2125
4eb7f1d1 2126
0b9c08b4 2127@functools.cache
4644ac55
S
2128def get_filesystem_encoding():
2129 encoding = sys.getfilesystemencoding()
2130 return encoding if encoding is not None else 'utf-8'
2131
2132
4eb7f1d1 2133def shell_quote(args):
a6a173c2 2134 quoted_args = []
4644ac55 2135 encoding = get_filesystem_encoding()
a6a173c2
JMF
2136 for a in args:
2137 if isinstance(a, bytes):
2138 # We may get a filename encoded with 'encodeFilename'
2139 a = a.decode(encoding)
aefce8e6 2140 quoted_args.append(compat_shlex_quote(a))
28e614de 2141 return ' '.join(quoted_args)
9d4660ca
PH
2142
2143
2144def smuggle_url(url, data):
2145 """ Pass additional data in a URL for internal use. """
2146
81953d1a
RA
2147 url, idata = unsmuggle_url(url, {})
2148 data.update(idata)
15707c7e 2149 sdata = compat_urllib_parse_urlencode(
28e614de
PH
2150 {'__youtubedl_smuggle': json.dumps(data)})
2151 return url + '#' + sdata
9d4660ca
PH
2152
2153
79f82953 2154def unsmuggle_url(smug_url, default=None):
83e865a3 2155 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2156 return smug_url, default
28e614de
PH
2157 url, _, sdata = smug_url.rpartition('#')
2158 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2159 data = json.loads(jsond)
2160 return url, data
02dbf93f
PH
2161
2162
e0fd9573 2163def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2164 """ Formats numbers with decimal sufixes like K, M, etc """
2165 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2166 if num is None or num < 0:
e0fd9573 2167 return None
eeb2a770 2168 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2169 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2170 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 2171 if factor == 1024:
2172 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2173 converted = num / (factor ** exponent)
abbeeebc 2174 return fmt % (converted, suffix)
e0fd9573 2175
2176
02dbf93f 2177def format_bytes(bytes):
f02d24d8 2178 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2179
1c088fa8 2180
fb47597b
S
2181def lookup_unit_table(unit_table, s):
2182 units_re = '|'.join(re.escape(u) for u in unit_table)
2183 m = re.match(
782b1b5b 2184 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
2185 if not m:
2186 return None
2187 num_str = m.group('num').replace(',', '.')
2188 mult = unit_table[m.group('unit')]
2189 return int(float(num_str) * mult)
2190
2191
be64b5b0
PH
2192def parse_filesize(s):
2193 if s is None:
2194 return None
2195
dfb1b146 2196 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2197 # but we support those too
2198 _UNIT_TABLE = {
2199 'B': 1,
2200 'b': 1,
70852b47 2201 'bytes': 1,
be64b5b0
PH
2202 'KiB': 1024,
2203 'KB': 1000,
2204 'kB': 1024,
2205 'Kb': 1000,
13585d76 2206 'kb': 1000,
70852b47
YCH
2207 'kilobytes': 1000,
2208 'kibibytes': 1024,
be64b5b0
PH
2209 'MiB': 1024 ** 2,
2210 'MB': 1000 ** 2,
2211 'mB': 1024 ** 2,
2212 'Mb': 1000 ** 2,
13585d76 2213 'mb': 1000 ** 2,
70852b47
YCH
2214 'megabytes': 1000 ** 2,
2215 'mebibytes': 1024 ** 2,
be64b5b0
PH
2216 'GiB': 1024 ** 3,
2217 'GB': 1000 ** 3,
2218 'gB': 1024 ** 3,
2219 'Gb': 1000 ** 3,
13585d76 2220 'gb': 1000 ** 3,
70852b47
YCH
2221 'gigabytes': 1000 ** 3,
2222 'gibibytes': 1024 ** 3,
be64b5b0
PH
2223 'TiB': 1024 ** 4,
2224 'TB': 1000 ** 4,
2225 'tB': 1024 ** 4,
2226 'Tb': 1000 ** 4,
13585d76 2227 'tb': 1000 ** 4,
70852b47
YCH
2228 'terabytes': 1000 ** 4,
2229 'tebibytes': 1024 ** 4,
be64b5b0
PH
2230 'PiB': 1024 ** 5,
2231 'PB': 1000 ** 5,
2232 'pB': 1024 ** 5,
2233 'Pb': 1000 ** 5,
13585d76 2234 'pb': 1000 ** 5,
70852b47
YCH
2235 'petabytes': 1000 ** 5,
2236 'pebibytes': 1024 ** 5,
be64b5b0
PH
2237 'EiB': 1024 ** 6,
2238 'EB': 1000 ** 6,
2239 'eB': 1024 ** 6,
2240 'Eb': 1000 ** 6,
13585d76 2241 'eb': 1000 ** 6,
70852b47
YCH
2242 'exabytes': 1000 ** 6,
2243 'exbibytes': 1024 ** 6,
be64b5b0
PH
2244 'ZiB': 1024 ** 7,
2245 'ZB': 1000 ** 7,
2246 'zB': 1024 ** 7,
2247 'Zb': 1000 ** 7,
13585d76 2248 'zb': 1000 ** 7,
70852b47
YCH
2249 'zettabytes': 1000 ** 7,
2250 'zebibytes': 1024 ** 7,
be64b5b0
PH
2251 'YiB': 1024 ** 8,
2252 'YB': 1000 ** 8,
2253 'yB': 1024 ** 8,
2254 'Yb': 1000 ** 8,
13585d76 2255 'yb': 1000 ** 8,
70852b47
YCH
2256 'yottabytes': 1000 ** 8,
2257 'yobibytes': 1024 ** 8,
be64b5b0
PH
2258 }
2259
fb47597b
S
2260 return lookup_unit_table(_UNIT_TABLE, s)
2261
2262
2263def parse_count(s):
2264 if s is None:
be64b5b0
PH
2265 return None
2266
352d5da8 2267 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2268
2269 if re.match(r'^[\d,.]+$', s):
2270 return str_to_int(s)
2271
2272 _UNIT_TABLE = {
2273 'k': 1000,
2274 'K': 1000,
2275 'm': 1000 ** 2,
2276 'M': 1000 ** 2,
2277 'kk': 1000 ** 2,
2278 'KK': 1000 ** 2,
352d5da8 2279 'b': 1000 ** 3,
2280 'B': 1000 ** 3,
fb47597b 2281 }
be64b5b0 2282
352d5da8 2283 ret = lookup_unit_table(_UNIT_TABLE, s)
2284 if ret is not None:
2285 return ret
2286
2287 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2288 if mobj:
2289 return str_to_int(mobj.group(1))
be64b5b0 2290
2f7ae819 2291
5d45484c 2292def parse_resolution(s, *, lenient=False):
b871d7e9
S
2293 if s is None:
2294 return {}
2295
5d45484c
LNO
2296 if lenient:
2297 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2298 else:
2299 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2300 if mobj:
2301 return {
2302 'width': int(mobj.group('w')),
2303 'height': int(mobj.group('h')),
2304 }
2305
17ec8bcf 2306 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2307 if mobj:
2308 return {'height': int(mobj.group(1))}
2309
2310 mobj = re.search(r'\b([48])[kK]\b', s)
2311 if mobj:
2312 return {'height': int(mobj.group(1)) * 540}
2313
2314 return {}
2315
2316
0dc41787
S
2317def parse_bitrate(s):
2318 if not isinstance(s, compat_str):
2319 return
2320 mobj = re.search(r'\b(\d+)\s*kbps', s)
2321 if mobj:
2322 return int(mobj.group(1))
2323
2324
a942d6cb 2325def month_by_name(name, lang='en'):
caefb1de
PH
2326 """ Return the number of a month by (locale-independently) English name """
2327
f6717dec 2328 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2329
caefb1de 2330 try:
f6717dec 2331 return month_names.index(name) + 1
7105440c
YCH
2332 except ValueError:
2333 return None
2334
2335
2336def month_by_abbreviation(abbrev):
2337 """ Return the number of a month by (locale-independently) English
2338 abbreviations """
2339
2340 try:
2341 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2342 except ValueError:
2343 return None
18258362
JMF
2344
2345
5aafe895 2346def fix_xml_ampersands(xml_str):
18258362 2347 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2348 return re.sub(
2349 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2350 '&amp;',
5aafe895 2351 xml_str)
e3946f98
PH
2352
2353
2354def setproctitle(title):
8bf48f23 2355 assert isinstance(title, compat_str)
c1c05c67
YCH
2356
2357 # ctypes in Jython is not complete
2358 # http://bugs.jython.org/issue2148
2359 if sys.platform.startswith('java'):
2360 return
2361
e3946f98 2362 try:
611c1dd9 2363 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2364 except OSError:
2365 return
2f49bcd6
RC
2366 except TypeError:
2367 # LoadLibrary in Windows Python 2.7.13 only expects
2368 # a bytestring, but since unicode_literals turns
2369 # every string into a unicode string, it fails.
2370 return
0f06bcd7 2371 title_bytes = title.encode()
6eefe533
PH
2372 buf = ctypes.create_string_buffer(len(title_bytes))
2373 buf.value = title_bytes
e3946f98 2374 try:
6eefe533 2375 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2376 except AttributeError:
2377 return # Strange libc, just skip this
d7dda168
PH
2378
2379
2380def remove_start(s, start):
46bc9b7d 2381 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2382
2383
2b9faf55 2384def remove_end(s, end):
46bc9b7d 2385 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2386
2387
31b2051e
S
2388def remove_quotes(s):
2389 if s is None or len(s) < 2:
2390 return s
2391 for quote in ('"', "'", ):
2392 if s[0] == quote and s[-1] == quote:
2393 return s[1:-1]
2394 return s
2395
2396
b6e0c7d2
U
2397def get_domain(url):
2398 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2399 return domain.group('domain') if domain else None
2400
2401
29eb5174 2402def url_basename(url):
9b8aaeed 2403 path = compat_urlparse.urlparse(url).path
28e614de 2404 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2405
2406
02dc0a36
S
2407def base_url(url):
2408 return re.match(r'https?://[^?#&]+/', url).group()
2409
2410
e34c3361 2411def urljoin(base, path):
4b5de77b 2412 if isinstance(path, bytes):
0f06bcd7 2413 path = path.decode()
e34c3361
S
2414 if not isinstance(path, compat_str) or not path:
2415 return None
fad4ceb5 2416 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2417 return path
4b5de77b 2418 if isinstance(base, bytes):
0f06bcd7 2419 base = base.decode()
4b5de77b
S
2420 if not isinstance(base, compat_str) or not re.match(
2421 r'^(?:https?:)?//', base):
e34c3361
S
2422 return None
2423 return compat_urlparse.urljoin(base, path)
2424
2425
aa94a6d3
PH
2426class HEADRequest(compat_urllib_request.Request):
2427 def get_method(self):
611c1dd9 2428 return 'HEAD'
7217e148
PH
2429
2430
95cf60e8
S
2431class PUTRequest(compat_urllib_request.Request):
2432 def get_method(self):
2433 return 'PUT'
2434
2435
9732d77e 2436def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2437 if get_attr and v is not None:
2438 v = getattr(v, get_attr, None)
1812afb7
S
2439 try:
2440 return int(v) * invscale // scale
31c49255 2441 except (ValueError, TypeError, OverflowError):
af98f8ff 2442 return default
9732d77e 2443
9572013d 2444
40a90862
JMF
2445def str_or_none(v, default=None):
2446 return default if v is None else compat_str(v)
2447
9732d77e
PH
2448
2449def str_to_int(int_str):
48d4681e 2450 """ A more relaxed version of int_or_none """
f9934b96 2451 if isinstance(int_str, int):
348c6bf1 2452 return int_str
42db58ec
S
2453 elif isinstance(int_str, compat_str):
2454 int_str = re.sub(r'[,\.\+]', '', int_str)
2455 return int_or_none(int_str)
608d11f5
PH
2456
2457
9732d77e 2458def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2459 if v is None:
2460 return default
2461 try:
2462 return float(v) * invscale / scale
5e1271c5 2463 except (ValueError, TypeError):
caf80631 2464 return default
43f775e4
PH
2465
2466
c7e327c4
S
2467def bool_or_none(v, default=None):
2468 return v if isinstance(v, bool) else default
2469
2470
53cd37ba
S
2471def strip_or_none(v, default=None):
2472 return v.strip() if isinstance(v, compat_str) else default
b72b4431
S
2473
2474
af03000a
S
2475def url_or_none(url):
2476 if not url or not isinstance(url, compat_str):
2477 return None
2478 url = url.strip()
29f7c58a 2479 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2480
2481
3e9b66d7
LNO
2482def request_to_url(req):
2483 if isinstance(req, compat_urllib_request.Request):
2484 return req.get_full_url()
2485 else:
2486 return req
2487
2488
e29663c6 2489def strftime_or_none(timestamp, date_format, default=None):
2490 datetime_object = None
2491 try:
f9934b96 2492 if isinstance(timestamp, (int, float)): # unix timestamp
e29663c6 2493 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2494 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2495 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2496 return datetime_object.strftime(date_format)
2497 except (ValueError, TypeError, AttributeError):
2498 return default
2499
2500
608d11f5 2501def parse_duration(s):
f9934b96 2502 if not isinstance(s, str):
608d11f5 2503 return None
ca7b3246 2504 s = s.strip()
38d79fd1 2505 if not s:
2506 return None
ca7b3246 2507
acaff495 2508 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2509 m = re.match(r'''(?x)
2510 (?P<before_secs>
2511 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2512 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2513 (?P<ms>[.:][0-9]+)?Z?$
2514 ''', s)
acaff495 2515 if m:
8bd1c00b 2516 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2517 else:
2518 m = re.match(
056653bb
S
2519 r'''(?ix)(?:P?
2520 (?:
1c1b2f96 2521 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2522 )?
2523 (?:
1c1b2f96 2524 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2525 )?
2526 (?:
1c1b2f96 2527 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2528 )?
8f4b58d7 2529 (?:
1c1b2f96 2530 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2531 )?
056653bb 2532 T)?
acaff495 2533 (?:
1c1b2f96 2534 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2535 )?
2536 (?:
1c1b2f96 2537 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2538 )?
2539 (?:
2540 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2541 )?Z?$''', s)
acaff495 2542 if m:
2543 days, hours, mins, secs, ms = m.groups()
2544 else:
15846398 2545 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2546 if m:
2547 hours, mins = m.groups()
2548 else:
2549 return None
2550
acaff495 2551 if ms:
19a03940 2552 ms = ms.replace(':', '.')
2553 return sum(float(part or 0) * mult for part, mult in (
2554 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2555
2556
e65e4c88 2557def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2558 name, real_ext = os.path.splitext(filename)
e65e4c88 2559 return (
86e5f3ed 2560 f'{name}.{ext}{real_ext}'
e65e4c88 2561 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2562 else f'{filename}.{ext}')
d70ad093
PH
2563
2564
b3ed15b7
S
2565def replace_extension(filename, ext, expected_real_ext=None):
2566 name, real_ext = os.path.splitext(filename)
86e5f3ed 2567 return '{}.{}'.format(
b3ed15b7
S
2568 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2569 ext)
2570
2571
d70ad093
PH
2572def check_executable(exe, args=[]):
2573 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2574 args can be a list of arguments for a short output (like -version) """
2575 try:
f0c9fb96 2576 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2577 except OSError:
2578 return False
2579 return exe
b7ab0590
PH
2580
2581
8a7f68d0 2582def _get_exe_version_output(exe, args, *, to_screen=None):
2583 if to_screen:
2584 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
95807118 2585 try:
b64d04c1 2586 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2587 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2588 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
f0c9fb96 2589 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2590 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
95807118
PH
2591 except OSError:
2592 return False
f0c9fb96 2593 return stdout
cae97f65
PH
2594
2595
2596def detect_exe_version(output, version_re=None, unrecognized='present'):
2597 assert isinstance(output, compat_str)
2598 if version_re is None:
2599 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2600 m = re.search(version_re, output)
95807118
PH
2601 if m:
2602 return m.group(1)
2603 else:
2604 return unrecognized
2605
2606
9af98e17 2607def get_exe_version(exe, args=['--version'],
2608 version_re=None, unrecognized='present'):
2609 """ Returns the version of the specified executable,
2610 or False if the executable is not present """
2611 out = _get_exe_version_output(exe, args)
2612 return detect_exe_version(out, version_re, unrecognized) if out else False
2613
2614
7e88d7d7 2615def frange(start=0, stop=None, step=1):
2616 """Float range"""
2617 if stop is None:
2618 start, stop = 0, start
2619 sign = [-1, 1][step > 0] if step else 0
2620 while sign * start < sign * stop:
2621 yield start
2622 start += step
2623
2624
cb89cfc1 2625class LazyList(collections.abc.Sequence):
0f06bcd7 2626 """Lazy immutable list from an iterable
2627 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2628
8e5fecc8 2629 class IndexError(IndexError):
2630 pass
2631
282f5709 2632 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2633 self._iterable = iter(iterable)
2634 self._cache = [] if _cache is None else _cache
2635 self._reversed = reverse
483336e7 2636
2637 def __iter__(self):
0f06bcd7 2638 if self._reversed:
28419ca2 2639 # We need to consume the entire iterable to iterate in reverse
981052c9 2640 yield from self.exhaust()
28419ca2 2641 return
0f06bcd7 2642 yield from self._cache
2643 for item in self._iterable:
2644 self._cache.append(item)
483336e7 2645 yield item
2646
0f06bcd7 2647 def _exhaust(self):
2648 self._cache.extend(self._iterable)
2649 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2650 return self._cache
28419ca2 2651
981052c9 2652 def exhaust(self):
0f06bcd7 2653 """Evaluate the entire iterable"""
2654 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2655
28419ca2 2656 @staticmethod
0f06bcd7 2657 def _reverse_index(x):
e0f2b4b4 2658 return None if x is None else -(x + 1)
483336e7 2659
2660 def __getitem__(self, idx):
2661 if isinstance(idx, slice):
0f06bcd7 2662 if self._reversed:
2663 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2664 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2665 elif isinstance(idx, int):
0f06bcd7 2666 if self._reversed:
2667 idx = self._reverse_index(idx)
e0f2b4b4 2668 start, stop, step = idx, idx, 0
483336e7 2669 else:
2670 raise TypeError('indices must be integers or slices')
e0f2b4b4 2671 if ((start or 0) < 0 or (stop or 0) < 0
2672 or (start is None and step < 0)
2673 or (stop is None and step > 0)):
483336e7 2674 # We need to consume the entire iterable to be able to slice from the end
2675 # Obviously, never use this with infinite iterables
0f06bcd7 2676 self._exhaust()
8e5fecc8 2677 try:
0f06bcd7 2678 return self._cache[idx]
8e5fecc8 2679 except IndexError as e:
2680 raise self.IndexError(e) from e
0f06bcd7 2681 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2682 if n > 0:
0f06bcd7 2683 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2684 try:
0f06bcd7 2685 return self._cache[idx]
8e5fecc8 2686 except IndexError as e:
2687 raise self.IndexError(e) from e
483336e7 2688
2689 def __bool__(self):
2690 try:
0f06bcd7 2691 self[-1] if self._reversed else self[0]
8e5fecc8 2692 except self.IndexError:
483336e7 2693 return False
2694 return True
2695
2696 def __len__(self):
0f06bcd7 2697 self._exhaust()
2698 return len(self._cache)
483336e7 2699
282f5709 2700 def __reversed__(self):
0f06bcd7 2701 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2702
2703 def __copy__(self):
0f06bcd7 2704 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2705
28419ca2 2706 def __repr__(self):
2707 # repr and str should mimic a list. So we exhaust the iterable
2708 return repr(self.exhaust())
2709
2710 def __str__(self):
2711 return repr(self.exhaust())
2712
483336e7 2713
7be9ccff 2714class PagedList:
c07a39ae 2715
2716 class IndexError(IndexError):
2717 pass
2718
dd26ced1
PH
2719 def __len__(self):
2720 # This is only useful for tests
2721 return len(self.getslice())
2722
7be9ccff 2723 def __init__(self, pagefunc, pagesize, use_cache=True):
2724 self._pagefunc = pagefunc
2725 self._pagesize = pagesize
f1d13090 2726 self._pagecount = float('inf')
7be9ccff 2727 self._use_cache = use_cache
2728 self._cache = {}
2729
2730 def getpage(self, pagenum):
d8cf8d97 2731 page_results = self._cache.get(pagenum)
2732 if page_results is None:
f1d13090 2733 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2734 if self._use_cache:
2735 self._cache[pagenum] = page_results
2736 return page_results
2737
2738 def getslice(self, start=0, end=None):
2739 return list(self._getslice(start, end))
2740
2741 def _getslice(self, start, end):
55575225 2742 raise NotImplementedError('This method must be implemented by subclasses')
2743
2744 def __getitem__(self, idx):
f1d13090 2745 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2746 if not isinstance(idx, int) or idx < 0:
2747 raise TypeError('indices must be non-negative integers')
2748 entries = self.getslice(idx, idx + 1)
d8cf8d97 2749 if not entries:
c07a39ae 2750 raise self.IndexError()
d8cf8d97 2751 return entries[0]
55575225 2752
9c44d242
PH
2753
2754class OnDemandPagedList(PagedList):
a44ca5a4 2755 """Download pages until a page with less than maximum results"""
86e5f3ed 2756
7be9ccff 2757 def _getslice(self, start, end):
b7ab0590
PH
2758 for pagenum in itertools.count(start // self._pagesize):
2759 firstid = pagenum * self._pagesize
2760 nextfirstid = pagenum * self._pagesize + self._pagesize
2761 if start >= nextfirstid:
2762 continue
2763
b7ab0590
PH
2764 startv = (
2765 start % self._pagesize
2766 if firstid <= start < nextfirstid
2767 else 0)
b7ab0590
PH
2768 endv = (
2769 ((end - 1) % self._pagesize) + 1
2770 if (end is not None and firstid <= end <= nextfirstid)
2771 else None)
2772
f1d13090 2773 try:
2774 page_results = self.getpage(pagenum)
2775 except Exception:
2776 self._pagecount = pagenum - 1
2777 raise
b7ab0590
PH
2778 if startv != 0 or endv is not None:
2779 page_results = page_results[startv:endv]
7be9ccff 2780 yield from page_results
b7ab0590
PH
2781
2782 # A little optimization - if current page is not "full", ie. does
2783 # not contain page_size videos then we can assume that this page
2784 # is the last one - there are no more ids on further pages -
2785 # i.e. no need to query again.
2786 if len(page_results) + startv < self._pagesize:
2787 break
2788
2789 # If we got the whole page, but the next page is not interesting,
2790 # break out early as well
2791 if end == nextfirstid:
2792 break
81c2f20b
PH
2793
2794
9c44d242 2795class InAdvancePagedList(PagedList):
a44ca5a4 2796 """PagedList with total number of pages known in advance"""
86e5f3ed 2797
9c44d242 2798 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2799 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2800 self._pagecount = pagecount
9c44d242 2801
7be9ccff 2802 def _getslice(self, start, end):
9c44d242 2803 start_page = start // self._pagesize
d37707bd 2804 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2805 skip_elems = start - start_page * self._pagesize
2806 only_more = None if end is None else end - start
2807 for pagenum in range(start_page, end_page):
7be9ccff 2808 page_results = self.getpage(pagenum)
9c44d242 2809 if skip_elems:
7be9ccff 2810 page_results = page_results[skip_elems:]
9c44d242
PH
2811 skip_elems = None
2812 if only_more is not None:
7be9ccff 2813 if len(page_results) < only_more:
2814 only_more -= len(page_results)
9c44d242 2815 else:
7be9ccff 2816 yield from page_results[:only_more]
9c44d242 2817 break
7be9ccff 2818 yield from page_results
9c44d242
PH
2819
2820
7e88d7d7 2821class PlaylistEntries:
2822 MissingEntry = object()
2823 is_exhausted = False
2824
2825 def __init__(self, ydl, info_dict):
7e9a6125 2826 self.ydl = ydl
2827
2828 # _entries must be assigned now since infodict can change during iteration
2829 entries = info_dict.get('entries')
2830 if entries is None:
2831 raise EntryNotInPlaylist('There are no entries')
2832 elif isinstance(entries, list):
2833 self.is_exhausted = True
2834
2835 requested_entries = info_dict.get('requested_entries')
2836 self.is_incomplete = bool(requested_entries)
2837 if self.is_incomplete:
2838 assert self.is_exhausted
2839 self._entries = [self.MissingEntry] * max(requested_entries)
2840 for i, entry in zip(requested_entries, entries):
2841 self._entries[i - 1] = entry
2842 elif isinstance(entries, (list, PagedList, LazyList)):
2843 self._entries = entries
2844 else:
2845 self._entries = LazyList(entries)
7e88d7d7 2846
2847 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2848 (?P<start>[+-]?\d+)?
2849 (?P<range>[:-]
2850 (?P<end>[+-]?\d+|inf(?:inite)?)?
2851 (?::(?P<step>[+-]?\d+))?
2852 )?''')
2853
2854 @classmethod
2855 def parse_playlist_items(cls, string):
2856 for segment in string.split(','):
2857 if not segment:
2858 raise ValueError('There is two or more consecutive commas')
2859 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2860 if not mobj:
2861 raise ValueError(f'{segment!r} is not a valid specification')
2862 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2863 if int_or_none(step) == 0:
2864 raise ValueError(f'Step in {segment!r} cannot be zero')
2865 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2866
2867 def get_requested_items(self):
2868 playlist_items = self.ydl.params.get('playlist_items')
2869 playlist_start = self.ydl.params.get('playliststart', 1)
2870 playlist_end = self.ydl.params.get('playlistend')
2871 # For backwards compatibility, interpret -1 as whole list
2872 if playlist_end in (-1, None):
2873 playlist_end = ''
2874 if not playlist_items:
2875 playlist_items = f'{playlist_start}:{playlist_end}'
2876 elif playlist_start != 1 or playlist_end:
2877 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2878
2879 for index in self.parse_playlist_items(playlist_items):
2880 for i, entry in self[index]:
2881 yield i, entry
2882 try:
2883 # TODO: Add auto-generated fields
2884 self.ydl._match_entry(entry, incomplete=True, silent=True)
2885 except (ExistingVideoReached, RejectedVideoReached):
2886 return
2887
7e9a6125 2888 def get_full_count(self):
2889 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 2890 return len(self)
2891 elif isinstance(self._entries, InAdvancePagedList):
2892 if self._entries._pagesize == 1:
2893 return self._entries._pagecount
2894
7e88d7d7 2895 @functools.cached_property
2896 def _getter(self):
2897 if isinstance(self._entries, list):
2898 def get_entry(i):
2899 try:
2900 entry = self._entries[i]
2901 except IndexError:
2902 entry = self.MissingEntry
2903 if not self.is_incomplete:
2904 raise self.IndexError()
2905 if entry is self.MissingEntry:
2906 raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2907 return entry
2908 else:
2909 def get_entry(i):
2910 try:
2911 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2912 except (LazyList.IndexError, PagedList.IndexError):
2913 raise self.IndexError()
2914 return get_entry
2915
2916 def __getitem__(self, idx):
2917 if isinstance(idx, int):
2918 idx = slice(idx, idx)
2919
2920 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
2921 step = 1 if idx.step is None else idx.step
2922 if idx.start is None:
2923 start = 0 if step > 0 else len(self) - 1
2924 else:
2925 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
2926
2927 # NB: Do not call len(self) when idx == [:]
2928 if idx.stop is None:
2929 stop = 0 if step < 0 else float('inf')
2930 else:
2931 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
2932 stop += [-1, 1][step > 0]
2933
2934 for i in frange(start, stop, step):
2935 if i < 0:
2936 continue
2937 try:
7e9a6125 2938 entry = self._getter(i)
2939 except self.IndexError:
2940 self.is_exhausted = True
2941 if step > 0:
7e88d7d7 2942 break
7e9a6125 2943 continue
7e88d7d7 2944 yield i + 1, entry
2945
2946 def __len__(self):
2947 return len(tuple(self[:]))
2948
2949 class IndexError(IndexError):
2950 pass
2951
2952
81c2f20b 2953def uppercase_escape(s):
676eb3f2 2954 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2955 return re.sub(
a612753d 2956 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2957 lambda m: unicode_escape(m.group(0))[0],
2958 s)
0fe2ff78
YCH
2959
2960
2961def lowercase_escape(s):
2962 unicode_escape = codecs.getdecoder('unicode_escape')
2963 return re.sub(
2964 r'\\u[0-9a-fA-F]{4}',
2965 lambda m: unicode_escape(m.group(0))[0],
2966 s)
b53466e1 2967
d05cfe06
S
2968
2969def escape_rfc3986(s):
2970 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 2971 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2972
2973
2974def escape_url(url):
2975 """Escape URL as suggested by RFC 3986"""
2976 url_parsed = compat_urllib_parse_urlparse(url)
2977 return url_parsed._replace(
efbed08d 2978 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2979 path=escape_rfc3986(url_parsed.path),
2980 params=escape_rfc3986(url_parsed.params),
2981 query=escape_rfc3986(url_parsed.query),
2982 fragment=escape_rfc3986(url_parsed.fragment)
2983 ).geturl()
2984
62e609ab 2985
4dfbf869 2986def parse_qs(url):
2987 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2988
2989
62e609ab
PH
2990def read_batch_urls(batch_fd):
2991 def fixup(url):
2992 if not isinstance(url, compat_str):
2993 url = url.decode('utf-8', 'replace')
8c04f0be 2994 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2995 for bom in BOM_UTF8:
2996 if url.startswith(bom):
2997 url = url[len(bom):]
2998 url = url.lstrip()
2999 if not url or url.startswith(('#', ';', ']')):
62e609ab 3000 return False
8c04f0be 3001 # "#" cannot be stripped out since it is part of the URI
3002 # However, it can be safely stipped out if follwing a whitespace
3003 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
3004
3005 with contextlib.closing(batch_fd) as fd:
3006 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
3007
3008
3009def urlencode_postdata(*args, **kargs):
15707c7e 3010 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
3011
3012
38f9ef31 3013def update_url_query(url, query):
cacd9966
YCH
3014 if not query:
3015 return url
38f9ef31 3016 parsed_url = compat_urlparse.urlparse(url)
3017 qs = compat_parse_qs(parsed_url.query)
3018 qs.update(query)
3019 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 3020 query=compat_urllib_parse_urlencode(qs, True)))
16392824 3021
8e60dc75 3022
ed0291d1
S
3023def update_Request(req, url=None, data=None, headers={}, query={}):
3024 req_headers = req.headers.copy()
3025 req_headers.update(headers)
3026 req_data = data or req.data
3027 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
3028 req_get_method = req.get_method()
3029 if req_get_method == 'HEAD':
3030 req_type = HEADRequest
3031 elif req_get_method == 'PUT':
3032 req_type = PUTRequest
3033 else:
3034 req_type = compat_urllib_request.Request
ed0291d1
S
3035 new_req = req_type(
3036 req_url, data=req_data, headers=req_headers,
3037 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3038 if hasattr(req, 'timeout'):
3039 new_req.timeout = req.timeout
3040 return new_req
3041
3042
10c87c15 3043def _multipart_encode_impl(data, boundary):
0c265486
YCH
3044 content_type = 'multipart/form-data; boundary=%s' % boundary
3045
3046 out = b''
3047 for k, v in data.items():
3048 out += b'--' + boundary.encode('ascii') + b'\r\n'
3049 if isinstance(k, compat_str):
0f06bcd7 3050 k = k.encode()
0c265486 3051 if isinstance(v, compat_str):
0f06bcd7 3052 v = v.encode()
0c265486
YCH
3053 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3054 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 3055 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
3056 if boundary.encode('ascii') in content:
3057 raise ValueError('Boundary overlaps with data')
3058 out += content
3059
3060 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3061
3062 return out, content_type
3063
3064
3065def multipart_encode(data, boundary=None):
3066 '''
3067 Encode a dict to RFC 7578-compliant form-data
3068
3069 data:
3070 A dict where keys and values can be either Unicode or bytes-like
3071 objects.
3072 boundary:
3073 If specified a Unicode object, it's used as the boundary. Otherwise
3074 a random boundary is generated.
3075
3076 Reference: https://tools.ietf.org/html/rfc7578
3077 '''
3078 has_specified_boundary = boundary is not None
3079
3080 while True:
3081 if boundary is None:
3082 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3083
3084 try:
10c87c15 3085 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
3086 break
3087 except ValueError:
3088 if has_specified_boundary:
3089 raise
3090 boundary = None
3091
3092 return out, content_type
3093
3094
86296ad2 3095def dict_get(d, key_or_keys, default=None, skip_false_values=True):
a44ca5a4 3096 for val in map(d.get, variadic(key_or_keys)):
3097 if val is not None and (val or not skip_false_values):
3098 return val
3099 return default
cbecc9b9
S
3100
3101
c4f60dd7 3102def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3103 for f in funcs:
a32a9a7e 3104 try:
c4f60dd7 3105 val = f(*args, **kwargs)
3106 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
a32a9a7e
S
3107 pass
3108 else:
c4f60dd7 3109 if expected_type is None or isinstance(val, expected_type):
3110 return val
3111
3112
3113def try_get(src, getter, expected_type=None):
3114 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
3115
3116
90137ca4 3117def filter_dict(dct, cndn=lambda _, v: v is not None):
3118 return {k: v for k, v in dct.items() if cndn(k, v)}
3119
3120
6cc62232
S
3121def merge_dicts(*dicts):
3122 merged = {}
3123 for a_dict in dicts:
3124 for k, v in a_dict.items():
90137ca4 3125 if (v is not None and k not in merged
3126 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
3127 merged[k] = v
3128 return merged
3129
3130
8e60dc75
S
3131def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3132 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3133
16392824 3134
a1a530b0
PH
3135US_RATINGS = {
3136 'G': 0,
3137 'PG': 10,
3138 'PG-13': 13,
3139 'R': 16,
3140 'NC': 18,
3141}
fac55558
PH
3142
3143
a8795327 3144TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3145 'TV-Y': 0,
3146 'TV-Y7': 7,
3147 'TV-G': 0,
3148 'TV-PG': 0,
3149 'TV-14': 14,
3150 'TV-MA': 17,
a8795327
S
3151}
3152
3153
146c80e2 3154def parse_age_limit(s):
19a03940 3155 # isinstance(False, int) is True. So type() must be used instead
c487cf00 3156 if type(s) is int: # noqa: E721
a8795327 3157 return s if 0 <= s <= 21 else None
19a03940 3158 elif not isinstance(s, str):
d838b1bd 3159 return None
146c80e2 3160 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3161 if m:
3162 return int(m.group('age'))
5c5fae6d 3163 s = s.upper()
a8795327
S
3164 if s in US_RATINGS:
3165 return US_RATINGS[s]
5a16c9d9 3166 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3167 if m:
5a16c9d9 3168 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3169 return None
146c80e2
S
3170
3171
fac55558 3172def strip_jsonp(code):
609a61e3 3173 return re.sub(
5552c9eb 3174 r'''(?sx)^
e9c671d5 3175 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3176 (?:\s*&&\s*(?P=func_name))?
3177 \s*\(\s*(?P<callback_data>.*)\);?
3178 \s*?(?://[^\n]*)*$''',
3179 r'\g<callback_data>', code)
478c2c61
PH
3180
3181
5c610515 3182def js_to_json(code, vars={}):
3183 # vars is a dict of var, val pairs to substitute
c843e685 3184 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 3185 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 3186 INTEGER_TABLE = (
86e5f3ed 3187 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3188 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
3189 )
3190
e05f6939 3191 def fix_kv(m):
e7b6d122
PH
3192 v = m.group(0)
3193 if v in ('true', 'false', 'null'):
3194 return v
421ddcb8
C
3195 elif v in ('undefined', 'void 0'):
3196 return 'null'
8bdd16b4 3197 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
bd1e4844 3198 return ""
3199
3200 if v[0] in ("'", '"'):
3201 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 3202 '"': '\\"',
bd1e4844 3203 "\\'": "'",
3204 '\\\n': '',
3205 '\\x': '\\u00',
3206 }.get(m.group(0), m.group(0)), v[1:-1])
8bdd16b4 3207 else:
3208 for regex, base in INTEGER_TABLE:
3209 im = re.match(regex, v)
3210 if im:
3211 i = int(im.group(1), base)
3212 return '"%d":' % i if v.endswith(':') else '%d' % i
89ac4a19 3213
5c610515 3214 if v in vars:
3215 return vars[v]
3216
e7b6d122 3217 return '"%s"' % v
e05f6939 3218
8072ef2b 3219 def create_map(mobj):
3220 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3221
febff4c1 3222 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
8072ef2b 3223 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
febff4c1 3224
bd1e4844 3225 return re.sub(r'''(?sx)
3226 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3227 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 3228 {comment}|,(?={skip}[\]}}])|
421ddcb8 3229 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
4195096e 3230 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
8bdd16b4 3231 [0-9]+(?={skip}:)|
3232 !+
4195096e 3233 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
3234
3235
478c2c61
PH
3236def qualities(quality_ids):
3237 """ Get a numeric quality value out of a list of possible values """
3238 def q(qid):
3239 try:
3240 return quality_ids.index(qid)
3241 except ValueError:
3242 return -1
3243 return q
3244
acd69589 3245
8aa0e7cd 3246POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 3247
3248
de6000d9 3249DEFAULT_OUTTMPL = {
3250 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3251 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3252}
3253OUTTMPL_TYPES = {
72755351 3254 'chapter': None,
de6000d9 3255 'subtitle': None,
3256 'thumbnail': None,
3257 'description': 'description',
3258 'annotation': 'annotations.xml',
3259 'infojson': 'info.json',
08438d2c 3260 'link': None,
3b603dbd 3261 'pl_video': None,
5112f26a 3262 'pl_thumbnail': None,
de6000d9 3263 'pl_description': 'description',
3264 'pl_infojson': 'info.json',
3265}
0a871f68 3266
143db31d 3267# As of [1] format syntax is:
3268# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3269# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3270STR_FORMAT_RE_TMPL = r'''(?x)
3271 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3272 %
524e2e4f 3273 (?P<has_key>\((?P<key>{0})\))?
752cda38 3274 (?P<format>
524e2e4f 3275 (?P<conversion>[#0\-+ ]+)?
3276 (?P<min_width>\d+)?
3277 (?P<precision>\.\d+)?
3278 (?P<len_mod>[hlL])? # unused in python
901130bb 3279 {1} # conversion type
752cda38 3280 )
143db31d 3281'''
3282
7d1eb38a 3283
901130bb 3284STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3285
7d1eb38a 3286
a020a0dc
PH
3287def limit_length(s, length):
3288 """ Add ellipses to overly long strings """
3289 if s is None:
3290 return None
3291 ELLIPSES = '...'
3292 if len(s) > length:
3293 return s[:length - len(ELLIPSES)] + ELLIPSES
3294 return s
48844745
PH
3295
3296
3297def version_tuple(v):
5f9b8394 3298 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3299
3300
3301def is_outdated_version(version, limit, assume_new=True):
3302 if not version:
3303 return not assume_new
3304 try:
3305 return version_tuple(version) < version_tuple(limit)
3306 except ValueError:
3307 return not assume_new
732ea2f0
PH
3308
3309
3310def ytdl_is_updateable():
7a5c1cfe 3311 """ Returns if yt-dlp can be updated with -U """
735d865e 3312
5d535b4a 3313 from .update import is_non_updateable
732ea2f0 3314
5d535b4a 3315 return not is_non_updateable()
7d4111ed
PH
3316
3317
3318def args_to_str(args):
3319 # Get a short string representation for a subprocess command
702ccf2d 3320 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3321
3322
9b9c5355 3323def error_to_compat_str(err):
cfb0511d 3324 return str(err)
fdae2358
S
3325
3326
a44ca5a4 3327def error_to_str(err):
3328 return f'{type(err).__name__}: {err}'
3329
3330
c460bdd5 3331def mimetype2ext(mt):
eb9ee194
S
3332 if mt is None:
3333 return None
3334
9359f3d4
F
3335 mt, _, params = mt.partition(';')
3336 mt = mt.strip()
3337
3338 FULL_MAP = {
765ac263 3339 'audio/mp4': 'm4a',
6c33d24b
YCH
3340 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3341 # it's the most popular one
3342 'audio/mpeg': 'mp3',
ba39289d 3343 'audio/x-wav': 'wav',
9359f3d4
F
3344 'audio/wav': 'wav',
3345 'audio/wave': 'wav',
3346 }
3347
3348 ext = FULL_MAP.get(mt)
765ac263
JMF
3349 if ext is not None:
3350 return ext
3351
9359f3d4 3352 SUBTYPE_MAP = {
f6861ec9 3353 '3gpp': '3gp',
cafcf657 3354 'smptett+xml': 'tt',
cafcf657 3355 'ttaf+xml': 'dfxp',
a0d8d704 3356 'ttml+xml': 'ttml',
f6861ec9 3357 'x-flv': 'flv',
a0d8d704 3358 'x-mp4-fragmented': 'mp4',
d4f05d47 3359 'x-ms-sami': 'sami',
a0d8d704 3360 'x-ms-wmv': 'wmv',
b4173f15
RA
3361 'mpegurl': 'm3u8',
3362 'x-mpegurl': 'm3u8',
3363 'vnd.apple.mpegurl': 'm3u8',
3364 'dash+xml': 'mpd',
b4173f15 3365 'f4m+xml': 'f4m',
f164b971 3366 'hds+xml': 'f4m',
e910fe2f 3367 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 3368 'quicktime': 'mov',
98ce1a3f 3369 'mp2t': 'ts',
39e7107d 3370 'x-wav': 'wav',
9359f3d4
F
3371 'filmstrip+json': 'fs',
3372 'svg+xml': 'svg',
3373 }
3374
3375 _, _, subtype = mt.rpartition('/')
3376 ext = SUBTYPE_MAP.get(subtype.lower())
3377 if ext is not None:
3378 return ext
3379
3380 SUFFIX_MAP = {
3381 'json': 'json',
3382 'xml': 'xml',
3383 'zip': 'zip',
3384 'gzip': 'gz',
3385 }
3386
3387 _, _, suffix = subtype.partition('+')
3388 ext = SUFFIX_MAP.get(suffix)
3389 if ext is not None:
3390 return ext
3391
3392 return subtype.replace('+', '.')
c460bdd5
PH
3393
3394
2814f12b
THD
3395def ext2mimetype(ext_or_url):
3396 if not ext_or_url:
3397 return None
3398 if '.' not in ext_or_url:
3399 ext_or_url = f'file.{ext_or_url}'
3400 return mimetypes.guess_type(ext_or_url)[0]
3401
3402
4f3c5e06 3403def parse_codecs(codecs_str):
3404 # http://tools.ietf.org/html/rfc6381
3405 if not codecs_str:
3406 return {}
a0566bbf 3407 split_codecs = list(filter(None, map(
dbf5416a 3408 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3409 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3410 for full_codec in split_codecs:
9bd979ca 3411 parts = full_codec.split('.')
3412 codec = parts[0].replace('0', '')
3413 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3414 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
4f3c5e06 3415 if not vcodec:
b69fd25c 3416 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
176f1866 3417 if codec in ('dvh1', 'dvhe'):
3418 hdr = 'DV'
9bd979ca 3419 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3420 hdr = 'HDR10'
3421 elif full_codec.replace('0', '').startswith('vp9.2'):
176f1866 3422 hdr = 'HDR10'
b69fd25c 3423 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 3424 if not acodec:
3425 acodec = full_codec
4afa3ec4 3426 elif codec in ('stpp', 'wvtt',):
3fe75fdc 3427 if not scodec:
3428 scodec = full_codec
4f3c5e06 3429 else:
19a03940 3430 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3431 if vcodec or acodec or scodec:
4f3c5e06 3432 return {
3433 'vcodec': vcodec or 'none',
3434 'acodec': acodec or 'none',
176f1866 3435 'dynamic_range': hdr,
3fe75fdc 3436 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3437 }
b69fd25c 3438 elif len(split_codecs) == 2:
3439 return {
3440 'vcodec': split_codecs[0],
3441 'acodec': split_codecs[1],
3442 }
4f3c5e06 3443 return {}
3444
3445
2ccd1b10 3446def urlhandle_detect_ext(url_handle):
79298173 3447 getheader = url_handle.headers.get
2ccd1b10 3448
b55ee18f
PH
3449 cd = getheader('Content-Disposition')
3450 if cd:
3451 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3452 if m:
3453 e = determine_ext(m.group('filename'), default_ext=None)
3454 if e:
3455 return e
3456
c460bdd5 3457 return mimetype2ext(getheader('Content-Type'))
05900629
PH
3458
3459
1e399778
YCH
3460def encode_data_uri(data, mime_type):
3461 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3462
3463
05900629 3464def age_restricted(content_limit, age_limit):
6ec6cb4e 3465 """ Returns True iff the content should be blocked """
05900629
PH
3466
3467 if age_limit is None: # No limit set
3468 return False
3469 if content_limit is None:
3470 return False # Content available for everyone
3471 return age_limit < content_limit
61ca9a80
PH
3472
3473
3474def is_html(first_bytes):
3475 """ Detect whether a file contains HTML by examining its first bytes. """
3476
3477 BOMS = [
3478 (b'\xef\xbb\xbf', 'utf-8'),
3479 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3480 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3481 (b'\xff\xfe', 'utf-16-le'),
3482 (b'\xfe\xff', 'utf-16-be'),
3483 ]
80e8493e 3484
3485 encoding = 'utf-8'
61ca9a80 3486 for bom, enc in BOMS:
80e8493e 3487 while first_bytes.startswith(bom):
3488 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3489
80e8493e 3490 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3491
3492
3493def determine_protocol(info_dict):
3494 protocol = info_dict.get('protocol')
3495 if protocol is not None:
3496 return protocol
3497
7de837a5 3498 url = sanitize_url(info_dict['url'])
a055469f
PH
3499 if url.startswith('rtmp'):
3500 return 'rtmp'
3501 elif url.startswith('mms'):
3502 return 'mms'
3503 elif url.startswith('rtsp'):
3504 return 'rtsp'
3505
3506 ext = determine_ext(url)
3507 if ext == 'm3u8':
3508 return 'm3u8'
3509 elif ext == 'f4m':
3510 return 'f4m'
3511
3512 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
3513
3514
c5e3f849 3515def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3516 """ Render a list of rows, each as a list of values.
3517 Text after a \t will be right aligned """
ec11a9f4 3518 def width(string):
c5e3f849 3519 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3520
3521 def get_max_lens(table):
ec11a9f4 3522 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3523
3524 def filter_using_list(row, filterArray):
d16df59d 3525 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3526
d16df59d 3527 max_lens = get_max_lens(data) if hide_empty else []
3528 header_row = filter_using_list(header_row, max_lens)
3529 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3530
cfb56d1a 3531 table = [header_row] + data
76d321f6 3532 max_lens = get_max_lens(table)
c5e3f849 3533 extra_gap += 1
76d321f6 3534 if delim:
c5e3f849 3535 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3536 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3537 for row in table:
3538 for pos, text in enumerate(map(str, row)):
c5e3f849 3539 if '\t' in text:
3540 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3541 else:
3542 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3543 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3544 return ret
347de493
PH
3545
3546
8f18aca8 3547def _match_one(filter_part, dct, incomplete):
77b87f05 3548 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3549 STRING_OPERATORS = {
3550 '*=': operator.contains,
3551 '^=': lambda attr, value: attr.startswith(value),
3552 '$=': lambda attr, value: attr.endswith(value),
3553 '~=': lambda attr, value: re.search(value, attr),
3554 }
347de493 3555 COMPARISON_OPERATORS = {
a047eeb6 3556 **STRING_OPERATORS,
3557 '<=': operator.le, # "<=" must be defined above "<"
347de493 3558 '<': operator.lt,
347de493 3559 '>=': operator.ge,
a047eeb6 3560 '>': operator.gt,
347de493 3561 '=': operator.eq,
347de493 3562 }
a047eeb6 3563
6db9c4d5 3564 if isinstance(incomplete, bool):
3565 is_incomplete = lambda _: incomplete
3566 else:
3567 is_incomplete = lambda k: k in incomplete
3568
64fa820c 3569 operator_rex = re.compile(r'''(?x)
347de493 3570 (?P<key>[a-z_]+)
77b87f05 3571 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3572 (?:
a047eeb6 3573 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3574 (?P<strval>.+?)
347de493 3575 )
347de493 3576 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3577 m = operator_rex.fullmatch(filter_part.strip())
347de493 3578 if m:
18f96d12 3579 m = m.groupdict()
3580 unnegated_op = COMPARISON_OPERATORS[m['op']]
3581 if m['negation']:
77b87f05
MT
3582 op = lambda attr, value: not unnegated_op(attr, value)
3583 else:
3584 op = unnegated_op
18f96d12 3585 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3586 if m['quote']:
3587 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3588 actual_value = dct.get(m['key'])
3589 numeric_comparison = None
f9934b96 3590 if isinstance(actual_value, (int, float)):
e5a088dc
S
3591 # If the original field is a string and matching comparisonvalue is
3592 # a number we should respect the origin of the original field
3593 # and process comparison value as a string (see
18f96d12 3594 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3595 try:
18f96d12 3596 numeric_comparison = int(comparison_value)
347de493 3597 except ValueError:
18f96d12 3598 numeric_comparison = parse_filesize(comparison_value)
3599 if numeric_comparison is None:
3600 numeric_comparison = parse_filesize(f'{comparison_value}B')
3601 if numeric_comparison is None:
3602 numeric_comparison = parse_duration(comparison_value)
3603 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3604 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3605 if actual_value is None:
6db9c4d5 3606 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3607 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3608
3609 UNARY_OPERATORS = {
1cc47c66
S
3610 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3611 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3612 }
64fa820c 3613 operator_rex = re.compile(r'''(?x)
347de493 3614 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3615 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3616 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3617 if m:
3618 op = UNARY_OPERATORS[m.group('op')]
3619 actual_value = dct.get(m.group('key'))
6db9c4d5 3620 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3621 return True
347de493
PH
3622 return op(actual_value)
3623
3624 raise ValueError('Invalid filter part %r' % filter_part)
3625
3626
8f18aca8 3627def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3628 """ Filter a dictionary with a simple string syntax.
3629 @returns Whether the filter passes
3630 @param incomplete Set of keys that is expected to be missing from dct.
3631 Can be True/False to indicate all/none of the keys may be missing.
3632 All conditions on incomplete keys pass if the key is missing
8f18aca8 3633 """
347de493 3634 return all(
8f18aca8 3635 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3636 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3637
3638
b1a7cd05 3639def match_filter_func(filters):
3640 if not filters:
d1b5f70b 3641 return None
492272fe 3642 filters = set(variadic(filters))
d1b5f70b 3643
492272fe 3644 interactive = '-' in filters
3645 if interactive:
3646 filters.remove('-')
3647
3648 def _match_func(info_dict, incomplete=False):
3649 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3650 return NO_DEFAULT if interactive and not incomplete else None
347de493 3651 else:
b1a7cd05 3652 video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3653 filter_str = ') | ('.join(map(str.strip, filters))
3654 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3655 return _match_func
91410c9b
PH
3656
3657
5ec1b6b7 3658def download_range_func(chapters, ranges):
3659 def inner(info_dict, ydl):
3660 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3661 else 'Cannot match chapters since chapter information is unavailable')
5ec1b6b7 3662 for regex in chapters or []:
3663 for i, chapter in enumerate(info_dict.get('chapters') or []):
3664 if re.search(regex, chapter['title']):
3665 warning = None
3666 yield {**chapter, 'index': i}
56ba69e4 3667 if chapters and warning:
5ec1b6b7 3668 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3669
3670 yield from ({'start_time': start, 'end_time': end} for start, end in ranges or [])
3671
3672 return inner
3673
3674
bf6427d2
YCH
3675def parse_dfxp_time_expr(time_expr):
3676 if not time_expr:
d631d5f9 3677 return
bf6427d2 3678
1d485a1a 3679 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3680 if mobj:
3681 return float(mobj.group('time_offset'))
3682
db2fe38b 3683 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3684 if mobj:
db2fe38b 3685 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3686
3687
c1c924ab 3688def srt_subtitles_timecode(seconds):
aa7785f8 3689 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3690
3691
3692def ass_subtitles_timecode(seconds):
3693 time = timetuple_from_msec(seconds * 1000)
3694 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3695
3696
3697def dfxp2srt(dfxp_data):
3869028f
YCH
3698 '''
3699 @param dfxp_data A bytes-like object containing DFXP data
3700 @returns A unicode object containing converted SRT data
3701 '''
5b995f71 3702 LEGACY_NAMESPACES = (
3869028f
YCH
3703 (b'http://www.w3.org/ns/ttml', [
3704 b'http://www.w3.org/2004/11/ttaf1',
3705 b'http://www.w3.org/2006/04/ttaf1',
3706 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3707 ]),
3869028f
YCH
3708 (b'http://www.w3.org/ns/ttml#styling', [
3709 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3710 ]),
3711 )
3712
3713 SUPPORTED_STYLING = [
3714 'color',
3715 'fontFamily',
3716 'fontSize',
3717 'fontStyle',
3718 'fontWeight',
3719 'textDecoration'
3720 ]
3721
4e335771 3722 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3723 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3724 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3725 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3726 })
bf6427d2 3727
5b995f71
RA
3728 styles = {}
3729 default_style = {}
3730
86e5f3ed 3731 class TTMLPElementParser:
5b995f71
RA
3732 _out = ''
3733 _unclosed_elements = []
3734 _applied_styles = []
bf6427d2 3735
2b14cb56 3736 def start(self, tag, attrib):
5b995f71
RA
3737 if tag in (_x('ttml:br'), 'br'):
3738 self._out += '\n'
3739 else:
3740 unclosed_elements = []
3741 style = {}
3742 element_style_id = attrib.get('style')
3743 if default_style:
3744 style.update(default_style)
3745 if element_style_id:
3746 style.update(styles.get(element_style_id, {}))
3747 for prop in SUPPORTED_STYLING:
3748 prop_val = attrib.get(_x('tts:' + prop))
3749 if prop_val:
3750 style[prop] = prop_val
3751 if style:
3752 font = ''
3753 for k, v in sorted(style.items()):
3754 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3755 continue
3756 if k == 'color':
3757 font += ' color="%s"' % v
3758 elif k == 'fontSize':
3759 font += ' size="%s"' % v
3760 elif k == 'fontFamily':
3761 font += ' face="%s"' % v
3762 elif k == 'fontWeight' and v == 'bold':
3763 self._out += '<b>'
3764 unclosed_elements.append('b')
3765 elif k == 'fontStyle' and v == 'italic':
3766 self._out += '<i>'
3767 unclosed_elements.append('i')
3768 elif k == 'textDecoration' and v == 'underline':
3769 self._out += '<u>'
3770 unclosed_elements.append('u')
3771 if font:
3772 self._out += '<font' + font + '>'
3773 unclosed_elements.append('font')
3774 applied_style = {}
3775 if self._applied_styles:
3776 applied_style.update(self._applied_styles[-1])
3777 applied_style.update(style)
3778 self._applied_styles.append(applied_style)
3779 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3780
2b14cb56 3781 def end(self, tag):
5b995f71
RA
3782 if tag not in (_x('ttml:br'), 'br'):
3783 unclosed_elements = self._unclosed_elements.pop()
3784 for element in reversed(unclosed_elements):
3785 self._out += '</%s>' % element
3786 if unclosed_elements and self._applied_styles:
3787 self._applied_styles.pop()
bf6427d2 3788
2b14cb56 3789 def data(self, data):
5b995f71 3790 self._out += data
2b14cb56 3791
3792 def close(self):
5b995f71 3793 return self._out.strip()
2b14cb56 3794
3795 def parse_node(node):
3796 target = TTMLPElementParser()
3797 parser = xml.etree.ElementTree.XMLParser(target=target)
3798 parser.feed(xml.etree.ElementTree.tostring(node))
3799 return parser.close()
bf6427d2 3800
5b995f71
RA
3801 for k, v in LEGACY_NAMESPACES:
3802 for ns in v:
3803 dfxp_data = dfxp_data.replace(ns, k)
3804
3869028f 3805 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3806 out = []
5b995f71 3807 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3808
3809 if not paras:
3810 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3811
5b995f71
RA
3812 repeat = False
3813 while True:
3814 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3815 style_id = style.get('id') or style.get(_x('xml:id'))
3816 if not style_id:
3817 continue
5b995f71
RA
3818 parent_style_id = style.get('style')
3819 if parent_style_id:
3820 if parent_style_id not in styles:
3821 repeat = True
3822 continue
3823 styles[style_id] = styles[parent_style_id].copy()
3824 for prop in SUPPORTED_STYLING:
3825 prop_val = style.get(_x('tts:' + prop))
3826 if prop_val:
3827 styles.setdefault(style_id, {})[prop] = prop_val
3828 if repeat:
3829 repeat = False
3830 else:
3831 break
3832
3833 for p in ('body', 'div'):
3834 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3835 if ele is None:
3836 continue
3837 style = styles.get(ele.get('style'))
3838 if not style:
3839 continue
3840 default_style.update(style)
3841
bf6427d2 3842 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3843 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3844 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3845 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3846 if begin_time is None:
3847 continue
7dff0363 3848 if not end_time:
d631d5f9
YCH
3849 if not dur:
3850 continue
3851 end_time = begin_time + dur
bf6427d2
YCH
3852 out.append('%d\n%s --> %s\n%s\n\n' % (
3853 index,
c1c924ab
YCH
3854 srt_subtitles_timecode(begin_time),
3855 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3856 parse_node(para)))
3857
3858 return ''.join(out)
3859
3860
c487cf00 3861def cli_option(params, command_option, param, separator=None):
66e289ba 3862 param = params.get(param)
c487cf00 3863 return ([] if param is None
3864 else [command_option, str(param)] if separator is None
3865 else [f'{command_option}{separator}{param}'])
66e289ba
S
3866
3867
3868def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3869 param = params.get(param)
c487cf00 3870 assert param in (True, False, None)
3871 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
3872
3873
3874def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 3875 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
3876
3877
e92caff5 3878def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3879 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3880 if use_compat:
5b1ecbb3 3881 return argdict
3882 else:
3883 argdict = None
eab9b2bc 3884 if argdict is None:
5b1ecbb3 3885 return default
eab9b2bc 3886 assert isinstance(argdict, dict)
3887
e92caff5 3888 assert isinstance(keys, (list, tuple))
3889 for key_list in keys:
e92caff5 3890 arg_list = list(filter(
3891 lambda x: x is not None,
6606817a 3892 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3893 if arg_list:
3894 return [arg for args in arg_list for arg in args]
3895 return default
66e289ba 3896
6251555f 3897
330690a2 3898def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3899 main_key, exe = main_key.lower(), exe.lower()
3900 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3901 keys = [f'{root_key}{k}' for k in (keys or [''])]
3902 if root_key in keys:
3903 if main_key != exe:
3904 keys.append((main_key, exe))
3905 keys.append('default')
3906 else:
3907 use_compat = False
3908 return cli_configuration_args(argdict, keys, default, use_compat)
3909
66e289ba 3910
86e5f3ed 3911class ISO639Utils:
39672624
YCH
3912 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3913 _lang_map = {
3914 'aa': 'aar',
3915 'ab': 'abk',
3916 'ae': 'ave',
3917 'af': 'afr',
3918 'ak': 'aka',
3919 'am': 'amh',
3920 'an': 'arg',
3921 'ar': 'ara',
3922 'as': 'asm',
3923 'av': 'ava',
3924 'ay': 'aym',
3925 'az': 'aze',
3926 'ba': 'bak',
3927 'be': 'bel',
3928 'bg': 'bul',
3929 'bh': 'bih',
3930 'bi': 'bis',
3931 'bm': 'bam',
3932 'bn': 'ben',
3933 'bo': 'bod',
3934 'br': 'bre',
3935 'bs': 'bos',
3936 'ca': 'cat',
3937 'ce': 'che',
3938 'ch': 'cha',
3939 'co': 'cos',
3940 'cr': 'cre',
3941 'cs': 'ces',
3942 'cu': 'chu',
3943 'cv': 'chv',
3944 'cy': 'cym',
3945 'da': 'dan',
3946 'de': 'deu',
3947 'dv': 'div',
3948 'dz': 'dzo',
3949 'ee': 'ewe',
3950 'el': 'ell',
3951 'en': 'eng',
3952 'eo': 'epo',
3953 'es': 'spa',
3954 'et': 'est',
3955 'eu': 'eus',
3956 'fa': 'fas',
3957 'ff': 'ful',
3958 'fi': 'fin',
3959 'fj': 'fij',
3960 'fo': 'fao',
3961 'fr': 'fra',
3962 'fy': 'fry',
3963 'ga': 'gle',
3964 'gd': 'gla',
3965 'gl': 'glg',
3966 'gn': 'grn',
3967 'gu': 'guj',
3968 'gv': 'glv',
3969 'ha': 'hau',
3970 'he': 'heb',
b7acc835 3971 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
3972 'hi': 'hin',
3973 'ho': 'hmo',
3974 'hr': 'hrv',
3975 'ht': 'hat',
3976 'hu': 'hun',
3977 'hy': 'hye',
3978 'hz': 'her',
3979 'ia': 'ina',
3980 'id': 'ind',
b7acc835 3981 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
3982 'ie': 'ile',
3983 'ig': 'ibo',
3984 'ii': 'iii',
3985 'ik': 'ipk',
3986 'io': 'ido',
3987 'is': 'isl',
3988 'it': 'ita',
3989 'iu': 'iku',
3990 'ja': 'jpn',
3991 'jv': 'jav',
3992 'ka': 'kat',
3993 'kg': 'kon',
3994 'ki': 'kik',
3995 'kj': 'kua',
3996 'kk': 'kaz',
3997 'kl': 'kal',
3998 'km': 'khm',
3999 'kn': 'kan',
4000 'ko': 'kor',
4001 'kr': 'kau',
4002 'ks': 'kas',
4003 'ku': 'kur',
4004 'kv': 'kom',
4005 'kw': 'cor',
4006 'ky': 'kir',
4007 'la': 'lat',
4008 'lb': 'ltz',
4009 'lg': 'lug',
4010 'li': 'lim',
4011 'ln': 'lin',
4012 'lo': 'lao',
4013 'lt': 'lit',
4014 'lu': 'lub',
4015 'lv': 'lav',
4016 'mg': 'mlg',
4017 'mh': 'mah',
4018 'mi': 'mri',
4019 'mk': 'mkd',
4020 'ml': 'mal',
4021 'mn': 'mon',
4022 'mr': 'mar',
4023 'ms': 'msa',
4024 'mt': 'mlt',
4025 'my': 'mya',
4026 'na': 'nau',
4027 'nb': 'nob',
4028 'nd': 'nde',
4029 'ne': 'nep',
4030 'ng': 'ndo',
4031 'nl': 'nld',
4032 'nn': 'nno',
4033 'no': 'nor',
4034 'nr': 'nbl',
4035 'nv': 'nav',
4036 'ny': 'nya',
4037 'oc': 'oci',
4038 'oj': 'oji',
4039 'om': 'orm',
4040 'or': 'ori',
4041 'os': 'oss',
4042 'pa': 'pan',
4043 'pi': 'pli',
4044 'pl': 'pol',
4045 'ps': 'pus',
4046 'pt': 'por',
4047 'qu': 'que',
4048 'rm': 'roh',
4049 'rn': 'run',
4050 'ro': 'ron',
4051 'ru': 'rus',
4052 'rw': 'kin',
4053 'sa': 'san',
4054 'sc': 'srd',
4055 'sd': 'snd',
4056 'se': 'sme',
4057 'sg': 'sag',
4058 'si': 'sin',
4059 'sk': 'slk',
4060 'sl': 'slv',
4061 'sm': 'smo',
4062 'sn': 'sna',
4063 'so': 'som',
4064 'sq': 'sqi',
4065 'sr': 'srp',
4066 'ss': 'ssw',
4067 'st': 'sot',
4068 'su': 'sun',
4069 'sv': 'swe',
4070 'sw': 'swa',
4071 'ta': 'tam',
4072 'te': 'tel',
4073 'tg': 'tgk',
4074 'th': 'tha',
4075 'ti': 'tir',
4076 'tk': 'tuk',
4077 'tl': 'tgl',
4078 'tn': 'tsn',
4079 'to': 'ton',
4080 'tr': 'tur',
4081 'ts': 'tso',
4082 'tt': 'tat',
4083 'tw': 'twi',
4084 'ty': 'tah',
4085 'ug': 'uig',
4086 'uk': 'ukr',
4087 'ur': 'urd',
4088 'uz': 'uzb',
4089 've': 'ven',
4090 'vi': 'vie',
4091 'vo': 'vol',
4092 'wa': 'wln',
4093 'wo': 'wol',
4094 'xh': 'xho',
4095 'yi': 'yid',
e9a50fba 4096 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
4097 'yo': 'yor',
4098 'za': 'zha',
4099 'zh': 'zho',
4100 'zu': 'zul',
4101 }
4102
4103 @classmethod
4104 def short2long(cls, code):
4105 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4106 return cls._lang_map.get(code[:2])
4107
4108 @classmethod
4109 def long2short(cls, code):
4110 """Convert language code from ISO 639-2/T to ISO 639-1"""
4111 for short_name, long_name in cls._lang_map.items():
4112 if long_name == code:
4113 return short_name
4114
4115
86e5f3ed 4116class ISO3166Utils:
4eb10f66
YCH
4117 # From http://data.okfn.org/data/core/country-list
4118 _country_map = {
4119 'AF': 'Afghanistan',
4120 'AX': 'Åland Islands',
4121 'AL': 'Albania',
4122 'DZ': 'Algeria',
4123 'AS': 'American Samoa',
4124 'AD': 'Andorra',
4125 'AO': 'Angola',
4126 'AI': 'Anguilla',
4127 'AQ': 'Antarctica',
4128 'AG': 'Antigua and Barbuda',
4129 'AR': 'Argentina',
4130 'AM': 'Armenia',
4131 'AW': 'Aruba',
4132 'AU': 'Australia',
4133 'AT': 'Austria',
4134 'AZ': 'Azerbaijan',
4135 'BS': 'Bahamas',
4136 'BH': 'Bahrain',
4137 'BD': 'Bangladesh',
4138 'BB': 'Barbados',
4139 'BY': 'Belarus',
4140 'BE': 'Belgium',
4141 'BZ': 'Belize',
4142 'BJ': 'Benin',
4143 'BM': 'Bermuda',
4144 'BT': 'Bhutan',
4145 'BO': 'Bolivia, Plurinational State of',
4146 'BQ': 'Bonaire, Sint Eustatius and Saba',
4147 'BA': 'Bosnia and Herzegovina',
4148 'BW': 'Botswana',
4149 'BV': 'Bouvet Island',
4150 'BR': 'Brazil',
4151 'IO': 'British Indian Ocean Territory',
4152 'BN': 'Brunei Darussalam',
4153 'BG': 'Bulgaria',
4154 'BF': 'Burkina Faso',
4155 'BI': 'Burundi',
4156 'KH': 'Cambodia',
4157 'CM': 'Cameroon',
4158 'CA': 'Canada',
4159 'CV': 'Cape Verde',
4160 'KY': 'Cayman Islands',
4161 'CF': 'Central African Republic',
4162 'TD': 'Chad',
4163 'CL': 'Chile',
4164 'CN': 'China',
4165 'CX': 'Christmas Island',
4166 'CC': 'Cocos (Keeling) Islands',
4167 'CO': 'Colombia',
4168 'KM': 'Comoros',
4169 'CG': 'Congo',
4170 'CD': 'Congo, the Democratic Republic of the',
4171 'CK': 'Cook Islands',
4172 'CR': 'Costa Rica',
4173 'CI': 'Côte d\'Ivoire',
4174 'HR': 'Croatia',
4175 'CU': 'Cuba',
4176 'CW': 'Curaçao',
4177 'CY': 'Cyprus',
4178 'CZ': 'Czech Republic',
4179 'DK': 'Denmark',
4180 'DJ': 'Djibouti',
4181 'DM': 'Dominica',
4182 'DO': 'Dominican Republic',
4183 'EC': 'Ecuador',
4184 'EG': 'Egypt',
4185 'SV': 'El Salvador',
4186 'GQ': 'Equatorial Guinea',
4187 'ER': 'Eritrea',
4188 'EE': 'Estonia',
4189 'ET': 'Ethiopia',
4190 'FK': 'Falkland Islands (Malvinas)',
4191 'FO': 'Faroe Islands',
4192 'FJ': 'Fiji',
4193 'FI': 'Finland',
4194 'FR': 'France',
4195 'GF': 'French Guiana',
4196 'PF': 'French Polynesia',
4197 'TF': 'French Southern Territories',
4198 'GA': 'Gabon',
4199 'GM': 'Gambia',
4200 'GE': 'Georgia',
4201 'DE': 'Germany',
4202 'GH': 'Ghana',
4203 'GI': 'Gibraltar',
4204 'GR': 'Greece',
4205 'GL': 'Greenland',
4206 'GD': 'Grenada',
4207 'GP': 'Guadeloupe',
4208 'GU': 'Guam',
4209 'GT': 'Guatemala',
4210 'GG': 'Guernsey',
4211 'GN': 'Guinea',
4212 'GW': 'Guinea-Bissau',
4213 'GY': 'Guyana',
4214 'HT': 'Haiti',
4215 'HM': 'Heard Island and McDonald Islands',
4216 'VA': 'Holy See (Vatican City State)',
4217 'HN': 'Honduras',
4218 'HK': 'Hong Kong',
4219 'HU': 'Hungary',
4220 'IS': 'Iceland',
4221 'IN': 'India',
4222 'ID': 'Indonesia',
4223 'IR': 'Iran, Islamic Republic of',
4224 'IQ': 'Iraq',
4225 'IE': 'Ireland',
4226 'IM': 'Isle of Man',
4227 'IL': 'Israel',
4228 'IT': 'Italy',
4229 'JM': 'Jamaica',
4230 'JP': 'Japan',
4231 'JE': 'Jersey',
4232 'JO': 'Jordan',
4233 'KZ': 'Kazakhstan',
4234 'KE': 'Kenya',
4235 'KI': 'Kiribati',
4236 'KP': 'Korea, Democratic People\'s Republic of',
4237 'KR': 'Korea, Republic of',
4238 'KW': 'Kuwait',
4239 'KG': 'Kyrgyzstan',
4240 'LA': 'Lao People\'s Democratic Republic',
4241 'LV': 'Latvia',
4242 'LB': 'Lebanon',
4243 'LS': 'Lesotho',
4244 'LR': 'Liberia',
4245 'LY': 'Libya',
4246 'LI': 'Liechtenstein',
4247 'LT': 'Lithuania',
4248 'LU': 'Luxembourg',
4249 'MO': 'Macao',
4250 'MK': 'Macedonia, the Former Yugoslav Republic of',
4251 'MG': 'Madagascar',
4252 'MW': 'Malawi',
4253 'MY': 'Malaysia',
4254 'MV': 'Maldives',
4255 'ML': 'Mali',
4256 'MT': 'Malta',
4257 'MH': 'Marshall Islands',
4258 'MQ': 'Martinique',
4259 'MR': 'Mauritania',
4260 'MU': 'Mauritius',
4261 'YT': 'Mayotte',
4262 'MX': 'Mexico',
4263 'FM': 'Micronesia, Federated States of',
4264 'MD': 'Moldova, Republic of',
4265 'MC': 'Monaco',
4266 'MN': 'Mongolia',
4267 'ME': 'Montenegro',
4268 'MS': 'Montserrat',
4269 'MA': 'Morocco',
4270 'MZ': 'Mozambique',
4271 'MM': 'Myanmar',
4272 'NA': 'Namibia',
4273 'NR': 'Nauru',
4274 'NP': 'Nepal',
4275 'NL': 'Netherlands',
4276 'NC': 'New Caledonia',
4277 'NZ': 'New Zealand',
4278 'NI': 'Nicaragua',
4279 'NE': 'Niger',
4280 'NG': 'Nigeria',
4281 'NU': 'Niue',
4282 'NF': 'Norfolk Island',
4283 'MP': 'Northern Mariana Islands',
4284 'NO': 'Norway',
4285 'OM': 'Oman',
4286 'PK': 'Pakistan',
4287 'PW': 'Palau',
4288 'PS': 'Palestine, State of',
4289 'PA': 'Panama',
4290 'PG': 'Papua New Guinea',
4291 'PY': 'Paraguay',
4292 'PE': 'Peru',
4293 'PH': 'Philippines',
4294 'PN': 'Pitcairn',
4295 'PL': 'Poland',
4296 'PT': 'Portugal',
4297 'PR': 'Puerto Rico',
4298 'QA': 'Qatar',
4299 'RE': 'Réunion',
4300 'RO': 'Romania',
4301 'RU': 'Russian Federation',
4302 'RW': 'Rwanda',
4303 'BL': 'Saint Barthélemy',
4304 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4305 'KN': 'Saint Kitts and Nevis',
4306 'LC': 'Saint Lucia',
4307 'MF': 'Saint Martin (French part)',
4308 'PM': 'Saint Pierre and Miquelon',
4309 'VC': 'Saint Vincent and the Grenadines',
4310 'WS': 'Samoa',
4311 'SM': 'San Marino',
4312 'ST': 'Sao Tome and Principe',
4313 'SA': 'Saudi Arabia',
4314 'SN': 'Senegal',
4315 'RS': 'Serbia',
4316 'SC': 'Seychelles',
4317 'SL': 'Sierra Leone',
4318 'SG': 'Singapore',
4319 'SX': 'Sint Maarten (Dutch part)',
4320 'SK': 'Slovakia',
4321 'SI': 'Slovenia',
4322 'SB': 'Solomon Islands',
4323 'SO': 'Somalia',
4324 'ZA': 'South Africa',
4325 'GS': 'South Georgia and the South Sandwich Islands',
4326 'SS': 'South Sudan',
4327 'ES': 'Spain',
4328 'LK': 'Sri Lanka',
4329 'SD': 'Sudan',
4330 'SR': 'Suriname',
4331 'SJ': 'Svalbard and Jan Mayen',
4332 'SZ': 'Swaziland',
4333 'SE': 'Sweden',
4334 'CH': 'Switzerland',
4335 'SY': 'Syrian Arab Republic',
4336 'TW': 'Taiwan, Province of China',
4337 'TJ': 'Tajikistan',
4338 'TZ': 'Tanzania, United Republic of',
4339 'TH': 'Thailand',
4340 'TL': 'Timor-Leste',
4341 'TG': 'Togo',
4342 'TK': 'Tokelau',
4343 'TO': 'Tonga',
4344 'TT': 'Trinidad and Tobago',
4345 'TN': 'Tunisia',
4346 'TR': 'Turkey',
4347 'TM': 'Turkmenistan',
4348 'TC': 'Turks and Caicos Islands',
4349 'TV': 'Tuvalu',
4350 'UG': 'Uganda',
4351 'UA': 'Ukraine',
4352 'AE': 'United Arab Emirates',
4353 'GB': 'United Kingdom',
4354 'US': 'United States',
4355 'UM': 'United States Minor Outlying Islands',
4356 'UY': 'Uruguay',
4357 'UZ': 'Uzbekistan',
4358 'VU': 'Vanuatu',
4359 'VE': 'Venezuela, Bolivarian Republic of',
4360 'VN': 'Viet Nam',
4361 'VG': 'Virgin Islands, British',
4362 'VI': 'Virgin Islands, U.S.',
4363 'WF': 'Wallis and Futuna',
4364 'EH': 'Western Sahara',
4365 'YE': 'Yemen',
4366 'ZM': 'Zambia',
4367 'ZW': 'Zimbabwe',
2f97cc61 4368 # Not ISO 3166 codes, but used for IP blocks
4369 'AP': 'Asia/Pacific Region',
4370 'EU': 'Europe',
4eb10f66
YCH
4371 }
4372
4373 @classmethod
4374 def short2full(cls, code):
4375 """Convert an ISO 3166-2 country code to the corresponding full name"""
4376 return cls._country_map.get(code.upper())
4377
4378
86e5f3ed 4379class GeoUtils:
773f291d
S
4380 # Major IPv4 address blocks per country
4381 _country_ip_map = {
53896ca5 4382 'AD': '46.172.224.0/19',
773f291d
S
4383 'AE': '94.200.0.0/13',
4384 'AF': '149.54.0.0/17',
4385 'AG': '209.59.64.0/18',
4386 'AI': '204.14.248.0/21',
4387 'AL': '46.99.0.0/16',
4388 'AM': '46.70.0.0/15',
4389 'AO': '105.168.0.0/13',
53896ca5
S
4390 'AP': '182.50.184.0/21',
4391 'AQ': '23.154.160.0/24',
773f291d
S
4392 'AR': '181.0.0.0/12',
4393 'AS': '202.70.112.0/20',
53896ca5 4394 'AT': '77.116.0.0/14',
773f291d
S
4395 'AU': '1.128.0.0/11',
4396 'AW': '181.41.0.0/18',
53896ca5
S
4397 'AX': '185.217.4.0/22',
4398 'AZ': '5.197.0.0/16',
773f291d
S
4399 'BA': '31.176.128.0/17',
4400 'BB': '65.48.128.0/17',
4401 'BD': '114.130.0.0/16',
4402 'BE': '57.0.0.0/8',
53896ca5 4403 'BF': '102.178.0.0/15',
773f291d
S
4404 'BG': '95.42.0.0/15',
4405 'BH': '37.131.0.0/17',
4406 'BI': '154.117.192.0/18',
4407 'BJ': '137.255.0.0/16',
53896ca5 4408 'BL': '185.212.72.0/23',
773f291d
S
4409 'BM': '196.12.64.0/18',
4410 'BN': '156.31.0.0/16',
4411 'BO': '161.56.0.0/16',
4412 'BQ': '161.0.80.0/20',
53896ca5 4413 'BR': '191.128.0.0/12',
773f291d
S
4414 'BS': '24.51.64.0/18',
4415 'BT': '119.2.96.0/19',
4416 'BW': '168.167.0.0/16',
4417 'BY': '178.120.0.0/13',
4418 'BZ': '179.42.192.0/18',
4419 'CA': '99.224.0.0/11',
4420 'CD': '41.243.0.0/16',
53896ca5
S
4421 'CF': '197.242.176.0/21',
4422 'CG': '160.113.0.0/16',
773f291d 4423 'CH': '85.0.0.0/13',
53896ca5 4424 'CI': '102.136.0.0/14',
773f291d
S
4425 'CK': '202.65.32.0/19',
4426 'CL': '152.172.0.0/14',
53896ca5 4427 'CM': '102.244.0.0/14',
773f291d
S
4428 'CN': '36.128.0.0/10',
4429 'CO': '181.240.0.0/12',
4430 'CR': '201.192.0.0/12',
4431 'CU': '152.206.0.0/15',
4432 'CV': '165.90.96.0/19',
4433 'CW': '190.88.128.0/17',
53896ca5 4434 'CY': '31.153.0.0/16',
773f291d
S
4435 'CZ': '88.100.0.0/14',
4436 'DE': '53.0.0.0/8',
4437 'DJ': '197.241.0.0/17',
4438 'DK': '87.48.0.0/12',
4439 'DM': '192.243.48.0/20',
4440 'DO': '152.166.0.0/15',
4441 'DZ': '41.96.0.0/12',
4442 'EC': '186.68.0.0/15',
4443 'EE': '90.190.0.0/15',
4444 'EG': '156.160.0.0/11',
4445 'ER': '196.200.96.0/20',
4446 'ES': '88.0.0.0/11',
4447 'ET': '196.188.0.0/14',
4448 'EU': '2.16.0.0/13',
4449 'FI': '91.152.0.0/13',
4450 'FJ': '144.120.0.0/16',
53896ca5 4451 'FK': '80.73.208.0/21',
773f291d
S
4452 'FM': '119.252.112.0/20',
4453 'FO': '88.85.32.0/19',
4454 'FR': '90.0.0.0/9',
4455 'GA': '41.158.0.0/15',
4456 'GB': '25.0.0.0/8',
4457 'GD': '74.122.88.0/21',
4458 'GE': '31.146.0.0/16',
4459 'GF': '161.22.64.0/18',
4460 'GG': '62.68.160.0/19',
53896ca5
S
4461 'GH': '154.160.0.0/12',
4462 'GI': '95.164.0.0/16',
773f291d
S
4463 'GL': '88.83.0.0/19',
4464 'GM': '160.182.0.0/15',
4465 'GN': '197.149.192.0/18',
4466 'GP': '104.250.0.0/19',
4467 'GQ': '105.235.224.0/20',
4468 'GR': '94.64.0.0/13',
4469 'GT': '168.234.0.0/16',
4470 'GU': '168.123.0.0/16',
4471 'GW': '197.214.80.0/20',
4472 'GY': '181.41.64.0/18',
4473 'HK': '113.252.0.0/14',
4474 'HN': '181.210.0.0/16',
4475 'HR': '93.136.0.0/13',
4476 'HT': '148.102.128.0/17',
4477 'HU': '84.0.0.0/14',
4478 'ID': '39.192.0.0/10',
4479 'IE': '87.32.0.0/12',
4480 'IL': '79.176.0.0/13',
4481 'IM': '5.62.80.0/20',
4482 'IN': '117.192.0.0/10',
4483 'IO': '203.83.48.0/21',
4484 'IQ': '37.236.0.0/14',
4485 'IR': '2.176.0.0/12',
4486 'IS': '82.221.0.0/16',
4487 'IT': '79.0.0.0/10',
4488 'JE': '87.244.64.0/18',
4489 'JM': '72.27.0.0/17',
4490 'JO': '176.29.0.0/16',
53896ca5 4491 'JP': '133.0.0.0/8',
773f291d
S
4492 'KE': '105.48.0.0/12',
4493 'KG': '158.181.128.0/17',
4494 'KH': '36.37.128.0/17',
4495 'KI': '103.25.140.0/22',
4496 'KM': '197.255.224.0/20',
53896ca5 4497 'KN': '198.167.192.0/19',
773f291d
S
4498 'KP': '175.45.176.0/22',
4499 'KR': '175.192.0.0/10',
4500 'KW': '37.36.0.0/14',
4501 'KY': '64.96.0.0/15',
4502 'KZ': '2.72.0.0/13',
4503 'LA': '115.84.64.0/18',
4504 'LB': '178.135.0.0/16',
53896ca5 4505 'LC': '24.92.144.0/20',
773f291d
S
4506 'LI': '82.117.0.0/19',
4507 'LK': '112.134.0.0/15',
53896ca5 4508 'LR': '102.183.0.0/16',
773f291d
S
4509 'LS': '129.232.0.0/17',
4510 'LT': '78.56.0.0/13',
4511 'LU': '188.42.0.0/16',
4512 'LV': '46.109.0.0/16',
4513 'LY': '41.252.0.0/14',
4514 'MA': '105.128.0.0/11',
4515 'MC': '88.209.64.0/18',
4516 'MD': '37.246.0.0/16',
4517 'ME': '178.175.0.0/17',
4518 'MF': '74.112.232.0/21',
4519 'MG': '154.126.0.0/17',
4520 'MH': '117.103.88.0/21',
4521 'MK': '77.28.0.0/15',
4522 'ML': '154.118.128.0/18',
4523 'MM': '37.111.0.0/17',
4524 'MN': '49.0.128.0/17',
4525 'MO': '60.246.0.0/16',
4526 'MP': '202.88.64.0/20',
4527 'MQ': '109.203.224.0/19',
4528 'MR': '41.188.64.0/18',
4529 'MS': '208.90.112.0/22',
4530 'MT': '46.11.0.0/16',
4531 'MU': '105.16.0.0/12',
4532 'MV': '27.114.128.0/18',
53896ca5 4533 'MW': '102.70.0.0/15',
773f291d
S
4534 'MX': '187.192.0.0/11',
4535 'MY': '175.136.0.0/13',
4536 'MZ': '197.218.0.0/15',
4537 'NA': '41.182.0.0/16',
4538 'NC': '101.101.0.0/18',
4539 'NE': '197.214.0.0/18',
4540 'NF': '203.17.240.0/22',
4541 'NG': '105.112.0.0/12',
4542 'NI': '186.76.0.0/15',
4543 'NL': '145.96.0.0/11',
4544 'NO': '84.208.0.0/13',
4545 'NP': '36.252.0.0/15',
4546 'NR': '203.98.224.0/19',
4547 'NU': '49.156.48.0/22',
4548 'NZ': '49.224.0.0/14',
4549 'OM': '5.36.0.0/15',
4550 'PA': '186.72.0.0/15',
4551 'PE': '186.160.0.0/14',
4552 'PF': '123.50.64.0/18',
4553 'PG': '124.240.192.0/19',
4554 'PH': '49.144.0.0/13',
4555 'PK': '39.32.0.0/11',
4556 'PL': '83.0.0.0/11',
4557 'PM': '70.36.0.0/20',
4558 'PR': '66.50.0.0/16',
4559 'PS': '188.161.0.0/16',
4560 'PT': '85.240.0.0/13',
4561 'PW': '202.124.224.0/20',
4562 'PY': '181.120.0.0/14',
4563 'QA': '37.210.0.0/15',
53896ca5 4564 'RE': '102.35.0.0/16',
773f291d 4565 'RO': '79.112.0.0/13',
53896ca5 4566 'RS': '93.86.0.0/15',
773f291d 4567 'RU': '5.136.0.0/13',
53896ca5 4568 'RW': '41.186.0.0/16',
773f291d
S
4569 'SA': '188.48.0.0/13',
4570 'SB': '202.1.160.0/19',
4571 'SC': '154.192.0.0/11',
53896ca5 4572 'SD': '102.120.0.0/13',
773f291d 4573 'SE': '78.64.0.0/12',
53896ca5 4574 'SG': '8.128.0.0/10',
773f291d
S
4575 'SI': '188.196.0.0/14',
4576 'SK': '78.98.0.0/15',
53896ca5 4577 'SL': '102.143.0.0/17',
773f291d
S
4578 'SM': '89.186.32.0/19',
4579 'SN': '41.82.0.0/15',
53896ca5 4580 'SO': '154.115.192.0/18',
773f291d
S
4581 'SR': '186.179.128.0/17',
4582 'SS': '105.235.208.0/21',
4583 'ST': '197.159.160.0/19',
4584 'SV': '168.243.0.0/16',
4585 'SX': '190.102.0.0/20',
4586 'SY': '5.0.0.0/16',
4587 'SZ': '41.84.224.0/19',
4588 'TC': '65.255.48.0/20',
4589 'TD': '154.68.128.0/19',
4590 'TG': '196.168.0.0/14',
4591 'TH': '171.96.0.0/13',
4592 'TJ': '85.9.128.0/18',
4593 'TK': '27.96.24.0/21',
4594 'TL': '180.189.160.0/20',
4595 'TM': '95.85.96.0/19',
4596 'TN': '197.0.0.0/11',
4597 'TO': '175.176.144.0/21',
4598 'TR': '78.160.0.0/11',
4599 'TT': '186.44.0.0/15',
4600 'TV': '202.2.96.0/19',
4601 'TW': '120.96.0.0/11',
4602 'TZ': '156.156.0.0/14',
53896ca5
S
4603 'UA': '37.52.0.0/14',
4604 'UG': '102.80.0.0/13',
4605 'US': '6.0.0.0/8',
773f291d 4606 'UY': '167.56.0.0/13',
53896ca5 4607 'UZ': '84.54.64.0/18',
773f291d 4608 'VA': '212.77.0.0/19',
53896ca5 4609 'VC': '207.191.240.0/21',
773f291d 4610 'VE': '186.88.0.0/13',
53896ca5 4611 'VG': '66.81.192.0/20',
773f291d
S
4612 'VI': '146.226.0.0/16',
4613 'VN': '14.160.0.0/11',
4614 'VU': '202.80.32.0/20',
4615 'WF': '117.20.32.0/21',
4616 'WS': '202.4.32.0/19',
4617 'YE': '134.35.0.0/16',
4618 'YT': '41.242.116.0/22',
4619 'ZA': '41.0.0.0/11',
53896ca5
S
4620 'ZM': '102.144.0.0/13',
4621 'ZW': '102.177.192.0/18',
773f291d
S
4622 }
4623
4624 @classmethod
5f95927a
S
4625 def random_ipv4(cls, code_or_block):
4626 if len(code_or_block) == 2:
4627 block = cls._country_ip_map.get(code_or_block.upper())
4628 if not block:
4629 return None
4630 else:
4631 block = code_or_block
773f291d
S
4632 addr, preflen = block.split('/')
4633 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4634 addr_max = addr_min | (0xffffffff >> int(preflen))
18a0defa 4635 return compat_str(socket.inet_ntoa(
4248dad9 4636 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4637
4638
91410c9b 4639class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
4640 def __init__(self, proxies=None):
4641 # Set default handlers
4642 for type in ('http', 'https'):
4643 setattr(self, '%s_open' % type,
4644 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4645 meth(r, proxy, type))
38e87f6c 4646 compat_urllib_request.ProxyHandler.__init__(self, proxies)
2461f79d 4647
91410c9b 4648 def proxy_open(self, req, proxy, type):
2461f79d 4649 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4650 if req_proxy is not None:
4651 proxy = req_proxy
2461f79d
PH
4652 del req.headers['Ytdl-request-proxy']
4653
4654 if proxy == '__noproxy__':
4655 return None # No Proxy
51fb4995 4656 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4657 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4658 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4659 return None
91410c9b
PH
4660 return compat_urllib_request.ProxyHandler.proxy_open(
4661 self, req, proxy, type)
5bc880b9
YCH
4662
4663
0a5445dd
YCH
4664# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4665# released into Public Domain
4666# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4667
4668def long_to_bytes(n, blocksize=0):
4669 """long_to_bytes(n:long, blocksize:int) : string
4670 Convert a long integer to a byte string.
4671
4672 If optional blocksize is given and greater than zero, pad the front of the
4673 byte string with binary zeros so that the length is a multiple of
4674 blocksize.
4675 """
4676 # after much testing, this algorithm was deemed to be the fastest
4677 s = b''
4678 n = int(n)
4679 while n > 0:
4680 s = compat_struct_pack('>I', n & 0xffffffff) + s
4681 n = n >> 32
4682 # strip off leading zeros
4683 for i in range(len(s)):
4684 if s[i] != b'\000'[0]:
4685 break
4686 else:
4687 # only happens when n == 0
4688 s = b'\000'
4689 i = 0
4690 s = s[i:]
4691 # add back some pad bytes. this could be done more efficiently w.r.t. the
4692 # de-padding being done above, but sigh...
4693 if blocksize > 0 and len(s) % blocksize:
4694 s = (blocksize - len(s) % blocksize) * b'\000' + s
4695 return s
4696
4697
4698def bytes_to_long(s):
4699 """bytes_to_long(string) : long
4700 Convert a byte string to a long integer.
4701
4702 This is (essentially) the inverse of long_to_bytes().
4703 """
4704 acc = 0
4705 length = len(s)
4706 if length % 4:
4707 extra = (4 - length % 4)
4708 s = b'\000' * extra + s
4709 length = length + extra
4710 for i in range(0, length, 4):
4711 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4712 return acc
4713
4714
5bc880b9
YCH
4715def ohdave_rsa_encrypt(data, exponent, modulus):
4716 '''
4717 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4718
4719 Input:
4720 data: data to encrypt, bytes-like object
4721 exponent, modulus: parameter e and N of RSA algorithm, both integer
4722 Output: hex string of encrypted data
4723
4724 Limitation: supports one block encryption only
4725 '''
4726
4727 payload = int(binascii.hexlify(data[::-1]), 16)
4728 encrypted = pow(payload, exponent, modulus)
4729 return '%x' % encrypted
81bdc8fd
YCH
4730
4731
f48409c7
YCH
4732def pkcs1pad(data, length):
4733 """
4734 Padding input data with PKCS#1 scheme
4735
4736 @param {int[]} data input data
4737 @param {int} length target length
4738 @returns {int[]} padded data
4739 """
4740 if len(data) > length - 11:
4741 raise ValueError('Input data too long for PKCS#1 padding')
4742
4743 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4744 return [0, 2] + pseudo_random + [0] + data
4745
4746
5eb6bdce 4747def encode_base_n(num, n, table=None):
59f898b7 4748 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
4749 if not table:
4750 table = FULL_TABLE[:n]
4751
5eb6bdce
YCH
4752 if n > len(table):
4753 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4754
4755 if num == 0:
4756 return table[0]
4757
81bdc8fd
YCH
4758 ret = ''
4759 while num:
4760 ret = table[num % n] + ret
4761 num = num // n
4762 return ret
f52354a8
YCH
4763
4764
4765def decode_packed_codes(code):
06b3fe29 4766 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4767 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4768 base = int(base)
4769 count = int(count)
4770 symbols = symbols.split('|')
4771 symbol_table = {}
4772
4773 while count:
4774 count -= 1
5eb6bdce 4775 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4776 symbol_table[base_n_count] = symbols[count] or base_n_count
4777
4778 return re.sub(
4779 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4780 obfuscated_code)
e154c651 4781
4782
1ced2221
S
4783def caesar(s, alphabet, shift):
4784 if shift == 0:
4785 return s
4786 l = len(alphabet)
4787 return ''.join(
4788 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4789 for c in s)
4790
4791
4792def rot47(s):
4793 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4794
4795
e154c651 4796def parse_m3u8_attributes(attrib):
4797 info = {}
4798 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4799 if val.startswith('"'):
4800 val = val[1:-1]
4801 info[key] = val
4802 return info
1143535d
YCH
4803
4804
4805def urshift(val, n):
4806 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4807
4808
4809# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 4810# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
4811def decode_png(png_data):
4812 # Reference: https://www.w3.org/TR/PNG/
4813 header = png_data[8:]
4814
4815 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
86e5f3ed 4816 raise OSError('Not a valid PNG file.')
d3f8e038
YCH
4817
4818 int_map = {1: '>B', 2: '>H', 4: '>I'}
4819 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4820
4821 chunks = []
4822
4823 while header:
4824 length = unpack_integer(header[:4])
4825 header = header[4:]
4826
4827 chunk_type = header[:4]
4828 header = header[4:]
4829
4830 chunk_data = header[:length]
4831 header = header[length:]
4832
4833 header = header[4:] # Skip CRC
4834
4835 chunks.append({
4836 'type': chunk_type,
4837 'length': length,
4838 'data': chunk_data
4839 })
4840
4841 ihdr = chunks[0]['data']
4842
4843 width = unpack_integer(ihdr[:4])
4844 height = unpack_integer(ihdr[4:8])
4845
4846 idat = b''
4847
4848 for chunk in chunks:
4849 if chunk['type'] == b'IDAT':
4850 idat += chunk['data']
4851
4852 if not idat:
86e5f3ed 4853 raise OSError('Unable to read PNG data.')
d3f8e038
YCH
4854
4855 decompressed_data = bytearray(zlib.decompress(idat))
4856
4857 stride = width * 3
4858 pixels = []
4859
4860 def _get_pixel(idx):
4861 x = idx % stride
4862 y = idx // stride
4863 return pixels[y][x]
4864
4865 for y in range(height):
4866 basePos = y * (1 + stride)
4867 filter_type = decompressed_data[basePos]
4868
4869 current_row = []
4870
4871 pixels.append(current_row)
4872
4873 for x in range(stride):
4874 color = decompressed_data[1 + basePos + x]
4875 basex = y * stride + x
4876 left = 0
4877 up = 0
4878
4879 if x > 2:
4880 left = _get_pixel(basex - 3)
4881 if y > 0:
4882 up = _get_pixel(basex - stride)
4883
4884 if filter_type == 1: # Sub
4885 color = (color + left) & 0xff
4886 elif filter_type == 2: # Up
4887 color = (color + up) & 0xff
4888 elif filter_type == 3: # Average
4889 color = (color + ((left + up) >> 1)) & 0xff
4890 elif filter_type == 4: # Paeth
4891 a = left
4892 b = up
4893 c = 0
4894
4895 if x > 2 and y > 0:
4896 c = _get_pixel(basex - stride - 3)
4897
4898 p = a + b - c
4899
4900 pa = abs(p - a)
4901 pb = abs(p - b)
4902 pc = abs(p - c)
4903
4904 if pa <= pb and pa <= pc:
4905 color = (color + a) & 0xff
4906 elif pb <= pc:
4907 color = (color + b) & 0xff
4908 else:
4909 color = (color + c) & 0xff
4910
4911 current_row.append(color)
4912
4913 return width, height, pixels
efa97bdc
YCH
4914
4915
4916def write_xattr(path, key, value):
6f7563be 4917 # Windows: Write xattrs to NTFS Alternate Data Streams:
4918 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4919 if compat_os_name == 'nt':
4920 assert ':' not in key
4921 assert os.path.exists(path)
efa97bdc
YCH
4922
4923 try:
6f7563be 4924 with open(f'{path}:{key}', 'wb') as f:
4925 f.write(value)
86e5f3ed 4926 except OSError as e:
efa97bdc 4927 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 4928 return
efa97bdc 4929
6f7563be 4930 # UNIX Method 1. Use xattrs/pyxattrs modules
4931 from .dependencies import xattr
efa97bdc 4932
6f7563be 4933 setxattr = None
4934 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4935 # Unicode arguments are not supported in pyxattr until version 0.5.0
4936 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4937 if version_tuple(xattr.__version__) >= (0, 5, 0):
4938 setxattr = xattr.set
4939 elif xattr:
4940 setxattr = xattr.setxattr
efa97bdc 4941
6f7563be 4942 if setxattr:
4943 try:
4944 setxattr(path, key, value)
4945 except OSError as e:
4946 raise XAttrMetadataError(e.errno, e.strerror)
4947 return
efa97bdc 4948
6f7563be 4949 # UNIX Method 2. Use setfattr/xattr executables
4950 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4951 else 'xattr' if check_executable('xattr', ['-h']) else None)
4952 if not exe:
4953 raise XAttrUnavailableError(
4954 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4955 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 4956
0f06bcd7 4957 value = value.decode()
6f7563be 4958 try:
f0c9fb96 4959 _, stderr, returncode = Popen.run(
6f7563be 4960 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 4961 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 4962 except OSError as e:
4963 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 4964 if returncode:
4965 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
4966
4967
4968def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
4969 start_date = datetime.date(1950, 1, 1)
4970 end_date = datetime.date(1995, 12, 31)
4971 offset = random.randint(0, (end_date - start_date).days)
4972 random_date = start_date + datetime.timedelta(offset)
0c265486 4973 return {
aa374bc7
AS
4974 year_field: str(random_date.year),
4975 month_field: str(random_date.month),
4976 day_field: str(random_date.day),
0c265486 4977 }
732044af 4978
c76eb41b 4979
732044af 4980# Templates for internet shortcut files, which are plain text files.
e5a998f3 4981DOT_URL_LINK_TEMPLATE = '''\
732044af 4982[InternetShortcut]
4983URL=%(url)s
e5a998f3 4984'''
732044af 4985
e5a998f3 4986DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 4987<?xml version="1.0" encoding="UTF-8"?>
4988<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4989<plist version="1.0">
4990<dict>
4991\t<key>URL</key>
4992\t<string>%(url)s</string>
4993</dict>
4994</plist>
e5a998f3 4995'''
732044af 4996
e5a998f3 4997DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 4998[Desktop Entry]
4999Encoding=UTF-8
5000Name=%(filename)s
5001Type=Link
5002URL=%(url)s
5003Icon=text-html
e5a998f3 5004'''
732044af 5005
08438d2c 5006LINK_TEMPLATES = {
5007 'url': DOT_URL_LINK_TEMPLATE,
5008 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5009 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5010}
5011
732044af 5012
5013def iri_to_uri(iri):
5014 """
5015 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5016
5017 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5018 """
5019
5020 iri_parts = compat_urllib_parse_urlparse(iri)
5021
5022 if '[' in iri_parts.netloc:
5023 raise ValueError('IPv6 URIs are not, yet, supported.')
5024 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5025
5026 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5027
5028 net_location = ''
5029 if iri_parts.username:
f9934b96 5030 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 5031 if iri_parts.password is not None:
f9934b96 5032 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 5033 net_location += '@'
5034
0f06bcd7 5035 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 5036 # The 'idna' encoding produces ASCII text.
5037 if iri_parts.port is not None and iri_parts.port != 80:
5038 net_location += ':' + str(iri_parts.port)
5039
f9934b96 5040 return urllib.parse.urlunparse(
732044af 5041 (iri_parts.scheme,
5042 net_location,
5043
f9934b96 5044 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5045
5046 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 5047 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5048
5049 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 5050 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 5051
f9934b96 5052 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 5053
5054 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5055
5056
5057def to_high_limit_path(path):
5058 if sys.platform in ['win32', 'cygwin']:
5059 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 5060 return '\\\\?\\' + os.path.abspath(path)
732044af 5061
5062 return path
76d321f6 5063
c76eb41b 5064
56ba69e4 5065def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=None):
e0ddbd02 5066 val = traverse_obj(obj, *variadic(field))
56ba69e4 5067 if (not val and val != 0) if ignore is NO_DEFAULT else val in ignore:
e0ddbd02 5068 return default
5069 return template % (func(val) if func else val)
00dd0cd5 5070
5071
5072def clean_podcast_url(url):
5073 return re.sub(r'''(?x)
5074 (?:
5075 (?:
5076 chtbl\.com/track|
5077 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5078 play\.podtrac\.com
5079 )/[^/]+|
5080 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5081 flex\.acast\.com|
5082 pd(?:
5083 cn\.co| # https://podcorn.com/analytics-prefix/
5084 st\.fm # https://podsights.com/docs/
5085 )/e
5086 )/''', '', url)
ffcb8191
THD
5087
5088
5089_HEX_TABLE = '0123456789abcdef'
5090
5091
5092def random_uuidv4():
5093 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5094
5095
5096def make_dir(path, to_screen=None):
5097 try:
5098 dn = os.path.dirname(path)
5099 if dn and not os.path.exists(dn):
5100 os.makedirs(dn)
5101 return True
86e5f3ed 5102 except OSError as err:
0202b52a 5103 if callable(to_screen) is not None:
5104 to_screen('unable to create directory ' + error_to_compat_str(err))
5105 return False
f74980cb 5106
5107
5108def get_executable_path():
b5899f4f 5109 from .update import _get_variant_and_executable_path
c487cf00 5110
b5899f4f 5111 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 5112
5113
2f567473 5114def load_plugins(name, suffix, namespace):
3ae5e797 5115 classes = {}
19a03940 5116 with contextlib.suppress(FileNotFoundError):
019a94f7
ÁS
5117 plugins_spec = importlib.util.spec_from_file_location(
5118 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5119 plugins = importlib.util.module_from_spec(plugins_spec)
5120 sys.modules[plugins_spec.name] = plugins
5121 plugins_spec.loader.exec_module(plugins)
f74980cb 5122 for name in dir(plugins):
2f567473 5123 if name in namespace:
5124 continue
5125 if not name.endswith(suffix):
f74980cb 5126 continue
5127 klass = getattr(plugins, name)
3ae5e797 5128 classes[name] = namespace[name] = klass
f74980cb 5129 return classes
06167fbb 5130
5131
325ebc17 5132def traverse_obj(
352d63fd 5133 obj, *path_list, default=None, expected_type=None, get_all=True,
325ebc17 5134 casesense=True, is_user_input=False, traverse_string=False):
324ad820 5135 ''' Traverse nested list/dict/tuple
8f334380 5136 @param path_list A list of paths which are checked one by one.
19a03940 5137 Each path is a list of keys where each key is a:
5138 - None: Do nothing
5139 - string: A dictionary key
5140 - int: An index into a list
5141 - tuple: A list of keys all of which will be traversed
5142 - Ellipsis: Fetch all values in the object
5143 - Function: Takes the key and value as arguments
5144 and returns whether the key matches or not
325ebc17 5145 @param default Default value to return
352d63fd 5146 @param expected_type Only accept final value of this type (Can also be any callable)
5147 @param get_all Return all the values obtained from a path or only the first one
324ad820 5148 @param casesense Whether to consider dictionary keys as case sensitive
5149 @param is_user_input Whether the keys are generated from user input. If True,
5150 strings are converted to int/slice if necessary
5151 @param traverse_string Whether to traverse inside strings. If True, any
5152 non-compatible object will also be converted into a string
8f334380 5153 # TODO: Write tests
324ad820 5154 '''
325ebc17 5155 if not casesense:
dbf5416a 5156 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
8f334380 5157 path_list = (map(_lower, variadic(path)) for path in path_list)
5158
5159 def _traverse_obj(obj, path, _current_depth=0):
5160 nonlocal depth
5161 path = tuple(variadic(path))
5162 for i, key in enumerate(path):
1797b073 5163 if None in (key, obj):
5164 return obj
8f334380 5165 if isinstance(key, (list, tuple)):
5166 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5167 key = ...
5168 if key is ...:
5169 obj = (obj.values() if isinstance(obj, dict)
5170 else obj if isinstance(obj, (list, tuple, LazyList))
5171 else str(obj) if traverse_string else [])
5172 _current_depth += 1
5173 depth = max(depth, _current_depth)
5174 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
2614f646 5175 elif callable(key):
5176 if isinstance(obj, (list, tuple, LazyList)):
5177 obj = enumerate(obj)
5178 elif isinstance(obj, dict):
5179 obj = obj.items()
5180 else:
5181 if not traverse_string:
5182 return None
5183 obj = str(obj)
5184 _current_depth += 1
5185 depth = max(depth, _current_depth)
e6f868a6 5186 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
575e17a1 5187 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
325ebc17 5188 obj = (obj.get(key) if casesense or (key in obj)
5189 else next((v for k, v in obj.items() if _lower(k) == key), None))
5190 else:
5191 if is_user_input:
5192 key = (int_or_none(key) if ':' not in key
5193 else slice(*map(int_or_none, key.split(':'))))
8f334380 5194 if key == slice(None):
575e17a1 5195 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
325ebc17 5196 if not isinstance(key, (int, slice)):
9fea350f 5197 return None
8f334380 5198 if not isinstance(obj, (list, tuple, LazyList)):
325ebc17 5199 if not traverse_string:
5200 return None
5201 obj = str(obj)
5202 try:
5203 obj = obj[key]
5204 except IndexError:
324ad820 5205 return None
325ebc17 5206 return obj
5207
352d63fd 5208 if isinstance(expected_type, type):
5209 type_test = lambda val: val if isinstance(val, expected_type) else None
5210 elif expected_type is not None:
5211 type_test = expected_type
5212 else:
5213 type_test = lambda val: val
5214
8f334380 5215 for path in path_list:
5216 depth = 0
5217 val = _traverse_obj(obj, path)
325ebc17 5218 if val is not None:
8f334380 5219 if depth:
5220 for _ in range(depth - 1):
6586bca9 5221 val = itertools.chain.from_iterable(v for v in val if v is not None)
352d63fd 5222 val = [v for v in map(type_test, val) if v is not None]
8f334380 5223 if val:
352d63fd 5224 return val if get_all else val[0]
5225 else:
5226 val = type_test(val)
5227 if val is not None:
8f334380 5228 return val
325ebc17 5229 return default
324ad820 5230
5231
5232def traverse_dict(dictn, keys, casesense=True):
ee8dd27a 5233 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5234 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5235 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5236
5237
ff91cf74 5238def get_first(obj, keys, **kwargs):
5239 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5240
5241
4b4b7f74 5242def variadic(x, allowed_types=(str, bytes, dict)):
cb89cfc1 5243 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
bd50a52b
THD
5244
5245
3e9b66d7
LNO
5246def decode_base(value, digits):
5247 # This will convert given base-x string to scalar (long or int)
5248 table = {char: index for index, char in enumerate(digits)}
5249 result = 0
5250 base = len(digits)
5251 for chr in value:
5252 result *= base
5253 result += table[chr]
5254 return result
5255
5256
5257def time_seconds(**kwargs):
5258 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5259 return t.timestamp()
5260
5261
49fa4d9a
N
5262# create a JSON Web Signature (jws) with HS256 algorithm
5263# the resulting format is in JWS Compact Serialization
5264# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5265# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5266def jwt_encode_hs256(payload_data, key, headers={}):
5267 header_data = {
5268 'alg': 'HS256',
5269 'typ': 'JWT',
5270 }
5271 if headers:
5272 header_data.update(headers)
0f06bcd7 5273 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5274 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5275 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
5276 signature_b64 = base64.b64encode(h.digest())
5277 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5278 return token
819e0531 5279
5280
16b0d7e6 5281# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5282def jwt_decode_hs256(jwt):
5283 header_b64, payload_b64, signature_b64 = jwt.split('.')
5284 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5285 return payload_data
5286
5287
53973b4d 5288WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5289
5290
0b9c08b4 5291@functools.cache
819e0531 5292def supports_terminal_sequences(stream):
5293 if compat_os_name == 'nt':
8a82af35 5294 if not WINDOWS_VT_MODE:
819e0531 5295 return False
5296 elif not os.getenv('TERM'):
5297 return False
5298 try:
5299 return stream.isatty()
5300 except BaseException:
5301 return False
5302
5303
53973b4d 5304def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
8a82af35 5305 if get_windows_version() < (10, 0, 10586):
53973b4d 5306 return
5307 global WINDOWS_VT_MODE
53973b4d 5308 try:
f0c9fb96 5309 Popen.run('', shell=True)
53973b4d 5310 except Exception:
5311 return
5312
5313 WINDOWS_VT_MODE = True
5314 supports_terminal_sequences.cache_clear()
5315
5316
ec11a9f4 5317_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5318
5319
5320def remove_terminal_sequences(string):
5321 return _terminal_sequences_re.sub('', string)
5322
5323
5324def number_of_digits(number):
5325 return len('%d' % number)
34921b43 5326
5327
5328def join_nonempty(*values, delim='-', from_dict=None):
5329 if from_dict is not None:
c586f9e8 5330 values = map(from_dict.get, values)
34921b43 5331 return delim.join(map(str, filter(None, values)))
06e57990 5332
5333
27231526
ZM
5334def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5335 """
5336 Find the largest format dimensions in terms of video width and, for each thumbnail:
5337 * Modify the URL: Match the width with the provided regex and replace with the former width
5338 * Update dimensions
5339
5340 This function is useful with video services that scale the provided thumbnails on demand
5341 """
5342 _keys = ('width', 'height')
5343 max_dimensions = max(
86e5f3ed 5344 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
5345 default=(0, 0))
5346 if not max_dimensions[0]:
5347 return thumbnails
5348 return [
5349 merge_dicts(
5350 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5351 dict(zip(_keys, max_dimensions)), thumbnail)
5352 for thumbnail in thumbnails
5353 ]
5354
5355
93c8410d
LNO
5356def parse_http_range(range):
5357 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5358 if not range:
5359 return None, None, None
5360 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5361 if not crg:
5362 return None, None, None
5363 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5364
5365
6b9e832d 5366def read_stdin(what):
5367 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5368 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5369 return sys.stdin
5370
5371
06e57990 5372class Config:
5373 own_args = None
9e491463 5374 parsed_args = None
06e57990 5375 filename = None
5376 __initialized = False
5377
5378 def __init__(self, parser, label=None):
9e491463 5379 self.parser, self.label = parser, label
06e57990 5380 self._loaded_paths, self.configs = set(), []
5381
5382 def init(self, args=None, filename=None):
5383 assert not self.__initialized
65662dff 5384 directory = ''
06e57990 5385 if filename:
5386 location = os.path.realpath(filename)
65662dff 5387 directory = os.path.dirname(location)
06e57990 5388 if location in self._loaded_paths:
5389 return False
5390 self._loaded_paths.add(location)
5391
9e491463 5392 self.own_args, self.__initialized = args, True
5393 opts, _ = self.parser.parse_known_args(args)
5394 self.parsed_args, self.filename = args, filename
5395
5396 for location in opts.config_locations or []:
6b9e832d 5397 if location == '-':
5398 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5399 continue
65662dff 5400 location = os.path.join(directory, expand_path(location))
06e57990 5401 if os.path.isdir(location):
5402 location = os.path.join(location, 'yt-dlp.conf')
5403 if not os.path.exists(location):
9e491463 5404 self.parser.error(f'config location {location} does not exist')
06e57990 5405 self.append_config(self.read_file(location), location)
5406 return True
5407
5408 def __str__(self):
5409 label = join_nonempty(
5410 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5411 delim=' ')
5412 return join_nonempty(
5413 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5414 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5415 delim='\n')
5416
5417 @staticmethod
5418 def read_file(filename, default=[]):
5419 try:
5420 optionf = open(filename)
86e5f3ed 5421 except OSError:
06e57990 5422 return default # silently skip if file is not present
5423 try:
5424 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5425 contents = optionf.read()
f9934b96 5426 res = shlex.split(contents, comments=True)
44a6fcff 5427 except Exception as err:
5428 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 5429 finally:
5430 optionf.close()
5431 return res
5432
5433 @staticmethod
5434 def hide_login_info(opts):
86e5f3ed 5435 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 5436 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5437
5438 def _scrub_eq(o):
5439 m = eqre.match(o)
5440 if m:
5441 return m.group('key') + '=PRIVATE'
5442 else:
5443 return o
5444
5445 opts = list(map(_scrub_eq, opts))
5446 for idx, opt in enumerate(opts):
5447 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5448 opts[idx + 1] = 'PRIVATE'
5449 return opts
5450
5451 def append_config(self, *args, label=None):
9e491463 5452 config = type(self)(self.parser, label)
06e57990 5453 config._loaded_paths = self._loaded_paths
5454 if config.init(*args):
5455 self.configs.append(config)
5456
5457 @property
5458 def all_args(self):
5459 for config in reversed(self.configs):
5460 yield from config.all_args
9e491463 5461 yield from self.parsed_args or []
5462
5463 def parse_known_args(self, **kwargs):
5464 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 5465
5466 def parse_args(self):
9e491463 5467 return self.parser.parse_args(self.all_args)
da42679b
LNO
5468
5469
5470class WebSocketsWrapper():
5471 """Wraps websockets module to use in non-async scopes"""
abfecb7b 5472 pool = None
da42679b 5473
3cea3edd 5474 def __init__(self, url, headers=None, connect=True):
059bc4db 5475 self.loop = asyncio.new_event_loop()
9cd08050 5476 # XXX: "loop" is deprecated
5477 self.conn = websockets.connect(
5478 url, extra_headers=headers, ping_interval=None,
5479 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5480 if connect:
5481 self.__enter__()
15dfb392 5482 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5483
5484 def __enter__(self):
3cea3edd 5485 if not self.pool:
9cd08050 5486 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5487 return self
5488
5489 def send(self, *args):
5490 self.run_with_loop(self.pool.send(*args), self.loop)
5491
5492 def recv(self, *args):
5493 return self.run_with_loop(self.pool.recv(*args), self.loop)
5494
5495 def __exit__(self, type, value, traceback):
5496 try:
5497 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5498 finally:
5499 self.loop.close()
15dfb392 5500 self._cancel_all_tasks(self.loop)
da42679b
LNO
5501
5502 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5503 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5504 @staticmethod
5505 def run_with_loop(main, loop):
059bc4db 5506 if not asyncio.iscoroutine(main):
da42679b
LNO
5507 raise ValueError(f'a coroutine was expected, got {main!r}')
5508
5509 try:
5510 return loop.run_until_complete(main)
5511 finally:
5512 loop.run_until_complete(loop.shutdown_asyncgens())
5513 if hasattr(loop, 'shutdown_default_executor'):
5514 loop.run_until_complete(loop.shutdown_default_executor())
5515
5516 @staticmethod
5517 def _cancel_all_tasks(loop):
059bc4db 5518 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
5519
5520 if not to_cancel:
5521 return
5522
5523 for task in to_cancel:
5524 task.cancel()
5525
9cd08050 5526 # XXX: "loop" is removed in python 3.10+
da42679b 5527 loop.run_until_complete(
059bc4db 5528 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
5529
5530 for task in to_cancel:
5531 if task.cancelled():
5532 continue
5533 if task.exception() is not None:
5534 loop.call_exception_handler({
5535 'message': 'unhandled exception during asyncio.run() shutdown',
5536 'exception': task.exception(),
5537 'task': task,
5538 })
5539
5540
8b7539d2 5541def merge_headers(*dicts):
08d30158 5542 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 5543 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 5544
5545
5546class classproperty:
c487cf00 5547 """classmethod(property(func)) that works in py < 3.9"""
5548
5549 def __init__(self, func):
5550 functools.update_wrapper(self, func)
5551 self.func = func
28787f16 5552
5553 def __get__(self, _, cls):
c487cf00 5554 return self.func(cls)
19a03940 5555
5556
64fa820c 5557class Namespace(types.SimpleNamespace):
591bb9d3 5558 """Immutable namespace"""
591bb9d3 5559
7896214c 5560 def __iter__(self):
64fa820c 5561 return iter(self.__dict__.values())
7896214c 5562
64fa820c 5563 @property
5564 def items_(self):
5565 return self.__dict__.items()
9b8ee23b 5566
5567
5568# Deprecated
5569has_certifi = bool(certifi)
5570has_websockets = bool(websockets)