]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
[cleanup] Minor fixes
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
cc52de43 1#!/usr/bin/env python3
15dfb392 2import atexit
1e399778 3import base64
5bc880b9 4import binascii
912b38b4 5import calendar
676eb3f2 6import codecs
c380cc28 7import collections
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96 10import datetime
0c265486 11import email.header
f8271158 12import email.utils
f45c185f 13import errno
be4a824d 14import functools
d77c3dfd 15import gzip
49fa4d9a
N
16import hashlib
17import hmac
019a94f7 18import importlib.util
03f9daab 19import io
79a2e94e 20import itertools
f4bfd65f 21import json
d77c3dfd 22import locale
02dbf93f 23import math
f8271158 24import mimetypes
347de493 25import operator
d77c3dfd 26import os
c496ca96 27import platform
773f291d 28import random
d77c3dfd 29import re
f8271158 30import shlex
c496ca96 31import socket
79a2e94e 32import ssl
1c088fa8 33import subprocess
d77c3dfd 34import sys
181c8655 35import tempfile
c380cc28 36import time
01951dda 37import traceback
f8271158 38import urllib.parse
bcf89ce6 39import xml.etree.ElementTree
d77c3dfd 40import zlib
d77c3dfd 41
8c25f81b 42from .compat import (
1e9969f4 43 asyncio,
8c25f81b 44 compat_chr,
1bab3437 45 compat_cookiejar,
36e6f62c 46 compat_etree_fromstring,
51098426 47 compat_expanduser,
8c25f81b 48 compat_html_entities,
55b2f099 49 compat_html_entities_html5,
f8271158 50 compat_HTMLParseError,
51 compat_HTMLParser,
be4a824d 52 compat_http_client,
f8271158 53 compat_HTTPError,
efa97bdc 54 compat_os_name,
8c25f81b 55 compat_parse_qs,
702ccf2d 56 compat_shlex_quote,
8c25f81b 57 compat_str,
edaa23f8 58 compat_struct_pack,
d3f8e038 59 compat_struct_unpack,
8c25f81b 60 compat_urllib_error,
f8271158 61 compat_urllib_parse_unquote_plus,
15707c7e 62 compat_urllib_parse_urlencode,
8c25f81b
PH
63 compat_urllib_parse_urlparse,
64 compat_urllib_request,
65 compat_urlparse,
66)
9b8ee23b 67from .dependencies import brotli, certifi, websockets
f8271158 68from .socks import ProxyType, sockssocket
71aff188 69
4644ac55 70
51fb4995
YCH
71def register_socks_protocols():
72 # "Register" SOCKS protocols
d5ae6bb5
YCH
73 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
74 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
75 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
76 if scheme not in compat_urlparse.uses_netloc:
77 compat_urlparse.uses_netloc.append(scheme)
78
79
468e2e92
FV
80# This is not clearly defined otherwise
81compiled_regex_type = type(re.compile(''))
82
f7a147e3
S
83
84def random_user_agent():
85 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
86 _CHROME_VERSIONS = (
19b4c74d 87 '90.0.4430.212',
88 '90.0.4430.24',
89 '90.0.4430.70',
90 '90.0.4430.72',
91 '90.0.4430.85',
92 '90.0.4430.93',
93 '91.0.4472.101',
94 '91.0.4472.106',
95 '91.0.4472.114',
96 '91.0.4472.124',
97 '91.0.4472.164',
98 '91.0.4472.19',
99 '91.0.4472.77',
100 '92.0.4515.107',
101 '92.0.4515.115',
102 '92.0.4515.131',
103 '92.0.4515.159',
104 '92.0.4515.43',
105 '93.0.4556.0',
106 '93.0.4577.15',
107 '93.0.4577.63',
108 '93.0.4577.82',
109 '94.0.4606.41',
110 '94.0.4606.54',
111 '94.0.4606.61',
112 '94.0.4606.71',
113 '94.0.4606.81',
114 '94.0.4606.85',
115 '95.0.4638.17',
116 '95.0.4638.50',
117 '95.0.4638.54',
118 '95.0.4638.69',
119 '95.0.4638.74',
120 '96.0.4664.18',
121 '96.0.4664.45',
122 '96.0.4664.55',
123 '96.0.4664.93',
124 '97.0.4692.20',
f7a147e3
S
125 )
126 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
127
128
4390d5ec 129SUPPORTED_ENCODINGS = [
130 'gzip', 'deflate'
131]
9b8ee23b 132if brotli:
4390d5ec 133 SUPPORTED_ENCODINGS.append('br')
134
3e669f36 135std_headers = {
f7a147e3 136 'User-Agent': random_user_agent(),
59ae15a5 137 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59ae15a5 138 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 139 'Sec-Fetch-Mode': 'navigate',
3e669f36 140}
f427df17 141
5f6a1245 142
fb37eb25
S
143USER_AGENTS = {
144 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
145}
146
147
bf42a990
S
148NO_DEFAULT = object()
149
7105440c
YCH
150ENGLISH_MONTH_NAMES = [
151 'January', 'February', 'March', 'April', 'May', 'June',
152 'July', 'August', 'September', 'October', 'November', 'December']
153
f6717dec
S
154MONTH_NAMES = {
155 'en': ENGLISH_MONTH_NAMES,
156 'fr': [
3e4185c3
S
157 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
158 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 159}
a942d6cb 160
a7aaa398
S
161KNOWN_EXTENSIONS = (
162 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
163 'flv', 'f4v', 'f4a', 'f4b',
164 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
165 'mkv', 'mka', 'mk3d',
166 'avi', 'divx',
167 'mov',
168 'asf', 'wmv', 'wma',
169 '3gp', '3g2',
170 'mp3',
171 'flac',
172 'ape',
173 'wav',
174 'f4f', 'f4m', 'm3u8', 'smil')
175
c587cbb7 176# needed for sanitizing filenames in restricted mode
c8827027 177ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
178 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
179 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 180
46f59e89
S
181DATE_FORMATS = (
182 '%d %B %Y',
183 '%d %b %Y',
184 '%B %d %Y',
cb655f34
S
185 '%B %dst %Y',
186 '%B %dnd %Y',
9d30c213 187 '%B %drd %Y',
cb655f34 188 '%B %dth %Y',
46f59e89 189 '%b %d %Y',
cb655f34
S
190 '%b %dst %Y',
191 '%b %dnd %Y',
9d30c213 192 '%b %drd %Y',
cb655f34 193 '%b %dth %Y',
46f59e89
S
194 '%b %dst %Y %I:%M',
195 '%b %dnd %Y %I:%M',
9d30c213 196 '%b %drd %Y %I:%M',
46f59e89
S
197 '%b %dth %Y %I:%M',
198 '%Y %m %d',
199 '%Y-%m-%d',
bccdbd22 200 '%Y.%m.%d.',
46f59e89 201 '%Y/%m/%d',
81c13222 202 '%Y/%m/%d %H:%M',
46f59e89 203 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
204 '%Y%m%d%H%M',
205 '%Y%m%d%H%M%S',
4f3fa23e 206 '%Y%m%d',
0c1c6f4b 207 '%Y-%m-%d %H:%M',
46f59e89
S
208 '%Y-%m-%d %H:%M:%S',
209 '%Y-%m-%d %H:%M:%S.%f',
5014558a 210 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
211 '%d.%m.%Y %H:%M',
212 '%d.%m.%Y %H.%M',
213 '%Y-%m-%dT%H:%M:%SZ',
214 '%Y-%m-%dT%H:%M:%S.%fZ',
215 '%Y-%m-%dT%H:%M:%S.%f0Z',
216 '%Y-%m-%dT%H:%M:%S',
217 '%Y-%m-%dT%H:%M:%S.%f',
218 '%Y-%m-%dT%H:%M',
c6eed6b8
S
219 '%b %d %Y at %H:%M',
220 '%b %d %Y at %H:%M:%S',
b555ae9b
S
221 '%B %d %Y at %H:%M',
222 '%B %d %Y at %H:%M:%S',
a63d9bd0 223 '%H:%M %d-%b-%Y',
46f59e89
S
224)
225
226DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
227DATE_FORMATS_DAY_FIRST.extend([
228 '%d-%m-%Y',
229 '%d.%m.%Y',
230 '%d.%m.%y',
231 '%d/%m/%Y',
232 '%d/%m/%y',
233 '%d/%m/%Y %H:%M:%S',
234])
235
236DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
237DATE_FORMATS_MONTH_FIRST.extend([
238 '%m-%d-%Y',
239 '%m.%d.%Y',
240 '%m/%d/%Y',
241 '%m/%d/%y',
242 '%m/%d/%Y %H:%M:%S',
243])
244
06b3fe29 245PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
22f5f5c6 246JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
06b3fe29 247
1d485a1a 248NUMBER_RE = r'\d+(?:\.\d+)?'
249
7105440c 250
d77c3dfd 251def preferredencoding():
59ae15a5 252 """Get preferred encoding.
d77c3dfd 253
59ae15a5
PH
254 Returns the best encoding scheme for the system, based on
255 locale.getpreferredencoding() and some further tweaks.
256 """
257 try:
258 pref = locale.getpreferredencoding()
28e614de 259 'TEST'.encode(pref)
70a1165b 260 except Exception:
59ae15a5 261 pref = 'UTF-8'
bae611f2 262
59ae15a5 263 return pref
d77c3dfd 264
f4bfd65f 265
181c8655 266def write_json_file(obj, fn):
1394646a 267 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 268
cfb0511d 269 tf = tempfile.NamedTemporaryFile(
270 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
271 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
272
273 try:
274 with tf:
45d86abe 275 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
276 if sys.platform == 'win32':
277 # Need to remove existing file on Windows, else os.rename raises
278 # WindowsError or FileExistsError.
19a03940 279 with contextlib.suppress(OSError):
1394646a 280 os.unlink(fn)
19a03940 281 with contextlib.suppress(OSError):
9cd5f54e
R
282 mask = os.umask(0)
283 os.umask(mask)
284 os.chmod(tf.name, 0o666 & ~mask)
181c8655 285 os.rename(tf.name, fn)
70a1165b 286 except Exception:
19a03940 287 with contextlib.suppress(OSError):
181c8655 288 os.remove(tf.name)
181c8655
PH
289 raise
290
291
cfb0511d 292def find_xpath_attr(node, xpath, key, val=None):
293 """ Find the xpath xpath[@key=val] """
294 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 295 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 296 return node.find(expr)
59ae56fa 297
d7e66d39
JMF
298# On python2.6 the xml.etree.ElementTree.Element methods don't support
299# the namespace parameter
5f6a1245
JW
300
301
d7e66d39
JMF
302def xpath_with_ns(path, ns_map):
303 components = [c.split(':') for c in path.split('/')]
304 replaced = []
305 for c in components:
306 if len(c) == 1:
307 replaced.append(c[0])
308 else:
309 ns, tag = c
310 replaced.append('{%s}%s' % (ns_map[ns], tag))
311 return '/'.join(replaced)
312
d77c3dfd 313
a41fb80c 314def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 315 def _find_xpath(xpath):
f9934b96 316 return node.find(xpath)
578c0745
S
317
318 if isinstance(xpath, (str, compat_str)):
319 n = _find_xpath(xpath)
320 else:
321 for xp in xpath:
322 n = _find_xpath(xp)
323 if n is not None:
324 break
d74bebd5 325
8e636da4 326 if n is None:
bf42a990
S
327 if default is not NO_DEFAULT:
328 return default
329 elif fatal:
bf0ff932
PH
330 name = xpath if name is None else name
331 raise ExtractorError('Could not find XML element %s' % name)
332 else:
333 return None
a41fb80c
S
334 return n
335
336
337def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
338 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
339 if n is None or n == default:
340 return n
341 if n.text is None:
342 if default is not NO_DEFAULT:
343 return default
344 elif fatal:
345 name = xpath if name is None else name
346 raise ExtractorError('Could not find XML element\'s text %s' % name)
347 else:
348 return None
349 return n.text
a41fb80c
S
350
351
352def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
353 n = find_xpath_attr(node, xpath, key)
354 if n is None:
355 if default is not NO_DEFAULT:
356 return default
357 elif fatal:
86e5f3ed 358 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
359 raise ExtractorError('Could not find XML attribute %s' % name)
360 else:
361 return None
362 return n.attrib[key]
bf0ff932
PH
363
364
9e6dd238 365def get_element_by_id(id, html):
43e8fafd 366 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 367 return get_element_by_attribute('id', id, html)
43e8fafd 368
12ea2f30 369
6f32a0b5
ZM
370def get_element_html_by_id(id, html):
371 """Return the html of the tag with the specified ID in the passed HTML document"""
372 return get_element_html_by_attribute('id', id, html)
373
374
84c237fb 375def get_element_by_class(class_name, html):
2af12ad9
TC
376 """Return the content of the first tag with the specified class in the passed HTML document"""
377 retval = get_elements_by_class(class_name, html)
378 return retval[0] if retval else None
379
380
6f32a0b5
ZM
381def get_element_html_by_class(class_name, html):
382 """Return the html of the first tag with the specified class in the passed HTML document"""
383 retval = get_elements_html_by_class(class_name, html)
384 return retval[0] if retval else None
385
386
2af12ad9
TC
387def get_element_by_attribute(attribute, value, html, escape_value=True):
388 retval = get_elements_by_attribute(attribute, value, html, escape_value)
389 return retval[0] if retval else None
390
391
6f32a0b5
ZM
392def get_element_html_by_attribute(attribute, value, html, escape_value=True):
393 retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
394 return retval[0] if retval else None
395
396
2af12ad9
TC
397def get_elements_by_class(class_name, html):
398 """Return the content of all tags with the specified class in the passed HTML document as a list"""
399 return get_elements_by_attribute(
84c237fb
YCH
400 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
401 html, escape_value=False)
402
403
6f32a0b5
ZM
404def get_elements_html_by_class(class_name, html):
405 """Return the html of all tags with the specified class in the passed HTML document as a list"""
406 return get_elements_html_by_attribute(
407 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
408 html, escape_value=False)
409
410
411def get_elements_by_attribute(*args, **kwargs):
43e8fafd 412 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
413 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
414
415
416def get_elements_html_by_attribute(*args, **kwargs):
417 """Return the html of the tag with the specified attribute in the passed HTML document"""
418 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
419
420
421def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
422 """
423 Return the text (content) and the html (whole) of the tag with the specified
424 attribute in the passed HTML document
425 """
9e6dd238 426
86e5f3ed 427 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 428
84c237fb
YCH
429 value = re.escape(value) if escape_value else value
430
86e5f3ed 431 partial_element_re = rf'''(?x)
6f32a0b5 432 <(?P<tag>[a-zA-Z0-9:._-]+)
0254f162 433 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 434 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
435 '''
38285056 436
0254f162
ZM
437 for m in re.finditer(partial_element_re, html):
438 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 439
0254f162
ZM
440 yield (
441 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
442 whole
443 )
a921f407 444
c5229f39 445
6f32a0b5
ZM
446class HTMLBreakOnClosingTagParser(compat_HTMLParser):
447 """
448 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
449 closing tag for the first opening tag it has encountered, and can be used
450 as a context manager
451 """
452
453 class HTMLBreakOnClosingTagException(Exception):
454 pass
455
456 def __init__(self):
457 self.tagstack = collections.deque()
458 compat_HTMLParser.__init__(self)
459
460 def __enter__(self):
461 return self
462
463 def __exit__(self, *_):
464 self.close()
465
466 def close(self):
467 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
468 # so data remains buffered; we no longer have any interest in it, thus
469 # override this method to discard it
470 pass
471
472 def handle_starttag(self, tag, _):
473 self.tagstack.append(tag)
474
475 def handle_endtag(self, tag):
476 if not self.tagstack:
477 raise compat_HTMLParseError('no tags in the stack')
478 while self.tagstack:
479 inner_tag = self.tagstack.pop()
480 if inner_tag == tag:
481 break
482 else:
483 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
484 if not self.tagstack:
485 raise self.HTMLBreakOnClosingTagException()
486
487
488def get_element_text_and_html_by_tag(tag, html):
489 """
490 For the first element with the specified tag in the passed HTML document
491 return its' content (text) and the whole element (html)
492 """
493 def find_or_raise(haystack, needle, exc):
494 try:
495 return haystack.index(needle)
496 except ValueError:
497 raise exc
498 closing_tag = f'</{tag}>'
499 whole_start = find_or_raise(
500 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
501 content_start = find_or_raise(
502 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
503 content_start += whole_start + 1
504 with HTMLBreakOnClosingTagParser() as parser:
505 parser.feed(html[whole_start:content_start])
506 if not parser.tagstack or parser.tagstack[0] != tag:
507 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
508 offset = content_start
509 while offset < len(html):
510 next_closing_tag_start = find_or_raise(
511 html[offset:], closing_tag,
512 compat_HTMLParseError(f'closing {tag} tag not found'))
513 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
514 try:
515 parser.feed(html[offset:offset + next_closing_tag_end])
516 offset += next_closing_tag_end
517 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
518 return html[content_start:offset + next_closing_tag_start], \
519 html[whole_start:offset + next_closing_tag_end]
520 raise compat_HTMLParseError('unexpected end of html')
521
522
8bb56eee
BF
523class HTMLAttributeParser(compat_HTMLParser):
524 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 525
8bb56eee 526 def __init__(self):
c5229f39 527 self.attrs = {}
8bb56eee
BF
528 compat_HTMLParser.__init__(self)
529
530 def handle_starttag(self, tag, attrs):
531 self.attrs = dict(attrs)
532
c5229f39 533
73673ccf
FF
534class HTMLListAttrsParser(compat_HTMLParser):
535 """HTML parser to gather the attributes for the elements of a list"""
536
537 def __init__(self):
538 compat_HTMLParser.__init__(self)
539 self.items = []
540 self._level = 0
541
542 def handle_starttag(self, tag, attrs):
543 if tag == 'li' and self._level == 0:
544 self.items.append(dict(attrs))
545 self._level += 1
546
547 def handle_endtag(self, tag):
548 self._level -= 1
549
550
8bb56eee
BF
551def extract_attributes(html_element):
552 """Given a string for an HTML element such as
553 <el
554 a="foo" B="bar" c="&98;az" d=boz
555 empty= noval entity="&amp;"
556 sq='"' dq="'"
557 >
558 Decode and return a dictionary of attributes.
559 {
560 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
561 'empty': '', 'noval': None, 'entity': '&',
562 'sq': '"', 'dq': '\''
563 }.
8bb56eee
BF
564 """
565 parser = HTMLAttributeParser()
19a03940 566 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
567 parser.feed(html_element)
568 parser.close()
8bb56eee 569 return parser.attrs
9e6dd238 570
c5229f39 571
73673ccf
FF
572def parse_list(webpage):
573 """Given a string for an series of HTML <li> elements,
574 return a dictionary of their attributes"""
575 parser = HTMLListAttrsParser()
576 parser.feed(webpage)
577 parser.close()
578 return parser.items
579
580
9e6dd238 581def clean_html(html):
59ae15a5 582 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
583
584 if html is None: # Convenience for sanitizing descriptions etc.
585 return html
586
49185227 587 html = re.sub(r'\s+', ' ', html)
588 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
589 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
590 # Strip html tags
591 html = re.sub('<.*?>', '', html)
592 # Replace html entities
593 html = unescapeHTML(html)
7decf895 594 return html.strip()
9e6dd238
FV
595
596
d77c3dfd 597def sanitize_open(filename, open_mode):
59ae15a5
PH
598 """Try to open the given filename, and slightly tweak it if this fails.
599
600 Attempts to open the given filename. If this fails, it tries to change
601 the filename slightly, step by step, until it's either able to open it
602 or it fails and raises a final exception, like the standard open()
603 function.
604
605 It returns the tuple (stream, definitive_file_name).
606 """
0edb3e33 607 if filename == '-':
608 if sys.platform == 'win32':
609 import msvcrt
610 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
611 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 612
0edb3e33 613 for attempt in range(2):
614 try:
615 try:
89737671 616 if sys.platform == 'win32':
b506289f 617 # FIXME: An exclusive lock also locks the file from being read.
618 # Since windows locks are mandatory, don't lock the file on windows (for now).
619 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 620 raise LockingUnsupportedError()
0edb3e33 621 stream = locked_file(filename, open_mode, block=False).__enter__()
622 except LockingUnsupportedError:
623 stream = open(filename, open_mode)
624 return (stream, filename)
86e5f3ed 625 except OSError as err:
0edb3e33 626 if attempt or err.errno in (errno.EACCES,):
627 raise
628 old_filename, filename = filename, sanitize_path(filename)
629 if old_filename == filename:
630 raise
d77c3dfd
FV
631
632
633def timeconvert(timestr):
59ae15a5
PH
634 """Convert RFC 2822 defined time string into system timestamp"""
635 timestamp = None
636 timetuple = email.utils.parsedate_tz(timestr)
637 if timetuple is not None:
638 timestamp = email.utils.mktime_tz(timetuple)
639 return timestamp
1c469a94 640
5f6a1245 641
5c3895ff 642def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 643 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 644 @param restricted Use a stricter subset of allowed characters
645 @param is_id Whether this is an ID that should be kept unchanged if possible.
646 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 647 """
5c3895ff 648 if s == '':
649 return ''
650
59ae15a5 651 def replace_insane(char):
c587cbb7
AT
652 if restricted and char in ACCENT_CHARS:
653 return ACCENT_CHARS[char]
91dd88b9 654 elif not restricted and char == '\n':
5c3895ff 655 return '\0 '
91dd88b9 656 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
657 return ''
658 elif char == '"':
659 return '' if restricted else '\''
660 elif char == ':':
5c3895ff 661 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 662 elif char in '\\/|*<>':
5c3895ff 663 return '\0_'
664 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
665 return '\0_'
59ae15a5
PH
666 return char
667
5c3895ff 668 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 669 result = ''.join(map(replace_insane, s))
5c3895ff 670 if is_id is NO_DEFAULT:
671 result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result) # Remove repeated substitute chars
672 STRIP_RE = '(?:\0.|[ _-])*'
673 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
674 result = result.replace('\0', '') or '_'
675
796173d0
PH
676 if not is_id:
677 while '__' in result:
678 result = result.replace('__', '_')
679 result = result.strip('_')
680 # Common case of "Foreign band name - English song title"
681 if restricted and result.startswith('-_'):
682 result = result[2:]
5a42414b
PH
683 if result.startswith('-'):
684 result = '_' + result[len('-'):]
a7440261 685 result = result.lstrip('.')
796173d0
PH
686 if not result:
687 result = '_'
59ae15a5 688 return result
d77c3dfd 689
5f6a1245 690
c2934512 691def sanitize_path(s, force=False):
a2aaf4db 692 """Sanitizes and normalizes path on Windows"""
c2934512 693 if sys.platform == 'win32':
c4218ac3 694 force = False
c2934512 695 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 696 elif force:
697 drive_or_unc = ''
698 else:
a2aaf4db 699 return s
c2934512 700
be531ef1
S
701 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
702 if drive_or_unc:
a2aaf4db
S
703 norm_path.pop(0)
704 sanitized_path = [
ec85ded8 705 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 706 for path_part in norm_path]
be531ef1
S
707 if drive_or_unc:
708 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 709 elif force and s and s[0] == os.path.sep:
c4218ac3 710 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
711 return os.path.join(*sanitized_path)
712
713
17bcc626 714def sanitize_url(url):
befa4708
S
715 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
716 # the number of unwanted failures due to missing protocol
21633673 717 if url is None:
718 return
719 elif url.startswith('//'):
befa4708
S
720 return 'http:%s' % url
721 # Fix some common typos seen so far
722 COMMON_TYPOS = (
067aa17e 723 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
724 (r'^httpss://', r'https://'),
725 # https://bx1.be/lives/direct-tv/
726 (r'^rmtp([es]?)://', r'rtmp\1://'),
727 )
728 for mistake, fixup in COMMON_TYPOS:
729 if re.match(mistake, url):
730 return re.sub(mistake, fixup, url)
bc6b9bcd 731 return url
17bcc626
S
732
733
5435dcf9
HH
734def extract_basic_auth(url):
735 parts = compat_urlparse.urlsplit(url)
736 if parts.username is None:
737 return url, None
738 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
739 parts.hostname if parts.port is None
740 else '%s:%d' % (parts.hostname, parts.port))))
741 auth_payload = base64.b64encode(
0f06bcd7 742 ('%s:%s' % (parts.username, parts.password or '')).encode())
743 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
744
745
67dda517 746def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 747 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
748 if auth_header is not None:
749 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
750 headers['Authorization'] = auth_header
751 return compat_urllib_request.Request(url, *args, **kwargs)
67dda517
S
752
753
51098426
S
754def expand_path(s):
755 """Expand shell variables and ~"""
756 return os.path.expandvars(compat_expanduser(s))
757
758
d77c3dfd 759def orderedSet(iterable):
59ae15a5
PH
760 """ Remove all duplicates from the input iterable """
761 res = []
762 for el in iterable:
763 if el not in res:
764 res.append(el)
765 return res
d77c3dfd 766
912b38b4 767
55b2f099 768def _htmlentity_transform(entity_with_semicolon):
4e408e47 769 """Transforms an HTML entity to a character."""
55b2f099
YCH
770 entity = entity_with_semicolon[:-1]
771
4e408e47
PH
772 # Known non-numeric HTML entity
773 if entity in compat_html_entities.name2codepoint:
774 return compat_chr(compat_html_entities.name2codepoint[entity])
775
55b2f099
YCH
776 # TODO: HTML5 allows entities without a semicolon. For example,
777 # '&Eacuteric' should be decoded as 'Éric'.
778 if entity_with_semicolon in compat_html_entities_html5:
779 return compat_html_entities_html5[entity_with_semicolon]
780
91757b0f 781 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
782 if mobj is not None:
783 numstr = mobj.group(1)
28e614de 784 if numstr.startswith('x'):
4e408e47 785 base = 16
28e614de 786 numstr = '0%s' % numstr
4e408e47
PH
787 else:
788 base = 10
067aa17e 789 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 790 with contextlib.suppress(ValueError):
7aefc49c 791 return compat_chr(int(numstr, base))
4e408e47
PH
792
793 # Unknown entity in name, return its literal representation
7a3f0c00 794 return '&%s;' % entity
4e408e47
PH
795
796
d77c3dfd 797def unescapeHTML(s):
912b38b4
PH
798 if s is None:
799 return None
19a03940 800 assert isinstance(s, str)
d77c3dfd 801
4e408e47 802 return re.sub(
95f3f7c2 803 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 804
8bf48f23 805
cdb19aa4 806def escapeHTML(text):
807 return (
808 text
809 .replace('&', '&amp;')
810 .replace('<', '&lt;')
811 .replace('>', '&gt;')
812 .replace('"', '&quot;')
813 .replace("'", '&#39;')
814 )
815
816
f5b1bca9 817def process_communicate_or_kill(p, *args, **kwargs):
818 try:
819 return p.communicate(*args, **kwargs)
820 except BaseException: # Including KeyboardInterrupt
821 p.kill()
822 p.wait()
823 raise
824
825
d3c93ec2 826class Popen(subprocess.Popen):
827 if sys.platform == 'win32':
828 _startupinfo = subprocess.STARTUPINFO()
829 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
830 else:
831 _startupinfo = None
832
833 def __init__(self, *args, **kwargs):
86e5f3ed 834 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 835
836 def communicate_or_kill(self, *args, **kwargs):
837 return process_communicate_or_kill(self, *args, **kwargs)
838
839
aa49acd1
S
840def get_subprocess_encoding():
841 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
842 # For subprocess calls, encode with locale encoding
843 # Refer to http://stackoverflow.com/a/9951851/35070
844 encoding = preferredencoding()
845 else:
846 encoding = sys.getfilesystemencoding()
847 if encoding is None:
848 encoding = 'utf-8'
849 return encoding
850
851
8bf48f23 852def encodeFilename(s, for_subprocess=False):
19a03940 853 assert isinstance(s, str)
cfb0511d 854 return s
aa49acd1
S
855
856
857def decodeFilename(b, for_subprocess=False):
cfb0511d 858 return b
8bf48f23 859
f07b74fc
PH
860
861def encodeArgument(s):
cfb0511d 862 # Legacy code that uses byte strings
863 # Uncomment the following line after fixing all post processors
864 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
865 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
866
867
aa49acd1 868def decodeArgument(b):
cfb0511d 869 return b
aa49acd1
S
870
871
8271226a
PH
872def decodeOption(optval):
873 if optval is None:
874 return optval
875 if isinstance(optval, bytes):
876 optval = optval.decode(preferredencoding())
877
878 assert isinstance(optval, compat_str)
879 return optval
1c256f70 880
5f6a1245 881
aa7785f8 882_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
883
884
885def timetuple_from_msec(msec):
886 secs, msec = divmod(msec, 1000)
887 mins, secs = divmod(secs, 60)
888 hrs, mins = divmod(mins, 60)
889 return _timetuple(hrs, mins, secs, msec)
890
891
cdb19aa4 892def formatSeconds(secs, delim=':', msec=False):
aa7785f8 893 time = timetuple_from_msec(secs * 1000)
894 if time.hours:
895 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
896 elif time.minutes:
897 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 898 else:
aa7785f8 899 ret = '%d' % time.seconds
900 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 901
a0ddb8a2 902
77562778 903def _ssl_load_windows_store_certs(ssl_context, storename):
904 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
905 try:
906 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
907 if encoding == 'x509_asn' and (
908 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
909 except PermissionError:
910 return
911 for cert in certs:
19a03940 912 with contextlib.suppress(ssl.SSLError):
77562778 913 ssl_context.load_verify_locations(cadata=cert)
a2366922 914
77562778 915
916def make_HTTPS_handler(params, **kwargs):
917 opts_check_certificate = not params.get('nocheckcertificate')
918 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
919 context.check_hostname = opts_check_certificate
f81c62a6 920 if params.get('legacyserverconnect'):
921 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
4f28b537 922 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
923 context.set_ciphers('DEFAULT')
77562778 924 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
925 if opts_check_certificate:
d5820461 926 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
927 context.load_verify_locations(cafile=certifi.where())
928 else:
929 try:
930 context.load_default_certs()
931 # Work around the issue in load_default_certs when there are bad certificates. See:
932 # https://github.com/yt-dlp/yt-dlp/issues/1060,
933 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
934 except ssl.SSLError:
935 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
936 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
d5820461 937 for storename in ('CA', 'ROOT'):
938 _ssl_load_windows_store_certs(context, storename)
939 context.set_default_verify_paths()
bb58c9ed 940 client_certfile = params.get('client_certificate')
941 if client_certfile:
942 try:
943 context.load_cert_chain(
944 client_certfile, keyfile=params.get('client_certificate_key'),
945 password=params.get('client_certificate_password'))
946 except ssl.SSLError:
947 raise YoutubeDLError('Unable to load client certificate')
77562778 948 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 949
732ea2f0 950
5873d4cc 951def bug_reports_message(before=';'):
a44ca5a4 952 msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , '
592b7485 953 'filling out the appropriate issue template. '
08d30158 954 'Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
955
956 before = before.rstrip()
957 if not before or before.endswith(('.', '!', '?')):
958 msg = msg[0].title() + msg[1:]
959
960 return (before + ' ' if before else '') + msg
08f2a92c
JMF
961
962
bf5b9d85
PM
963class YoutubeDLError(Exception):
964 """Base exception for YoutubeDL errors."""
aa9369a2 965 msg = None
966
967 def __init__(self, msg=None):
968 if msg is not None:
969 self.msg = msg
970 elif self.msg is None:
971 self.msg = type(self).__name__
972 super().__init__(self.msg)
bf5b9d85
PM
973
974
3158150c 975network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
976if hasattr(ssl, 'CertificateError'):
977 network_exceptions.append(ssl.CertificateError)
978network_exceptions = tuple(network_exceptions)
979
980
bf5b9d85 981class ExtractorError(YoutubeDLError):
1c256f70 982 """Error during info extraction."""
5f6a1245 983
1151c407 984 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 985 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 986 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 987 """
3158150c 988 if sys.exc_info()[0] in network_exceptions:
9a82b238 989 expected = True
d5979c5d 990
7265a219 991 self.orig_msg = str(msg)
1c256f70 992 self.traceback = tb
1151c407 993 self.expected = expected
2eabb802 994 self.cause = cause
d11271dd 995 self.video_id = video_id
1151c407 996 self.ie = ie
997 self.exc_info = sys.exc_info() # preserve original exception
998
86e5f3ed 999 super().__init__(''.join((
1151c407 1000 format_field(ie, template='[%s] '),
1001 format_field(video_id, template='%s: '),
7265a219 1002 msg,
1151c407 1003 format_field(cause, template=' (caused by %r)'),
1004 '' if expected else bug_reports_message())))
1c256f70 1005
01951dda 1006 def format_traceback(self):
497d2fab 1007 return join_nonempty(
1008 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 1009 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 1010 delim='\n') or None
01951dda 1011
1c256f70 1012
416c7fcb
PH
1013class UnsupportedError(ExtractorError):
1014 def __init__(self, url):
86e5f3ed 1015 super().__init__(
416c7fcb
PH
1016 'Unsupported URL: %s' % url, expected=True)
1017 self.url = url
1018
1019
55b3e45b
JMF
1020class RegexNotFoundError(ExtractorError):
1021 """Error when a regex didn't match"""
1022 pass
1023
1024
773f291d
S
1025class GeoRestrictedError(ExtractorError):
1026 """Geographic restriction Error exception.
1027
1028 This exception may be thrown when a video is not available from your
1029 geographic location due to geographic restrictions imposed by a website.
1030 """
b6e0c7d2 1031
0db3bae8 1032 def __init__(self, msg, countries=None, **kwargs):
1033 kwargs['expected'] = True
86e5f3ed 1034 super().__init__(msg, **kwargs)
773f291d
S
1035 self.countries = countries
1036
1037
bf5b9d85 1038class DownloadError(YoutubeDLError):
59ae15a5 1039 """Download Error exception.
d77c3dfd 1040
59ae15a5
PH
1041 This exception may be thrown by FileDownloader objects if they are not
1042 configured to continue on errors. They will contain the appropriate
1043 error message.
1044 """
5f6a1245 1045
8cc83b8d
FV
1046 def __init__(self, msg, exc_info=None):
1047 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1048 super().__init__(msg)
8cc83b8d 1049 self.exc_info = exc_info
d77c3dfd
FV
1050
1051
498f5606 1052class EntryNotInPlaylist(YoutubeDLError):
1053 """Entry not in playlist exception.
1054
1055 This exception will be thrown by YoutubeDL when a requested entry
1056 is not found in the playlist info_dict
1057 """
aa9369a2 1058 msg = 'Entry not found in info'
498f5606 1059
1060
bf5b9d85 1061class SameFileError(YoutubeDLError):
59ae15a5 1062 """Same File exception.
d77c3dfd 1063
59ae15a5
PH
1064 This exception will be thrown by FileDownloader objects if they detect
1065 multiple files would have to be downloaded to the same file on disk.
1066 """
aa9369a2 1067 msg = 'Fixed output name but more than one file to download'
1068
1069 def __init__(self, filename=None):
1070 if filename is not None:
1071 self.msg += f': {filename}'
1072 super().__init__(self.msg)
d77c3dfd
FV
1073
1074
bf5b9d85 1075class PostProcessingError(YoutubeDLError):
59ae15a5 1076 """Post Processing exception.
d77c3dfd 1077
59ae15a5
PH
1078 This exception may be raised by PostProcessor's .run() method to
1079 indicate an error in the postprocessing task.
1080 """
5f6a1245 1081
5f6a1245 1082
48f79687 1083class DownloadCancelled(YoutubeDLError):
1084 """ Exception raised when the download queue should be interrupted """
1085 msg = 'The download was cancelled'
8b0d7497 1086
8b0d7497 1087
48f79687 1088class ExistingVideoReached(DownloadCancelled):
1089 """ --break-on-existing triggered """
1090 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1091
48f79687 1092
1093class RejectedVideoReached(DownloadCancelled):
1094 """ --break-on-reject triggered """
1095 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
51d9739f 1096
1097
48f79687 1098class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1099 """ --max-downloads limit has been reached. """
48f79687 1100 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1101
1102
f2ebc5c7 1103class ReExtractInfo(YoutubeDLError):
1104 """ Video info needs to be re-extracted. """
1105
1106 def __init__(self, msg, expected=False):
1107 super().__init__(msg)
1108 self.expected = expected
1109
1110
1111class ThrottledDownload(ReExtractInfo):
48f79687 1112 """ Download speed below --throttled-rate. """
aa9369a2 1113 msg = 'The download speed is below throttle limit'
d77c3dfd 1114
43b22906 1115 def __init__(self):
1116 super().__init__(self.msg, expected=False)
f2ebc5c7 1117
d77c3dfd 1118
bf5b9d85 1119class UnavailableVideoError(YoutubeDLError):
59ae15a5 1120 """Unavailable Format exception.
d77c3dfd 1121
59ae15a5
PH
1122 This exception will be thrown when a video is requested
1123 in a format that is not available for that video.
1124 """
aa9369a2 1125 msg = 'Unable to download video'
1126
1127 def __init__(self, err=None):
1128 if err is not None:
1129 self.msg += f': {err}'
1130 super().__init__(self.msg)
d77c3dfd
FV
1131
1132
bf5b9d85 1133class ContentTooShortError(YoutubeDLError):
59ae15a5 1134 """Content Too Short exception.
d77c3dfd 1135
59ae15a5
PH
1136 This exception may be raised by FileDownloader objects when a file they
1137 download is too small for what the server announced first, indicating
1138 the connection was probably interrupted.
1139 """
d77c3dfd 1140
59ae15a5 1141 def __init__(self, downloaded, expected):
86e5f3ed 1142 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1143 # Both in bytes
59ae15a5
PH
1144 self.downloaded = downloaded
1145 self.expected = expected
d77c3dfd 1146
5f6a1245 1147
bf5b9d85 1148class XAttrMetadataError(YoutubeDLError):
efa97bdc 1149 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1150 super().__init__(msg)
efa97bdc 1151 self.code = code
bd264412 1152 self.msg = msg
efa97bdc
YCH
1153
1154 # Parsing code and msg
3089bc74 1155 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1156 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1157 self.reason = 'NO_SPACE'
1158 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1159 self.reason = 'VALUE_TOO_LONG'
1160 else:
1161 self.reason = 'NOT_SUPPORTED'
1162
1163
bf5b9d85 1164class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1165 pass
1166
1167
c5a59d93 1168def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
f9934b96 1169 hc = http_class(*args, **kwargs)
be4a824d 1170 source_address = ydl_handler._params.get('source_address')
8959018a 1171
be4a824d 1172 if source_address is not None:
8959018a
AU
1173 # This is to workaround _create_connection() from socket where it will try all
1174 # address data from getaddrinfo() including IPv6. This filters the result from
1175 # getaddrinfo() based on the source_address value.
1176 # This is based on the cpython socket.create_connection() function.
1177 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1178 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1179 host, port = address
1180 err = None
1181 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1182 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1183 ip_addrs = [addr for addr in addrs if addr[0] == af]
1184 if addrs and not ip_addrs:
1185 ip_version = 'v4' if af == socket.AF_INET else 'v6'
86e5f3ed 1186 raise OSError(
9e21e6d9
S
1187 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1188 % (ip_version, source_address[0]))
8959018a
AU
1189 for res in ip_addrs:
1190 af, socktype, proto, canonname, sa = res
1191 sock = None
1192 try:
1193 sock = socket.socket(af, socktype, proto)
1194 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1195 sock.settimeout(timeout)
1196 sock.bind(source_address)
1197 sock.connect(sa)
1198 err = None # Explicitly break reference cycle
1199 return sock
86e5f3ed 1200 except OSError as _:
8959018a
AU
1201 err = _
1202 if sock is not None:
1203 sock.close()
1204 if err is not None:
1205 raise err
1206 else:
86e5f3ed 1207 raise OSError('getaddrinfo returns an empty list')
9e21e6d9
S
1208 if hasattr(hc, '_create_connection'):
1209 hc._create_connection = _create_connection
cfb0511d 1210 hc.source_address = (source_address, 0)
be4a824d
PH
1211
1212 return hc
1213
1214
87f0e62d 1215def handle_youtubedl_headers(headers):
992fc9d6
YCH
1216 filtered_headers = headers
1217
1218 if 'Youtubedl-no-compression' in filtered_headers:
86e5f3ed 1219 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
87f0e62d 1220 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1221
992fc9d6 1222 return filtered_headers
87f0e62d
YCH
1223
1224
acebc9cd 1225class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
1226 """Handler for HTTP requests and responses.
1227
1228 This class, when installed with an OpenerDirector, automatically adds
1229 the standard headers to every HTTP request and handles gzipped and
1230 deflated responses from web servers. If compression is to be avoided in
1231 a particular request, the original request in the program code only has
0424ec30 1232 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1233 removed before making the real request.
1234
1235 Part of this code was copied from:
1236
1237 http://techknack.net/python-urllib2-handlers/
1238
1239 Andrew Rowls, the author of that code, agreed to release it to the
1240 public domain.
1241 """
1242
be4a824d
PH
1243 def __init__(self, params, *args, **kwargs):
1244 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1245 self._params = params
1246
1247 def http_open(self, req):
71aff188
YCH
1248 conn_class = compat_http_client.HTTPConnection
1249
1250 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1251 if socks_proxy:
1252 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1253 del req.headers['Ytdl-socks-proxy']
1254
be4a824d 1255 return self.do_open(functools.partial(
71aff188 1256 _create_http_connection, self, conn_class, False),
be4a824d
PH
1257 req)
1258
59ae15a5
PH
1259 @staticmethod
1260 def deflate(data):
fc2119f2 1261 if not data:
1262 return data
59ae15a5
PH
1263 try:
1264 return zlib.decompress(data, -zlib.MAX_WBITS)
1265 except zlib.error:
1266 return zlib.decompress(data)
1267
4390d5ec 1268 @staticmethod
1269 def brotli(data):
1270 if not data:
1271 return data
9b8ee23b 1272 return brotli.decompress(data)
4390d5ec 1273
acebc9cd 1274 def http_request(self, req):
51f267d9
S
1275 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1276 # always respected by websites, some tend to give out URLs with non percent-encoded
1277 # non-ASCII characters (see telemb.py, ard.py [#3412])
1278 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1279 # To work around aforementioned issue we will replace request's original URL with
1280 # percent-encoded one
1281 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1282 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1283 url = req.get_full_url()
1284 url_escaped = escape_url(url)
1285
1286 # Substitute URL if any change after escaping
1287 if url != url_escaped:
15d260eb 1288 req = update_Request(req, url=url_escaped)
51f267d9 1289
8b7539d2 1290 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1291 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1292 # The dict keys are capitalized because of this bug by urllib
1293 if h.capitalize() not in req.headers:
33ac271b 1294 req.add_header(h, v)
87f0e62d 1295
af14914b 1296 if 'Accept-encoding' not in req.headers:
1297 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1298
87f0e62d 1299 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b 1300
59ae15a5
PH
1301 return req
1302
acebc9cd 1303 def http_response(self, req, resp):
59ae15a5
PH
1304 old_resp = resp
1305 # gzip
1306 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1307 content = resp.read()
1308 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1309 try:
1310 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1311 except OSError as original_ioerror:
aa3e9507
PH
1312 # There may be junk add the end of the file
1313 # See http://stackoverflow.com/q/4928560/35070 for details
1314 for i in range(1, 1024):
1315 try:
1316 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1317 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1318 except OSError:
aa3e9507
PH
1319 continue
1320 break
1321 else:
1322 raise original_ioerror
b407d853 1323 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1324 resp.msg = old_resp.msg
c047270c 1325 del resp.headers['Content-encoding']
59ae15a5
PH
1326 # deflate
1327 if resp.headers.get('Content-encoding', '') == 'deflate':
1328 gz = io.BytesIO(self.deflate(resp.read()))
b407d853 1329 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1330 resp.msg = old_resp.msg
c047270c 1331 del resp.headers['Content-encoding']
4390d5ec 1332 # brotli
1333 if resp.headers.get('Content-encoding', '') == 'br':
1334 resp = compat_urllib_request.addinfourl(
1335 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1336 resp.msg = old_resp.msg
1337 del resp.headers['Content-encoding']
ad729172 1338 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1339 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1340 if 300 <= resp.code < 400:
1341 location = resp.headers.get('Location')
1342 if location:
1343 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
0f06bcd7 1344 location = location.encode('iso-8859-1').decode()
5a4d9ddb
S
1345 location_escaped = escape_url(location)
1346 if location != location_escaped:
1347 del resp.headers['Location']
1348 resp.headers['Location'] = location_escaped
59ae15a5 1349 return resp
0f8d03f8 1350
acebc9cd
PH
1351 https_request = http_request
1352 https_response = http_response
bf50b038 1353
5de90176 1354
71aff188
YCH
1355def make_socks_conn_class(base_class, socks_proxy):
1356 assert issubclass(base_class, (
1357 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1358
1359 url_components = compat_urlparse.urlparse(socks_proxy)
1360 if url_components.scheme.lower() == 'socks5':
1361 socks_type = ProxyType.SOCKS5
1362 elif url_components.scheme.lower() in ('socks', 'socks4'):
1363 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1364 elif url_components.scheme.lower() == 'socks4a':
1365 socks_type = ProxyType.SOCKS4A
71aff188 1366
cdd94c2e
YCH
1367 def unquote_if_non_empty(s):
1368 if not s:
1369 return s
1370 return compat_urllib_parse_unquote_plus(s)
1371
71aff188
YCH
1372 proxy_args = (
1373 socks_type,
1374 url_components.hostname, url_components.port or 1080,
1375 True, # Remote DNS
cdd94c2e
YCH
1376 unquote_if_non_empty(url_components.username),
1377 unquote_if_non_empty(url_components.password),
71aff188
YCH
1378 )
1379
1380 class SocksConnection(base_class):
1381 def connect(self):
1382 self.sock = sockssocket()
1383 self.sock.setproxy(*proxy_args)
19a03940 1384 if isinstance(self.timeout, (int, float)):
71aff188
YCH
1385 self.sock.settimeout(self.timeout)
1386 self.sock.connect((self.host, self.port))
1387
1388 if isinstance(self, compat_http_client.HTTPSConnection):
1389 if hasattr(self, '_context'): # Python > 2.6
1390 self.sock = self._context.wrap_socket(
1391 self.sock, server_hostname=self.host)
1392 else:
1393 self.sock = ssl.wrap_socket(self.sock)
1394
1395 return SocksConnection
1396
1397
be4a824d
PH
1398class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1399 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1400 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1401 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1402 self._params = params
1403
1404 def https_open(self, req):
4f264c02 1405 kwargs = {}
71aff188
YCH
1406 conn_class = self._https_conn_class
1407
4f264c02
JMF
1408 if hasattr(self, '_context'): # python > 2.6
1409 kwargs['context'] = self._context
1410 if hasattr(self, '_check_hostname'): # python 3.x
1411 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1412
1413 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1414 if socks_proxy:
1415 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1416 del req.headers['Ytdl-socks-proxy']
1417
4f28b537 1418 try:
1419 return self.do_open(
1420 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1421 except urllib.error.URLError as e:
1422 if (isinstance(e.reason, ssl.SSLError)
1423 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1424 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1425 raise
be4a824d
PH
1426
1427
1bab3437 1428class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
f1a8511f
S
1429 """
1430 See [1] for cookie file format.
1431
1432 1. https://curl.haxx.se/docs/http-cookies.html
1433 """
e7e62441 1434 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1435 _ENTRY_LEN = 7
1436 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1437# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1438
1439'''
1440 _CookieFileEntry = collections.namedtuple(
1441 'CookieFileEntry',
1442 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1443
d76fa1f3 1444 def __init__(self, filename=None, *args, **kwargs):
1445 super().__init__(None, *args, **kwargs)
1446 if self.is_path(filename):
1447 filename = os.fspath(filename)
1448 self.filename = filename
1449
24146491 1450 @staticmethod
1451 def _true_or_false(cndn):
1452 return 'TRUE' if cndn else 'FALSE'
1453
d76fa1f3 1454 @staticmethod
1455 def is_path(file):
1456 return isinstance(file, (str, bytes, os.PathLike))
1457
1458 @contextlib.contextmanager
1459 def open(self, file, *, write=False):
1460 if self.is_path(file):
1461 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1462 yield f
1463 else:
1464 if write:
1465 file.truncate(0)
1466 yield file
1467
24146491 1468 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1469 now = time.time()
1470 for cookie in self:
1471 if (not ignore_discard and cookie.discard
1472 or not ignore_expires and cookie.is_expired(now)):
1473 continue
1474 name, value = cookie.name, cookie.value
1475 if value is None:
1476 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1477 # with no name, whereas http.cookiejar regards it as a
1478 # cookie with no value.
1479 name, value = '', name
1480 f.write('%s\n' % '\t'.join((
1481 cookie.domain,
1482 self._true_or_false(cookie.domain.startswith('.')),
1483 cookie.path,
1484 self._true_or_false(cookie.secure),
1485 str_or_none(cookie.expires, default=''),
1486 name, value
1487 )))
1488
1489 def save(self, filename=None, *args, **kwargs):
c380cc28
S
1490 """
1491 Save cookies to a file.
24146491 1492 Code is taken from CPython 3.6
1493 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
c380cc28 1494
c380cc28
S
1495 if filename is None:
1496 if self.filename is not None:
1497 filename = self.filename
1498 else:
1499 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1500
24146491 1501 # Store session cookies with `expires` set to 0 instead of an empty string
1bab3437
S
1502 for cookie in self:
1503 if cookie.expires is None:
1504 cookie.expires = 0
c380cc28 1505
d76fa1f3 1506 with self.open(filename, write=True) as f:
c380cc28 1507 f.write(self._HEADER)
24146491 1508 self._really_save(f, *args, **kwargs)
1bab3437
S
1509
1510 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1511 """Load cookies from a file."""
1512 if filename is None:
1513 if self.filename is not None:
1514 filename = self.filename
1515 else:
1516 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1517
c380cc28
S
1518 def prepare_line(line):
1519 if line.startswith(self._HTTPONLY_PREFIX):
1520 line = line[len(self._HTTPONLY_PREFIX):]
1521 # comments and empty lines are fine
1522 if line.startswith('#') or not line.strip():
1523 return line
1524 cookie_list = line.split('\t')
1525 if len(cookie_list) != self._ENTRY_LEN:
1526 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1527 cookie = self._CookieFileEntry(*cookie_list)
1528 if cookie.expires_at and not cookie.expires_at.isdigit():
1529 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1530 return line
1531
e7e62441 1532 cf = io.StringIO()
d76fa1f3 1533 with self.open(filename) as f:
e7e62441 1534 for line in f:
c380cc28
S
1535 try:
1536 cf.write(prepare_line(line))
1537 except compat_cookiejar.LoadError as e:
94aa0644
L
1538 if f'{line.strip()} '[0] in '[{"':
1539 raise compat_cookiejar.LoadError(
1540 'Cookies file must be Netscape formatted, not JSON. See '
1541 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
19a03940 1542 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
c380cc28 1543 continue
e7e62441 1544 cf.seek(0)
1545 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1546 # Session cookies are denoted by either `expires` field set to
1547 # an empty string or 0. MozillaCookieJar only recognizes the former
1548 # (see [1]). So we need force the latter to be recognized as session
1549 # cookies on our own.
1550 # Session cookies may be important for cookies-based authentication,
1551 # e.g. usually, when user does not check 'Remember me' check box while
1552 # logging in on a site, some important cookies are stored as session
1553 # cookies so that not recognizing them will result in failed login.
1554 # 1. https://bugs.python.org/issue17164
1555 for cookie in self:
1556 # Treat `expires=0` cookies as session cookies
1557 if cookie.expires == 0:
1558 cookie.expires = None
1559 cookie.discard = True
1560
1561
a6420bf5
S
1562class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1563 def __init__(self, cookiejar=None):
1564 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1565
1566 def http_response(self, request, response):
a6420bf5
S
1567 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1568
f5fa042c 1569 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
a6420bf5
S
1570 https_response = http_response
1571
1572
fca6dba8 1573class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
201c1459 1574 """YoutubeDL redirect handler
1575
1576 The code is based on HTTPRedirectHandler implementation from CPython [1].
1577
1578 This redirect handler solves two issues:
1579 - ensures redirect URL is always unicode under python 2
1580 - introduces support for experimental HTTP response status code
1581 308 Permanent Redirect [2] used by some sites [3]
1582
1583 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1584 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1585 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1586 """
1587
1588 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1589
1590 def redirect_request(self, req, fp, code, msg, headers, newurl):
1591 """Return a Request or None in response to a redirect.
1592
1593 This is called by the http_error_30x methods when a
1594 redirection response is received. If a redirection should
1595 take place, return a new Request to allow http_error_30x to
1596 perform the redirect. Otherwise, raise HTTPError if no-one
1597 else should try to handle this url. Return None if you can't
1598 but another Handler might.
1599 """
1600 m = req.get_method()
1601 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1602 or code in (301, 302, 303) and m == "POST")):
1603 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1604 # Strictly (according to RFC 2616), 301 or 302 in response to
1605 # a POST MUST NOT cause a redirection without confirmation
1606 # from the user (of urllib.request, in this case). In practice,
1607 # essentially all clients do redirect in this case, so we do
1608 # the same.
1609
201c1459 1610 # Be conciliant with URIs containing a space. This is mainly
1611 # redundant with the more complete encoding done in http_error_302(),
1612 # but it is kept for compatibility with other callers.
1613 newurl = newurl.replace(' ', '%20')
1614
1615 CONTENT_HEADERS = ("content-length", "content-type")
1616 # NB: don't use dict comprehension for python 2.6 compatibility
86e5f3ed 1617 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
afac4caa 1618
1619 # A 303 must either use GET or HEAD for subsequent request
1620 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1621 if code == 303 and m != 'HEAD':
1622 m = 'GET'
1623 # 301 and 302 redirects are commonly turned into a GET from a POST
1624 # for subsequent requests by browsers, so we'll do the same.
1625 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1626 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1627 if code in (301, 302) and m == 'POST':
1628 m = 'GET'
1629
201c1459 1630 return compat_urllib_request.Request(
1631 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
afac4caa 1632 unverifiable=True, method=m)
fca6dba8
S
1633
1634
46f59e89
S
1635def extract_timezone(date_str):
1636 m = re.search(
f137e4c2 1637 r'''(?x)
1638 ^.{8,}? # >=8 char non-TZ prefix, if present
1639 (?P<tz>Z| # just the UTC Z, or
1640 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1641 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1642 [ ]? # optional space
1643 (?P<sign>\+|-) # +/-
1644 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1645 $)
1646 ''', date_str)
46f59e89
S
1647 if not m:
1648 timezone = datetime.timedelta()
1649 else:
1650 date_str = date_str[:-len(m.group('tz'))]
1651 if not m.group('sign'):
1652 timezone = datetime.timedelta()
1653 else:
1654 sign = 1 if m.group('sign') == '+' else -1
1655 timezone = datetime.timedelta(
1656 hours=sign * int(m.group('hours')),
1657 minutes=sign * int(m.group('minutes')))
1658 return timezone, date_str
1659
1660
08b38d54 1661def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1662 """ Return a UNIX timestamp from the given date """
1663
1664 if date_str is None:
1665 return None
1666
52c3a6e4
S
1667 date_str = re.sub(r'\.[0-9]+', '', date_str)
1668
08b38d54 1669 if timezone is None:
46f59e89
S
1670 timezone, date_str = extract_timezone(date_str)
1671
19a03940 1672 with contextlib.suppress(ValueError):
86e5f3ed 1673 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1674 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1675 return calendar.timegm(dt.timetuple())
912b38b4
PH
1676
1677
46f59e89
S
1678def date_formats(day_first=True):
1679 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1680
1681
42bdd9d0 1682def unified_strdate(date_str, day_first=True):
bf50b038 1683 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1684
1685 if date_str is None:
1686 return None
bf50b038 1687 upload_date = None
5f6a1245 1688 # Replace commas
026fcc04 1689 date_str = date_str.replace(',', ' ')
42bdd9d0 1690 # Remove AM/PM + timezone
9bb8e0a3 1691 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1692 _, date_str = extract_timezone(date_str)
42bdd9d0 1693
46f59e89 1694 for expression in date_formats(day_first):
19a03940 1695 with contextlib.suppress(ValueError):
bf50b038 1696 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1697 if upload_date is None:
1698 timetuple = email.utils.parsedate_tz(date_str)
1699 if timetuple:
19a03940 1700 with contextlib.suppress(ValueError):
c6b9cf05 1701 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402
JMF
1702 if upload_date is not None:
1703 return compat_str(upload_date)
bf50b038 1704
5f6a1245 1705
46f59e89
S
1706def unified_timestamp(date_str, day_first=True):
1707 if date_str is None:
1708 return None
1709
2ae2ffda 1710 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1711
7dc2a74e 1712 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1713 timezone, date_str = extract_timezone(date_str)
1714
1715 # Remove AM/PM + timezone
1716 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1717
deef3195
S
1718 # Remove unrecognized timezones from ISO 8601 alike timestamps
1719 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1720 if m:
1721 date_str = date_str[:-len(m.group('tz'))]
1722
f226880c
PH
1723 # Python only supports microseconds, so remove nanoseconds
1724 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1725 if m:
1726 date_str = m.group(1)
1727
46f59e89 1728 for expression in date_formats(day_first):
19a03940 1729 with contextlib.suppress(ValueError):
7dc2a74e 1730 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1731 return calendar.timegm(dt.timetuple())
46f59e89
S
1732 timetuple = email.utils.parsedate_tz(date_str)
1733 if timetuple:
7dc2a74e 1734 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1735
1736
28e614de 1737def determine_ext(url, default_ext='unknown_video'):
85750f89 1738 if url is None or '.' not in url:
f4776371 1739 return default_ext
9cb9a5df 1740 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1741 if re.match(r'^[A-Za-z0-9]+$', guess):
1742 return guess
a7aaa398
S
1743 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1744 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1745 return guess.rstrip('/')
73e79f2a 1746 else:
cbdbb766 1747 return default_ext
73e79f2a 1748
5f6a1245 1749
824fa511
S
1750def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1751 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1752
5f6a1245 1753
9e62f283 1754def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1755 R"""
1756 Return a datetime object from a string.
1757 Supported format:
1758 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1759
1760 @param format strftime format of DATE
1761 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1762 auto: round to the unit provided in date_str (if applicable).
9e62f283 1763 """
1764 auto_precision = False
1765 if precision == 'auto':
1766 auto_precision = True
1767 precision = 'microsecond'
396a76f7 1768 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1769 if date_str in ('now', 'today'):
37254abc 1770 return today
f8795e10
PH
1771 if date_str == 'yesterday':
1772 return today - datetime.timedelta(days=1)
9e62f283 1773 match = re.match(
3d38b2d6 1774 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1775 date_str)
37254abc 1776 if match is not None:
9e62f283 1777 start_time = datetime_from_str(match.group('start'), precision, format)
1778 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1779 unit = match.group('unit')
9e62f283 1780 if unit == 'month' or unit == 'year':
1781 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1782 unit = 'day'
9e62f283 1783 else:
1784 if unit == 'week':
1785 unit = 'day'
1786 time *= 7
1787 delta = datetime.timedelta(**{unit + 's': time})
1788 new_date = start_time + delta
1789 if auto_precision:
1790 return datetime_round(new_date, unit)
1791 return new_date
1792
1793 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1794
1795
d49f8db3 1796def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1797 R"""
1798 Return a date object from a string using datetime_from_str
9e62f283 1799
3d38b2d6 1800 @param strict Restrict allowed patterns to "YYYYMMDD" and
1801 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1802 """
3d38b2d6 1803 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1804 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1805 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1806
1807
1808def datetime_add_months(dt, months):
1809 """Increment/Decrement a datetime object by months."""
1810 month = dt.month + months - 1
1811 year = dt.year + month // 12
1812 month = month % 12 + 1
1813 day = min(dt.day, calendar.monthrange(year, month)[1])
1814 return dt.replace(year, month, day)
1815
1816
1817def datetime_round(dt, precision='day'):
1818 """
1819 Round a datetime object's time to a specific precision
1820 """
1821 if precision == 'microsecond':
1822 return dt
1823
1824 unit_seconds = {
1825 'day': 86400,
1826 'hour': 3600,
1827 'minute': 60,
1828 'second': 1,
1829 }
1830 roundto = lambda x, n: ((x + n / 2) // n) * n
1831 timestamp = calendar.timegm(dt.timetuple())
1832 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1833
1834
e63fc1be 1835def hyphenate_date(date_str):
1836 """
1837 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1838 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1839 if match is not None:
1840 return '-'.join(match.groups())
1841 else:
1842 return date_str
1843
5f6a1245 1844
86e5f3ed 1845class DateRange:
bd558525 1846 """Represents a time interval between two dates"""
5f6a1245 1847
bd558525
JMF
1848 def __init__(self, start=None, end=None):
1849 """start and end must be strings in the format accepted by date"""
1850 if start is not None:
d49f8db3 1851 self.start = date_from_str(start, strict=True)
bd558525
JMF
1852 else:
1853 self.start = datetime.datetime.min.date()
1854 if end is not None:
d49f8db3 1855 self.end = date_from_str(end, strict=True)
bd558525
JMF
1856 else:
1857 self.end = datetime.datetime.max.date()
37254abc 1858 if self.start > self.end:
bd558525 1859 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1860
bd558525
JMF
1861 @classmethod
1862 def day(cls, day):
1863 """Returns a range that only contains the given day"""
5f6a1245
JW
1864 return cls(day, day)
1865
bd558525
JMF
1866 def __contains__(self, date):
1867 """Check if the date is in the range"""
37254abc
JMF
1868 if not isinstance(date, datetime.date):
1869 date = date_from_str(date)
1870 return self.start <= date <= self.end
5f6a1245 1871
bd558525 1872 def __str__(self):
86e5f3ed 1873 return f'{self.start.isoformat()} - {self.end.isoformat()}'
c496ca96
PH
1874
1875
1876def platform_name():
1877 """ Returns the platform name as a compat_str """
1878 res = platform.platform()
1879 if isinstance(res, bytes):
1880 res = res.decode(preferredencoding())
1881
1882 assert isinstance(res, compat_str)
1883 return res
c257baff
PH
1884
1885
49fa4d9a
N
1886def get_windows_version():
1887 ''' Get Windows version. None if it's not running on Windows '''
1888 if compat_os_name == 'nt':
1889 return version_tuple(platform.win32_ver()[1])
1890 else:
1891 return None
1892
1893
734f90bb 1894def write_string(s, out=None, encoding=None):
19a03940 1895 assert isinstance(s, str)
1896 out = out or sys.stderr
7459e3a2 1897
fe1daad3 1898 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 1899 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 1900
cfb0511d 1901 if 'b' in getattr(out, 'mode', ''):
104aa738
PH
1902 byt = s.encode(encoding or preferredencoding(), 'ignore')
1903 out.write(byt)
1904 elif hasattr(out, 'buffer'):
1905 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1906 byt = s.encode(enc, 'ignore')
1907 out.buffer.write(byt)
1908 else:
8bf48f23 1909 out.write(s)
7459e3a2
PH
1910 out.flush()
1911
1912
48ea9cea
PH
1913def bytes_to_intlist(bs):
1914 if not bs:
1915 return []
1916 if isinstance(bs[0], int): # Python 3
1917 return list(bs)
1918 else:
1919 return [ord(c) for c in bs]
1920
c257baff 1921
cba892fa 1922def intlist_to_bytes(xs):
1923 if not xs:
1924 return b''
edaa23f8 1925 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1926
1927
0edb3e33 1928class LockingUnsupportedError(IOError):
1929 msg = 'File locking is not supported on this platform'
1930
1931 def __init__(self):
1932 super().__init__(self.msg)
1933
1934
c1c9a79c
PH
1935# Cross-platform file locking
1936if sys.platform == 'win32':
1937 import ctypes.wintypes
1938 import msvcrt
1939
1940 class OVERLAPPED(ctypes.Structure):
1941 _fields_ = [
1942 ('Internal', ctypes.wintypes.LPVOID),
1943 ('InternalHigh', ctypes.wintypes.LPVOID),
1944 ('Offset', ctypes.wintypes.DWORD),
1945 ('OffsetHigh', ctypes.wintypes.DWORD),
1946 ('hEvent', ctypes.wintypes.HANDLE),
1947 ]
1948
1949 kernel32 = ctypes.windll.kernel32
1950 LockFileEx = kernel32.LockFileEx
1951 LockFileEx.argtypes = [
1952 ctypes.wintypes.HANDLE, # hFile
1953 ctypes.wintypes.DWORD, # dwFlags
1954 ctypes.wintypes.DWORD, # dwReserved
1955 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1956 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1957 ctypes.POINTER(OVERLAPPED) # Overlapped
1958 ]
1959 LockFileEx.restype = ctypes.wintypes.BOOL
1960 UnlockFileEx = kernel32.UnlockFileEx
1961 UnlockFileEx.argtypes = [
1962 ctypes.wintypes.HANDLE, # hFile
1963 ctypes.wintypes.DWORD, # dwReserved
1964 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1965 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1966 ctypes.POINTER(OVERLAPPED) # Overlapped
1967 ]
1968 UnlockFileEx.restype = ctypes.wintypes.BOOL
1969 whole_low = 0xffffffff
1970 whole_high = 0x7fffffff
1971
747c0bd1 1972 def _lock_file(f, exclusive, block):
c1c9a79c
PH
1973 overlapped = OVERLAPPED()
1974 overlapped.Offset = 0
1975 overlapped.OffsetHigh = 0
1976 overlapped.hEvent = 0
1977 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 1978
1979 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1980 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1981 0, whole_low, whole_high, f._lock_file_overlapped_p):
1982 raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
c1c9a79c
PH
1983
1984 def _unlock_file(f):
1985 assert f._lock_file_overlapped_p
1986 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 1987 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
1988 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1989
1990else:
399a76e6
YCH
1991 try:
1992 import fcntl
c1c9a79c 1993
a3125791 1994 def _lock_file(f, exclusive, block):
b63837bc 1995 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1996 if not block:
1997 flags |= fcntl.LOCK_NB
acea8d7c 1998 try:
b63837bc 1999 fcntl.flock(f, flags)
acea8d7c
JK
2000 except BlockingIOError:
2001 raise
2002 except OSError: # AOSP does not have flock()
b63837bc 2003 fcntl.lockf(f, flags)
c1c9a79c 2004
399a76e6 2005 def _unlock_file(f):
acea8d7c
JK
2006 try:
2007 fcntl.flock(f, fcntl.LOCK_UN)
2008 except OSError:
2009 fcntl.lockf(f, fcntl.LOCK_UN)
a3125791 2010
399a76e6 2011 except ImportError:
399a76e6 2012
a3125791 2013 def _lock_file(f, exclusive, block):
0edb3e33 2014 raise LockingUnsupportedError()
399a76e6
YCH
2015
2016 def _unlock_file(f):
0edb3e33 2017 raise LockingUnsupportedError()
c1c9a79c
PH
2018
2019
86e5f3ed 2020class locked_file:
0edb3e33 2021 locked = False
747c0bd1 2022
a3125791 2023 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
2024 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2025 raise NotImplementedError(mode)
2026 self.mode, self.block = mode, block
2027
2028 writable = any(f in mode for f in 'wax+')
2029 readable = any(f in mode for f in 'r+')
2030 flags = functools.reduce(operator.ior, (
2031 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2032 getattr(os, 'O_BINARY', 0), # Windows only
2033 getattr(os, 'O_NOINHERIT', 0), # Windows only
2034 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2035 os.O_APPEND if 'a' in mode else 0,
2036 os.O_EXCL if 'x' in mode else 0,
2037 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2038 ))
2039
98804d03 2040 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
2041
2042 def __enter__(self):
a3125791 2043 exclusive = 'r' not in self.mode
c1c9a79c 2044 try:
a3125791 2045 _lock_file(self.f, exclusive, self.block)
0edb3e33 2046 self.locked = True
86e5f3ed 2047 except OSError:
c1c9a79c
PH
2048 self.f.close()
2049 raise
fcfa8853 2050 if 'w' in self.mode:
131e14dc
JK
2051 try:
2052 self.f.truncate()
2053 except OSError as e:
2054 if e.errno != 29: # Illegal seek, expected when self.f is a FIFO
2055 raise e
c1c9a79c
PH
2056 return self
2057
0edb3e33 2058 def unlock(self):
2059 if not self.locked:
2060 return
c1c9a79c 2061 try:
0edb3e33 2062 _unlock_file(self.f)
c1c9a79c 2063 finally:
0edb3e33 2064 self.locked = False
c1c9a79c 2065
0edb3e33 2066 def __exit__(self, *_):
2067 try:
2068 self.unlock()
2069 finally:
2070 self.f.close()
4eb7f1d1 2071
0edb3e33 2072 open = __enter__
2073 close = __exit__
a3125791 2074
0edb3e33 2075 def __getattr__(self, attr):
2076 return getattr(self.f, attr)
a3125791 2077
0edb3e33 2078 def __iter__(self):
2079 return iter(self.f)
a3125791 2080
4eb7f1d1 2081
4644ac55
S
2082def get_filesystem_encoding():
2083 encoding = sys.getfilesystemencoding()
2084 return encoding if encoding is not None else 'utf-8'
2085
2086
4eb7f1d1 2087def shell_quote(args):
a6a173c2 2088 quoted_args = []
4644ac55 2089 encoding = get_filesystem_encoding()
a6a173c2
JMF
2090 for a in args:
2091 if isinstance(a, bytes):
2092 # We may get a filename encoded with 'encodeFilename'
2093 a = a.decode(encoding)
aefce8e6 2094 quoted_args.append(compat_shlex_quote(a))
28e614de 2095 return ' '.join(quoted_args)
9d4660ca
PH
2096
2097
2098def smuggle_url(url, data):
2099 """ Pass additional data in a URL for internal use. """
2100
81953d1a
RA
2101 url, idata = unsmuggle_url(url, {})
2102 data.update(idata)
15707c7e 2103 sdata = compat_urllib_parse_urlencode(
28e614de
PH
2104 {'__youtubedl_smuggle': json.dumps(data)})
2105 return url + '#' + sdata
9d4660ca
PH
2106
2107
79f82953 2108def unsmuggle_url(smug_url, default=None):
83e865a3 2109 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2110 return smug_url, default
28e614de
PH
2111 url, _, sdata = smug_url.rpartition('#')
2112 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2113 data = json.loads(jsond)
2114 return url, data
02dbf93f
PH
2115
2116
e0fd9573 2117def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2118 """ Formats numbers with decimal sufixes like K, M, etc """
2119 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2120 if num is None or num < 0:
e0fd9573 2121 return None
eeb2a770 2122 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2123 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2124 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 2125 if factor == 1024:
2126 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2127 converted = num / (factor ** exponent)
abbeeebc 2128 return fmt % (converted, suffix)
e0fd9573 2129
2130
02dbf93f 2131def format_bytes(bytes):
f02d24d8 2132 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2133
1c088fa8 2134
fb47597b
S
2135def lookup_unit_table(unit_table, s):
2136 units_re = '|'.join(re.escape(u) for u in unit_table)
2137 m = re.match(
782b1b5b 2138 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
2139 if not m:
2140 return None
2141 num_str = m.group('num').replace(',', '.')
2142 mult = unit_table[m.group('unit')]
2143 return int(float(num_str) * mult)
2144
2145
be64b5b0
PH
2146def parse_filesize(s):
2147 if s is None:
2148 return None
2149
dfb1b146 2150 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2151 # but we support those too
2152 _UNIT_TABLE = {
2153 'B': 1,
2154 'b': 1,
70852b47 2155 'bytes': 1,
be64b5b0
PH
2156 'KiB': 1024,
2157 'KB': 1000,
2158 'kB': 1024,
2159 'Kb': 1000,
13585d76 2160 'kb': 1000,
70852b47
YCH
2161 'kilobytes': 1000,
2162 'kibibytes': 1024,
be64b5b0
PH
2163 'MiB': 1024 ** 2,
2164 'MB': 1000 ** 2,
2165 'mB': 1024 ** 2,
2166 'Mb': 1000 ** 2,
13585d76 2167 'mb': 1000 ** 2,
70852b47
YCH
2168 'megabytes': 1000 ** 2,
2169 'mebibytes': 1024 ** 2,
be64b5b0
PH
2170 'GiB': 1024 ** 3,
2171 'GB': 1000 ** 3,
2172 'gB': 1024 ** 3,
2173 'Gb': 1000 ** 3,
13585d76 2174 'gb': 1000 ** 3,
70852b47
YCH
2175 'gigabytes': 1000 ** 3,
2176 'gibibytes': 1024 ** 3,
be64b5b0
PH
2177 'TiB': 1024 ** 4,
2178 'TB': 1000 ** 4,
2179 'tB': 1024 ** 4,
2180 'Tb': 1000 ** 4,
13585d76 2181 'tb': 1000 ** 4,
70852b47
YCH
2182 'terabytes': 1000 ** 4,
2183 'tebibytes': 1024 ** 4,
be64b5b0
PH
2184 'PiB': 1024 ** 5,
2185 'PB': 1000 ** 5,
2186 'pB': 1024 ** 5,
2187 'Pb': 1000 ** 5,
13585d76 2188 'pb': 1000 ** 5,
70852b47
YCH
2189 'petabytes': 1000 ** 5,
2190 'pebibytes': 1024 ** 5,
be64b5b0
PH
2191 'EiB': 1024 ** 6,
2192 'EB': 1000 ** 6,
2193 'eB': 1024 ** 6,
2194 'Eb': 1000 ** 6,
13585d76 2195 'eb': 1000 ** 6,
70852b47
YCH
2196 'exabytes': 1000 ** 6,
2197 'exbibytes': 1024 ** 6,
be64b5b0
PH
2198 'ZiB': 1024 ** 7,
2199 'ZB': 1000 ** 7,
2200 'zB': 1024 ** 7,
2201 'Zb': 1000 ** 7,
13585d76 2202 'zb': 1000 ** 7,
70852b47
YCH
2203 'zettabytes': 1000 ** 7,
2204 'zebibytes': 1024 ** 7,
be64b5b0
PH
2205 'YiB': 1024 ** 8,
2206 'YB': 1000 ** 8,
2207 'yB': 1024 ** 8,
2208 'Yb': 1000 ** 8,
13585d76 2209 'yb': 1000 ** 8,
70852b47
YCH
2210 'yottabytes': 1000 ** 8,
2211 'yobibytes': 1024 ** 8,
be64b5b0
PH
2212 }
2213
fb47597b
S
2214 return lookup_unit_table(_UNIT_TABLE, s)
2215
2216
2217def parse_count(s):
2218 if s is None:
be64b5b0
PH
2219 return None
2220
352d5da8 2221 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2222
2223 if re.match(r'^[\d,.]+$', s):
2224 return str_to_int(s)
2225
2226 _UNIT_TABLE = {
2227 'k': 1000,
2228 'K': 1000,
2229 'm': 1000 ** 2,
2230 'M': 1000 ** 2,
2231 'kk': 1000 ** 2,
2232 'KK': 1000 ** 2,
352d5da8 2233 'b': 1000 ** 3,
2234 'B': 1000 ** 3,
fb47597b 2235 }
be64b5b0 2236
352d5da8 2237 ret = lookup_unit_table(_UNIT_TABLE, s)
2238 if ret is not None:
2239 return ret
2240
2241 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2242 if mobj:
2243 return str_to_int(mobj.group(1))
be64b5b0 2244
2f7ae819 2245
5d45484c 2246def parse_resolution(s, *, lenient=False):
b871d7e9
S
2247 if s is None:
2248 return {}
2249
5d45484c
LNO
2250 if lenient:
2251 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2252 else:
2253 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2254 if mobj:
2255 return {
2256 'width': int(mobj.group('w')),
2257 'height': int(mobj.group('h')),
2258 }
2259
17ec8bcf 2260 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2261 if mobj:
2262 return {'height': int(mobj.group(1))}
2263
2264 mobj = re.search(r'\b([48])[kK]\b', s)
2265 if mobj:
2266 return {'height': int(mobj.group(1)) * 540}
2267
2268 return {}
2269
2270
0dc41787
S
2271def parse_bitrate(s):
2272 if not isinstance(s, compat_str):
2273 return
2274 mobj = re.search(r'\b(\d+)\s*kbps', s)
2275 if mobj:
2276 return int(mobj.group(1))
2277
2278
a942d6cb 2279def month_by_name(name, lang='en'):
caefb1de
PH
2280 """ Return the number of a month by (locale-independently) English name """
2281
f6717dec 2282 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2283
caefb1de 2284 try:
f6717dec 2285 return month_names.index(name) + 1
7105440c
YCH
2286 except ValueError:
2287 return None
2288
2289
2290def month_by_abbreviation(abbrev):
2291 """ Return the number of a month by (locale-independently) English
2292 abbreviations """
2293
2294 try:
2295 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2296 except ValueError:
2297 return None
18258362
JMF
2298
2299
5aafe895 2300def fix_xml_ampersands(xml_str):
18258362 2301 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2302 return re.sub(
2303 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2304 '&amp;',
5aafe895 2305 xml_str)
e3946f98
PH
2306
2307
2308def setproctitle(title):
8bf48f23 2309 assert isinstance(title, compat_str)
c1c05c67
YCH
2310
2311 # ctypes in Jython is not complete
2312 # http://bugs.jython.org/issue2148
2313 if sys.platform.startswith('java'):
2314 return
2315
e3946f98 2316 try:
611c1dd9 2317 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2318 except OSError:
2319 return
2f49bcd6
RC
2320 except TypeError:
2321 # LoadLibrary in Windows Python 2.7.13 only expects
2322 # a bytestring, but since unicode_literals turns
2323 # every string into a unicode string, it fails.
2324 return
0f06bcd7 2325 title_bytes = title.encode()
6eefe533
PH
2326 buf = ctypes.create_string_buffer(len(title_bytes))
2327 buf.value = title_bytes
e3946f98 2328 try:
6eefe533 2329 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2330 except AttributeError:
2331 return # Strange libc, just skip this
d7dda168
PH
2332
2333
2334def remove_start(s, start):
46bc9b7d 2335 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2336
2337
2b9faf55 2338def remove_end(s, end):
46bc9b7d 2339 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2340
2341
31b2051e
S
2342def remove_quotes(s):
2343 if s is None or len(s) < 2:
2344 return s
2345 for quote in ('"', "'", ):
2346 if s[0] == quote and s[-1] == quote:
2347 return s[1:-1]
2348 return s
2349
2350
b6e0c7d2
U
2351def get_domain(url):
2352 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2353 return domain.group('domain') if domain else None
2354
2355
29eb5174 2356def url_basename(url):
9b8aaeed 2357 path = compat_urlparse.urlparse(url).path
28e614de 2358 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2359
2360
02dc0a36
S
2361def base_url(url):
2362 return re.match(r'https?://[^?#&]+/', url).group()
2363
2364
e34c3361 2365def urljoin(base, path):
4b5de77b 2366 if isinstance(path, bytes):
0f06bcd7 2367 path = path.decode()
e34c3361
S
2368 if not isinstance(path, compat_str) or not path:
2369 return None
fad4ceb5 2370 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2371 return path
4b5de77b 2372 if isinstance(base, bytes):
0f06bcd7 2373 base = base.decode()
4b5de77b
S
2374 if not isinstance(base, compat_str) or not re.match(
2375 r'^(?:https?:)?//', base):
e34c3361
S
2376 return None
2377 return compat_urlparse.urljoin(base, path)
2378
2379
aa94a6d3
PH
2380class HEADRequest(compat_urllib_request.Request):
2381 def get_method(self):
611c1dd9 2382 return 'HEAD'
7217e148
PH
2383
2384
95cf60e8
S
2385class PUTRequest(compat_urllib_request.Request):
2386 def get_method(self):
2387 return 'PUT'
2388
2389
9732d77e 2390def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2391 if get_attr and v is not None:
2392 v = getattr(v, get_attr, None)
1812afb7
S
2393 try:
2394 return int(v) * invscale // scale
31c49255 2395 except (ValueError, TypeError, OverflowError):
af98f8ff 2396 return default
9732d77e 2397
9572013d 2398
40a90862
JMF
2399def str_or_none(v, default=None):
2400 return default if v is None else compat_str(v)
2401
9732d77e
PH
2402
2403def str_to_int(int_str):
48d4681e 2404 """ A more relaxed version of int_or_none """
f9934b96 2405 if isinstance(int_str, int):
348c6bf1 2406 return int_str
42db58ec
S
2407 elif isinstance(int_str, compat_str):
2408 int_str = re.sub(r'[,\.\+]', '', int_str)
2409 return int_or_none(int_str)
608d11f5
PH
2410
2411
9732d77e 2412def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2413 if v is None:
2414 return default
2415 try:
2416 return float(v) * invscale / scale
5e1271c5 2417 except (ValueError, TypeError):
caf80631 2418 return default
43f775e4
PH
2419
2420
c7e327c4
S
2421def bool_or_none(v, default=None):
2422 return v if isinstance(v, bool) else default
2423
2424
53cd37ba
S
2425def strip_or_none(v, default=None):
2426 return v.strip() if isinstance(v, compat_str) else default
b72b4431
S
2427
2428
af03000a
S
2429def url_or_none(url):
2430 if not url or not isinstance(url, compat_str):
2431 return None
2432 url = url.strip()
29f7c58a 2433 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2434
2435
3e9b66d7
LNO
2436def request_to_url(req):
2437 if isinstance(req, compat_urllib_request.Request):
2438 return req.get_full_url()
2439 else:
2440 return req
2441
2442
e29663c6 2443def strftime_or_none(timestamp, date_format, default=None):
2444 datetime_object = None
2445 try:
f9934b96 2446 if isinstance(timestamp, (int, float)): # unix timestamp
e29663c6 2447 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2448 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2449 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2450 return datetime_object.strftime(date_format)
2451 except (ValueError, TypeError, AttributeError):
2452 return default
2453
2454
608d11f5 2455def parse_duration(s):
f9934b96 2456 if not isinstance(s, str):
608d11f5 2457 return None
ca7b3246 2458 s = s.strip()
38d79fd1 2459 if not s:
2460 return None
ca7b3246 2461
acaff495 2462 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2463 m = re.match(r'''(?x)
2464 (?P<before_secs>
2465 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2466 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2467 (?P<ms>[.:][0-9]+)?Z?$
2468 ''', s)
acaff495 2469 if m:
8bd1c00b 2470 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2471 else:
2472 m = re.match(
056653bb
S
2473 r'''(?ix)(?:P?
2474 (?:
1c1b2f96 2475 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2476 )?
2477 (?:
1c1b2f96 2478 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2479 )?
2480 (?:
1c1b2f96 2481 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2482 )?
8f4b58d7 2483 (?:
1c1b2f96 2484 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2485 )?
056653bb 2486 T)?
acaff495 2487 (?:
1c1b2f96 2488 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2489 )?
2490 (?:
1c1b2f96 2491 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2492 )?
2493 (?:
2494 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2495 )?Z?$''', s)
acaff495 2496 if m:
2497 days, hours, mins, secs, ms = m.groups()
2498 else:
15846398 2499 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2500 if m:
2501 hours, mins = m.groups()
2502 else:
2503 return None
2504
acaff495 2505 if ms:
19a03940 2506 ms = ms.replace(':', '.')
2507 return sum(float(part or 0) * mult for part, mult in (
2508 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2509
2510
e65e4c88 2511def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2512 name, real_ext = os.path.splitext(filename)
e65e4c88 2513 return (
86e5f3ed 2514 f'{name}.{ext}{real_ext}'
e65e4c88 2515 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2516 else f'{filename}.{ext}')
d70ad093
PH
2517
2518
b3ed15b7
S
2519def replace_extension(filename, ext, expected_real_ext=None):
2520 name, real_ext = os.path.splitext(filename)
86e5f3ed 2521 return '{}.{}'.format(
b3ed15b7
S
2522 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2523 ext)
2524
2525
d70ad093
PH
2526def check_executable(exe, args=[]):
2527 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2528 args can be a list of arguments for a short output (like -version) """
2529 try:
d3c93ec2 2530 Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
d70ad093
PH
2531 except OSError:
2532 return False
2533 return exe
b7ab0590
PH
2534
2535
8a7f68d0 2536def _get_exe_version_output(exe, args, *, to_screen=None):
2537 if to_screen:
2538 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
95807118 2539 try:
b64d04c1 2540 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2541 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2542 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
d3c93ec2 2543 out, _ = Popen(
2544 [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2545 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
95807118
PH
2546 except OSError:
2547 return False
cae97f65
PH
2548 if isinstance(out, bytes): # Python 2.x
2549 out = out.decode('ascii', 'ignore')
9af98e17 2550 return out
cae97f65
PH
2551
2552
2553def detect_exe_version(output, version_re=None, unrecognized='present'):
2554 assert isinstance(output, compat_str)
2555 if version_re is None:
2556 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2557 m = re.search(version_re, output)
95807118
PH
2558 if m:
2559 return m.group(1)
2560 else:
2561 return unrecognized
2562
2563
9af98e17 2564def get_exe_version(exe, args=['--version'],
2565 version_re=None, unrecognized='present'):
2566 """ Returns the version of the specified executable,
2567 or False if the executable is not present """
2568 out = _get_exe_version_output(exe, args)
2569 return detect_exe_version(out, version_re, unrecognized) if out else False
2570
2571
cb89cfc1 2572class LazyList(collections.abc.Sequence):
0f06bcd7 2573 """Lazy immutable list from an iterable
2574 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2575
8e5fecc8 2576 class IndexError(IndexError):
2577 pass
2578
282f5709 2579 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2580 self._iterable = iter(iterable)
2581 self._cache = [] if _cache is None else _cache
2582 self._reversed = reverse
483336e7 2583
2584 def __iter__(self):
0f06bcd7 2585 if self._reversed:
28419ca2 2586 # We need to consume the entire iterable to iterate in reverse
981052c9 2587 yield from self.exhaust()
28419ca2 2588 return
0f06bcd7 2589 yield from self._cache
2590 for item in self._iterable:
2591 self._cache.append(item)
483336e7 2592 yield item
2593
0f06bcd7 2594 def _exhaust(self):
2595 self._cache.extend(self._iterable)
2596 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2597 return self._cache
28419ca2 2598
981052c9 2599 def exhaust(self):
0f06bcd7 2600 """Evaluate the entire iterable"""
2601 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2602
28419ca2 2603 @staticmethod
0f06bcd7 2604 def _reverse_index(x):
e0f2b4b4 2605 return None if x is None else -(x + 1)
483336e7 2606
2607 def __getitem__(self, idx):
2608 if isinstance(idx, slice):
0f06bcd7 2609 if self._reversed:
2610 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2611 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2612 elif isinstance(idx, int):
0f06bcd7 2613 if self._reversed:
2614 idx = self._reverse_index(idx)
e0f2b4b4 2615 start, stop, step = idx, idx, 0
483336e7 2616 else:
2617 raise TypeError('indices must be integers or slices')
e0f2b4b4 2618 if ((start or 0) < 0 or (stop or 0) < 0
2619 or (start is None and step < 0)
2620 or (stop is None and step > 0)):
483336e7 2621 # We need to consume the entire iterable to be able to slice from the end
2622 # Obviously, never use this with infinite iterables
0f06bcd7 2623 self._exhaust()
8e5fecc8 2624 try:
0f06bcd7 2625 return self._cache[idx]
8e5fecc8 2626 except IndexError as e:
2627 raise self.IndexError(e) from e
0f06bcd7 2628 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2629 if n > 0:
0f06bcd7 2630 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2631 try:
0f06bcd7 2632 return self._cache[idx]
8e5fecc8 2633 except IndexError as e:
2634 raise self.IndexError(e) from e
483336e7 2635
2636 def __bool__(self):
2637 try:
0f06bcd7 2638 self[-1] if self._reversed else self[0]
8e5fecc8 2639 except self.IndexError:
483336e7 2640 return False
2641 return True
2642
2643 def __len__(self):
0f06bcd7 2644 self._exhaust()
2645 return len(self._cache)
483336e7 2646
282f5709 2647 def __reversed__(self):
0f06bcd7 2648 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2649
2650 def __copy__(self):
0f06bcd7 2651 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2652
28419ca2 2653 def __repr__(self):
2654 # repr and str should mimic a list. So we exhaust the iterable
2655 return repr(self.exhaust())
2656
2657 def __str__(self):
2658 return repr(self.exhaust())
2659
483336e7 2660
7be9ccff 2661class PagedList:
c07a39ae 2662
2663 class IndexError(IndexError):
2664 pass
2665
dd26ced1
PH
2666 def __len__(self):
2667 # This is only useful for tests
2668 return len(self.getslice())
2669
7be9ccff 2670 def __init__(self, pagefunc, pagesize, use_cache=True):
2671 self._pagefunc = pagefunc
2672 self._pagesize = pagesize
f1d13090 2673 self._pagecount = float('inf')
7be9ccff 2674 self._use_cache = use_cache
2675 self._cache = {}
2676
2677 def getpage(self, pagenum):
d8cf8d97 2678 page_results = self._cache.get(pagenum)
2679 if page_results is None:
f1d13090 2680 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2681 if self._use_cache:
2682 self._cache[pagenum] = page_results
2683 return page_results
2684
2685 def getslice(self, start=0, end=None):
2686 return list(self._getslice(start, end))
2687
2688 def _getslice(self, start, end):
55575225 2689 raise NotImplementedError('This method must be implemented by subclasses')
2690
2691 def __getitem__(self, idx):
f1d13090 2692 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2693 if not isinstance(idx, int) or idx < 0:
2694 raise TypeError('indices must be non-negative integers')
2695 entries = self.getslice(idx, idx + 1)
d8cf8d97 2696 if not entries:
c07a39ae 2697 raise self.IndexError()
d8cf8d97 2698 return entries[0]
55575225 2699
9c44d242
PH
2700
2701class OnDemandPagedList(PagedList):
a44ca5a4 2702 """Download pages until a page with less than maximum results"""
86e5f3ed 2703
7be9ccff 2704 def _getslice(self, start, end):
b7ab0590
PH
2705 for pagenum in itertools.count(start // self._pagesize):
2706 firstid = pagenum * self._pagesize
2707 nextfirstid = pagenum * self._pagesize + self._pagesize
2708 if start >= nextfirstid:
2709 continue
2710
b7ab0590
PH
2711 startv = (
2712 start % self._pagesize
2713 if firstid <= start < nextfirstid
2714 else 0)
b7ab0590
PH
2715 endv = (
2716 ((end - 1) % self._pagesize) + 1
2717 if (end is not None and firstid <= end <= nextfirstid)
2718 else None)
2719
f1d13090 2720 try:
2721 page_results = self.getpage(pagenum)
2722 except Exception:
2723 self._pagecount = pagenum - 1
2724 raise
b7ab0590
PH
2725 if startv != 0 or endv is not None:
2726 page_results = page_results[startv:endv]
7be9ccff 2727 yield from page_results
b7ab0590
PH
2728
2729 # A little optimization - if current page is not "full", ie. does
2730 # not contain page_size videos then we can assume that this page
2731 # is the last one - there are no more ids on further pages -
2732 # i.e. no need to query again.
2733 if len(page_results) + startv < self._pagesize:
2734 break
2735
2736 # If we got the whole page, but the next page is not interesting,
2737 # break out early as well
2738 if end == nextfirstid:
2739 break
81c2f20b
PH
2740
2741
9c44d242 2742class InAdvancePagedList(PagedList):
a44ca5a4 2743 """PagedList with total number of pages known in advance"""
86e5f3ed 2744
9c44d242 2745 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2746 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2747 self._pagecount = pagecount
9c44d242 2748
7be9ccff 2749 def _getslice(self, start, end):
9c44d242 2750 start_page = start // self._pagesize
d37707bd 2751 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2752 skip_elems = start - start_page * self._pagesize
2753 only_more = None if end is None else end - start
2754 for pagenum in range(start_page, end_page):
7be9ccff 2755 page_results = self.getpage(pagenum)
9c44d242 2756 if skip_elems:
7be9ccff 2757 page_results = page_results[skip_elems:]
9c44d242
PH
2758 skip_elems = None
2759 if only_more is not None:
7be9ccff 2760 if len(page_results) < only_more:
2761 only_more -= len(page_results)
9c44d242 2762 else:
7be9ccff 2763 yield from page_results[:only_more]
9c44d242 2764 break
7be9ccff 2765 yield from page_results
9c44d242
PH
2766
2767
81c2f20b 2768def uppercase_escape(s):
676eb3f2 2769 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2770 return re.sub(
a612753d 2771 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2772 lambda m: unicode_escape(m.group(0))[0],
2773 s)
0fe2ff78
YCH
2774
2775
2776def lowercase_escape(s):
2777 unicode_escape = codecs.getdecoder('unicode_escape')
2778 return re.sub(
2779 r'\\u[0-9a-fA-F]{4}',
2780 lambda m: unicode_escape(m.group(0))[0],
2781 s)
b53466e1 2782
d05cfe06
S
2783
2784def escape_rfc3986(s):
2785 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 2786 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2787
2788
2789def escape_url(url):
2790 """Escape URL as suggested by RFC 3986"""
2791 url_parsed = compat_urllib_parse_urlparse(url)
2792 return url_parsed._replace(
efbed08d 2793 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2794 path=escape_rfc3986(url_parsed.path),
2795 params=escape_rfc3986(url_parsed.params),
2796 query=escape_rfc3986(url_parsed.query),
2797 fragment=escape_rfc3986(url_parsed.fragment)
2798 ).geturl()
2799
62e609ab 2800
4dfbf869 2801def parse_qs(url):
2802 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2803
2804
62e609ab
PH
2805def read_batch_urls(batch_fd):
2806 def fixup(url):
2807 if not isinstance(url, compat_str):
2808 url = url.decode('utf-8', 'replace')
8c04f0be 2809 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2810 for bom in BOM_UTF8:
2811 if url.startswith(bom):
2812 url = url[len(bom):]
2813 url = url.lstrip()
2814 if not url or url.startswith(('#', ';', ']')):
62e609ab 2815 return False
8c04f0be 2816 # "#" cannot be stripped out since it is part of the URI
2817 # However, it can be safely stipped out if follwing a whitespace
2818 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
2819
2820 with contextlib.closing(batch_fd) as fd:
2821 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2822
2823
2824def urlencode_postdata(*args, **kargs):
15707c7e 2825 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2826
2827
38f9ef31 2828def update_url_query(url, query):
cacd9966
YCH
2829 if not query:
2830 return url
38f9ef31 2831 parsed_url = compat_urlparse.urlparse(url)
2832 qs = compat_parse_qs(parsed_url.query)
2833 qs.update(query)
2834 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2835 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2836
8e60dc75 2837
ed0291d1
S
2838def update_Request(req, url=None, data=None, headers={}, query={}):
2839 req_headers = req.headers.copy()
2840 req_headers.update(headers)
2841 req_data = data or req.data
2842 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2843 req_get_method = req.get_method()
2844 if req_get_method == 'HEAD':
2845 req_type = HEADRequest
2846 elif req_get_method == 'PUT':
2847 req_type = PUTRequest
2848 else:
2849 req_type = compat_urllib_request.Request
ed0291d1
S
2850 new_req = req_type(
2851 req_url, data=req_data, headers=req_headers,
2852 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2853 if hasattr(req, 'timeout'):
2854 new_req.timeout = req.timeout
2855 return new_req
2856
2857
10c87c15 2858def _multipart_encode_impl(data, boundary):
0c265486
YCH
2859 content_type = 'multipart/form-data; boundary=%s' % boundary
2860
2861 out = b''
2862 for k, v in data.items():
2863 out += b'--' + boundary.encode('ascii') + b'\r\n'
2864 if isinstance(k, compat_str):
0f06bcd7 2865 k = k.encode()
0c265486 2866 if isinstance(v, compat_str):
0f06bcd7 2867 v = v.encode()
0c265486
YCH
2868 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2869 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2870 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2871 if boundary.encode('ascii') in content:
2872 raise ValueError('Boundary overlaps with data')
2873 out += content
2874
2875 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2876
2877 return out, content_type
2878
2879
2880def multipart_encode(data, boundary=None):
2881 '''
2882 Encode a dict to RFC 7578-compliant form-data
2883
2884 data:
2885 A dict where keys and values can be either Unicode or bytes-like
2886 objects.
2887 boundary:
2888 If specified a Unicode object, it's used as the boundary. Otherwise
2889 a random boundary is generated.
2890
2891 Reference: https://tools.ietf.org/html/rfc7578
2892 '''
2893 has_specified_boundary = boundary is not None
2894
2895 while True:
2896 if boundary is None:
2897 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2898
2899 try:
10c87c15 2900 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2901 break
2902 except ValueError:
2903 if has_specified_boundary:
2904 raise
2905 boundary = None
2906
2907 return out, content_type
2908
2909
86296ad2 2910def dict_get(d, key_or_keys, default=None, skip_false_values=True):
a44ca5a4 2911 for val in map(d.get, variadic(key_or_keys)):
2912 if val is not None and (val or not skip_false_values):
2913 return val
2914 return default
cbecc9b9
S
2915
2916
c4f60dd7 2917def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2918 for f in funcs:
a32a9a7e 2919 try:
c4f60dd7 2920 val = f(*args, **kwargs)
2921 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
a32a9a7e
S
2922 pass
2923 else:
c4f60dd7 2924 if expected_type is None or isinstance(val, expected_type):
2925 return val
2926
2927
2928def try_get(src, getter, expected_type=None):
2929 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
2930
2931
90137ca4 2932def filter_dict(dct, cndn=lambda _, v: v is not None):
2933 return {k: v for k, v in dct.items() if cndn(k, v)}
2934
2935
6cc62232
S
2936def merge_dicts(*dicts):
2937 merged = {}
2938 for a_dict in dicts:
2939 for k, v in a_dict.items():
90137ca4 2940 if (v is not None and k not in merged
2941 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
2942 merged[k] = v
2943 return merged
2944
2945
8e60dc75
S
2946def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2947 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2948
16392824 2949
a1a530b0
PH
2950US_RATINGS = {
2951 'G': 0,
2952 'PG': 10,
2953 'PG-13': 13,
2954 'R': 16,
2955 'NC': 18,
2956}
fac55558
PH
2957
2958
a8795327 2959TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
2960 'TV-Y': 0,
2961 'TV-Y7': 7,
2962 'TV-G': 0,
2963 'TV-PG': 0,
2964 'TV-14': 14,
2965 'TV-MA': 17,
a8795327
S
2966}
2967
2968
146c80e2 2969def parse_age_limit(s):
19a03940 2970 # isinstance(False, int) is True. So type() must be used instead
2971 if type(s) is int:
a8795327 2972 return s if 0 <= s <= 21 else None
19a03940 2973 elif not isinstance(s, str):
d838b1bd 2974 return None
146c80e2 2975 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2976 if m:
2977 return int(m.group('age'))
5c5fae6d 2978 s = s.upper()
a8795327
S
2979 if s in US_RATINGS:
2980 return US_RATINGS[s]
5a16c9d9 2981 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 2982 if m:
5a16c9d9 2983 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 2984 return None
146c80e2
S
2985
2986
fac55558 2987def strip_jsonp(code):
609a61e3 2988 return re.sub(
5552c9eb 2989 r'''(?sx)^
e9c671d5 2990 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
2991 (?:\s*&&\s*(?P=func_name))?
2992 \s*\(\s*(?P<callback_data>.*)\);?
2993 \s*?(?://[^\n]*)*$''',
2994 r'\g<callback_data>', code)
478c2c61
PH
2995
2996
5c610515 2997def js_to_json(code, vars={}):
2998 # vars is a dict of var, val pairs to substitute
c843e685 2999 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 3000 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 3001 INTEGER_TABLE = (
86e5f3ed 3002 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3003 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
3004 )
3005
e05f6939 3006 def fix_kv(m):
e7b6d122
PH
3007 v = m.group(0)
3008 if v in ('true', 'false', 'null'):
3009 return v
421ddcb8
C
3010 elif v in ('undefined', 'void 0'):
3011 return 'null'
8bdd16b4 3012 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
bd1e4844 3013 return ""
3014
3015 if v[0] in ("'", '"'):
3016 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 3017 '"': '\\"',
bd1e4844 3018 "\\'": "'",
3019 '\\\n': '',
3020 '\\x': '\\u00',
3021 }.get(m.group(0), m.group(0)), v[1:-1])
8bdd16b4 3022 else:
3023 for regex, base in INTEGER_TABLE:
3024 im = re.match(regex, v)
3025 if im:
3026 i = int(im.group(1), base)
3027 return '"%d":' % i if v.endswith(':') else '%d' % i
89ac4a19 3028
5c610515 3029 if v in vars:
3030 return vars[v]
3031
e7b6d122 3032 return '"%s"' % v
e05f6939 3033
febff4c1
B
3034 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3035
bd1e4844 3036 return re.sub(r'''(?sx)
3037 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3038 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 3039 {comment}|,(?={skip}[\]}}])|
421ddcb8 3040 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
4195096e 3041 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
8bdd16b4 3042 [0-9]+(?={skip}:)|
3043 !+
4195096e 3044 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
3045
3046
478c2c61
PH
3047def qualities(quality_ids):
3048 """ Get a numeric quality value out of a list of possible values """
3049 def q(qid):
3050 try:
3051 return quality_ids.index(qid)
3052 except ValueError:
3053 return -1
3054 return q
3055
acd69589 3056
62f6f1cb 3057POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist')
1e43a6f7 3058
3059
de6000d9 3060DEFAULT_OUTTMPL = {
3061 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3062 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3063}
3064OUTTMPL_TYPES = {
72755351 3065 'chapter': None,
de6000d9 3066 'subtitle': None,
3067 'thumbnail': None,
3068 'description': 'description',
3069 'annotation': 'annotations.xml',
3070 'infojson': 'info.json',
08438d2c 3071 'link': None,
3b603dbd 3072 'pl_video': None,
5112f26a 3073 'pl_thumbnail': None,
de6000d9 3074 'pl_description': 'description',
3075 'pl_infojson': 'info.json',
3076}
0a871f68 3077
143db31d 3078# As of [1] format syntax is:
3079# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3080# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3081STR_FORMAT_RE_TMPL = r'''(?x)
3082 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3083 %
524e2e4f 3084 (?P<has_key>\((?P<key>{0})\))?
752cda38 3085 (?P<format>
524e2e4f 3086 (?P<conversion>[#0\-+ ]+)?
3087 (?P<min_width>\d+)?
3088 (?P<precision>\.\d+)?
3089 (?P<len_mod>[hlL])? # unused in python
901130bb 3090 {1} # conversion type
752cda38 3091 )
143db31d 3092'''
3093
7d1eb38a 3094
901130bb 3095STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3096
7d1eb38a 3097
a020a0dc
PH
3098def limit_length(s, length):
3099 """ Add ellipses to overly long strings """
3100 if s is None:
3101 return None
3102 ELLIPSES = '...'
3103 if len(s) > length:
3104 return s[:length - len(ELLIPSES)] + ELLIPSES
3105 return s
48844745
PH
3106
3107
3108def version_tuple(v):
5f9b8394 3109 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3110
3111
3112def is_outdated_version(version, limit, assume_new=True):
3113 if not version:
3114 return not assume_new
3115 try:
3116 return version_tuple(version) < version_tuple(limit)
3117 except ValueError:
3118 return not assume_new
732ea2f0
PH
3119
3120
3121def ytdl_is_updateable():
7a5c1cfe 3122 """ Returns if yt-dlp can be updated with -U """
735d865e 3123
5d535b4a 3124 from .update import is_non_updateable
732ea2f0 3125
5d535b4a 3126 return not is_non_updateable()
7d4111ed
PH
3127
3128
3129def args_to_str(args):
3130 # Get a short string representation for a subprocess command
702ccf2d 3131 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3132
3133
9b9c5355 3134def error_to_compat_str(err):
cfb0511d 3135 return str(err)
fdae2358
S
3136
3137
a44ca5a4 3138def error_to_str(err):
3139 return f'{type(err).__name__}: {err}'
3140
3141
c460bdd5 3142def mimetype2ext(mt):
eb9ee194
S
3143 if mt is None:
3144 return None
3145
9359f3d4
F
3146 mt, _, params = mt.partition(';')
3147 mt = mt.strip()
3148
3149 FULL_MAP = {
765ac263 3150 'audio/mp4': 'm4a',
6c33d24b
YCH
3151 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3152 # it's the most popular one
3153 'audio/mpeg': 'mp3',
ba39289d 3154 'audio/x-wav': 'wav',
9359f3d4
F
3155 'audio/wav': 'wav',
3156 'audio/wave': 'wav',
3157 }
3158
3159 ext = FULL_MAP.get(mt)
765ac263
JMF
3160 if ext is not None:
3161 return ext
3162
9359f3d4 3163 SUBTYPE_MAP = {
f6861ec9 3164 '3gpp': '3gp',
cafcf657 3165 'smptett+xml': 'tt',
cafcf657 3166 'ttaf+xml': 'dfxp',
a0d8d704 3167 'ttml+xml': 'ttml',
f6861ec9 3168 'x-flv': 'flv',
a0d8d704 3169 'x-mp4-fragmented': 'mp4',
d4f05d47 3170 'x-ms-sami': 'sami',
a0d8d704 3171 'x-ms-wmv': 'wmv',
b4173f15
RA
3172 'mpegurl': 'm3u8',
3173 'x-mpegurl': 'm3u8',
3174 'vnd.apple.mpegurl': 'm3u8',
3175 'dash+xml': 'mpd',
b4173f15 3176 'f4m+xml': 'f4m',
f164b971 3177 'hds+xml': 'f4m',
e910fe2f 3178 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 3179 'quicktime': 'mov',
98ce1a3f 3180 'mp2t': 'ts',
39e7107d 3181 'x-wav': 'wav',
9359f3d4
F
3182 'filmstrip+json': 'fs',
3183 'svg+xml': 'svg',
3184 }
3185
3186 _, _, subtype = mt.rpartition('/')
3187 ext = SUBTYPE_MAP.get(subtype.lower())
3188 if ext is not None:
3189 return ext
3190
3191 SUFFIX_MAP = {
3192 'json': 'json',
3193 'xml': 'xml',
3194 'zip': 'zip',
3195 'gzip': 'gz',
3196 }
3197
3198 _, _, suffix = subtype.partition('+')
3199 ext = SUFFIX_MAP.get(suffix)
3200 if ext is not None:
3201 return ext
3202
3203 return subtype.replace('+', '.')
c460bdd5
PH
3204
3205
2814f12b
THD
3206def ext2mimetype(ext_or_url):
3207 if not ext_or_url:
3208 return None
3209 if '.' not in ext_or_url:
3210 ext_or_url = f'file.{ext_or_url}'
3211 return mimetypes.guess_type(ext_or_url)[0]
3212
3213
4f3c5e06 3214def parse_codecs(codecs_str):
3215 # http://tools.ietf.org/html/rfc6381
3216 if not codecs_str:
3217 return {}
a0566bbf 3218 split_codecs = list(filter(None, map(
dbf5416a 3219 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3220 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3221 for full_codec in split_codecs:
9bd979ca 3222 parts = full_codec.split('.')
3223 codec = parts[0].replace('0', '')
3224 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3225 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
4f3c5e06 3226 if not vcodec:
b69fd25c 3227 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
176f1866 3228 if codec in ('dvh1', 'dvhe'):
3229 hdr = 'DV'
9bd979ca 3230 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3231 hdr = 'HDR10'
3232 elif full_codec.replace('0', '').startswith('vp9.2'):
176f1866 3233 hdr = 'HDR10'
b69fd25c 3234 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 3235 if not acodec:
3236 acodec = full_codec
4afa3ec4 3237 elif codec in ('stpp', 'wvtt',):
3fe75fdc 3238 if not scodec:
3239 scodec = full_codec
4f3c5e06 3240 else:
19a03940 3241 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3242 if vcodec or acodec or scodec:
4f3c5e06 3243 return {
3244 'vcodec': vcodec or 'none',
3245 'acodec': acodec or 'none',
176f1866 3246 'dynamic_range': hdr,
3fe75fdc 3247 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3248 }
b69fd25c 3249 elif len(split_codecs) == 2:
3250 return {
3251 'vcodec': split_codecs[0],
3252 'acodec': split_codecs[1],
3253 }
4f3c5e06 3254 return {}
3255
3256
2ccd1b10 3257def urlhandle_detect_ext(url_handle):
79298173 3258 getheader = url_handle.headers.get
2ccd1b10 3259
b55ee18f
PH
3260 cd = getheader('Content-Disposition')
3261 if cd:
3262 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3263 if m:
3264 e = determine_ext(m.group('filename'), default_ext=None)
3265 if e:
3266 return e
3267
c460bdd5 3268 return mimetype2ext(getheader('Content-Type'))
05900629
PH
3269
3270
1e399778
YCH
3271def encode_data_uri(data, mime_type):
3272 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3273
3274
05900629 3275def age_restricted(content_limit, age_limit):
6ec6cb4e 3276 """ Returns True iff the content should be blocked """
05900629
PH
3277
3278 if age_limit is None: # No limit set
3279 return False
3280 if content_limit is None:
3281 return False # Content available for everyone
3282 return age_limit < content_limit
61ca9a80
PH
3283
3284
3285def is_html(first_bytes):
3286 """ Detect whether a file contains HTML by examining its first bytes. """
3287
3288 BOMS = [
3289 (b'\xef\xbb\xbf', 'utf-8'),
3290 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3291 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3292 (b'\xff\xfe', 'utf-16-le'),
3293 (b'\xfe\xff', 'utf-16-be'),
3294 ]
80e8493e 3295
3296 encoding = 'utf-8'
61ca9a80 3297 for bom, enc in BOMS:
80e8493e 3298 while first_bytes.startswith(bom):
3299 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3300
80e8493e 3301 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3302
3303
3304def determine_protocol(info_dict):
3305 protocol = info_dict.get('protocol')
3306 if protocol is not None:
3307 return protocol
3308
7de837a5 3309 url = sanitize_url(info_dict['url'])
a055469f
PH
3310 if url.startswith('rtmp'):
3311 return 'rtmp'
3312 elif url.startswith('mms'):
3313 return 'mms'
3314 elif url.startswith('rtsp'):
3315 return 'rtsp'
3316
3317 ext = determine_ext(url)
3318 if ext == 'm3u8':
3319 return 'm3u8'
3320 elif ext == 'f4m':
3321 return 'f4m'
3322
3323 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
3324
3325
c5e3f849 3326def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3327 """ Render a list of rows, each as a list of values.
3328 Text after a \t will be right aligned """
ec11a9f4 3329 def width(string):
c5e3f849 3330 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3331
3332 def get_max_lens(table):
ec11a9f4 3333 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3334
3335 def filter_using_list(row, filterArray):
d16df59d 3336 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3337
d16df59d 3338 max_lens = get_max_lens(data) if hide_empty else []
3339 header_row = filter_using_list(header_row, max_lens)
3340 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3341
cfb56d1a 3342 table = [header_row] + data
76d321f6 3343 max_lens = get_max_lens(table)
c5e3f849 3344 extra_gap += 1
76d321f6 3345 if delim:
c5e3f849 3346 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3347 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3348 for row in table:
3349 for pos, text in enumerate(map(str, row)):
c5e3f849 3350 if '\t' in text:
3351 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3352 else:
3353 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3354 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3355 return ret
347de493
PH
3356
3357
8f18aca8 3358def _match_one(filter_part, dct, incomplete):
77b87f05 3359 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3360 STRING_OPERATORS = {
3361 '*=': operator.contains,
3362 '^=': lambda attr, value: attr.startswith(value),
3363 '$=': lambda attr, value: attr.endswith(value),
3364 '~=': lambda attr, value: re.search(value, attr),
3365 }
347de493 3366 COMPARISON_OPERATORS = {
a047eeb6 3367 **STRING_OPERATORS,
3368 '<=': operator.le, # "<=" must be defined above "<"
347de493 3369 '<': operator.lt,
347de493 3370 '>=': operator.ge,
a047eeb6 3371 '>': operator.gt,
347de493 3372 '=': operator.eq,
347de493 3373 }
a047eeb6 3374
6db9c4d5 3375 if isinstance(incomplete, bool):
3376 is_incomplete = lambda _: incomplete
3377 else:
3378 is_incomplete = lambda k: k in incomplete
3379
347de493
PH
3380 operator_rex = re.compile(r'''(?x)\s*
3381 (?P<key>[a-z_]+)
77b87f05 3382 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3383 (?:
a047eeb6 3384 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3385 (?P<strval>.+?)
347de493
PH
3386 )
3387 \s*$
3388 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3389 m = operator_rex.search(filter_part)
3390 if m:
18f96d12 3391 m = m.groupdict()
3392 unnegated_op = COMPARISON_OPERATORS[m['op']]
3393 if m['negation']:
77b87f05
MT
3394 op = lambda attr, value: not unnegated_op(attr, value)
3395 else:
3396 op = unnegated_op
18f96d12 3397 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3398 if m['quote']:
3399 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3400 actual_value = dct.get(m['key'])
3401 numeric_comparison = None
f9934b96 3402 if isinstance(actual_value, (int, float)):
e5a088dc
S
3403 # If the original field is a string and matching comparisonvalue is
3404 # a number we should respect the origin of the original field
3405 # and process comparison value as a string (see
18f96d12 3406 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3407 try:
18f96d12 3408 numeric_comparison = int(comparison_value)
347de493 3409 except ValueError:
18f96d12 3410 numeric_comparison = parse_filesize(comparison_value)
3411 if numeric_comparison is None:
3412 numeric_comparison = parse_filesize(f'{comparison_value}B')
3413 if numeric_comparison is None:
3414 numeric_comparison = parse_duration(comparison_value)
3415 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3416 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3417 if actual_value is None:
6db9c4d5 3418 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3419 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3420
3421 UNARY_OPERATORS = {
1cc47c66
S
3422 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3423 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493
PH
3424 }
3425 operator_rex = re.compile(r'''(?x)\s*
3426 (?P<op>%s)\s*(?P<key>[a-z_]+)
3427 \s*$
3428 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3429 m = operator_rex.search(filter_part)
3430 if m:
3431 op = UNARY_OPERATORS[m.group('op')]
3432 actual_value = dct.get(m.group('key'))
6db9c4d5 3433 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3434 return True
347de493
PH
3435 return op(actual_value)
3436
3437 raise ValueError('Invalid filter part %r' % filter_part)
3438
3439
8f18aca8 3440def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3441 """ Filter a dictionary with a simple string syntax.
3442 @returns Whether the filter passes
3443 @param incomplete Set of keys that is expected to be missing from dct.
3444 Can be True/False to indicate all/none of the keys may be missing.
3445 All conditions on incomplete keys pass if the key is missing
8f18aca8 3446 """
347de493 3447 return all(
8f18aca8 3448 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3449 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3450
3451
b1a7cd05 3452def match_filter_func(filters):
3453 if not filters:
d1b5f70b 3454 return None
492272fe 3455 filters = set(variadic(filters))
d1b5f70b 3456
492272fe 3457 interactive = '-' in filters
3458 if interactive:
3459 filters.remove('-')
3460
3461 def _match_func(info_dict, incomplete=False):
3462 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3463 return NO_DEFAULT if interactive and not incomplete else None
347de493 3464 else:
b1a7cd05 3465 video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3466 filter_str = ') | ('.join(map(str.strip, filters))
3467 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3468 return _match_func
91410c9b
PH
3469
3470
bf6427d2
YCH
3471def parse_dfxp_time_expr(time_expr):
3472 if not time_expr:
d631d5f9 3473 return
bf6427d2 3474
1d485a1a 3475 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3476 if mobj:
3477 return float(mobj.group('time_offset'))
3478
db2fe38b 3479 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3480 if mobj:
db2fe38b 3481 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3482
3483
c1c924ab 3484def srt_subtitles_timecode(seconds):
aa7785f8 3485 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3486
3487
3488def ass_subtitles_timecode(seconds):
3489 time = timetuple_from_msec(seconds * 1000)
3490 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3491
3492
3493def dfxp2srt(dfxp_data):
3869028f
YCH
3494 '''
3495 @param dfxp_data A bytes-like object containing DFXP data
3496 @returns A unicode object containing converted SRT data
3497 '''
5b995f71 3498 LEGACY_NAMESPACES = (
3869028f
YCH
3499 (b'http://www.w3.org/ns/ttml', [
3500 b'http://www.w3.org/2004/11/ttaf1',
3501 b'http://www.w3.org/2006/04/ttaf1',
3502 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3503 ]),
3869028f
YCH
3504 (b'http://www.w3.org/ns/ttml#styling', [
3505 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3506 ]),
3507 )
3508
3509 SUPPORTED_STYLING = [
3510 'color',
3511 'fontFamily',
3512 'fontSize',
3513 'fontStyle',
3514 'fontWeight',
3515 'textDecoration'
3516 ]
3517
4e335771 3518 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3519 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3520 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3521 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3522 })
bf6427d2 3523
5b995f71
RA
3524 styles = {}
3525 default_style = {}
3526
86e5f3ed 3527 class TTMLPElementParser:
5b995f71
RA
3528 _out = ''
3529 _unclosed_elements = []
3530 _applied_styles = []
bf6427d2 3531
2b14cb56 3532 def start(self, tag, attrib):
5b995f71
RA
3533 if tag in (_x('ttml:br'), 'br'):
3534 self._out += '\n'
3535 else:
3536 unclosed_elements = []
3537 style = {}
3538 element_style_id = attrib.get('style')
3539 if default_style:
3540 style.update(default_style)
3541 if element_style_id:
3542 style.update(styles.get(element_style_id, {}))
3543 for prop in SUPPORTED_STYLING:
3544 prop_val = attrib.get(_x('tts:' + prop))
3545 if prop_val:
3546 style[prop] = prop_val
3547 if style:
3548 font = ''
3549 for k, v in sorted(style.items()):
3550 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3551 continue
3552 if k == 'color':
3553 font += ' color="%s"' % v
3554 elif k == 'fontSize':
3555 font += ' size="%s"' % v
3556 elif k == 'fontFamily':
3557 font += ' face="%s"' % v
3558 elif k == 'fontWeight' and v == 'bold':
3559 self._out += '<b>'
3560 unclosed_elements.append('b')
3561 elif k == 'fontStyle' and v == 'italic':
3562 self._out += '<i>'
3563 unclosed_elements.append('i')
3564 elif k == 'textDecoration' and v == 'underline':
3565 self._out += '<u>'
3566 unclosed_elements.append('u')
3567 if font:
3568 self._out += '<font' + font + '>'
3569 unclosed_elements.append('font')
3570 applied_style = {}
3571 if self._applied_styles:
3572 applied_style.update(self._applied_styles[-1])
3573 applied_style.update(style)
3574 self._applied_styles.append(applied_style)
3575 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3576
2b14cb56 3577 def end(self, tag):
5b995f71
RA
3578 if tag not in (_x('ttml:br'), 'br'):
3579 unclosed_elements = self._unclosed_elements.pop()
3580 for element in reversed(unclosed_elements):
3581 self._out += '</%s>' % element
3582 if unclosed_elements and self._applied_styles:
3583 self._applied_styles.pop()
bf6427d2 3584
2b14cb56 3585 def data(self, data):
5b995f71 3586 self._out += data
2b14cb56 3587
3588 def close(self):
5b995f71 3589 return self._out.strip()
2b14cb56 3590
3591 def parse_node(node):
3592 target = TTMLPElementParser()
3593 parser = xml.etree.ElementTree.XMLParser(target=target)
3594 parser.feed(xml.etree.ElementTree.tostring(node))
3595 return parser.close()
bf6427d2 3596
5b995f71
RA
3597 for k, v in LEGACY_NAMESPACES:
3598 for ns in v:
3599 dfxp_data = dfxp_data.replace(ns, k)
3600
3869028f 3601 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3602 out = []
5b995f71 3603 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3604
3605 if not paras:
3606 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3607
5b995f71
RA
3608 repeat = False
3609 while True:
3610 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3611 style_id = style.get('id') or style.get(_x('xml:id'))
3612 if not style_id:
3613 continue
5b995f71
RA
3614 parent_style_id = style.get('style')
3615 if parent_style_id:
3616 if parent_style_id not in styles:
3617 repeat = True
3618 continue
3619 styles[style_id] = styles[parent_style_id].copy()
3620 for prop in SUPPORTED_STYLING:
3621 prop_val = style.get(_x('tts:' + prop))
3622 if prop_val:
3623 styles.setdefault(style_id, {})[prop] = prop_val
3624 if repeat:
3625 repeat = False
3626 else:
3627 break
3628
3629 for p in ('body', 'div'):
3630 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3631 if ele is None:
3632 continue
3633 style = styles.get(ele.get('style'))
3634 if not style:
3635 continue
3636 default_style.update(style)
3637
bf6427d2 3638 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3639 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3640 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3641 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3642 if begin_time is None:
3643 continue
7dff0363 3644 if not end_time:
d631d5f9
YCH
3645 if not dur:
3646 continue
3647 end_time = begin_time + dur
bf6427d2
YCH
3648 out.append('%d\n%s --> %s\n%s\n\n' % (
3649 index,
c1c924ab
YCH
3650 srt_subtitles_timecode(begin_time),
3651 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3652 parse_node(para)))
3653
3654 return ''.join(out)
3655
3656
66e289ba
S
3657def cli_option(params, command_option, param):
3658 param = params.get(param)
98e698f1
RA
3659 if param:
3660 param = compat_str(param)
66e289ba
S
3661 return [command_option, param] if param is not None else []
3662
3663
3664def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3665 param = params.get(param)
5b232f46
S
3666 if param is None:
3667 return []
66e289ba
S
3668 assert isinstance(param, bool)
3669 if separator:
3670 return [command_option + separator + (true_value if param else false_value)]
3671 return [command_option, true_value if param else false_value]
3672
3673
3674def cli_valueless_option(params, command_option, param, expected_value=True):
3675 param = params.get(param)
3676 return [command_option] if param == expected_value else []
3677
3678
e92caff5 3679def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3680 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3681 if use_compat:
5b1ecbb3 3682 return argdict
3683 else:
3684 argdict = None
eab9b2bc 3685 if argdict is None:
5b1ecbb3 3686 return default
eab9b2bc 3687 assert isinstance(argdict, dict)
3688
e92caff5 3689 assert isinstance(keys, (list, tuple))
3690 for key_list in keys:
e92caff5 3691 arg_list = list(filter(
3692 lambda x: x is not None,
6606817a 3693 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3694 if arg_list:
3695 return [arg for args in arg_list for arg in args]
3696 return default
66e289ba 3697
6251555f 3698
330690a2 3699def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3700 main_key, exe = main_key.lower(), exe.lower()
3701 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3702 keys = [f'{root_key}{k}' for k in (keys or [''])]
3703 if root_key in keys:
3704 if main_key != exe:
3705 keys.append((main_key, exe))
3706 keys.append('default')
3707 else:
3708 use_compat = False
3709 return cli_configuration_args(argdict, keys, default, use_compat)
3710
66e289ba 3711
86e5f3ed 3712class ISO639Utils:
39672624
YCH
3713 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3714 _lang_map = {
3715 'aa': 'aar',
3716 'ab': 'abk',
3717 'ae': 'ave',
3718 'af': 'afr',
3719 'ak': 'aka',
3720 'am': 'amh',
3721 'an': 'arg',
3722 'ar': 'ara',
3723 'as': 'asm',
3724 'av': 'ava',
3725 'ay': 'aym',
3726 'az': 'aze',
3727 'ba': 'bak',
3728 'be': 'bel',
3729 'bg': 'bul',
3730 'bh': 'bih',
3731 'bi': 'bis',
3732 'bm': 'bam',
3733 'bn': 'ben',
3734 'bo': 'bod',
3735 'br': 'bre',
3736 'bs': 'bos',
3737 'ca': 'cat',
3738 'ce': 'che',
3739 'ch': 'cha',
3740 'co': 'cos',
3741 'cr': 'cre',
3742 'cs': 'ces',
3743 'cu': 'chu',
3744 'cv': 'chv',
3745 'cy': 'cym',
3746 'da': 'dan',
3747 'de': 'deu',
3748 'dv': 'div',
3749 'dz': 'dzo',
3750 'ee': 'ewe',
3751 'el': 'ell',
3752 'en': 'eng',
3753 'eo': 'epo',
3754 'es': 'spa',
3755 'et': 'est',
3756 'eu': 'eus',
3757 'fa': 'fas',
3758 'ff': 'ful',
3759 'fi': 'fin',
3760 'fj': 'fij',
3761 'fo': 'fao',
3762 'fr': 'fra',
3763 'fy': 'fry',
3764 'ga': 'gle',
3765 'gd': 'gla',
3766 'gl': 'glg',
3767 'gn': 'grn',
3768 'gu': 'guj',
3769 'gv': 'glv',
3770 'ha': 'hau',
3771 'he': 'heb',
b7acc835 3772 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
3773 'hi': 'hin',
3774 'ho': 'hmo',
3775 'hr': 'hrv',
3776 'ht': 'hat',
3777 'hu': 'hun',
3778 'hy': 'hye',
3779 'hz': 'her',
3780 'ia': 'ina',
3781 'id': 'ind',
b7acc835 3782 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
3783 'ie': 'ile',
3784 'ig': 'ibo',
3785 'ii': 'iii',
3786 'ik': 'ipk',
3787 'io': 'ido',
3788 'is': 'isl',
3789 'it': 'ita',
3790 'iu': 'iku',
3791 'ja': 'jpn',
3792 'jv': 'jav',
3793 'ka': 'kat',
3794 'kg': 'kon',
3795 'ki': 'kik',
3796 'kj': 'kua',
3797 'kk': 'kaz',
3798 'kl': 'kal',
3799 'km': 'khm',
3800 'kn': 'kan',
3801 'ko': 'kor',
3802 'kr': 'kau',
3803 'ks': 'kas',
3804 'ku': 'kur',
3805 'kv': 'kom',
3806 'kw': 'cor',
3807 'ky': 'kir',
3808 'la': 'lat',
3809 'lb': 'ltz',
3810 'lg': 'lug',
3811 'li': 'lim',
3812 'ln': 'lin',
3813 'lo': 'lao',
3814 'lt': 'lit',
3815 'lu': 'lub',
3816 'lv': 'lav',
3817 'mg': 'mlg',
3818 'mh': 'mah',
3819 'mi': 'mri',
3820 'mk': 'mkd',
3821 'ml': 'mal',
3822 'mn': 'mon',
3823 'mr': 'mar',
3824 'ms': 'msa',
3825 'mt': 'mlt',
3826 'my': 'mya',
3827 'na': 'nau',
3828 'nb': 'nob',
3829 'nd': 'nde',
3830 'ne': 'nep',
3831 'ng': 'ndo',
3832 'nl': 'nld',
3833 'nn': 'nno',
3834 'no': 'nor',
3835 'nr': 'nbl',
3836 'nv': 'nav',
3837 'ny': 'nya',
3838 'oc': 'oci',
3839 'oj': 'oji',
3840 'om': 'orm',
3841 'or': 'ori',
3842 'os': 'oss',
3843 'pa': 'pan',
3844 'pi': 'pli',
3845 'pl': 'pol',
3846 'ps': 'pus',
3847 'pt': 'por',
3848 'qu': 'que',
3849 'rm': 'roh',
3850 'rn': 'run',
3851 'ro': 'ron',
3852 'ru': 'rus',
3853 'rw': 'kin',
3854 'sa': 'san',
3855 'sc': 'srd',
3856 'sd': 'snd',
3857 'se': 'sme',
3858 'sg': 'sag',
3859 'si': 'sin',
3860 'sk': 'slk',
3861 'sl': 'slv',
3862 'sm': 'smo',
3863 'sn': 'sna',
3864 'so': 'som',
3865 'sq': 'sqi',
3866 'sr': 'srp',
3867 'ss': 'ssw',
3868 'st': 'sot',
3869 'su': 'sun',
3870 'sv': 'swe',
3871 'sw': 'swa',
3872 'ta': 'tam',
3873 'te': 'tel',
3874 'tg': 'tgk',
3875 'th': 'tha',
3876 'ti': 'tir',
3877 'tk': 'tuk',
3878 'tl': 'tgl',
3879 'tn': 'tsn',
3880 'to': 'ton',
3881 'tr': 'tur',
3882 'ts': 'tso',
3883 'tt': 'tat',
3884 'tw': 'twi',
3885 'ty': 'tah',
3886 'ug': 'uig',
3887 'uk': 'ukr',
3888 'ur': 'urd',
3889 'uz': 'uzb',
3890 've': 'ven',
3891 'vi': 'vie',
3892 'vo': 'vol',
3893 'wa': 'wln',
3894 'wo': 'wol',
3895 'xh': 'xho',
3896 'yi': 'yid',
e9a50fba 3897 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
3898 'yo': 'yor',
3899 'za': 'zha',
3900 'zh': 'zho',
3901 'zu': 'zul',
3902 }
3903
3904 @classmethod
3905 def short2long(cls, code):
3906 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3907 return cls._lang_map.get(code[:2])
3908
3909 @classmethod
3910 def long2short(cls, code):
3911 """Convert language code from ISO 639-2/T to ISO 639-1"""
3912 for short_name, long_name in cls._lang_map.items():
3913 if long_name == code:
3914 return short_name
3915
3916
86e5f3ed 3917class ISO3166Utils:
4eb10f66
YCH
3918 # From http://data.okfn.org/data/core/country-list
3919 _country_map = {
3920 'AF': 'Afghanistan',
3921 'AX': 'Åland Islands',
3922 'AL': 'Albania',
3923 'DZ': 'Algeria',
3924 'AS': 'American Samoa',
3925 'AD': 'Andorra',
3926 'AO': 'Angola',
3927 'AI': 'Anguilla',
3928 'AQ': 'Antarctica',
3929 'AG': 'Antigua and Barbuda',
3930 'AR': 'Argentina',
3931 'AM': 'Armenia',
3932 'AW': 'Aruba',
3933 'AU': 'Australia',
3934 'AT': 'Austria',
3935 'AZ': 'Azerbaijan',
3936 'BS': 'Bahamas',
3937 'BH': 'Bahrain',
3938 'BD': 'Bangladesh',
3939 'BB': 'Barbados',
3940 'BY': 'Belarus',
3941 'BE': 'Belgium',
3942 'BZ': 'Belize',
3943 'BJ': 'Benin',
3944 'BM': 'Bermuda',
3945 'BT': 'Bhutan',
3946 'BO': 'Bolivia, Plurinational State of',
3947 'BQ': 'Bonaire, Sint Eustatius and Saba',
3948 'BA': 'Bosnia and Herzegovina',
3949 'BW': 'Botswana',
3950 'BV': 'Bouvet Island',
3951 'BR': 'Brazil',
3952 'IO': 'British Indian Ocean Territory',
3953 'BN': 'Brunei Darussalam',
3954 'BG': 'Bulgaria',
3955 'BF': 'Burkina Faso',
3956 'BI': 'Burundi',
3957 'KH': 'Cambodia',
3958 'CM': 'Cameroon',
3959 'CA': 'Canada',
3960 'CV': 'Cape Verde',
3961 'KY': 'Cayman Islands',
3962 'CF': 'Central African Republic',
3963 'TD': 'Chad',
3964 'CL': 'Chile',
3965 'CN': 'China',
3966 'CX': 'Christmas Island',
3967 'CC': 'Cocos (Keeling) Islands',
3968 'CO': 'Colombia',
3969 'KM': 'Comoros',
3970 'CG': 'Congo',
3971 'CD': 'Congo, the Democratic Republic of the',
3972 'CK': 'Cook Islands',
3973 'CR': 'Costa Rica',
3974 'CI': 'Côte d\'Ivoire',
3975 'HR': 'Croatia',
3976 'CU': 'Cuba',
3977 'CW': 'Curaçao',
3978 'CY': 'Cyprus',
3979 'CZ': 'Czech Republic',
3980 'DK': 'Denmark',
3981 'DJ': 'Djibouti',
3982 'DM': 'Dominica',
3983 'DO': 'Dominican Republic',
3984 'EC': 'Ecuador',
3985 'EG': 'Egypt',
3986 'SV': 'El Salvador',
3987 'GQ': 'Equatorial Guinea',
3988 'ER': 'Eritrea',
3989 'EE': 'Estonia',
3990 'ET': 'Ethiopia',
3991 'FK': 'Falkland Islands (Malvinas)',
3992 'FO': 'Faroe Islands',
3993 'FJ': 'Fiji',
3994 'FI': 'Finland',
3995 'FR': 'France',
3996 'GF': 'French Guiana',
3997 'PF': 'French Polynesia',
3998 'TF': 'French Southern Territories',
3999 'GA': 'Gabon',
4000 'GM': 'Gambia',
4001 'GE': 'Georgia',
4002 'DE': 'Germany',
4003 'GH': 'Ghana',
4004 'GI': 'Gibraltar',
4005 'GR': 'Greece',
4006 'GL': 'Greenland',
4007 'GD': 'Grenada',
4008 'GP': 'Guadeloupe',
4009 'GU': 'Guam',
4010 'GT': 'Guatemala',
4011 'GG': 'Guernsey',
4012 'GN': 'Guinea',
4013 'GW': 'Guinea-Bissau',
4014 'GY': 'Guyana',
4015 'HT': 'Haiti',
4016 'HM': 'Heard Island and McDonald Islands',
4017 'VA': 'Holy See (Vatican City State)',
4018 'HN': 'Honduras',
4019 'HK': 'Hong Kong',
4020 'HU': 'Hungary',
4021 'IS': 'Iceland',
4022 'IN': 'India',
4023 'ID': 'Indonesia',
4024 'IR': 'Iran, Islamic Republic of',
4025 'IQ': 'Iraq',
4026 'IE': 'Ireland',
4027 'IM': 'Isle of Man',
4028 'IL': 'Israel',
4029 'IT': 'Italy',
4030 'JM': 'Jamaica',
4031 'JP': 'Japan',
4032 'JE': 'Jersey',
4033 'JO': 'Jordan',
4034 'KZ': 'Kazakhstan',
4035 'KE': 'Kenya',
4036 'KI': 'Kiribati',
4037 'KP': 'Korea, Democratic People\'s Republic of',
4038 'KR': 'Korea, Republic of',
4039 'KW': 'Kuwait',
4040 'KG': 'Kyrgyzstan',
4041 'LA': 'Lao People\'s Democratic Republic',
4042 'LV': 'Latvia',
4043 'LB': 'Lebanon',
4044 'LS': 'Lesotho',
4045 'LR': 'Liberia',
4046 'LY': 'Libya',
4047 'LI': 'Liechtenstein',
4048 'LT': 'Lithuania',
4049 'LU': 'Luxembourg',
4050 'MO': 'Macao',
4051 'MK': 'Macedonia, the Former Yugoslav Republic of',
4052 'MG': 'Madagascar',
4053 'MW': 'Malawi',
4054 'MY': 'Malaysia',
4055 'MV': 'Maldives',
4056 'ML': 'Mali',
4057 'MT': 'Malta',
4058 'MH': 'Marshall Islands',
4059 'MQ': 'Martinique',
4060 'MR': 'Mauritania',
4061 'MU': 'Mauritius',
4062 'YT': 'Mayotte',
4063 'MX': 'Mexico',
4064 'FM': 'Micronesia, Federated States of',
4065 'MD': 'Moldova, Republic of',
4066 'MC': 'Monaco',
4067 'MN': 'Mongolia',
4068 'ME': 'Montenegro',
4069 'MS': 'Montserrat',
4070 'MA': 'Morocco',
4071 'MZ': 'Mozambique',
4072 'MM': 'Myanmar',
4073 'NA': 'Namibia',
4074 'NR': 'Nauru',
4075 'NP': 'Nepal',
4076 'NL': 'Netherlands',
4077 'NC': 'New Caledonia',
4078 'NZ': 'New Zealand',
4079 'NI': 'Nicaragua',
4080 'NE': 'Niger',
4081 'NG': 'Nigeria',
4082 'NU': 'Niue',
4083 'NF': 'Norfolk Island',
4084 'MP': 'Northern Mariana Islands',
4085 'NO': 'Norway',
4086 'OM': 'Oman',
4087 'PK': 'Pakistan',
4088 'PW': 'Palau',
4089 'PS': 'Palestine, State of',
4090 'PA': 'Panama',
4091 'PG': 'Papua New Guinea',
4092 'PY': 'Paraguay',
4093 'PE': 'Peru',
4094 'PH': 'Philippines',
4095 'PN': 'Pitcairn',
4096 'PL': 'Poland',
4097 'PT': 'Portugal',
4098 'PR': 'Puerto Rico',
4099 'QA': 'Qatar',
4100 'RE': 'Réunion',
4101 'RO': 'Romania',
4102 'RU': 'Russian Federation',
4103 'RW': 'Rwanda',
4104 'BL': 'Saint Barthélemy',
4105 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4106 'KN': 'Saint Kitts and Nevis',
4107 'LC': 'Saint Lucia',
4108 'MF': 'Saint Martin (French part)',
4109 'PM': 'Saint Pierre and Miquelon',
4110 'VC': 'Saint Vincent and the Grenadines',
4111 'WS': 'Samoa',
4112 'SM': 'San Marino',
4113 'ST': 'Sao Tome and Principe',
4114 'SA': 'Saudi Arabia',
4115 'SN': 'Senegal',
4116 'RS': 'Serbia',
4117 'SC': 'Seychelles',
4118 'SL': 'Sierra Leone',
4119 'SG': 'Singapore',
4120 'SX': 'Sint Maarten (Dutch part)',
4121 'SK': 'Slovakia',
4122 'SI': 'Slovenia',
4123 'SB': 'Solomon Islands',
4124 'SO': 'Somalia',
4125 'ZA': 'South Africa',
4126 'GS': 'South Georgia and the South Sandwich Islands',
4127 'SS': 'South Sudan',
4128 'ES': 'Spain',
4129 'LK': 'Sri Lanka',
4130 'SD': 'Sudan',
4131 'SR': 'Suriname',
4132 'SJ': 'Svalbard and Jan Mayen',
4133 'SZ': 'Swaziland',
4134 'SE': 'Sweden',
4135 'CH': 'Switzerland',
4136 'SY': 'Syrian Arab Republic',
4137 'TW': 'Taiwan, Province of China',
4138 'TJ': 'Tajikistan',
4139 'TZ': 'Tanzania, United Republic of',
4140 'TH': 'Thailand',
4141 'TL': 'Timor-Leste',
4142 'TG': 'Togo',
4143 'TK': 'Tokelau',
4144 'TO': 'Tonga',
4145 'TT': 'Trinidad and Tobago',
4146 'TN': 'Tunisia',
4147 'TR': 'Turkey',
4148 'TM': 'Turkmenistan',
4149 'TC': 'Turks and Caicos Islands',
4150 'TV': 'Tuvalu',
4151 'UG': 'Uganda',
4152 'UA': 'Ukraine',
4153 'AE': 'United Arab Emirates',
4154 'GB': 'United Kingdom',
4155 'US': 'United States',
4156 'UM': 'United States Minor Outlying Islands',
4157 'UY': 'Uruguay',
4158 'UZ': 'Uzbekistan',
4159 'VU': 'Vanuatu',
4160 'VE': 'Venezuela, Bolivarian Republic of',
4161 'VN': 'Viet Nam',
4162 'VG': 'Virgin Islands, British',
4163 'VI': 'Virgin Islands, U.S.',
4164 'WF': 'Wallis and Futuna',
4165 'EH': 'Western Sahara',
4166 'YE': 'Yemen',
4167 'ZM': 'Zambia',
4168 'ZW': 'Zimbabwe',
4169 }
4170
4171 @classmethod
4172 def short2full(cls, code):
4173 """Convert an ISO 3166-2 country code to the corresponding full name"""
4174 return cls._country_map.get(code.upper())
4175
4176
86e5f3ed 4177class GeoUtils:
773f291d
S
4178 # Major IPv4 address blocks per country
4179 _country_ip_map = {
53896ca5 4180 'AD': '46.172.224.0/19',
773f291d
S
4181 'AE': '94.200.0.0/13',
4182 'AF': '149.54.0.0/17',
4183 'AG': '209.59.64.0/18',
4184 'AI': '204.14.248.0/21',
4185 'AL': '46.99.0.0/16',
4186 'AM': '46.70.0.0/15',
4187 'AO': '105.168.0.0/13',
53896ca5
S
4188 'AP': '182.50.184.0/21',
4189 'AQ': '23.154.160.0/24',
773f291d
S
4190 'AR': '181.0.0.0/12',
4191 'AS': '202.70.112.0/20',
53896ca5 4192 'AT': '77.116.0.0/14',
773f291d
S
4193 'AU': '1.128.0.0/11',
4194 'AW': '181.41.0.0/18',
53896ca5
S
4195 'AX': '185.217.4.0/22',
4196 'AZ': '5.197.0.0/16',
773f291d
S
4197 'BA': '31.176.128.0/17',
4198 'BB': '65.48.128.0/17',
4199 'BD': '114.130.0.0/16',
4200 'BE': '57.0.0.0/8',
53896ca5 4201 'BF': '102.178.0.0/15',
773f291d
S
4202 'BG': '95.42.0.0/15',
4203 'BH': '37.131.0.0/17',
4204 'BI': '154.117.192.0/18',
4205 'BJ': '137.255.0.0/16',
53896ca5 4206 'BL': '185.212.72.0/23',
773f291d
S
4207 'BM': '196.12.64.0/18',
4208 'BN': '156.31.0.0/16',
4209 'BO': '161.56.0.0/16',
4210 'BQ': '161.0.80.0/20',
53896ca5 4211 'BR': '191.128.0.0/12',
773f291d
S
4212 'BS': '24.51.64.0/18',
4213 'BT': '119.2.96.0/19',
4214 'BW': '168.167.0.0/16',
4215 'BY': '178.120.0.0/13',
4216 'BZ': '179.42.192.0/18',
4217 'CA': '99.224.0.0/11',
4218 'CD': '41.243.0.0/16',
53896ca5
S
4219 'CF': '197.242.176.0/21',
4220 'CG': '160.113.0.0/16',
773f291d 4221 'CH': '85.0.0.0/13',
53896ca5 4222 'CI': '102.136.0.0/14',
773f291d
S
4223 'CK': '202.65.32.0/19',
4224 'CL': '152.172.0.0/14',
53896ca5 4225 'CM': '102.244.0.0/14',
773f291d
S
4226 'CN': '36.128.0.0/10',
4227 'CO': '181.240.0.0/12',
4228 'CR': '201.192.0.0/12',
4229 'CU': '152.206.0.0/15',
4230 'CV': '165.90.96.0/19',
4231 'CW': '190.88.128.0/17',
53896ca5 4232 'CY': '31.153.0.0/16',
773f291d
S
4233 'CZ': '88.100.0.0/14',
4234 'DE': '53.0.0.0/8',
4235 'DJ': '197.241.0.0/17',
4236 'DK': '87.48.0.0/12',
4237 'DM': '192.243.48.0/20',
4238 'DO': '152.166.0.0/15',
4239 'DZ': '41.96.0.0/12',
4240 'EC': '186.68.0.0/15',
4241 'EE': '90.190.0.0/15',
4242 'EG': '156.160.0.0/11',
4243 'ER': '196.200.96.0/20',
4244 'ES': '88.0.0.0/11',
4245 'ET': '196.188.0.0/14',
4246 'EU': '2.16.0.0/13',
4247 'FI': '91.152.0.0/13',
4248 'FJ': '144.120.0.0/16',
53896ca5 4249 'FK': '80.73.208.0/21',
773f291d
S
4250 'FM': '119.252.112.0/20',
4251 'FO': '88.85.32.0/19',
4252 'FR': '90.0.0.0/9',
4253 'GA': '41.158.0.0/15',
4254 'GB': '25.0.0.0/8',
4255 'GD': '74.122.88.0/21',
4256 'GE': '31.146.0.0/16',
4257 'GF': '161.22.64.0/18',
4258 'GG': '62.68.160.0/19',
53896ca5
S
4259 'GH': '154.160.0.0/12',
4260 'GI': '95.164.0.0/16',
773f291d
S
4261 'GL': '88.83.0.0/19',
4262 'GM': '160.182.0.0/15',
4263 'GN': '197.149.192.0/18',
4264 'GP': '104.250.0.0/19',
4265 'GQ': '105.235.224.0/20',
4266 'GR': '94.64.0.0/13',
4267 'GT': '168.234.0.0/16',
4268 'GU': '168.123.0.0/16',
4269 'GW': '197.214.80.0/20',
4270 'GY': '181.41.64.0/18',
4271 'HK': '113.252.0.0/14',
4272 'HN': '181.210.0.0/16',
4273 'HR': '93.136.0.0/13',
4274 'HT': '148.102.128.0/17',
4275 'HU': '84.0.0.0/14',
4276 'ID': '39.192.0.0/10',
4277 'IE': '87.32.0.0/12',
4278 'IL': '79.176.0.0/13',
4279 'IM': '5.62.80.0/20',
4280 'IN': '117.192.0.0/10',
4281 'IO': '203.83.48.0/21',
4282 'IQ': '37.236.0.0/14',
4283 'IR': '2.176.0.0/12',
4284 'IS': '82.221.0.0/16',
4285 'IT': '79.0.0.0/10',
4286 'JE': '87.244.64.0/18',
4287 'JM': '72.27.0.0/17',
4288 'JO': '176.29.0.0/16',
53896ca5 4289 'JP': '133.0.0.0/8',
773f291d
S
4290 'KE': '105.48.0.0/12',
4291 'KG': '158.181.128.0/17',
4292 'KH': '36.37.128.0/17',
4293 'KI': '103.25.140.0/22',
4294 'KM': '197.255.224.0/20',
53896ca5 4295 'KN': '198.167.192.0/19',
773f291d
S
4296 'KP': '175.45.176.0/22',
4297 'KR': '175.192.0.0/10',
4298 'KW': '37.36.0.0/14',
4299 'KY': '64.96.0.0/15',
4300 'KZ': '2.72.0.0/13',
4301 'LA': '115.84.64.0/18',
4302 'LB': '178.135.0.0/16',
53896ca5 4303 'LC': '24.92.144.0/20',
773f291d
S
4304 'LI': '82.117.0.0/19',
4305 'LK': '112.134.0.0/15',
53896ca5 4306 'LR': '102.183.0.0/16',
773f291d
S
4307 'LS': '129.232.0.0/17',
4308 'LT': '78.56.0.0/13',
4309 'LU': '188.42.0.0/16',
4310 'LV': '46.109.0.0/16',
4311 'LY': '41.252.0.0/14',
4312 'MA': '105.128.0.0/11',
4313 'MC': '88.209.64.0/18',
4314 'MD': '37.246.0.0/16',
4315 'ME': '178.175.0.0/17',
4316 'MF': '74.112.232.0/21',
4317 'MG': '154.126.0.0/17',
4318 'MH': '117.103.88.0/21',
4319 'MK': '77.28.0.0/15',
4320 'ML': '154.118.128.0/18',
4321 'MM': '37.111.0.0/17',
4322 'MN': '49.0.128.0/17',
4323 'MO': '60.246.0.0/16',
4324 'MP': '202.88.64.0/20',
4325 'MQ': '109.203.224.0/19',
4326 'MR': '41.188.64.0/18',
4327 'MS': '208.90.112.0/22',
4328 'MT': '46.11.0.0/16',
4329 'MU': '105.16.0.0/12',
4330 'MV': '27.114.128.0/18',
53896ca5 4331 'MW': '102.70.0.0/15',
773f291d
S
4332 'MX': '187.192.0.0/11',
4333 'MY': '175.136.0.0/13',
4334 'MZ': '197.218.0.0/15',
4335 'NA': '41.182.0.0/16',
4336 'NC': '101.101.0.0/18',
4337 'NE': '197.214.0.0/18',
4338 'NF': '203.17.240.0/22',
4339 'NG': '105.112.0.0/12',
4340 'NI': '186.76.0.0/15',
4341 'NL': '145.96.0.0/11',
4342 'NO': '84.208.0.0/13',
4343 'NP': '36.252.0.0/15',
4344 'NR': '203.98.224.0/19',
4345 'NU': '49.156.48.0/22',
4346 'NZ': '49.224.0.0/14',
4347 'OM': '5.36.0.0/15',
4348 'PA': '186.72.0.0/15',
4349 'PE': '186.160.0.0/14',
4350 'PF': '123.50.64.0/18',
4351 'PG': '124.240.192.0/19',
4352 'PH': '49.144.0.0/13',
4353 'PK': '39.32.0.0/11',
4354 'PL': '83.0.0.0/11',
4355 'PM': '70.36.0.0/20',
4356 'PR': '66.50.0.0/16',
4357 'PS': '188.161.0.0/16',
4358 'PT': '85.240.0.0/13',
4359 'PW': '202.124.224.0/20',
4360 'PY': '181.120.0.0/14',
4361 'QA': '37.210.0.0/15',
53896ca5 4362 'RE': '102.35.0.0/16',
773f291d 4363 'RO': '79.112.0.0/13',
53896ca5 4364 'RS': '93.86.0.0/15',
773f291d 4365 'RU': '5.136.0.0/13',
53896ca5 4366 'RW': '41.186.0.0/16',
773f291d
S
4367 'SA': '188.48.0.0/13',
4368 'SB': '202.1.160.0/19',
4369 'SC': '154.192.0.0/11',
53896ca5 4370 'SD': '102.120.0.0/13',
773f291d 4371 'SE': '78.64.0.0/12',
53896ca5 4372 'SG': '8.128.0.0/10',
773f291d
S
4373 'SI': '188.196.0.0/14',
4374 'SK': '78.98.0.0/15',
53896ca5 4375 'SL': '102.143.0.0/17',
773f291d
S
4376 'SM': '89.186.32.0/19',
4377 'SN': '41.82.0.0/15',
53896ca5 4378 'SO': '154.115.192.0/18',
773f291d
S
4379 'SR': '186.179.128.0/17',
4380 'SS': '105.235.208.0/21',
4381 'ST': '197.159.160.0/19',
4382 'SV': '168.243.0.0/16',
4383 'SX': '190.102.0.0/20',
4384 'SY': '5.0.0.0/16',
4385 'SZ': '41.84.224.0/19',
4386 'TC': '65.255.48.0/20',
4387 'TD': '154.68.128.0/19',
4388 'TG': '196.168.0.0/14',
4389 'TH': '171.96.0.0/13',
4390 'TJ': '85.9.128.0/18',
4391 'TK': '27.96.24.0/21',
4392 'TL': '180.189.160.0/20',
4393 'TM': '95.85.96.0/19',
4394 'TN': '197.0.0.0/11',
4395 'TO': '175.176.144.0/21',
4396 'TR': '78.160.0.0/11',
4397 'TT': '186.44.0.0/15',
4398 'TV': '202.2.96.0/19',
4399 'TW': '120.96.0.0/11',
4400 'TZ': '156.156.0.0/14',
53896ca5
S
4401 'UA': '37.52.0.0/14',
4402 'UG': '102.80.0.0/13',
4403 'US': '6.0.0.0/8',
773f291d 4404 'UY': '167.56.0.0/13',
53896ca5 4405 'UZ': '84.54.64.0/18',
773f291d 4406 'VA': '212.77.0.0/19',
53896ca5 4407 'VC': '207.191.240.0/21',
773f291d 4408 'VE': '186.88.0.0/13',
53896ca5 4409 'VG': '66.81.192.0/20',
773f291d
S
4410 'VI': '146.226.0.0/16',
4411 'VN': '14.160.0.0/11',
4412 'VU': '202.80.32.0/20',
4413 'WF': '117.20.32.0/21',
4414 'WS': '202.4.32.0/19',
4415 'YE': '134.35.0.0/16',
4416 'YT': '41.242.116.0/22',
4417 'ZA': '41.0.0.0/11',
53896ca5
S
4418 'ZM': '102.144.0.0/13',
4419 'ZW': '102.177.192.0/18',
773f291d
S
4420 }
4421
4422 @classmethod
5f95927a
S
4423 def random_ipv4(cls, code_or_block):
4424 if len(code_or_block) == 2:
4425 block = cls._country_ip_map.get(code_or_block.upper())
4426 if not block:
4427 return None
4428 else:
4429 block = code_or_block
773f291d
S
4430 addr, preflen = block.split('/')
4431 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4432 addr_max = addr_min | (0xffffffff >> int(preflen))
18a0defa 4433 return compat_str(socket.inet_ntoa(
4248dad9 4434 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4435
4436
91410c9b 4437class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
4438 def __init__(self, proxies=None):
4439 # Set default handlers
4440 for type in ('http', 'https'):
4441 setattr(self, '%s_open' % type,
4442 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4443 meth(r, proxy, type))
38e87f6c 4444 compat_urllib_request.ProxyHandler.__init__(self, proxies)
2461f79d 4445
91410c9b 4446 def proxy_open(self, req, proxy, type):
2461f79d 4447 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4448 if req_proxy is not None:
4449 proxy = req_proxy
2461f79d
PH
4450 del req.headers['Ytdl-request-proxy']
4451
4452 if proxy == '__noproxy__':
4453 return None # No Proxy
51fb4995 4454 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4455 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4456 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4457 return None
91410c9b
PH
4458 return compat_urllib_request.ProxyHandler.proxy_open(
4459 self, req, proxy, type)
5bc880b9
YCH
4460
4461
0a5445dd
YCH
4462# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4463# released into Public Domain
4464# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4465
4466def long_to_bytes(n, blocksize=0):
4467 """long_to_bytes(n:long, blocksize:int) : string
4468 Convert a long integer to a byte string.
4469
4470 If optional blocksize is given and greater than zero, pad the front of the
4471 byte string with binary zeros so that the length is a multiple of
4472 blocksize.
4473 """
4474 # after much testing, this algorithm was deemed to be the fastest
4475 s = b''
4476 n = int(n)
4477 while n > 0:
4478 s = compat_struct_pack('>I', n & 0xffffffff) + s
4479 n = n >> 32
4480 # strip off leading zeros
4481 for i in range(len(s)):
4482 if s[i] != b'\000'[0]:
4483 break
4484 else:
4485 # only happens when n == 0
4486 s = b'\000'
4487 i = 0
4488 s = s[i:]
4489 # add back some pad bytes. this could be done more efficiently w.r.t. the
4490 # de-padding being done above, but sigh...
4491 if blocksize > 0 and len(s) % blocksize:
4492 s = (blocksize - len(s) % blocksize) * b'\000' + s
4493 return s
4494
4495
4496def bytes_to_long(s):
4497 """bytes_to_long(string) : long
4498 Convert a byte string to a long integer.
4499
4500 This is (essentially) the inverse of long_to_bytes().
4501 """
4502 acc = 0
4503 length = len(s)
4504 if length % 4:
4505 extra = (4 - length % 4)
4506 s = b'\000' * extra + s
4507 length = length + extra
4508 for i in range(0, length, 4):
4509 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4510 return acc
4511
4512
5bc880b9
YCH
4513def ohdave_rsa_encrypt(data, exponent, modulus):
4514 '''
4515 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4516
4517 Input:
4518 data: data to encrypt, bytes-like object
4519 exponent, modulus: parameter e and N of RSA algorithm, both integer
4520 Output: hex string of encrypted data
4521
4522 Limitation: supports one block encryption only
4523 '''
4524
4525 payload = int(binascii.hexlify(data[::-1]), 16)
4526 encrypted = pow(payload, exponent, modulus)
4527 return '%x' % encrypted
81bdc8fd
YCH
4528
4529
f48409c7
YCH
4530def pkcs1pad(data, length):
4531 """
4532 Padding input data with PKCS#1 scheme
4533
4534 @param {int[]} data input data
4535 @param {int} length target length
4536 @returns {int[]} padded data
4537 """
4538 if len(data) > length - 11:
4539 raise ValueError('Input data too long for PKCS#1 padding')
4540
4541 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4542 return [0, 2] + pseudo_random + [0] + data
4543
4544
5eb6bdce 4545def encode_base_n(num, n, table=None):
59f898b7 4546 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
4547 if not table:
4548 table = FULL_TABLE[:n]
4549
5eb6bdce
YCH
4550 if n > len(table):
4551 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4552
4553 if num == 0:
4554 return table[0]
4555
81bdc8fd
YCH
4556 ret = ''
4557 while num:
4558 ret = table[num % n] + ret
4559 num = num // n
4560 return ret
f52354a8
YCH
4561
4562
4563def decode_packed_codes(code):
06b3fe29 4564 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4565 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4566 base = int(base)
4567 count = int(count)
4568 symbols = symbols.split('|')
4569 symbol_table = {}
4570
4571 while count:
4572 count -= 1
5eb6bdce 4573 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4574 symbol_table[base_n_count] = symbols[count] or base_n_count
4575
4576 return re.sub(
4577 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4578 obfuscated_code)
e154c651 4579
4580
1ced2221
S
4581def caesar(s, alphabet, shift):
4582 if shift == 0:
4583 return s
4584 l = len(alphabet)
4585 return ''.join(
4586 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4587 for c in s)
4588
4589
4590def rot47(s):
4591 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4592
4593
e154c651 4594def parse_m3u8_attributes(attrib):
4595 info = {}
4596 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4597 if val.startswith('"'):
4598 val = val[1:-1]
4599 info[key] = val
4600 return info
1143535d
YCH
4601
4602
4603def urshift(val, n):
4604 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4605
4606
4607# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 4608# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
4609def decode_png(png_data):
4610 # Reference: https://www.w3.org/TR/PNG/
4611 header = png_data[8:]
4612
4613 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
86e5f3ed 4614 raise OSError('Not a valid PNG file.')
d3f8e038
YCH
4615
4616 int_map = {1: '>B', 2: '>H', 4: '>I'}
4617 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4618
4619 chunks = []
4620
4621 while header:
4622 length = unpack_integer(header[:4])
4623 header = header[4:]
4624
4625 chunk_type = header[:4]
4626 header = header[4:]
4627
4628 chunk_data = header[:length]
4629 header = header[length:]
4630
4631 header = header[4:] # Skip CRC
4632
4633 chunks.append({
4634 'type': chunk_type,
4635 'length': length,
4636 'data': chunk_data
4637 })
4638
4639 ihdr = chunks[0]['data']
4640
4641 width = unpack_integer(ihdr[:4])
4642 height = unpack_integer(ihdr[4:8])
4643
4644 idat = b''
4645
4646 for chunk in chunks:
4647 if chunk['type'] == b'IDAT':
4648 idat += chunk['data']
4649
4650 if not idat:
86e5f3ed 4651 raise OSError('Unable to read PNG data.')
d3f8e038
YCH
4652
4653 decompressed_data = bytearray(zlib.decompress(idat))
4654
4655 stride = width * 3
4656 pixels = []
4657
4658 def _get_pixel(idx):
4659 x = idx % stride
4660 y = idx // stride
4661 return pixels[y][x]
4662
4663 for y in range(height):
4664 basePos = y * (1 + stride)
4665 filter_type = decompressed_data[basePos]
4666
4667 current_row = []
4668
4669 pixels.append(current_row)
4670
4671 for x in range(stride):
4672 color = decompressed_data[1 + basePos + x]
4673 basex = y * stride + x
4674 left = 0
4675 up = 0
4676
4677 if x > 2:
4678 left = _get_pixel(basex - 3)
4679 if y > 0:
4680 up = _get_pixel(basex - stride)
4681
4682 if filter_type == 1: # Sub
4683 color = (color + left) & 0xff
4684 elif filter_type == 2: # Up
4685 color = (color + up) & 0xff
4686 elif filter_type == 3: # Average
4687 color = (color + ((left + up) >> 1)) & 0xff
4688 elif filter_type == 4: # Paeth
4689 a = left
4690 b = up
4691 c = 0
4692
4693 if x > 2 and y > 0:
4694 c = _get_pixel(basex - stride - 3)
4695
4696 p = a + b - c
4697
4698 pa = abs(p - a)
4699 pb = abs(p - b)
4700 pc = abs(p - c)
4701
4702 if pa <= pb and pa <= pc:
4703 color = (color + a) & 0xff
4704 elif pb <= pc:
4705 color = (color + b) & 0xff
4706 else:
4707 color = (color + c) & 0xff
4708
4709 current_row.append(color)
4710
4711 return width, height, pixels
efa97bdc
YCH
4712
4713
4714def write_xattr(path, key, value):
6f7563be 4715 # Windows: Write xattrs to NTFS Alternate Data Streams:
4716 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4717 if compat_os_name == 'nt':
4718 assert ':' not in key
4719 assert os.path.exists(path)
efa97bdc
YCH
4720
4721 try:
6f7563be 4722 with open(f'{path}:{key}', 'wb') as f:
4723 f.write(value)
86e5f3ed 4724 except OSError as e:
efa97bdc 4725 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 4726 return
efa97bdc 4727
6f7563be 4728 # UNIX Method 1. Use xattrs/pyxattrs modules
4729 from .dependencies import xattr
efa97bdc 4730
6f7563be 4731 setxattr = None
4732 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4733 # Unicode arguments are not supported in pyxattr until version 0.5.0
4734 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4735 if version_tuple(xattr.__version__) >= (0, 5, 0):
4736 setxattr = xattr.set
4737 elif xattr:
4738 setxattr = xattr.setxattr
efa97bdc 4739
6f7563be 4740 if setxattr:
4741 try:
4742 setxattr(path, key, value)
4743 except OSError as e:
4744 raise XAttrMetadataError(e.errno, e.strerror)
4745 return
efa97bdc 4746
6f7563be 4747 # UNIX Method 2. Use setfattr/xattr executables
4748 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4749 else 'xattr' if check_executable('xattr', ['-h']) else None)
4750 if not exe:
4751 raise XAttrUnavailableError(
4752 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4753 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 4754
0f06bcd7 4755 value = value.decode()
6f7563be 4756 try:
4757 p = Popen(
4758 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4759 stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4760 except OSError as e:
4761 raise XAttrMetadataError(e.errno, e.strerror)
4762 stderr = p.communicate_or_kill()[1].decode('utf-8', 'replace')
4763 if p.returncode:
4764 raise XAttrMetadataError(p.returncode, stderr)
0c265486
YCH
4765
4766
4767def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
4768 start_date = datetime.date(1950, 1, 1)
4769 end_date = datetime.date(1995, 12, 31)
4770 offset = random.randint(0, (end_date - start_date).days)
4771 random_date = start_date + datetime.timedelta(offset)
0c265486 4772 return {
aa374bc7
AS
4773 year_field: str(random_date.year),
4774 month_field: str(random_date.month),
4775 day_field: str(random_date.day),
0c265486 4776 }
732044af 4777
c76eb41b 4778
732044af 4779# Templates for internet shortcut files, which are plain text files.
e5a998f3 4780DOT_URL_LINK_TEMPLATE = '''\
732044af 4781[InternetShortcut]
4782URL=%(url)s
e5a998f3 4783'''
732044af 4784
e5a998f3 4785DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 4786<?xml version="1.0" encoding="UTF-8"?>
4787<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4788<plist version="1.0">
4789<dict>
4790\t<key>URL</key>
4791\t<string>%(url)s</string>
4792</dict>
4793</plist>
e5a998f3 4794'''
732044af 4795
e5a998f3 4796DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 4797[Desktop Entry]
4798Encoding=UTF-8
4799Name=%(filename)s
4800Type=Link
4801URL=%(url)s
4802Icon=text-html
e5a998f3 4803'''
732044af 4804
08438d2c 4805LINK_TEMPLATES = {
4806 'url': DOT_URL_LINK_TEMPLATE,
4807 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4808 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4809}
4810
732044af 4811
4812def iri_to_uri(iri):
4813 """
4814 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4815
4816 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4817 """
4818
4819 iri_parts = compat_urllib_parse_urlparse(iri)
4820
4821 if '[' in iri_parts.netloc:
4822 raise ValueError('IPv6 URIs are not, yet, supported.')
4823 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4824
4825 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4826
4827 net_location = ''
4828 if iri_parts.username:
f9934b96 4829 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 4830 if iri_parts.password is not None:
f9934b96 4831 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 4832 net_location += '@'
4833
0f06bcd7 4834 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 4835 # The 'idna' encoding produces ASCII text.
4836 if iri_parts.port is not None and iri_parts.port != 80:
4837 net_location += ':' + str(iri_parts.port)
4838
f9934b96 4839 return urllib.parse.urlunparse(
732044af 4840 (iri_parts.scheme,
4841 net_location,
4842
f9934b96 4843 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 4844
4845 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 4846 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 4847
4848 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 4849 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 4850
f9934b96 4851 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 4852
4853 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4854
4855
4856def to_high_limit_path(path):
4857 if sys.platform in ['win32', 'cygwin']:
4858 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 4859 return '\\\\?\\' + os.path.abspath(path)
732044af 4860
4861 return path
76d321f6 4862
c76eb41b 4863
b868936c 4864def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
e0ddbd02 4865 val = traverse_obj(obj, *variadic(field))
4866 if val in ignore:
4867 return default
4868 return template % (func(val) if func else val)
00dd0cd5 4869
4870
4871def clean_podcast_url(url):
4872 return re.sub(r'''(?x)
4873 (?:
4874 (?:
4875 chtbl\.com/track|
4876 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4877 play\.podtrac\.com
4878 )/[^/]+|
4879 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4880 flex\.acast\.com|
4881 pd(?:
4882 cn\.co| # https://podcorn.com/analytics-prefix/
4883 st\.fm # https://podsights.com/docs/
4884 )/e
4885 )/''', '', url)
ffcb8191
THD
4886
4887
4888_HEX_TABLE = '0123456789abcdef'
4889
4890
4891def random_uuidv4():
4892 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 4893
4894
4895def make_dir(path, to_screen=None):
4896 try:
4897 dn = os.path.dirname(path)
4898 if dn and not os.path.exists(dn):
4899 os.makedirs(dn)
4900 return True
86e5f3ed 4901 except OSError as err:
0202b52a 4902 if callable(to_screen) is not None:
4903 to_screen('unable to create directory ' + error_to_compat_str(err))
4904 return False
f74980cb 4905
4906
4907def get_executable_path():
c552ae88 4908 from zipimport import zipimporter
4909 if hasattr(sys, 'frozen'): # Running from PyInstaller
4910 path = os.path.dirname(sys.executable)
cfb0511d 4911 elif isinstance(__loader__, zipimporter): # Running from ZIP
c552ae88 4912 path = os.path.join(os.path.dirname(__file__), '../..')
4913 else:
4914 path = os.path.join(os.path.dirname(__file__), '..')
f74980cb 4915 return os.path.abspath(path)
4916
4917
2f567473 4918def load_plugins(name, suffix, namespace):
3ae5e797 4919 classes = {}
19a03940 4920 with contextlib.suppress(FileNotFoundError):
019a94f7
ÁS
4921 plugins_spec = importlib.util.spec_from_file_location(
4922 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
4923 plugins = importlib.util.module_from_spec(plugins_spec)
4924 sys.modules[plugins_spec.name] = plugins
4925 plugins_spec.loader.exec_module(plugins)
f74980cb 4926 for name in dir(plugins):
2f567473 4927 if name in namespace:
4928 continue
4929 if not name.endswith(suffix):
f74980cb 4930 continue
4931 klass = getattr(plugins, name)
3ae5e797 4932 classes[name] = namespace[name] = klass
f74980cb 4933 return classes
06167fbb 4934
4935
325ebc17 4936def traverse_obj(
352d63fd 4937 obj, *path_list, default=None, expected_type=None, get_all=True,
325ebc17 4938 casesense=True, is_user_input=False, traverse_string=False):
324ad820 4939 ''' Traverse nested list/dict/tuple
8f334380 4940 @param path_list A list of paths which are checked one by one.
19a03940 4941 Each path is a list of keys where each key is a:
4942 - None: Do nothing
4943 - string: A dictionary key
4944 - int: An index into a list
4945 - tuple: A list of keys all of which will be traversed
4946 - Ellipsis: Fetch all values in the object
4947 - Function: Takes the key and value as arguments
4948 and returns whether the key matches or not
325ebc17 4949 @param default Default value to return
352d63fd 4950 @param expected_type Only accept final value of this type (Can also be any callable)
4951 @param get_all Return all the values obtained from a path or only the first one
324ad820 4952 @param casesense Whether to consider dictionary keys as case sensitive
4953 @param is_user_input Whether the keys are generated from user input. If True,
4954 strings are converted to int/slice if necessary
4955 @param traverse_string Whether to traverse inside strings. If True, any
4956 non-compatible object will also be converted into a string
8f334380 4957 # TODO: Write tests
324ad820 4958 '''
325ebc17 4959 if not casesense:
dbf5416a 4960 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
8f334380 4961 path_list = (map(_lower, variadic(path)) for path in path_list)
4962
4963 def _traverse_obj(obj, path, _current_depth=0):
4964 nonlocal depth
4965 path = tuple(variadic(path))
4966 for i, key in enumerate(path):
1797b073 4967 if None in (key, obj):
4968 return obj
8f334380 4969 if isinstance(key, (list, tuple)):
4970 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
4971 key = ...
4972 if key is ...:
4973 obj = (obj.values() if isinstance(obj, dict)
4974 else obj if isinstance(obj, (list, tuple, LazyList))
4975 else str(obj) if traverse_string else [])
4976 _current_depth += 1
4977 depth = max(depth, _current_depth)
4978 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
2614f646 4979 elif callable(key):
4980 if isinstance(obj, (list, tuple, LazyList)):
4981 obj = enumerate(obj)
4982 elif isinstance(obj, dict):
4983 obj = obj.items()
4984 else:
4985 if not traverse_string:
4986 return None
4987 obj = str(obj)
4988 _current_depth += 1
4989 depth = max(depth, _current_depth)
e6f868a6 4990 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
575e17a1 4991 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
325ebc17 4992 obj = (obj.get(key) if casesense or (key in obj)
4993 else next((v for k, v in obj.items() if _lower(k) == key), None))
4994 else:
4995 if is_user_input:
4996 key = (int_or_none(key) if ':' not in key
4997 else slice(*map(int_or_none, key.split(':'))))
8f334380 4998 if key == slice(None):
575e17a1 4999 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
325ebc17 5000 if not isinstance(key, (int, slice)):
9fea350f 5001 return None
8f334380 5002 if not isinstance(obj, (list, tuple, LazyList)):
325ebc17 5003 if not traverse_string:
5004 return None
5005 obj = str(obj)
5006 try:
5007 obj = obj[key]
5008 except IndexError:
324ad820 5009 return None
325ebc17 5010 return obj
5011
352d63fd 5012 if isinstance(expected_type, type):
5013 type_test = lambda val: val if isinstance(val, expected_type) else None
5014 elif expected_type is not None:
5015 type_test = expected_type
5016 else:
5017 type_test = lambda val: val
5018
8f334380 5019 for path in path_list:
5020 depth = 0
5021 val = _traverse_obj(obj, path)
325ebc17 5022 if val is not None:
8f334380 5023 if depth:
5024 for _ in range(depth - 1):
6586bca9 5025 val = itertools.chain.from_iterable(v for v in val if v is not None)
352d63fd 5026 val = [v for v in map(type_test, val) if v is not None]
8f334380 5027 if val:
352d63fd 5028 return val if get_all else val[0]
5029 else:
5030 val = type_test(val)
5031 if val is not None:
8f334380 5032 return val
325ebc17 5033 return default
324ad820 5034
5035
5036def traverse_dict(dictn, keys, casesense=True):
ee8dd27a 5037 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5038 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5039 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5040
5041
ff91cf74 5042def get_first(obj, keys, **kwargs):
5043 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5044
5045
4b4b7f74 5046def variadic(x, allowed_types=(str, bytes, dict)):
cb89cfc1 5047 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
bd50a52b
THD
5048
5049
3e9b66d7
LNO
5050def decode_base(value, digits):
5051 # This will convert given base-x string to scalar (long or int)
5052 table = {char: index for index, char in enumerate(digits)}
5053 result = 0
5054 base = len(digits)
5055 for chr in value:
5056 result *= base
5057 result += table[chr]
5058 return result
5059
5060
5061def time_seconds(**kwargs):
5062 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5063 return t.timestamp()
5064
5065
49fa4d9a
N
5066# create a JSON Web Signature (jws) with HS256 algorithm
5067# the resulting format is in JWS Compact Serialization
5068# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5069# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5070def jwt_encode_hs256(payload_data, key, headers={}):
5071 header_data = {
5072 'alg': 'HS256',
5073 'typ': 'JWT',
5074 }
5075 if headers:
5076 header_data.update(headers)
0f06bcd7 5077 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5078 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5079 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
5080 signature_b64 = base64.b64encode(h.digest())
5081 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5082 return token
819e0531 5083
5084
16b0d7e6 5085# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5086def jwt_decode_hs256(jwt):
5087 header_b64, payload_b64, signature_b64 = jwt.split('.')
5088 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5089 return payload_data
5090
5091
819e0531 5092def supports_terminal_sequences(stream):
5093 if compat_os_name == 'nt':
e3c7d495 5094 from .compat import WINDOWS_VT_MODE # Must be imported locally
5095 if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
819e0531 5096 return False
5097 elif not os.getenv('TERM'):
5098 return False
5099 try:
5100 return stream.isatty()
5101 except BaseException:
5102 return False
5103
5104
ec11a9f4 5105_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5106
5107
5108def remove_terminal_sequences(string):
5109 return _terminal_sequences_re.sub('', string)
5110
5111
5112def number_of_digits(number):
5113 return len('%d' % number)
34921b43 5114
5115
5116def join_nonempty(*values, delim='-', from_dict=None):
5117 if from_dict is not None:
c586f9e8 5118 values = map(from_dict.get, values)
34921b43 5119 return delim.join(map(str, filter(None, values)))
06e57990 5120
5121
27231526
ZM
5122def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5123 """
5124 Find the largest format dimensions in terms of video width and, for each thumbnail:
5125 * Modify the URL: Match the width with the provided regex and replace with the former width
5126 * Update dimensions
5127
5128 This function is useful with video services that scale the provided thumbnails on demand
5129 """
5130 _keys = ('width', 'height')
5131 max_dimensions = max(
86e5f3ed 5132 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
5133 default=(0, 0))
5134 if not max_dimensions[0]:
5135 return thumbnails
5136 return [
5137 merge_dicts(
5138 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5139 dict(zip(_keys, max_dimensions)), thumbnail)
5140 for thumbnail in thumbnails
5141 ]
5142
5143
93c8410d
LNO
5144def parse_http_range(range):
5145 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5146 if not range:
5147 return None, None, None
5148 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5149 if not crg:
5150 return None, None, None
5151 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5152
5153
06e57990 5154class Config:
5155 own_args = None
5156 filename = None
5157 __initialized = False
5158
5159 def __init__(self, parser, label=None):
5160 self._parser, self.label = parser, label
5161 self._loaded_paths, self.configs = set(), []
5162
5163 def init(self, args=None, filename=None):
5164 assert not self.__initialized
65662dff 5165 directory = ''
06e57990 5166 if filename:
5167 location = os.path.realpath(filename)
65662dff 5168 directory = os.path.dirname(location)
06e57990 5169 if location in self._loaded_paths:
5170 return False
5171 self._loaded_paths.add(location)
5172
5173 self.__initialized = True
5174 self.own_args, self.filename = args, filename
5175 for location in self._parser.parse_args(args)[0].config_locations or []:
65662dff 5176 location = os.path.join(directory, expand_path(location))
06e57990 5177 if os.path.isdir(location):
5178 location = os.path.join(location, 'yt-dlp.conf')
5179 if not os.path.exists(location):
5180 self._parser.error(f'config location {location} does not exist')
5181 self.append_config(self.read_file(location), location)
5182 return True
5183
5184 def __str__(self):
5185 label = join_nonempty(
5186 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5187 delim=' ')
5188 return join_nonempty(
5189 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5190 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5191 delim='\n')
5192
5193 @staticmethod
5194 def read_file(filename, default=[]):
5195 try:
5196 optionf = open(filename)
86e5f3ed 5197 except OSError:
06e57990 5198 return default # silently skip if file is not present
5199 try:
5200 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5201 contents = optionf.read()
f9934b96 5202 res = shlex.split(contents, comments=True)
06e57990 5203 finally:
5204 optionf.close()
5205 return res
5206
5207 @staticmethod
5208 def hide_login_info(opts):
86e5f3ed 5209 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 5210 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5211
5212 def _scrub_eq(o):
5213 m = eqre.match(o)
5214 if m:
5215 return m.group('key') + '=PRIVATE'
5216 else:
5217 return o
5218
5219 opts = list(map(_scrub_eq, opts))
5220 for idx, opt in enumerate(opts):
5221 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5222 opts[idx + 1] = 'PRIVATE'
5223 return opts
5224
5225 def append_config(self, *args, label=None):
5226 config = type(self)(self._parser, label)
5227 config._loaded_paths = self._loaded_paths
5228 if config.init(*args):
5229 self.configs.append(config)
5230
5231 @property
5232 def all_args(self):
5233 for config in reversed(self.configs):
5234 yield from config.all_args
5235 yield from self.own_args or []
5236
5237 def parse_args(self):
19a03940 5238 return self._parser.parse_args(self.all_args)
da42679b
LNO
5239
5240
5241class WebSocketsWrapper():
5242 """Wraps websockets module to use in non-async scopes"""
abfecb7b 5243 pool = None
da42679b 5244
3cea3edd 5245 def __init__(self, url, headers=None, connect=True):
059bc4db 5246 self.loop = asyncio.new_event_loop()
9cd08050 5247 # XXX: "loop" is deprecated
5248 self.conn = websockets.connect(
5249 url, extra_headers=headers, ping_interval=None,
5250 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5251 if connect:
5252 self.__enter__()
15dfb392 5253 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5254
5255 def __enter__(self):
3cea3edd 5256 if not self.pool:
9cd08050 5257 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5258 return self
5259
5260 def send(self, *args):
5261 self.run_with_loop(self.pool.send(*args), self.loop)
5262
5263 def recv(self, *args):
5264 return self.run_with_loop(self.pool.recv(*args), self.loop)
5265
5266 def __exit__(self, type, value, traceback):
5267 try:
5268 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5269 finally:
5270 self.loop.close()
15dfb392 5271 self._cancel_all_tasks(self.loop)
da42679b
LNO
5272
5273 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5274 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5275 @staticmethod
5276 def run_with_loop(main, loop):
059bc4db 5277 if not asyncio.iscoroutine(main):
da42679b
LNO
5278 raise ValueError(f'a coroutine was expected, got {main!r}')
5279
5280 try:
5281 return loop.run_until_complete(main)
5282 finally:
5283 loop.run_until_complete(loop.shutdown_asyncgens())
5284 if hasattr(loop, 'shutdown_default_executor'):
5285 loop.run_until_complete(loop.shutdown_default_executor())
5286
5287 @staticmethod
5288 def _cancel_all_tasks(loop):
059bc4db 5289 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
5290
5291 if not to_cancel:
5292 return
5293
5294 for task in to_cancel:
5295 task.cancel()
5296
9cd08050 5297 # XXX: "loop" is removed in python 3.10+
da42679b 5298 loop.run_until_complete(
059bc4db 5299 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
5300
5301 for task in to_cancel:
5302 if task.cancelled():
5303 continue
5304 if task.exception() is not None:
5305 loop.call_exception_handler({
5306 'message': 'unhandled exception during asyncio.run() shutdown',
5307 'exception': task.exception(),
5308 'task': task,
5309 })
5310
5311
8b7539d2 5312def merge_headers(*dicts):
08d30158 5313 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 5314 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 5315
5316
5317class classproperty:
5318 def __init__(self, f):
82d02080 5319 functools.update_wrapper(self, f)
28787f16 5320 self.f = f
5321
5322 def __get__(self, _, cls):
5323 return self.f(cls)
19a03940 5324
5325
591bb9d3 5326class Namespace:
5327 """Immutable namespace"""
591bb9d3 5328
5329 def __init__(self, **kwargs):
5330 self._dict = kwargs
5331
5332 def __getattr__(self, attr):
5333 return self._dict[attr]
5334
7896214c 5335 def __contains__(self, item):
5336 return item in self._dict.values()
5337
5338 def __iter__(self):
5339 return iter(self._dict.items())
5340
591bb9d3 5341 def __repr__(self):
7896214c 5342 return f'{type(self).__name__}({", ".join(f"{k}={v}" for k, v in self)})'
9b8ee23b 5343
5344
5345# Deprecated
5346has_certifi = bool(certifi)
5347has_websockets = bool(websockets)