]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
Fix `--date today`
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
cc52de43 1#!/usr/bin/env python3
15dfb392 2import atexit
1e399778 3import base64
5bc880b9 4import binascii
912b38b4 5import calendar
676eb3f2 6import codecs
c380cc28 7import collections
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96 10import datetime
0c265486 11import email.header
f8271158 12import email.utils
f45c185f 13import errno
be4a824d 14import functools
d77c3dfd 15import gzip
49fa4d9a
N
16import hashlib
17import hmac
019a94f7 18import importlib.util
03f9daab 19import io
79a2e94e 20import itertools
f4bfd65f 21import json
d77c3dfd 22import locale
02dbf93f 23import math
f8271158 24import mimetypes
347de493 25import operator
d77c3dfd 26import os
c496ca96 27import platform
773f291d 28import random
d77c3dfd 29import re
f8271158 30import shlex
c496ca96 31import socket
79a2e94e 32import ssl
1c088fa8 33import subprocess
d77c3dfd 34import sys
181c8655 35import tempfile
c380cc28 36import time
01951dda 37import traceback
f8271158 38import urllib.parse
bcf89ce6 39import xml.etree.ElementTree
d77c3dfd 40import zlib
d77c3dfd 41
8c25f81b 42from .compat import (
1e9969f4 43 asyncio,
8c25f81b 44 compat_chr,
1bab3437 45 compat_cookiejar,
36e6f62c 46 compat_etree_fromstring,
51098426 47 compat_expanduser,
8c25f81b 48 compat_html_entities,
55b2f099 49 compat_html_entities_html5,
f8271158 50 compat_HTMLParseError,
51 compat_HTMLParser,
be4a824d 52 compat_http_client,
f8271158 53 compat_HTTPError,
efa97bdc 54 compat_os_name,
8c25f81b 55 compat_parse_qs,
702ccf2d 56 compat_shlex_quote,
8c25f81b 57 compat_str,
edaa23f8 58 compat_struct_pack,
d3f8e038 59 compat_struct_unpack,
8c25f81b 60 compat_urllib_error,
f8271158 61 compat_urllib_parse_unquote_plus,
15707c7e 62 compat_urllib_parse_urlencode,
8c25f81b
PH
63 compat_urllib_parse_urlparse,
64 compat_urllib_request,
65 compat_urlparse,
66)
9b8ee23b 67from .dependencies import brotli, certifi, websockets
f8271158 68from .socks import ProxyType, sockssocket
71aff188 69
4644ac55 70
51fb4995
YCH
71def register_socks_protocols():
72 # "Register" SOCKS protocols
d5ae6bb5
YCH
73 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
74 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
75 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
76 if scheme not in compat_urlparse.uses_netloc:
77 compat_urlparse.uses_netloc.append(scheme)
78
79
468e2e92
FV
80# This is not clearly defined otherwise
81compiled_regex_type = type(re.compile(''))
82
f7a147e3
S
83
84def random_user_agent():
85 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
86 _CHROME_VERSIONS = (
19b4c74d 87 '90.0.4430.212',
88 '90.0.4430.24',
89 '90.0.4430.70',
90 '90.0.4430.72',
91 '90.0.4430.85',
92 '90.0.4430.93',
93 '91.0.4472.101',
94 '91.0.4472.106',
95 '91.0.4472.114',
96 '91.0.4472.124',
97 '91.0.4472.164',
98 '91.0.4472.19',
99 '91.0.4472.77',
100 '92.0.4515.107',
101 '92.0.4515.115',
102 '92.0.4515.131',
103 '92.0.4515.159',
104 '92.0.4515.43',
105 '93.0.4556.0',
106 '93.0.4577.15',
107 '93.0.4577.63',
108 '93.0.4577.82',
109 '94.0.4606.41',
110 '94.0.4606.54',
111 '94.0.4606.61',
112 '94.0.4606.71',
113 '94.0.4606.81',
114 '94.0.4606.85',
115 '95.0.4638.17',
116 '95.0.4638.50',
117 '95.0.4638.54',
118 '95.0.4638.69',
119 '95.0.4638.74',
120 '96.0.4664.18',
121 '96.0.4664.45',
122 '96.0.4664.55',
123 '96.0.4664.93',
124 '97.0.4692.20',
f7a147e3
S
125 )
126 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
127
128
4390d5ec 129SUPPORTED_ENCODINGS = [
130 'gzip', 'deflate'
131]
9b8ee23b 132if brotli:
4390d5ec 133 SUPPORTED_ENCODINGS.append('br')
134
3e669f36 135std_headers = {
f7a147e3 136 'User-Agent': random_user_agent(),
59ae15a5 137 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59ae15a5 138 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 139 'Sec-Fetch-Mode': 'navigate',
3e669f36 140}
f427df17 141
5f6a1245 142
fb37eb25
S
143USER_AGENTS = {
144 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
145}
146
147
bf42a990
S
148NO_DEFAULT = object()
149
7105440c
YCH
150ENGLISH_MONTH_NAMES = [
151 'January', 'February', 'March', 'April', 'May', 'June',
152 'July', 'August', 'September', 'October', 'November', 'December']
153
f6717dec
S
154MONTH_NAMES = {
155 'en': ENGLISH_MONTH_NAMES,
156 'fr': [
3e4185c3
S
157 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
158 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 159}
a942d6cb 160
a7aaa398
S
161KNOWN_EXTENSIONS = (
162 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
163 'flv', 'f4v', 'f4a', 'f4b',
164 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
165 'mkv', 'mka', 'mk3d',
166 'avi', 'divx',
167 'mov',
168 'asf', 'wmv', 'wma',
169 '3gp', '3g2',
170 'mp3',
171 'flac',
172 'ape',
173 'wav',
174 'f4f', 'f4m', 'm3u8', 'smil')
175
c587cbb7 176# needed for sanitizing filenames in restricted mode
c8827027 177ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
178 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
179 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 180
46f59e89
S
181DATE_FORMATS = (
182 '%d %B %Y',
183 '%d %b %Y',
184 '%B %d %Y',
cb655f34
S
185 '%B %dst %Y',
186 '%B %dnd %Y',
9d30c213 187 '%B %drd %Y',
cb655f34 188 '%B %dth %Y',
46f59e89 189 '%b %d %Y',
cb655f34
S
190 '%b %dst %Y',
191 '%b %dnd %Y',
9d30c213 192 '%b %drd %Y',
cb655f34 193 '%b %dth %Y',
46f59e89
S
194 '%b %dst %Y %I:%M',
195 '%b %dnd %Y %I:%M',
9d30c213 196 '%b %drd %Y %I:%M',
46f59e89
S
197 '%b %dth %Y %I:%M',
198 '%Y %m %d',
199 '%Y-%m-%d',
bccdbd22 200 '%Y.%m.%d.',
46f59e89 201 '%Y/%m/%d',
81c13222 202 '%Y/%m/%d %H:%M',
46f59e89 203 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
204 '%Y%m%d%H%M',
205 '%Y%m%d%H%M%S',
4f3fa23e 206 '%Y%m%d',
0c1c6f4b 207 '%Y-%m-%d %H:%M',
46f59e89
S
208 '%Y-%m-%d %H:%M:%S',
209 '%Y-%m-%d %H:%M:%S.%f',
5014558a 210 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
211 '%d.%m.%Y %H:%M',
212 '%d.%m.%Y %H.%M',
213 '%Y-%m-%dT%H:%M:%SZ',
214 '%Y-%m-%dT%H:%M:%S.%fZ',
215 '%Y-%m-%dT%H:%M:%S.%f0Z',
216 '%Y-%m-%dT%H:%M:%S',
217 '%Y-%m-%dT%H:%M:%S.%f',
218 '%Y-%m-%dT%H:%M',
c6eed6b8
S
219 '%b %d %Y at %H:%M',
220 '%b %d %Y at %H:%M:%S',
b555ae9b
S
221 '%B %d %Y at %H:%M',
222 '%B %d %Y at %H:%M:%S',
a63d9bd0 223 '%H:%M %d-%b-%Y',
46f59e89
S
224)
225
226DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
227DATE_FORMATS_DAY_FIRST.extend([
228 '%d-%m-%Y',
229 '%d.%m.%Y',
230 '%d.%m.%y',
231 '%d/%m/%Y',
232 '%d/%m/%y',
233 '%d/%m/%Y %H:%M:%S',
234])
235
236DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
237DATE_FORMATS_MONTH_FIRST.extend([
238 '%m-%d-%Y',
239 '%m.%d.%Y',
240 '%m/%d/%Y',
241 '%m/%d/%y',
242 '%m/%d/%Y %H:%M:%S',
243])
244
06b3fe29 245PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
22f5f5c6 246JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
06b3fe29 247
1d485a1a 248NUMBER_RE = r'\d+(?:\.\d+)?'
249
7105440c 250
d77c3dfd 251def preferredencoding():
59ae15a5 252 """Get preferred encoding.
d77c3dfd 253
59ae15a5
PH
254 Returns the best encoding scheme for the system, based on
255 locale.getpreferredencoding() and some further tweaks.
256 """
257 try:
258 pref = locale.getpreferredencoding()
28e614de 259 'TEST'.encode(pref)
70a1165b 260 except Exception:
59ae15a5 261 pref = 'UTF-8'
bae611f2 262
59ae15a5 263 return pref
d77c3dfd 264
f4bfd65f 265
181c8655 266def write_json_file(obj, fn):
1394646a 267 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 268
cfb0511d 269 tf = tempfile.NamedTemporaryFile(
270 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
271 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
272
273 try:
274 with tf:
45d86abe 275 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
276 if sys.platform == 'win32':
277 # Need to remove existing file on Windows, else os.rename raises
278 # WindowsError or FileExistsError.
19a03940 279 with contextlib.suppress(OSError):
1394646a 280 os.unlink(fn)
19a03940 281 with contextlib.suppress(OSError):
9cd5f54e
R
282 mask = os.umask(0)
283 os.umask(mask)
284 os.chmod(tf.name, 0o666 & ~mask)
181c8655 285 os.rename(tf.name, fn)
70a1165b 286 except Exception:
19a03940 287 with contextlib.suppress(OSError):
181c8655 288 os.remove(tf.name)
181c8655
PH
289 raise
290
291
cfb0511d 292def find_xpath_attr(node, xpath, key, val=None):
293 """ Find the xpath xpath[@key=val] """
294 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 295 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 296 return node.find(expr)
59ae56fa 297
d7e66d39
JMF
298# On python2.6 the xml.etree.ElementTree.Element methods don't support
299# the namespace parameter
5f6a1245
JW
300
301
d7e66d39
JMF
302def xpath_with_ns(path, ns_map):
303 components = [c.split(':') for c in path.split('/')]
304 replaced = []
305 for c in components:
306 if len(c) == 1:
307 replaced.append(c[0])
308 else:
309 ns, tag = c
310 replaced.append('{%s}%s' % (ns_map[ns], tag))
311 return '/'.join(replaced)
312
d77c3dfd 313
a41fb80c 314def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 315 def _find_xpath(xpath):
f9934b96 316 return node.find(xpath)
578c0745
S
317
318 if isinstance(xpath, (str, compat_str)):
319 n = _find_xpath(xpath)
320 else:
321 for xp in xpath:
322 n = _find_xpath(xp)
323 if n is not None:
324 break
d74bebd5 325
8e636da4 326 if n is None:
bf42a990
S
327 if default is not NO_DEFAULT:
328 return default
329 elif fatal:
bf0ff932
PH
330 name = xpath if name is None else name
331 raise ExtractorError('Could not find XML element %s' % name)
332 else:
333 return None
a41fb80c
S
334 return n
335
336
337def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
338 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
339 if n is None or n == default:
340 return n
341 if n.text is None:
342 if default is not NO_DEFAULT:
343 return default
344 elif fatal:
345 name = xpath if name is None else name
346 raise ExtractorError('Could not find XML element\'s text %s' % name)
347 else:
348 return None
349 return n.text
a41fb80c
S
350
351
352def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
353 n = find_xpath_attr(node, xpath, key)
354 if n is None:
355 if default is not NO_DEFAULT:
356 return default
357 elif fatal:
86e5f3ed 358 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
359 raise ExtractorError('Could not find XML attribute %s' % name)
360 else:
361 return None
362 return n.attrib[key]
bf0ff932
PH
363
364
9e6dd238 365def get_element_by_id(id, html):
43e8fafd 366 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 367 return get_element_by_attribute('id', id, html)
43e8fafd 368
12ea2f30 369
6f32a0b5
ZM
370def get_element_html_by_id(id, html):
371 """Return the html of the tag with the specified ID in the passed HTML document"""
372 return get_element_html_by_attribute('id', id, html)
373
374
84c237fb 375def get_element_by_class(class_name, html):
2af12ad9
TC
376 """Return the content of the first tag with the specified class in the passed HTML document"""
377 retval = get_elements_by_class(class_name, html)
378 return retval[0] if retval else None
379
380
6f32a0b5
ZM
381def get_element_html_by_class(class_name, html):
382 """Return the html of the first tag with the specified class in the passed HTML document"""
383 retval = get_elements_html_by_class(class_name, html)
384 return retval[0] if retval else None
385
386
2af12ad9
TC
387def get_element_by_attribute(attribute, value, html, escape_value=True):
388 retval = get_elements_by_attribute(attribute, value, html, escape_value)
389 return retval[0] if retval else None
390
391
6f32a0b5
ZM
392def get_element_html_by_attribute(attribute, value, html, escape_value=True):
393 retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
394 return retval[0] if retval else None
395
396
2af12ad9
TC
397def get_elements_by_class(class_name, html):
398 """Return the content of all tags with the specified class in the passed HTML document as a list"""
399 return get_elements_by_attribute(
84c237fb
YCH
400 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
401 html, escape_value=False)
402
403
6f32a0b5
ZM
404def get_elements_html_by_class(class_name, html):
405 """Return the html of all tags with the specified class in the passed HTML document as a list"""
406 return get_elements_html_by_attribute(
407 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
408 html, escape_value=False)
409
410
411def get_elements_by_attribute(*args, **kwargs):
43e8fafd 412 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
413 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
414
415
416def get_elements_html_by_attribute(*args, **kwargs):
417 """Return the html of the tag with the specified attribute in the passed HTML document"""
418 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
419
420
421def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
422 """
423 Return the text (content) and the html (whole) of the tag with the specified
424 attribute in the passed HTML document
425 """
9e6dd238 426
86e5f3ed 427 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 428
84c237fb
YCH
429 value = re.escape(value) if escape_value else value
430
86e5f3ed 431 partial_element_re = rf'''(?x)
6f32a0b5 432 <(?P<tag>[a-zA-Z0-9:._-]+)
0254f162 433 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 434 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
435 '''
38285056 436
0254f162
ZM
437 for m in re.finditer(partial_element_re, html):
438 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 439
0254f162
ZM
440 yield (
441 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
442 whole
443 )
a921f407 444
c5229f39 445
6f32a0b5
ZM
446class HTMLBreakOnClosingTagParser(compat_HTMLParser):
447 """
448 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
449 closing tag for the first opening tag it has encountered, and can be used
450 as a context manager
451 """
452
453 class HTMLBreakOnClosingTagException(Exception):
454 pass
455
456 def __init__(self):
457 self.tagstack = collections.deque()
458 compat_HTMLParser.__init__(self)
459
460 def __enter__(self):
461 return self
462
463 def __exit__(self, *_):
464 self.close()
465
466 def close(self):
467 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
468 # so data remains buffered; we no longer have any interest in it, thus
469 # override this method to discard it
470 pass
471
472 def handle_starttag(self, tag, _):
473 self.tagstack.append(tag)
474
475 def handle_endtag(self, tag):
476 if not self.tagstack:
477 raise compat_HTMLParseError('no tags in the stack')
478 while self.tagstack:
479 inner_tag = self.tagstack.pop()
480 if inner_tag == tag:
481 break
482 else:
483 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
484 if not self.tagstack:
485 raise self.HTMLBreakOnClosingTagException()
486
487
488def get_element_text_and_html_by_tag(tag, html):
489 """
490 For the first element with the specified tag in the passed HTML document
491 return its' content (text) and the whole element (html)
492 """
493 def find_or_raise(haystack, needle, exc):
494 try:
495 return haystack.index(needle)
496 except ValueError:
497 raise exc
498 closing_tag = f'</{tag}>'
499 whole_start = find_or_raise(
500 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
501 content_start = find_or_raise(
502 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
503 content_start += whole_start + 1
504 with HTMLBreakOnClosingTagParser() as parser:
505 parser.feed(html[whole_start:content_start])
506 if not parser.tagstack or parser.tagstack[0] != tag:
507 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
508 offset = content_start
509 while offset < len(html):
510 next_closing_tag_start = find_or_raise(
511 html[offset:], closing_tag,
512 compat_HTMLParseError(f'closing {tag} tag not found'))
513 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
514 try:
515 parser.feed(html[offset:offset + next_closing_tag_end])
516 offset += next_closing_tag_end
517 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
518 return html[content_start:offset + next_closing_tag_start], \
519 html[whole_start:offset + next_closing_tag_end]
520 raise compat_HTMLParseError('unexpected end of html')
521
522
8bb56eee
BF
523class HTMLAttributeParser(compat_HTMLParser):
524 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 525
8bb56eee 526 def __init__(self):
c5229f39 527 self.attrs = {}
8bb56eee
BF
528 compat_HTMLParser.__init__(self)
529
530 def handle_starttag(self, tag, attrs):
531 self.attrs = dict(attrs)
532
c5229f39 533
73673ccf
FF
534class HTMLListAttrsParser(compat_HTMLParser):
535 """HTML parser to gather the attributes for the elements of a list"""
536
537 def __init__(self):
538 compat_HTMLParser.__init__(self)
539 self.items = []
540 self._level = 0
541
542 def handle_starttag(self, tag, attrs):
543 if tag == 'li' and self._level == 0:
544 self.items.append(dict(attrs))
545 self._level += 1
546
547 def handle_endtag(self, tag):
548 self._level -= 1
549
550
8bb56eee
BF
551def extract_attributes(html_element):
552 """Given a string for an HTML element such as
553 <el
554 a="foo" B="bar" c="&98;az" d=boz
555 empty= noval entity="&amp;"
556 sq='"' dq="'"
557 >
558 Decode and return a dictionary of attributes.
559 {
560 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
561 'empty': '', 'noval': None, 'entity': '&',
562 'sq': '"', 'dq': '\''
563 }.
8bb56eee
BF
564 """
565 parser = HTMLAttributeParser()
19a03940 566 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
567 parser.feed(html_element)
568 parser.close()
8bb56eee 569 return parser.attrs
9e6dd238 570
c5229f39 571
73673ccf
FF
572def parse_list(webpage):
573 """Given a string for an series of HTML <li> elements,
574 return a dictionary of their attributes"""
575 parser = HTMLListAttrsParser()
576 parser.feed(webpage)
577 parser.close()
578 return parser.items
579
580
9e6dd238 581def clean_html(html):
59ae15a5 582 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
583
584 if html is None: # Convenience for sanitizing descriptions etc.
585 return html
586
49185227 587 html = re.sub(r'\s+', ' ', html)
588 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
589 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
590 # Strip html tags
591 html = re.sub('<.*?>', '', html)
592 # Replace html entities
593 html = unescapeHTML(html)
7decf895 594 return html.strip()
9e6dd238
FV
595
596
d77c3dfd 597def sanitize_open(filename, open_mode):
59ae15a5
PH
598 """Try to open the given filename, and slightly tweak it if this fails.
599
600 Attempts to open the given filename. If this fails, it tries to change
601 the filename slightly, step by step, until it's either able to open it
602 or it fails and raises a final exception, like the standard open()
603 function.
604
605 It returns the tuple (stream, definitive_file_name).
606 """
0edb3e33 607 if filename == '-':
608 if sys.platform == 'win32':
609 import msvcrt
610 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
611 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 612
0edb3e33 613 for attempt in range(2):
614 try:
615 try:
89737671 616 if sys.platform == 'win32':
b506289f 617 # FIXME: An exclusive lock also locks the file from being read.
618 # Since windows locks are mandatory, don't lock the file on windows (for now).
619 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 620 raise LockingUnsupportedError()
0edb3e33 621 stream = locked_file(filename, open_mode, block=False).__enter__()
622 except LockingUnsupportedError:
623 stream = open(filename, open_mode)
624 return (stream, filename)
86e5f3ed 625 except OSError as err:
0edb3e33 626 if attempt or err.errno in (errno.EACCES,):
627 raise
628 old_filename, filename = filename, sanitize_path(filename)
629 if old_filename == filename:
630 raise
d77c3dfd
FV
631
632
633def timeconvert(timestr):
59ae15a5
PH
634 """Convert RFC 2822 defined time string into system timestamp"""
635 timestamp = None
636 timetuple = email.utils.parsedate_tz(timestr)
637 if timetuple is not None:
638 timestamp = email.utils.mktime_tz(timetuple)
639 return timestamp
1c469a94 640
5f6a1245 641
5c3895ff 642def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 643 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 644 @param restricted Use a stricter subset of allowed characters
645 @param is_id Whether this is an ID that should be kept unchanged if possible.
646 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 647 """
5c3895ff 648 if s == '':
649 return ''
650
59ae15a5 651 def replace_insane(char):
c587cbb7
AT
652 if restricted and char in ACCENT_CHARS:
653 return ACCENT_CHARS[char]
91dd88b9 654 elif not restricted and char == '\n':
5c3895ff 655 return '\0 '
91dd88b9 656 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
657 return ''
658 elif char == '"':
659 return '' if restricted else '\''
660 elif char == ':':
5c3895ff 661 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 662 elif char in '\\/|*<>':
5c3895ff 663 return '\0_'
664 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
665 return '\0_'
59ae15a5
PH
666 return char
667
5c3895ff 668 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 669 result = ''.join(map(replace_insane, s))
5c3895ff 670 if is_id is NO_DEFAULT:
671 result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result) # Remove repeated substitute chars
672 STRIP_RE = '(?:\0.|[ _-])*'
673 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
674 result = result.replace('\0', '') or '_'
675
796173d0
PH
676 if not is_id:
677 while '__' in result:
678 result = result.replace('__', '_')
679 result = result.strip('_')
680 # Common case of "Foreign band name - English song title"
681 if restricted and result.startswith('-_'):
682 result = result[2:]
5a42414b
PH
683 if result.startswith('-'):
684 result = '_' + result[len('-'):]
a7440261 685 result = result.lstrip('.')
796173d0
PH
686 if not result:
687 result = '_'
59ae15a5 688 return result
d77c3dfd 689
5f6a1245 690
c2934512 691def sanitize_path(s, force=False):
a2aaf4db 692 """Sanitizes and normalizes path on Windows"""
c2934512 693 if sys.platform == 'win32':
c4218ac3 694 force = False
c2934512 695 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 696 elif force:
697 drive_or_unc = ''
698 else:
a2aaf4db 699 return s
c2934512 700
be531ef1
S
701 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
702 if drive_or_unc:
a2aaf4db
S
703 norm_path.pop(0)
704 sanitized_path = [
ec85ded8 705 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 706 for path_part in norm_path]
be531ef1
S
707 if drive_or_unc:
708 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 709 elif force and s and s[0] == os.path.sep:
c4218ac3 710 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
711 return os.path.join(*sanitized_path)
712
713
17bcc626 714def sanitize_url(url):
befa4708
S
715 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
716 # the number of unwanted failures due to missing protocol
717 if url.startswith('//'):
718 return 'http:%s' % url
719 # Fix some common typos seen so far
720 COMMON_TYPOS = (
067aa17e 721 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
722 (r'^httpss://', r'https://'),
723 # https://bx1.be/lives/direct-tv/
724 (r'^rmtp([es]?)://', r'rtmp\1://'),
725 )
726 for mistake, fixup in COMMON_TYPOS:
727 if re.match(mistake, url):
728 return re.sub(mistake, fixup, url)
bc6b9bcd 729 return url
17bcc626
S
730
731
5435dcf9
HH
732def extract_basic_auth(url):
733 parts = compat_urlparse.urlsplit(url)
734 if parts.username is None:
735 return url, None
736 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
737 parts.hostname if parts.port is None
738 else '%s:%d' % (parts.hostname, parts.port))))
739 auth_payload = base64.b64encode(
0f06bcd7 740 ('%s:%s' % (parts.username, parts.password or '')).encode())
741 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
742
743
67dda517 744def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 745 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
746 if auth_header is not None:
747 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
748 headers['Authorization'] = auth_header
749 return compat_urllib_request.Request(url, *args, **kwargs)
67dda517
S
750
751
51098426
S
752def expand_path(s):
753 """Expand shell variables and ~"""
754 return os.path.expandvars(compat_expanduser(s))
755
756
d77c3dfd 757def orderedSet(iterable):
59ae15a5
PH
758 """ Remove all duplicates from the input iterable """
759 res = []
760 for el in iterable:
761 if el not in res:
762 res.append(el)
763 return res
d77c3dfd 764
912b38b4 765
55b2f099 766def _htmlentity_transform(entity_with_semicolon):
4e408e47 767 """Transforms an HTML entity to a character."""
55b2f099
YCH
768 entity = entity_with_semicolon[:-1]
769
4e408e47
PH
770 # Known non-numeric HTML entity
771 if entity in compat_html_entities.name2codepoint:
772 return compat_chr(compat_html_entities.name2codepoint[entity])
773
55b2f099
YCH
774 # TODO: HTML5 allows entities without a semicolon. For example,
775 # '&Eacuteric' should be decoded as 'Éric'.
776 if entity_with_semicolon in compat_html_entities_html5:
777 return compat_html_entities_html5[entity_with_semicolon]
778
91757b0f 779 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
780 if mobj is not None:
781 numstr = mobj.group(1)
28e614de 782 if numstr.startswith('x'):
4e408e47 783 base = 16
28e614de 784 numstr = '0%s' % numstr
4e408e47
PH
785 else:
786 base = 10
067aa17e 787 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 788 with contextlib.suppress(ValueError):
7aefc49c 789 return compat_chr(int(numstr, base))
4e408e47
PH
790
791 # Unknown entity in name, return its literal representation
7a3f0c00 792 return '&%s;' % entity
4e408e47
PH
793
794
d77c3dfd 795def unescapeHTML(s):
912b38b4
PH
796 if s is None:
797 return None
19a03940 798 assert isinstance(s, str)
d77c3dfd 799
4e408e47 800 return re.sub(
95f3f7c2 801 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 802
8bf48f23 803
cdb19aa4 804def escapeHTML(text):
805 return (
806 text
807 .replace('&', '&amp;')
808 .replace('<', '&lt;')
809 .replace('>', '&gt;')
810 .replace('"', '&quot;')
811 .replace("'", '&#39;')
812 )
813
814
f5b1bca9 815def process_communicate_or_kill(p, *args, **kwargs):
816 try:
817 return p.communicate(*args, **kwargs)
818 except BaseException: # Including KeyboardInterrupt
819 p.kill()
820 p.wait()
821 raise
822
823
d3c93ec2 824class Popen(subprocess.Popen):
825 if sys.platform == 'win32':
826 _startupinfo = subprocess.STARTUPINFO()
827 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
828 else:
829 _startupinfo = None
830
831 def __init__(self, *args, **kwargs):
86e5f3ed 832 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 833
834 def communicate_or_kill(self, *args, **kwargs):
835 return process_communicate_or_kill(self, *args, **kwargs)
836
837
aa49acd1
S
838def get_subprocess_encoding():
839 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
840 # For subprocess calls, encode with locale encoding
841 # Refer to http://stackoverflow.com/a/9951851/35070
842 encoding = preferredencoding()
843 else:
844 encoding = sys.getfilesystemencoding()
845 if encoding is None:
846 encoding = 'utf-8'
847 return encoding
848
849
8bf48f23 850def encodeFilename(s, for_subprocess=False):
19a03940 851 assert isinstance(s, str)
cfb0511d 852 return s
aa49acd1
S
853
854
855def decodeFilename(b, for_subprocess=False):
cfb0511d 856 return b
8bf48f23 857
f07b74fc
PH
858
859def encodeArgument(s):
cfb0511d 860 # Legacy code that uses byte strings
861 # Uncomment the following line after fixing all post processors
862 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
863 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
864
865
aa49acd1 866def decodeArgument(b):
cfb0511d 867 return b
aa49acd1
S
868
869
8271226a
PH
870def decodeOption(optval):
871 if optval is None:
872 return optval
873 if isinstance(optval, bytes):
874 optval = optval.decode(preferredencoding())
875
876 assert isinstance(optval, compat_str)
877 return optval
1c256f70 878
5f6a1245 879
aa7785f8 880_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
881
882
883def timetuple_from_msec(msec):
884 secs, msec = divmod(msec, 1000)
885 mins, secs = divmod(secs, 60)
886 hrs, mins = divmod(mins, 60)
887 return _timetuple(hrs, mins, secs, msec)
888
889
cdb19aa4 890def formatSeconds(secs, delim=':', msec=False):
aa7785f8 891 time = timetuple_from_msec(secs * 1000)
892 if time.hours:
893 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
894 elif time.minutes:
895 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 896 else:
aa7785f8 897 ret = '%d' % time.seconds
898 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 899
a0ddb8a2 900
77562778 901def _ssl_load_windows_store_certs(ssl_context, storename):
902 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
903 try:
904 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
905 if encoding == 'x509_asn' and (
906 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
907 except PermissionError:
908 return
909 for cert in certs:
19a03940 910 with contextlib.suppress(ssl.SSLError):
77562778 911 ssl_context.load_verify_locations(cadata=cert)
a2366922 912
77562778 913
914def make_HTTPS_handler(params, **kwargs):
915 opts_check_certificate = not params.get('nocheckcertificate')
916 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
917 context.check_hostname = opts_check_certificate
f81c62a6 918 if params.get('legacyserverconnect'):
919 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
4f28b537 920 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
921 context.set_ciphers('DEFAULT')
77562778 922 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
923 if opts_check_certificate:
d5820461 924 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
925 context.load_verify_locations(cafile=certifi.where())
926 else:
927 try:
928 context.load_default_certs()
929 # Work around the issue in load_default_certs when there are bad certificates. See:
930 # https://github.com/yt-dlp/yt-dlp/issues/1060,
931 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
932 except ssl.SSLError:
933 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
934 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
d5820461 935 for storename in ('CA', 'ROOT'):
936 _ssl_load_windows_store_certs(context, storename)
937 context.set_default_verify_paths()
bb58c9ed 938 client_certfile = params.get('client_certificate')
939 if client_certfile:
940 try:
941 context.load_cert_chain(
942 client_certfile, keyfile=params.get('client_certificate_key'),
943 password=params.get('client_certificate_password'))
944 except ssl.SSLError:
945 raise YoutubeDLError('Unable to load client certificate')
77562778 946 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 947
732ea2f0 948
5873d4cc 949def bug_reports_message(before=';'):
a44ca5a4 950 msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , '
592b7485 951 'filling out the appropriate issue template. '
08d30158 952 'Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
953
954 before = before.rstrip()
955 if not before or before.endswith(('.', '!', '?')):
956 msg = msg[0].title() + msg[1:]
957
958 return (before + ' ' if before else '') + msg
08f2a92c
JMF
959
960
bf5b9d85
PM
961class YoutubeDLError(Exception):
962 """Base exception for YoutubeDL errors."""
aa9369a2 963 msg = None
964
965 def __init__(self, msg=None):
966 if msg is not None:
967 self.msg = msg
968 elif self.msg is None:
969 self.msg = type(self).__name__
970 super().__init__(self.msg)
bf5b9d85
PM
971
972
3158150c 973network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
974if hasattr(ssl, 'CertificateError'):
975 network_exceptions.append(ssl.CertificateError)
976network_exceptions = tuple(network_exceptions)
977
978
bf5b9d85 979class ExtractorError(YoutubeDLError):
1c256f70 980 """Error during info extraction."""
5f6a1245 981
1151c407 982 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 983 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 984 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 985 """
3158150c 986 if sys.exc_info()[0] in network_exceptions:
9a82b238 987 expected = True
d5979c5d 988
7265a219 989 self.orig_msg = str(msg)
1c256f70 990 self.traceback = tb
1151c407 991 self.expected = expected
2eabb802 992 self.cause = cause
d11271dd 993 self.video_id = video_id
1151c407 994 self.ie = ie
995 self.exc_info = sys.exc_info() # preserve original exception
996
86e5f3ed 997 super().__init__(''.join((
1151c407 998 format_field(ie, template='[%s] '),
999 format_field(video_id, template='%s: '),
7265a219 1000 msg,
1151c407 1001 format_field(cause, template=' (caused by %r)'),
1002 '' if expected else bug_reports_message())))
1c256f70 1003
01951dda 1004 def format_traceback(self):
497d2fab 1005 return join_nonempty(
1006 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 1007 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 1008 delim='\n') or None
01951dda 1009
1c256f70 1010
416c7fcb
PH
1011class UnsupportedError(ExtractorError):
1012 def __init__(self, url):
86e5f3ed 1013 super().__init__(
416c7fcb
PH
1014 'Unsupported URL: %s' % url, expected=True)
1015 self.url = url
1016
1017
55b3e45b
JMF
1018class RegexNotFoundError(ExtractorError):
1019 """Error when a regex didn't match"""
1020 pass
1021
1022
773f291d
S
1023class GeoRestrictedError(ExtractorError):
1024 """Geographic restriction Error exception.
1025
1026 This exception may be thrown when a video is not available from your
1027 geographic location due to geographic restrictions imposed by a website.
1028 """
b6e0c7d2 1029
0db3bae8 1030 def __init__(self, msg, countries=None, **kwargs):
1031 kwargs['expected'] = True
86e5f3ed 1032 super().__init__(msg, **kwargs)
773f291d
S
1033 self.countries = countries
1034
1035
bf5b9d85 1036class DownloadError(YoutubeDLError):
59ae15a5 1037 """Download Error exception.
d77c3dfd 1038
59ae15a5
PH
1039 This exception may be thrown by FileDownloader objects if they are not
1040 configured to continue on errors. They will contain the appropriate
1041 error message.
1042 """
5f6a1245 1043
8cc83b8d
FV
1044 def __init__(self, msg, exc_info=None):
1045 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1046 super().__init__(msg)
8cc83b8d 1047 self.exc_info = exc_info
d77c3dfd
FV
1048
1049
498f5606 1050class EntryNotInPlaylist(YoutubeDLError):
1051 """Entry not in playlist exception.
1052
1053 This exception will be thrown by YoutubeDL when a requested entry
1054 is not found in the playlist info_dict
1055 """
aa9369a2 1056 msg = 'Entry not found in info'
498f5606 1057
1058
bf5b9d85 1059class SameFileError(YoutubeDLError):
59ae15a5 1060 """Same File exception.
d77c3dfd 1061
59ae15a5
PH
1062 This exception will be thrown by FileDownloader objects if they detect
1063 multiple files would have to be downloaded to the same file on disk.
1064 """
aa9369a2 1065 msg = 'Fixed output name but more than one file to download'
1066
1067 def __init__(self, filename=None):
1068 if filename is not None:
1069 self.msg += f': {filename}'
1070 super().__init__(self.msg)
d77c3dfd
FV
1071
1072
bf5b9d85 1073class PostProcessingError(YoutubeDLError):
59ae15a5 1074 """Post Processing exception.
d77c3dfd 1075
59ae15a5
PH
1076 This exception may be raised by PostProcessor's .run() method to
1077 indicate an error in the postprocessing task.
1078 """
5f6a1245 1079
5f6a1245 1080
48f79687 1081class DownloadCancelled(YoutubeDLError):
1082 """ Exception raised when the download queue should be interrupted """
1083 msg = 'The download was cancelled'
8b0d7497 1084
8b0d7497 1085
48f79687 1086class ExistingVideoReached(DownloadCancelled):
1087 """ --break-on-existing triggered """
1088 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1089
48f79687 1090
1091class RejectedVideoReached(DownloadCancelled):
1092 """ --break-on-reject triggered """
1093 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
51d9739f 1094
1095
48f79687 1096class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1097 """ --max-downloads limit has been reached. """
48f79687 1098 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1099
1100
f2ebc5c7 1101class ReExtractInfo(YoutubeDLError):
1102 """ Video info needs to be re-extracted. """
1103
1104 def __init__(self, msg, expected=False):
1105 super().__init__(msg)
1106 self.expected = expected
1107
1108
1109class ThrottledDownload(ReExtractInfo):
48f79687 1110 """ Download speed below --throttled-rate. """
aa9369a2 1111 msg = 'The download speed is below throttle limit'
d77c3dfd 1112
43b22906 1113 def __init__(self):
1114 super().__init__(self.msg, expected=False)
f2ebc5c7 1115
d77c3dfd 1116
bf5b9d85 1117class UnavailableVideoError(YoutubeDLError):
59ae15a5 1118 """Unavailable Format exception.
d77c3dfd 1119
59ae15a5
PH
1120 This exception will be thrown when a video is requested
1121 in a format that is not available for that video.
1122 """
aa9369a2 1123 msg = 'Unable to download video'
1124
1125 def __init__(self, err=None):
1126 if err is not None:
1127 self.msg += f': {err}'
1128 super().__init__(self.msg)
d77c3dfd
FV
1129
1130
bf5b9d85 1131class ContentTooShortError(YoutubeDLError):
59ae15a5 1132 """Content Too Short exception.
d77c3dfd 1133
59ae15a5
PH
1134 This exception may be raised by FileDownloader objects when a file they
1135 download is too small for what the server announced first, indicating
1136 the connection was probably interrupted.
1137 """
d77c3dfd 1138
59ae15a5 1139 def __init__(self, downloaded, expected):
86e5f3ed 1140 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1141 # Both in bytes
59ae15a5
PH
1142 self.downloaded = downloaded
1143 self.expected = expected
d77c3dfd 1144
5f6a1245 1145
bf5b9d85 1146class XAttrMetadataError(YoutubeDLError):
efa97bdc 1147 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1148 super().__init__(msg)
efa97bdc 1149 self.code = code
bd264412 1150 self.msg = msg
efa97bdc
YCH
1151
1152 # Parsing code and msg
3089bc74 1153 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1154 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1155 self.reason = 'NO_SPACE'
1156 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1157 self.reason = 'VALUE_TOO_LONG'
1158 else:
1159 self.reason = 'NOT_SUPPORTED'
1160
1161
bf5b9d85 1162class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1163 pass
1164
1165
c5a59d93 1166def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
f9934b96 1167 hc = http_class(*args, **kwargs)
be4a824d 1168 source_address = ydl_handler._params.get('source_address')
8959018a 1169
be4a824d 1170 if source_address is not None:
8959018a
AU
1171 # This is to workaround _create_connection() from socket where it will try all
1172 # address data from getaddrinfo() including IPv6. This filters the result from
1173 # getaddrinfo() based on the source_address value.
1174 # This is based on the cpython socket.create_connection() function.
1175 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1176 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1177 host, port = address
1178 err = None
1179 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1180 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1181 ip_addrs = [addr for addr in addrs if addr[0] == af]
1182 if addrs and not ip_addrs:
1183 ip_version = 'v4' if af == socket.AF_INET else 'v6'
86e5f3ed 1184 raise OSError(
9e21e6d9
S
1185 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1186 % (ip_version, source_address[0]))
8959018a
AU
1187 for res in ip_addrs:
1188 af, socktype, proto, canonname, sa = res
1189 sock = None
1190 try:
1191 sock = socket.socket(af, socktype, proto)
1192 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1193 sock.settimeout(timeout)
1194 sock.bind(source_address)
1195 sock.connect(sa)
1196 err = None # Explicitly break reference cycle
1197 return sock
86e5f3ed 1198 except OSError as _:
8959018a
AU
1199 err = _
1200 if sock is not None:
1201 sock.close()
1202 if err is not None:
1203 raise err
1204 else:
86e5f3ed 1205 raise OSError('getaddrinfo returns an empty list')
9e21e6d9
S
1206 if hasattr(hc, '_create_connection'):
1207 hc._create_connection = _create_connection
cfb0511d 1208 hc.source_address = (source_address, 0)
be4a824d
PH
1209
1210 return hc
1211
1212
87f0e62d 1213def handle_youtubedl_headers(headers):
992fc9d6
YCH
1214 filtered_headers = headers
1215
1216 if 'Youtubedl-no-compression' in filtered_headers:
86e5f3ed 1217 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
87f0e62d 1218 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1219
992fc9d6 1220 return filtered_headers
87f0e62d
YCH
1221
1222
acebc9cd 1223class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
1224 """Handler for HTTP requests and responses.
1225
1226 This class, when installed with an OpenerDirector, automatically adds
1227 the standard headers to every HTTP request and handles gzipped and
1228 deflated responses from web servers. If compression is to be avoided in
1229 a particular request, the original request in the program code only has
0424ec30 1230 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1231 removed before making the real request.
1232
1233 Part of this code was copied from:
1234
1235 http://techknack.net/python-urllib2-handlers/
1236
1237 Andrew Rowls, the author of that code, agreed to release it to the
1238 public domain.
1239 """
1240
be4a824d
PH
1241 def __init__(self, params, *args, **kwargs):
1242 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1243 self._params = params
1244
1245 def http_open(self, req):
71aff188
YCH
1246 conn_class = compat_http_client.HTTPConnection
1247
1248 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1249 if socks_proxy:
1250 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1251 del req.headers['Ytdl-socks-proxy']
1252
be4a824d 1253 return self.do_open(functools.partial(
71aff188 1254 _create_http_connection, self, conn_class, False),
be4a824d
PH
1255 req)
1256
59ae15a5
PH
1257 @staticmethod
1258 def deflate(data):
fc2119f2 1259 if not data:
1260 return data
59ae15a5
PH
1261 try:
1262 return zlib.decompress(data, -zlib.MAX_WBITS)
1263 except zlib.error:
1264 return zlib.decompress(data)
1265
4390d5ec 1266 @staticmethod
1267 def brotli(data):
1268 if not data:
1269 return data
9b8ee23b 1270 return brotli.decompress(data)
4390d5ec 1271
acebc9cd 1272 def http_request(self, req):
51f267d9
S
1273 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1274 # always respected by websites, some tend to give out URLs with non percent-encoded
1275 # non-ASCII characters (see telemb.py, ard.py [#3412])
1276 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1277 # To work around aforementioned issue we will replace request's original URL with
1278 # percent-encoded one
1279 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1280 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1281 url = req.get_full_url()
1282 url_escaped = escape_url(url)
1283
1284 # Substitute URL if any change after escaping
1285 if url != url_escaped:
15d260eb 1286 req = update_Request(req, url=url_escaped)
51f267d9 1287
8b7539d2 1288 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1289 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1290 # The dict keys are capitalized because of this bug by urllib
1291 if h.capitalize() not in req.headers:
33ac271b 1292 req.add_header(h, v)
87f0e62d 1293
af14914b 1294 if 'Accept-encoding' not in req.headers:
1295 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1296
87f0e62d 1297 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b 1298
59ae15a5
PH
1299 return req
1300
acebc9cd 1301 def http_response(self, req, resp):
59ae15a5
PH
1302 old_resp = resp
1303 # gzip
1304 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1305 content = resp.read()
1306 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1307 try:
1308 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1309 except OSError as original_ioerror:
aa3e9507
PH
1310 # There may be junk add the end of the file
1311 # See http://stackoverflow.com/q/4928560/35070 for details
1312 for i in range(1, 1024):
1313 try:
1314 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1315 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1316 except OSError:
aa3e9507
PH
1317 continue
1318 break
1319 else:
1320 raise original_ioerror
b407d853 1321 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1322 resp.msg = old_resp.msg
c047270c 1323 del resp.headers['Content-encoding']
59ae15a5
PH
1324 # deflate
1325 if resp.headers.get('Content-encoding', '') == 'deflate':
1326 gz = io.BytesIO(self.deflate(resp.read()))
b407d853 1327 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1328 resp.msg = old_resp.msg
c047270c 1329 del resp.headers['Content-encoding']
4390d5ec 1330 # brotli
1331 if resp.headers.get('Content-encoding', '') == 'br':
1332 resp = compat_urllib_request.addinfourl(
1333 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1334 resp.msg = old_resp.msg
1335 del resp.headers['Content-encoding']
ad729172 1336 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1337 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1338 if 300 <= resp.code < 400:
1339 location = resp.headers.get('Location')
1340 if location:
1341 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
0f06bcd7 1342 location = location.encode('iso-8859-1').decode()
5a4d9ddb
S
1343 location_escaped = escape_url(location)
1344 if location != location_escaped:
1345 del resp.headers['Location']
1346 resp.headers['Location'] = location_escaped
59ae15a5 1347 return resp
0f8d03f8 1348
acebc9cd
PH
1349 https_request = http_request
1350 https_response = http_response
bf50b038 1351
5de90176 1352
71aff188
YCH
1353def make_socks_conn_class(base_class, socks_proxy):
1354 assert issubclass(base_class, (
1355 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1356
1357 url_components = compat_urlparse.urlparse(socks_proxy)
1358 if url_components.scheme.lower() == 'socks5':
1359 socks_type = ProxyType.SOCKS5
1360 elif url_components.scheme.lower() in ('socks', 'socks4'):
1361 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1362 elif url_components.scheme.lower() == 'socks4a':
1363 socks_type = ProxyType.SOCKS4A
71aff188 1364
cdd94c2e
YCH
1365 def unquote_if_non_empty(s):
1366 if not s:
1367 return s
1368 return compat_urllib_parse_unquote_plus(s)
1369
71aff188
YCH
1370 proxy_args = (
1371 socks_type,
1372 url_components.hostname, url_components.port or 1080,
1373 True, # Remote DNS
cdd94c2e
YCH
1374 unquote_if_non_empty(url_components.username),
1375 unquote_if_non_empty(url_components.password),
71aff188
YCH
1376 )
1377
1378 class SocksConnection(base_class):
1379 def connect(self):
1380 self.sock = sockssocket()
1381 self.sock.setproxy(*proxy_args)
19a03940 1382 if isinstance(self.timeout, (int, float)):
71aff188
YCH
1383 self.sock.settimeout(self.timeout)
1384 self.sock.connect((self.host, self.port))
1385
1386 if isinstance(self, compat_http_client.HTTPSConnection):
1387 if hasattr(self, '_context'): # Python > 2.6
1388 self.sock = self._context.wrap_socket(
1389 self.sock, server_hostname=self.host)
1390 else:
1391 self.sock = ssl.wrap_socket(self.sock)
1392
1393 return SocksConnection
1394
1395
be4a824d
PH
1396class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1397 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1398 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1399 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1400 self._params = params
1401
1402 def https_open(self, req):
4f264c02 1403 kwargs = {}
71aff188
YCH
1404 conn_class = self._https_conn_class
1405
4f264c02
JMF
1406 if hasattr(self, '_context'): # python > 2.6
1407 kwargs['context'] = self._context
1408 if hasattr(self, '_check_hostname'): # python 3.x
1409 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1410
1411 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1412 if socks_proxy:
1413 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1414 del req.headers['Ytdl-socks-proxy']
1415
4f28b537 1416 try:
1417 return self.do_open(
1418 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1419 except urllib.error.URLError as e:
1420 if (isinstance(e.reason, ssl.SSLError)
1421 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1422 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1423 raise
be4a824d
PH
1424
1425
1bab3437 1426class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
f1a8511f
S
1427 """
1428 See [1] for cookie file format.
1429
1430 1. https://curl.haxx.se/docs/http-cookies.html
1431 """
e7e62441 1432 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1433 _ENTRY_LEN = 7
1434 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1435# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1436
1437'''
1438 _CookieFileEntry = collections.namedtuple(
1439 'CookieFileEntry',
1440 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1441
d76fa1f3 1442 def __init__(self, filename=None, *args, **kwargs):
1443 super().__init__(None, *args, **kwargs)
1444 if self.is_path(filename):
1445 filename = os.fspath(filename)
1446 self.filename = filename
1447
1448 @staticmethod
1449 def is_path(file):
1450 return isinstance(file, (str, bytes, os.PathLike))
1451
1452 @contextlib.contextmanager
1453 def open(self, file, *, write=False):
1454 if self.is_path(file):
1455 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1456 yield f
1457 else:
1458 if write:
1459 file.truncate(0)
1460 yield file
1461
1bab3437 1462 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
c380cc28
S
1463 """
1464 Save cookies to a file.
1465
1466 Most of the code is taken from CPython 3.8 and slightly adapted
1467 to support cookie files with UTF-8 in both python 2 and 3.
1468 """
1469 if filename is None:
1470 if self.filename is not None:
1471 filename = self.filename
1472 else:
1473 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1474
1bab3437
S
1475 # Store session cookies with `expires` set to 0 instead of an empty
1476 # string
1477 for cookie in self:
1478 if cookie.expires is None:
1479 cookie.expires = 0
c380cc28 1480
d76fa1f3 1481 with self.open(filename, write=True) as f:
c380cc28
S
1482 f.write(self._HEADER)
1483 now = time.time()
1484 for cookie in self:
1485 if not ignore_discard and cookie.discard:
1486 continue
1487 if not ignore_expires and cookie.is_expired(now):
1488 continue
1489 if cookie.secure:
1490 secure = 'TRUE'
1491 else:
1492 secure = 'FALSE'
1493 if cookie.domain.startswith('.'):
1494 initial_dot = 'TRUE'
1495 else:
1496 initial_dot = 'FALSE'
1497 if cookie.expires is not None:
1498 expires = compat_str(cookie.expires)
1499 else:
1500 expires = ''
1501 if cookie.value is None:
1502 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1503 # with no name, whereas http.cookiejar regards it as a
1504 # cookie with no value.
1505 name = ''
1506 value = cookie.name
1507 else:
1508 name = cookie.name
1509 value = cookie.value
1510 f.write(
1511 '\t'.join([cookie.domain, initial_dot, cookie.path,
1512 secure, expires, name, value]) + '\n')
1bab3437
S
1513
1514 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1515 """Load cookies from a file."""
1516 if filename is None:
1517 if self.filename is not None:
1518 filename = self.filename
1519 else:
1520 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1521
c380cc28
S
1522 def prepare_line(line):
1523 if line.startswith(self._HTTPONLY_PREFIX):
1524 line = line[len(self._HTTPONLY_PREFIX):]
1525 # comments and empty lines are fine
1526 if line.startswith('#') or not line.strip():
1527 return line
1528 cookie_list = line.split('\t')
1529 if len(cookie_list) != self._ENTRY_LEN:
1530 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1531 cookie = self._CookieFileEntry(*cookie_list)
1532 if cookie.expires_at and not cookie.expires_at.isdigit():
1533 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1534 return line
1535
e7e62441 1536 cf = io.StringIO()
d76fa1f3 1537 with self.open(filename) as f:
e7e62441 1538 for line in f:
c380cc28
S
1539 try:
1540 cf.write(prepare_line(line))
1541 except compat_cookiejar.LoadError as e:
94aa0644
L
1542 if f'{line.strip()} '[0] in '[{"':
1543 raise compat_cookiejar.LoadError(
1544 'Cookies file must be Netscape formatted, not JSON. See '
1545 'https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl')
19a03940 1546 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
c380cc28 1547 continue
e7e62441 1548 cf.seek(0)
1549 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1550 # Session cookies are denoted by either `expires` field set to
1551 # an empty string or 0. MozillaCookieJar only recognizes the former
1552 # (see [1]). So we need force the latter to be recognized as session
1553 # cookies on our own.
1554 # Session cookies may be important for cookies-based authentication,
1555 # e.g. usually, when user does not check 'Remember me' check box while
1556 # logging in on a site, some important cookies are stored as session
1557 # cookies so that not recognizing them will result in failed login.
1558 # 1. https://bugs.python.org/issue17164
1559 for cookie in self:
1560 # Treat `expires=0` cookies as session cookies
1561 if cookie.expires == 0:
1562 cookie.expires = None
1563 cookie.discard = True
1564
1565
a6420bf5
S
1566class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1567 def __init__(self, cookiejar=None):
1568 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1569
1570 def http_response(self, request, response):
a6420bf5
S
1571 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1572
f5fa042c 1573 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
a6420bf5
S
1574 https_response = http_response
1575
1576
fca6dba8 1577class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
201c1459 1578 """YoutubeDL redirect handler
1579
1580 The code is based on HTTPRedirectHandler implementation from CPython [1].
1581
1582 This redirect handler solves two issues:
1583 - ensures redirect URL is always unicode under python 2
1584 - introduces support for experimental HTTP response status code
1585 308 Permanent Redirect [2] used by some sites [3]
1586
1587 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1588 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1589 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1590 """
1591
1592 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1593
1594 def redirect_request(self, req, fp, code, msg, headers, newurl):
1595 """Return a Request or None in response to a redirect.
1596
1597 This is called by the http_error_30x methods when a
1598 redirection response is received. If a redirection should
1599 take place, return a new Request to allow http_error_30x to
1600 perform the redirect. Otherwise, raise HTTPError if no-one
1601 else should try to handle this url. Return None if you can't
1602 but another Handler might.
1603 """
1604 m = req.get_method()
1605 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1606 or code in (301, 302, 303) and m == "POST")):
1607 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1608 # Strictly (according to RFC 2616), 301 or 302 in response to
1609 # a POST MUST NOT cause a redirection without confirmation
1610 # from the user (of urllib.request, in this case). In practice,
1611 # essentially all clients do redirect in this case, so we do
1612 # the same.
1613
201c1459 1614 # Be conciliant with URIs containing a space. This is mainly
1615 # redundant with the more complete encoding done in http_error_302(),
1616 # but it is kept for compatibility with other callers.
1617 newurl = newurl.replace(' ', '%20')
1618
1619 CONTENT_HEADERS = ("content-length", "content-type")
1620 # NB: don't use dict comprehension for python 2.6 compatibility
86e5f3ed 1621 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
afac4caa 1622
1623 # A 303 must either use GET or HEAD for subsequent request
1624 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1625 if code == 303 and m != 'HEAD':
1626 m = 'GET'
1627 # 301 and 302 redirects are commonly turned into a GET from a POST
1628 # for subsequent requests by browsers, so we'll do the same.
1629 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1630 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1631 if code in (301, 302) and m == 'POST':
1632 m = 'GET'
1633
201c1459 1634 return compat_urllib_request.Request(
1635 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
afac4caa 1636 unverifiable=True, method=m)
fca6dba8
S
1637
1638
46f59e89
S
1639def extract_timezone(date_str):
1640 m = re.search(
f137e4c2 1641 r'''(?x)
1642 ^.{8,}? # >=8 char non-TZ prefix, if present
1643 (?P<tz>Z| # just the UTC Z, or
1644 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1645 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1646 [ ]? # optional space
1647 (?P<sign>\+|-) # +/-
1648 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1649 $)
1650 ''', date_str)
46f59e89
S
1651 if not m:
1652 timezone = datetime.timedelta()
1653 else:
1654 date_str = date_str[:-len(m.group('tz'))]
1655 if not m.group('sign'):
1656 timezone = datetime.timedelta()
1657 else:
1658 sign = 1 if m.group('sign') == '+' else -1
1659 timezone = datetime.timedelta(
1660 hours=sign * int(m.group('hours')),
1661 minutes=sign * int(m.group('minutes')))
1662 return timezone, date_str
1663
1664
08b38d54 1665def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1666 """ Return a UNIX timestamp from the given date """
1667
1668 if date_str is None:
1669 return None
1670
52c3a6e4
S
1671 date_str = re.sub(r'\.[0-9]+', '', date_str)
1672
08b38d54 1673 if timezone is None:
46f59e89
S
1674 timezone, date_str = extract_timezone(date_str)
1675
19a03940 1676 with contextlib.suppress(ValueError):
86e5f3ed 1677 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1678 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1679 return calendar.timegm(dt.timetuple())
912b38b4
PH
1680
1681
46f59e89
S
1682def date_formats(day_first=True):
1683 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1684
1685
42bdd9d0 1686def unified_strdate(date_str, day_first=True):
bf50b038 1687 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1688
1689 if date_str is None:
1690 return None
bf50b038 1691 upload_date = None
5f6a1245 1692 # Replace commas
026fcc04 1693 date_str = date_str.replace(',', ' ')
42bdd9d0 1694 # Remove AM/PM + timezone
9bb8e0a3 1695 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1696 _, date_str = extract_timezone(date_str)
42bdd9d0 1697
46f59e89 1698 for expression in date_formats(day_first):
19a03940 1699 with contextlib.suppress(ValueError):
bf50b038 1700 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1701 if upload_date is None:
1702 timetuple = email.utils.parsedate_tz(date_str)
1703 if timetuple:
19a03940 1704 with contextlib.suppress(ValueError):
c6b9cf05 1705 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402
JMF
1706 if upload_date is not None:
1707 return compat_str(upload_date)
bf50b038 1708
5f6a1245 1709
46f59e89
S
1710def unified_timestamp(date_str, day_first=True):
1711 if date_str is None:
1712 return None
1713
2ae2ffda 1714 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1715
7dc2a74e 1716 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1717 timezone, date_str = extract_timezone(date_str)
1718
1719 # Remove AM/PM + timezone
1720 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1721
deef3195
S
1722 # Remove unrecognized timezones from ISO 8601 alike timestamps
1723 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1724 if m:
1725 date_str = date_str[:-len(m.group('tz'))]
1726
f226880c
PH
1727 # Python only supports microseconds, so remove nanoseconds
1728 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1729 if m:
1730 date_str = m.group(1)
1731
46f59e89 1732 for expression in date_formats(day_first):
19a03940 1733 with contextlib.suppress(ValueError):
7dc2a74e 1734 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1735 return calendar.timegm(dt.timetuple())
46f59e89
S
1736 timetuple = email.utils.parsedate_tz(date_str)
1737 if timetuple:
7dc2a74e 1738 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1739
1740
28e614de 1741def determine_ext(url, default_ext='unknown_video'):
85750f89 1742 if url is None or '.' not in url:
f4776371 1743 return default_ext
9cb9a5df 1744 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1745 if re.match(r'^[A-Za-z0-9]+$', guess):
1746 return guess
a7aaa398
S
1747 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1748 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1749 return guess.rstrip('/')
73e79f2a 1750 else:
cbdbb766 1751 return default_ext
73e79f2a 1752
5f6a1245 1753
824fa511
S
1754def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1755 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1756
5f6a1245 1757
9e62f283 1758def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1759 R"""
1760 Return a datetime object from a string.
1761 Supported format:
1762 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1763
1764 @param format strftime format of DATE
1765 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1766 auto: round to the unit provided in date_str (if applicable).
9e62f283 1767 """
1768 auto_precision = False
1769 if precision == 'auto':
1770 auto_precision = True
1771 precision = 'microsecond'
396a76f7 1772 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1773 if date_str in ('now', 'today'):
37254abc 1774 return today
f8795e10
PH
1775 if date_str == 'yesterday':
1776 return today - datetime.timedelta(days=1)
9e62f283 1777 match = re.match(
3d38b2d6 1778 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1779 date_str)
37254abc 1780 if match is not None:
9e62f283 1781 start_time = datetime_from_str(match.group('start'), precision, format)
1782 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1783 unit = match.group('unit')
9e62f283 1784 if unit == 'month' or unit == 'year':
1785 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1786 unit = 'day'
9e62f283 1787 else:
1788 if unit == 'week':
1789 unit = 'day'
1790 time *= 7
1791 delta = datetime.timedelta(**{unit + 's': time})
1792 new_date = start_time + delta
1793 if auto_precision:
1794 return datetime_round(new_date, unit)
1795 return new_date
1796
1797 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1798
1799
d49f8db3 1800def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1801 R"""
1802 Return a date object from a string using datetime_from_str
9e62f283 1803
3d38b2d6 1804 @param strict Restrict allowed patterns to "YYYYMMDD" and
1805 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1806 """
3d38b2d6 1807 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1808 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1809 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1810
1811
1812def datetime_add_months(dt, months):
1813 """Increment/Decrement a datetime object by months."""
1814 month = dt.month + months - 1
1815 year = dt.year + month // 12
1816 month = month % 12 + 1
1817 day = min(dt.day, calendar.monthrange(year, month)[1])
1818 return dt.replace(year, month, day)
1819
1820
1821def datetime_round(dt, precision='day'):
1822 """
1823 Round a datetime object's time to a specific precision
1824 """
1825 if precision == 'microsecond':
1826 return dt
1827
1828 unit_seconds = {
1829 'day': 86400,
1830 'hour': 3600,
1831 'minute': 60,
1832 'second': 1,
1833 }
1834 roundto = lambda x, n: ((x + n / 2) // n) * n
1835 timestamp = calendar.timegm(dt.timetuple())
1836 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1837
1838
e63fc1be 1839def hyphenate_date(date_str):
1840 """
1841 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1842 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1843 if match is not None:
1844 return '-'.join(match.groups())
1845 else:
1846 return date_str
1847
5f6a1245 1848
86e5f3ed 1849class DateRange:
bd558525 1850 """Represents a time interval between two dates"""
5f6a1245 1851
bd558525
JMF
1852 def __init__(self, start=None, end=None):
1853 """start and end must be strings in the format accepted by date"""
1854 if start is not None:
d49f8db3 1855 self.start = date_from_str(start, strict=True)
bd558525
JMF
1856 else:
1857 self.start = datetime.datetime.min.date()
1858 if end is not None:
d49f8db3 1859 self.end = date_from_str(end, strict=True)
bd558525
JMF
1860 else:
1861 self.end = datetime.datetime.max.date()
37254abc 1862 if self.start > self.end:
bd558525 1863 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1864
bd558525
JMF
1865 @classmethod
1866 def day(cls, day):
1867 """Returns a range that only contains the given day"""
5f6a1245
JW
1868 return cls(day, day)
1869
bd558525
JMF
1870 def __contains__(self, date):
1871 """Check if the date is in the range"""
37254abc
JMF
1872 if not isinstance(date, datetime.date):
1873 date = date_from_str(date)
1874 return self.start <= date <= self.end
5f6a1245 1875
bd558525 1876 def __str__(self):
86e5f3ed 1877 return f'{self.start.isoformat()} - {self.end.isoformat()}'
c496ca96
PH
1878
1879
1880def platform_name():
1881 """ Returns the platform name as a compat_str """
1882 res = platform.platform()
1883 if isinstance(res, bytes):
1884 res = res.decode(preferredencoding())
1885
1886 assert isinstance(res, compat_str)
1887 return res
c257baff
PH
1888
1889
49fa4d9a
N
1890def get_windows_version():
1891 ''' Get Windows version. None if it's not running on Windows '''
1892 if compat_os_name == 'nt':
1893 return version_tuple(platform.win32_ver()[1])
1894 else:
1895 return None
1896
1897
734f90bb 1898def write_string(s, out=None, encoding=None):
19a03940 1899 assert isinstance(s, str)
1900 out = out or sys.stderr
7459e3a2 1901
fe1daad3 1902 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 1903 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 1904
cfb0511d 1905 if 'b' in getattr(out, 'mode', ''):
104aa738
PH
1906 byt = s.encode(encoding or preferredencoding(), 'ignore')
1907 out.write(byt)
1908 elif hasattr(out, 'buffer'):
1909 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1910 byt = s.encode(enc, 'ignore')
1911 out.buffer.write(byt)
1912 else:
8bf48f23 1913 out.write(s)
7459e3a2
PH
1914 out.flush()
1915
1916
48ea9cea
PH
1917def bytes_to_intlist(bs):
1918 if not bs:
1919 return []
1920 if isinstance(bs[0], int): # Python 3
1921 return list(bs)
1922 else:
1923 return [ord(c) for c in bs]
1924
c257baff 1925
cba892fa 1926def intlist_to_bytes(xs):
1927 if not xs:
1928 return b''
edaa23f8 1929 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1930
1931
0edb3e33 1932class LockingUnsupportedError(IOError):
1933 msg = 'File locking is not supported on this platform'
1934
1935 def __init__(self):
1936 super().__init__(self.msg)
1937
1938
c1c9a79c
PH
1939# Cross-platform file locking
1940if sys.platform == 'win32':
1941 import ctypes.wintypes
1942 import msvcrt
1943
1944 class OVERLAPPED(ctypes.Structure):
1945 _fields_ = [
1946 ('Internal', ctypes.wintypes.LPVOID),
1947 ('InternalHigh', ctypes.wintypes.LPVOID),
1948 ('Offset', ctypes.wintypes.DWORD),
1949 ('OffsetHigh', ctypes.wintypes.DWORD),
1950 ('hEvent', ctypes.wintypes.HANDLE),
1951 ]
1952
1953 kernel32 = ctypes.windll.kernel32
1954 LockFileEx = kernel32.LockFileEx
1955 LockFileEx.argtypes = [
1956 ctypes.wintypes.HANDLE, # hFile
1957 ctypes.wintypes.DWORD, # dwFlags
1958 ctypes.wintypes.DWORD, # dwReserved
1959 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1960 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1961 ctypes.POINTER(OVERLAPPED) # Overlapped
1962 ]
1963 LockFileEx.restype = ctypes.wintypes.BOOL
1964 UnlockFileEx = kernel32.UnlockFileEx
1965 UnlockFileEx.argtypes = [
1966 ctypes.wintypes.HANDLE, # hFile
1967 ctypes.wintypes.DWORD, # dwReserved
1968 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1969 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1970 ctypes.POINTER(OVERLAPPED) # Overlapped
1971 ]
1972 UnlockFileEx.restype = ctypes.wintypes.BOOL
1973 whole_low = 0xffffffff
1974 whole_high = 0x7fffffff
1975
747c0bd1 1976 def _lock_file(f, exclusive, block):
c1c9a79c
PH
1977 overlapped = OVERLAPPED()
1978 overlapped.Offset = 0
1979 overlapped.OffsetHigh = 0
1980 overlapped.hEvent = 0
1981 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 1982
1983 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1984 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1985 0, whole_low, whole_high, f._lock_file_overlapped_p):
1986 raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
c1c9a79c
PH
1987
1988 def _unlock_file(f):
1989 assert f._lock_file_overlapped_p
1990 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 1991 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
1992 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1993
1994else:
399a76e6
YCH
1995 try:
1996 import fcntl
c1c9a79c 1997
a3125791 1998 def _lock_file(f, exclusive, block):
b63837bc 1999 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2000 if not block:
2001 flags |= fcntl.LOCK_NB
acea8d7c 2002 try:
b63837bc 2003 fcntl.flock(f, flags)
acea8d7c
JK
2004 except BlockingIOError:
2005 raise
2006 except OSError: # AOSP does not have flock()
b63837bc 2007 fcntl.lockf(f, flags)
c1c9a79c 2008
399a76e6 2009 def _unlock_file(f):
acea8d7c
JK
2010 try:
2011 fcntl.flock(f, fcntl.LOCK_UN)
2012 except OSError:
2013 fcntl.lockf(f, fcntl.LOCK_UN)
a3125791 2014
399a76e6 2015 except ImportError:
399a76e6 2016
a3125791 2017 def _lock_file(f, exclusive, block):
0edb3e33 2018 raise LockingUnsupportedError()
399a76e6
YCH
2019
2020 def _unlock_file(f):
0edb3e33 2021 raise LockingUnsupportedError()
c1c9a79c
PH
2022
2023
86e5f3ed 2024class locked_file:
0edb3e33 2025 locked = False
747c0bd1 2026
a3125791 2027 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
2028 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2029 raise NotImplementedError(mode)
2030 self.mode, self.block = mode, block
2031
2032 writable = any(f in mode for f in 'wax+')
2033 readable = any(f in mode for f in 'r+')
2034 flags = functools.reduce(operator.ior, (
2035 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2036 getattr(os, 'O_BINARY', 0), # Windows only
2037 getattr(os, 'O_NOINHERIT', 0), # Windows only
2038 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2039 os.O_APPEND if 'a' in mode else 0,
2040 os.O_EXCL if 'x' in mode else 0,
2041 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2042 ))
2043
98804d03 2044 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
2045
2046 def __enter__(self):
a3125791 2047 exclusive = 'r' not in self.mode
c1c9a79c 2048 try:
a3125791 2049 _lock_file(self.f, exclusive, self.block)
0edb3e33 2050 self.locked = True
86e5f3ed 2051 except OSError:
c1c9a79c
PH
2052 self.f.close()
2053 raise
fcfa8853 2054 if 'w' in self.mode:
131e14dc
JK
2055 try:
2056 self.f.truncate()
2057 except OSError as e:
2058 if e.errno != 29: # Illegal seek, expected when self.f is a FIFO
2059 raise e
c1c9a79c
PH
2060 return self
2061
0edb3e33 2062 def unlock(self):
2063 if not self.locked:
2064 return
c1c9a79c 2065 try:
0edb3e33 2066 _unlock_file(self.f)
c1c9a79c 2067 finally:
0edb3e33 2068 self.locked = False
c1c9a79c 2069
0edb3e33 2070 def __exit__(self, *_):
2071 try:
2072 self.unlock()
2073 finally:
2074 self.f.close()
4eb7f1d1 2075
0edb3e33 2076 open = __enter__
2077 close = __exit__
a3125791 2078
0edb3e33 2079 def __getattr__(self, attr):
2080 return getattr(self.f, attr)
a3125791 2081
0edb3e33 2082 def __iter__(self):
2083 return iter(self.f)
a3125791 2084
4eb7f1d1 2085
4644ac55
S
2086def get_filesystem_encoding():
2087 encoding = sys.getfilesystemencoding()
2088 return encoding if encoding is not None else 'utf-8'
2089
2090
4eb7f1d1 2091def shell_quote(args):
a6a173c2 2092 quoted_args = []
4644ac55 2093 encoding = get_filesystem_encoding()
a6a173c2
JMF
2094 for a in args:
2095 if isinstance(a, bytes):
2096 # We may get a filename encoded with 'encodeFilename'
2097 a = a.decode(encoding)
aefce8e6 2098 quoted_args.append(compat_shlex_quote(a))
28e614de 2099 return ' '.join(quoted_args)
9d4660ca
PH
2100
2101
2102def smuggle_url(url, data):
2103 """ Pass additional data in a URL for internal use. """
2104
81953d1a
RA
2105 url, idata = unsmuggle_url(url, {})
2106 data.update(idata)
15707c7e 2107 sdata = compat_urllib_parse_urlencode(
28e614de
PH
2108 {'__youtubedl_smuggle': json.dumps(data)})
2109 return url + '#' + sdata
9d4660ca
PH
2110
2111
79f82953 2112def unsmuggle_url(smug_url, default=None):
83e865a3 2113 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2114 return smug_url, default
28e614de
PH
2115 url, _, sdata = smug_url.rpartition('#')
2116 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2117 data = json.loads(jsond)
2118 return url, data
02dbf93f
PH
2119
2120
e0fd9573 2121def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2122 """ Formats numbers with decimal sufixes like K, M, etc """
2123 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2124 if num is None or num < 0:
e0fd9573 2125 return None
eeb2a770 2126 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2127 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2128 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 2129 if factor == 1024:
2130 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2131 converted = num / (factor ** exponent)
abbeeebc 2132 return fmt % (converted, suffix)
e0fd9573 2133
2134
02dbf93f 2135def format_bytes(bytes):
f02d24d8 2136 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2137
1c088fa8 2138
fb47597b
S
2139def lookup_unit_table(unit_table, s):
2140 units_re = '|'.join(re.escape(u) for u in unit_table)
2141 m = re.match(
782b1b5b 2142 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
2143 if not m:
2144 return None
2145 num_str = m.group('num').replace(',', '.')
2146 mult = unit_table[m.group('unit')]
2147 return int(float(num_str) * mult)
2148
2149
be64b5b0
PH
2150def parse_filesize(s):
2151 if s is None:
2152 return None
2153
dfb1b146 2154 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2155 # but we support those too
2156 _UNIT_TABLE = {
2157 'B': 1,
2158 'b': 1,
70852b47 2159 'bytes': 1,
be64b5b0
PH
2160 'KiB': 1024,
2161 'KB': 1000,
2162 'kB': 1024,
2163 'Kb': 1000,
13585d76 2164 'kb': 1000,
70852b47
YCH
2165 'kilobytes': 1000,
2166 'kibibytes': 1024,
be64b5b0
PH
2167 'MiB': 1024 ** 2,
2168 'MB': 1000 ** 2,
2169 'mB': 1024 ** 2,
2170 'Mb': 1000 ** 2,
13585d76 2171 'mb': 1000 ** 2,
70852b47
YCH
2172 'megabytes': 1000 ** 2,
2173 'mebibytes': 1024 ** 2,
be64b5b0
PH
2174 'GiB': 1024 ** 3,
2175 'GB': 1000 ** 3,
2176 'gB': 1024 ** 3,
2177 'Gb': 1000 ** 3,
13585d76 2178 'gb': 1000 ** 3,
70852b47
YCH
2179 'gigabytes': 1000 ** 3,
2180 'gibibytes': 1024 ** 3,
be64b5b0
PH
2181 'TiB': 1024 ** 4,
2182 'TB': 1000 ** 4,
2183 'tB': 1024 ** 4,
2184 'Tb': 1000 ** 4,
13585d76 2185 'tb': 1000 ** 4,
70852b47
YCH
2186 'terabytes': 1000 ** 4,
2187 'tebibytes': 1024 ** 4,
be64b5b0
PH
2188 'PiB': 1024 ** 5,
2189 'PB': 1000 ** 5,
2190 'pB': 1024 ** 5,
2191 'Pb': 1000 ** 5,
13585d76 2192 'pb': 1000 ** 5,
70852b47
YCH
2193 'petabytes': 1000 ** 5,
2194 'pebibytes': 1024 ** 5,
be64b5b0
PH
2195 'EiB': 1024 ** 6,
2196 'EB': 1000 ** 6,
2197 'eB': 1024 ** 6,
2198 'Eb': 1000 ** 6,
13585d76 2199 'eb': 1000 ** 6,
70852b47
YCH
2200 'exabytes': 1000 ** 6,
2201 'exbibytes': 1024 ** 6,
be64b5b0
PH
2202 'ZiB': 1024 ** 7,
2203 'ZB': 1000 ** 7,
2204 'zB': 1024 ** 7,
2205 'Zb': 1000 ** 7,
13585d76 2206 'zb': 1000 ** 7,
70852b47
YCH
2207 'zettabytes': 1000 ** 7,
2208 'zebibytes': 1024 ** 7,
be64b5b0
PH
2209 'YiB': 1024 ** 8,
2210 'YB': 1000 ** 8,
2211 'yB': 1024 ** 8,
2212 'Yb': 1000 ** 8,
13585d76 2213 'yb': 1000 ** 8,
70852b47
YCH
2214 'yottabytes': 1000 ** 8,
2215 'yobibytes': 1024 ** 8,
be64b5b0
PH
2216 }
2217
fb47597b
S
2218 return lookup_unit_table(_UNIT_TABLE, s)
2219
2220
2221def parse_count(s):
2222 if s is None:
be64b5b0
PH
2223 return None
2224
352d5da8 2225 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2226
2227 if re.match(r'^[\d,.]+$', s):
2228 return str_to_int(s)
2229
2230 _UNIT_TABLE = {
2231 'k': 1000,
2232 'K': 1000,
2233 'm': 1000 ** 2,
2234 'M': 1000 ** 2,
2235 'kk': 1000 ** 2,
2236 'KK': 1000 ** 2,
352d5da8 2237 'b': 1000 ** 3,
2238 'B': 1000 ** 3,
fb47597b 2239 }
be64b5b0 2240
352d5da8 2241 ret = lookup_unit_table(_UNIT_TABLE, s)
2242 if ret is not None:
2243 return ret
2244
2245 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2246 if mobj:
2247 return str_to_int(mobj.group(1))
be64b5b0 2248
2f7ae819 2249
5d45484c 2250def parse_resolution(s, *, lenient=False):
b871d7e9
S
2251 if s is None:
2252 return {}
2253
5d45484c
LNO
2254 if lenient:
2255 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2256 else:
2257 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2258 if mobj:
2259 return {
2260 'width': int(mobj.group('w')),
2261 'height': int(mobj.group('h')),
2262 }
2263
17ec8bcf 2264 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2265 if mobj:
2266 return {'height': int(mobj.group(1))}
2267
2268 mobj = re.search(r'\b([48])[kK]\b', s)
2269 if mobj:
2270 return {'height': int(mobj.group(1)) * 540}
2271
2272 return {}
2273
2274
0dc41787
S
2275def parse_bitrate(s):
2276 if not isinstance(s, compat_str):
2277 return
2278 mobj = re.search(r'\b(\d+)\s*kbps', s)
2279 if mobj:
2280 return int(mobj.group(1))
2281
2282
a942d6cb 2283def month_by_name(name, lang='en'):
caefb1de
PH
2284 """ Return the number of a month by (locale-independently) English name """
2285
f6717dec 2286 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2287
caefb1de 2288 try:
f6717dec 2289 return month_names.index(name) + 1
7105440c
YCH
2290 except ValueError:
2291 return None
2292
2293
2294def month_by_abbreviation(abbrev):
2295 """ Return the number of a month by (locale-independently) English
2296 abbreviations """
2297
2298 try:
2299 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2300 except ValueError:
2301 return None
18258362
JMF
2302
2303
5aafe895 2304def fix_xml_ampersands(xml_str):
18258362 2305 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2306 return re.sub(
2307 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2308 '&amp;',
5aafe895 2309 xml_str)
e3946f98
PH
2310
2311
2312def setproctitle(title):
8bf48f23 2313 assert isinstance(title, compat_str)
c1c05c67
YCH
2314
2315 # ctypes in Jython is not complete
2316 # http://bugs.jython.org/issue2148
2317 if sys.platform.startswith('java'):
2318 return
2319
e3946f98 2320 try:
611c1dd9 2321 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2322 except OSError:
2323 return
2f49bcd6
RC
2324 except TypeError:
2325 # LoadLibrary in Windows Python 2.7.13 only expects
2326 # a bytestring, but since unicode_literals turns
2327 # every string into a unicode string, it fails.
2328 return
0f06bcd7 2329 title_bytes = title.encode()
6eefe533
PH
2330 buf = ctypes.create_string_buffer(len(title_bytes))
2331 buf.value = title_bytes
e3946f98 2332 try:
6eefe533 2333 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2334 except AttributeError:
2335 return # Strange libc, just skip this
d7dda168
PH
2336
2337
2338def remove_start(s, start):
46bc9b7d 2339 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2340
2341
2b9faf55 2342def remove_end(s, end):
46bc9b7d 2343 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2344
2345
31b2051e
S
2346def remove_quotes(s):
2347 if s is None or len(s) < 2:
2348 return s
2349 for quote in ('"', "'", ):
2350 if s[0] == quote and s[-1] == quote:
2351 return s[1:-1]
2352 return s
2353
2354
b6e0c7d2
U
2355def get_domain(url):
2356 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2357 return domain.group('domain') if domain else None
2358
2359
29eb5174 2360def url_basename(url):
9b8aaeed 2361 path = compat_urlparse.urlparse(url).path
28e614de 2362 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2363
2364
02dc0a36
S
2365def base_url(url):
2366 return re.match(r'https?://[^?#&]+/', url).group()
2367
2368
e34c3361 2369def urljoin(base, path):
4b5de77b 2370 if isinstance(path, bytes):
0f06bcd7 2371 path = path.decode()
e34c3361
S
2372 if not isinstance(path, compat_str) or not path:
2373 return None
fad4ceb5 2374 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2375 return path
4b5de77b 2376 if isinstance(base, bytes):
0f06bcd7 2377 base = base.decode()
4b5de77b
S
2378 if not isinstance(base, compat_str) or not re.match(
2379 r'^(?:https?:)?//', base):
e34c3361
S
2380 return None
2381 return compat_urlparse.urljoin(base, path)
2382
2383
aa94a6d3
PH
2384class HEADRequest(compat_urllib_request.Request):
2385 def get_method(self):
611c1dd9 2386 return 'HEAD'
7217e148
PH
2387
2388
95cf60e8
S
2389class PUTRequest(compat_urllib_request.Request):
2390 def get_method(self):
2391 return 'PUT'
2392
2393
9732d77e 2394def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2395 if get_attr and v is not None:
2396 v = getattr(v, get_attr, None)
1812afb7
S
2397 try:
2398 return int(v) * invscale // scale
31c49255 2399 except (ValueError, TypeError, OverflowError):
af98f8ff 2400 return default
9732d77e 2401
9572013d 2402
40a90862
JMF
2403def str_or_none(v, default=None):
2404 return default if v is None else compat_str(v)
2405
9732d77e
PH
2406
2407def str_to_int(int_str):
48d4681e 2408 """ A more relaxed version of int_or_none """
f9934b96 2409 if isinstance(int_str, int):
348c6bf1 2410 return int_str
42db58ec
S
2411 elif isinstance(int_str, compat_str):
2412 int_str = re.sub(r'[,\.\+]', '', int_str)
2413 return int_or_none(int_str)
608d11f5
PH
2414
2415
9732d77e 2416def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2417 if v is None:
2418 return default
2419 try:
2420 return float(v) * invscale / scale
5e1271c5 2421 except (ValueError, TypeError):
caf80631 2422 return default
43f775e4
PH
2423
2424
c7e327c4
S
2425def bool_or_none(v, default=None):
2426 return v if isinstance(v, bool) else default
2427
2428
53cd37ba
S
2429def strip_or_none(v, default=None):
2430 return v.strip() if isinstance(v, compat_str) else default
b72b4431
S
2431
2432
af03000a
S
2433def url_or_none(url):
2434 if not url or not isinstance(url, compat_str):
2435 return None
2436 url = url.strip()
29f7c58a 2437 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2438
2439
3e9b66d7
LNO
2440def request_to_url(req):
2441 if isinstance(req, compat_urllib_request.Request):
2442 return req.get_full_url()
2443 else:
2444 return req
2445
2446
e29663c6 2447def strftime_or_none(timestamp, date_format, default=None):
2448 datetime_object = None
2449 try:
f9934b96 2450 if isinstance(timestamp, (int, float)): # unix timestamp
e29663c6 2451 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2452 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2453 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2454 return datetime_object.strftime(date_format)
2455 except (ValueError, TypeError, AttributeError):
2456 return default
2457
2458
608d11f5 2459def parse_duration(s):
f9934b96 2460 if not isinstance(s, str):
608d11f5 2461 return None
ca7b3246 2462 s = s.strip()
38d79fd1 2463 if not s:
2464 return None
ca7b3246 2465
acaff495 2466 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2467 m = re.match(r'''(?x)
2468 (?P<before_secs>
2469 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2470 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2471 (?P<ms>[.:][0-9]+)?Z?$
2472 ''', s)
acaff495 2473 if m:
8bd1c00b 2474 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2475 else:
2476 m = re.match(
056653bb
S
2477 r'''(?ix)(?:P?
2478 (?:
1c1b2f96 2479 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2480 )?
2481 (?:
1c1b2f96 2482 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2483 )?
2484 (?:
1c1b2f96 2485 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2486 )?
8f4b58d7 2487 (?:
1c1b2f96 2488 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2489 )?
056653bb 2490 T)?
acaff495 2491 (?:
1c1b2f96 2492 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2493 )?
2494 (?:
1c1b2f96 2495 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2496 )?
2497 (?:
2498 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2499 )?Z?$''', s)
acaff495 2500 if m:
2501 days, hours, mins, secs, ms = m.groups()
2502 else:
15846398 2503 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2504 if m:
2505 hours, mins = m.groups()
2506 else:
2507 return None
2508
acaff495 2509 if ms:
19a03940 2510 ms = ms.replace(':', '.')
2511 return sum(float(part or 0) * mult for part, mult in (
2512 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2513
2514
e65e4c88 2515def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2516 name, real_ext = os.path.splitext(filename)
e65e4c88 2517 return (
86e5f3ed 2518 f'{name}.{ext}{real_ext}'
e65e4c88 2519 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2520 else f'{filename}.{ext}')
d70ad093
PH
2521
2522
b3ed15b7
S
2523def replace_extension(filename, ext, expected_real_ext=None):
2524 name, real_ext = os.path.splitext(filename)
86e5f3ed 2525 return '{}.{}'.format(
b3ed15b7
S
2526 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2527 ext)
2528
2529
d70ad093
PH
2530def check_executable(exe, args=[]):
2531 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2532 args can be a list of arguments for a short output (like -version) """
2533 try:
d3c93ec2 2534 Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
d70ad093
PH
2535 except OSError:
2536 return False
2537 return exe
b7ab0590
PH
2538
2539
8a7f68d0 2540def _get_exe_version_output(exe, args, *, to_screen=None):
2541 if to_screen:
2542 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
95807118 2543 try:
b64d04c1 2544 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2545 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2546 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
d3c93ec2 2547 out, _ = Popen(
2548 [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2549 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
95807118
PH
2550 except OSError:
2551 return False
cae97f65
PH
2552 if isinstance(out, bytes): # Python 2.x
2553 out = out.decode('ascii', 'ignore')
9af98e17 2554 return out
cae97f65
PH
2555
2556
2557def detect_exe_version(output, version_re=None, unrecognized='present'):
2558 assert isinstance(output, compat_str)
2559 if version_re is None:
2560 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2561 m = re.search(version_re, output)
95807118
PH
2562 if m:
2563 return m.group(1)
2564 else:
2565 return unrecognized
2566
2567
9af98e17 2568def get_exe_version(exe, args=['--version'],
2569 version_re=None, unrecognized='present'):
2570 """ Returns the version of the specified executable,
2571 or False if the executable is not present """
2572 out = _get_exe_version_output(exe, args)
2573 return detect_exe_version(out, version_re, unrecognized) if out else False
2574
2575
cb89cfc1 2576class LazyList(collections.abc.Sequence):
0f06bcd7 2577 """Lazy immutable list from an iterable
2578 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2579
8e5fecc8 2580 class IndexError(IndexError):
2581 pass
2582
282f5709 2583 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2584 self._iterable = iter(iterable)
2585 self._cache = [] if _cache is None else _cache
2586 self._reversed = reverse
483336e7 2587
2588 def __iter__(self):
0f06bcd7 2589 if self._reversed:
28419ca2 2590 # We need to consume the entire iterable to iterate in reverse
981052c9 2591 yield from self.exhaust()
28419ca2 2592 return
0f06bcd7 2593 yield from self._cache
2594 for item in self._iterable:
2595 self._cache.append(item)
483336e7 2596 yield item
2597
0f06bcd7 2598 def _exhaust(self):
2599 self._cache.extend(self._iterable)
2600 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2601 return self._cache
28419ca2 2602
981052c9 2603 def exhaust(self):
0f06bcd7 2604 """Evaluate the entire iterable"""
2605 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2606
28419ca2 2607 @staticmethod
0f06bcd7 2608 def _reverse_index(x):
e0f2b4b4 2609 return None if x is None else -(x + 1)
483336e7 2610
2611 def __getitem__(self, idx):
2612 if isinstance(idx, slice):
0f06bcd7 2613 if self._reversed:
2614 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2615 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2616 elif isinstance(idx, int):
0f06bcd7 2617 if self._reversed:
2618 idx = self._reverse_index(idx)
e0f2b4b4 2619 start, stop, step = idx, idx, 0
483336e7 2620 else:
2621 raise TypeError('indices must be integers or slices')
e0f2b4b4 2622 if ((start or 0) < 0 or (stop or 0) < 0
2623 or (start is None and step < 0)
2624 or (stop is None and step > 0)):
483336e7 2625 # We need to consume the entire iterable to be able to slice from the end
2626 # Obviously, never use this with infinite iterables
0f06bcd7 2627 self._exhaust()
8e5fecc8 2628 try:
0f06bcd7 2629 return self._cache[idx]
8e5fecc8 2630 except IndexError as e:
2631 raise self.IndexError(e) from e
0f06bcd7 2632 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2633 if n > 0:
0f06bcd7 2634 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2635 try:
0f06bcd7 2636 return self._cache[idx]
8e5fecc8 2637 except IndexError as e:
2638 raise self.IndexError(e) from e
483336e7 2639
2640 def __bool__(self):
2641 try:
0f06bcd7 2642 self[-1] if self._reversed else self[0]
8e5fecc8 2643 except self.IndexError:
483336e7 2644 return False
2645 return True
2646
2647 def __len__(self):
0f06bcd7 2648 self._exhaust()
2649 return len(self._cache)
483336e7 2650
282f5709 2651 def __reversed__(self):
0f06bcd7 2652 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2653
2654 def __copy__(self):
0f06bcd7 2655 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2656
28419ca2 2657 def __repr__(self):
2658 # repr and str should mimic a list. So we exhaust the iterable
2659 return repr(self.exhaust())
2660
2661 def __str__(self):
2662 return repr(self.exhaust())
2663
483336e7 2664
7be9ccff 2665class PagedList:
c07a39ae 2666
2667 class IndexError(IndexError):
2668 pass
2669
dd26ced1
PH
2670 def __len__(self):
2671 # This is only useful for tests
2672 return len(self.getslice())
2673
7be9ccff 2674 def __init__(self, pagefunc, pagesize, use_cache=True):
2675 self._pagefunc = pagefunc
2676 self._pagesize = pagesize
f1d13090 2677 self._pagecount = float('inf')
7be9ccff 2678 self._use_cache = use_cache
2679 self._cache = {}
2680
2681 def getpage(self, pagenum):
d8cf8d97 2682 page_results = self._cache.get(pagenum)
2683 if page_results is None:
f1d13090 2684 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2685 if self._use_cache:
2686 self._cache[pagenum] = page_results
2687 return page_results
2688
2689 def getslice(self, start=0, end=None):
2690 return list(self._getslice(start, end))
2691
2692 def _getslice(self, start, end):
55575225 2693 raise NotImplementedError('This method must be implemented by subclasses')
2694
2695 def __getitem__(self, idx):
f1d13090 2696 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2697 if not isinstance(idx, int) or idx < 0:
2698 raise TypeError('indices must be non-negative integers')
2699 entries = self.getslice(idx, idx + 1)
d8cf8d97 2700 if not entries:
c07a39ae 2701 raise self.IndexError()
d8cf8d97 2702 return entries[0]
55575225 2703
9c44d242
PH
2704
2705class OnDemandPagedList(PagedList):
a44ca5a4 2706 """Download pages until a page with less than maximum results"""
86e5f3ed 2707
7be9ccff 2708 def _getslice(self, start, end):
b7ab0590
PH
2709 for pagenum in itertools.count(start // self._pagesize):
2710 firstid = pagenum * self._pagesize
2711 nextfirstid = pagenum * self._pagesize + self._pagesize
2712 if start >= nextfirstid:
2713 continue
2714
b7ab0590
PH
2715 startv = (
2716 start % self._pagesize
2717 if firstid <= start < nextfirstid
2718 else 0)
b7ab0590
PH
2719 endv = (
2720 ((end - 1) % self._pagesize) + 1
2721 if (end is not None and firstid <= end <= nextfirstid)
2722 else None)
2723
f1d13090 2724 try:
2725 page_results = self.getpage(pagenum)
2726 except Exception:
2727 self._pagecount = pagenum - 1
2728 raise
b7ab0590
PH
2729 if startv != 0 or endv is not None:
2730 page_results = page_results[startv:endv]
7be9ccff 2731 yield from page_results
b7ab0590
PH
2732
2733 # A little optimization - if current page is not "full", ie. does
2734 # not contain page_size videos then we can assume that this page
2735 # is the last one - there are no more ids on further pages -
2736 # i.e. no need to query again.
2737 if len(page_results) + startv < self._pagesize:
2738 break
2739
2740 # If we got the whole page, but the next page is not interesting,
2741 # break out early as well
2742 if end == nextfirstid:
2743 break
81c2f20b
PH
2744
2745
9c44d242 2746class InAdvancePagedList(PagedList):
a44ca5a4 2747 """PagedList with total number of pages known in advance"""
86e5f3ed 2748
9c44d242 2749 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2750 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2751 self._pagecount = pagecount
9c44d242 2752
7be9ccff 2753 def _getslice(self, start, end):
9c44d242 2754 start_page = start // self._pagesize
d37707bd 2755 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2756 skip_elems = start - start_page * self._pagesize
2757 only_more = None if end is None else end - start
2758 for pagenum in range(start_page, end_page):
7be9ccff 2759 page_results = self.getpage(pagenum)
9c44d242 2760 if skip_elems:
7be9ccff 2761 page_results = page_results[skip_elems:]
9c44d242
PH
2762 skip_elems = None
2763 if only_more is not None:
7be9ccff 2764 if len(page_results) < only_more:
2765 only_more -= len(page_results)
9c44d242 2766 else:
7be9ccff 2767 yield from page_results[:only_more]
9c44d242 2768 break
7be9ccff 2769 yield from page_results
9c44d242
PH
2770
2771
81c2f20b 2772def uppercase_escape(s):
676eb3f2 2773 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2774 return re.sub(
a612753d 2775 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2776 lambda m: unicode_escape(m.group(0))[0],
2777 s)
0fe2ff78
YCH
2778
2779
2780def lowercase_escape(s):
2781 unicode_escape = codecs.getdecoder('unicode_escape')
2782 return re.sub(
2783 r'\\u[0-9a-fA-F]{4}',
2784 lambda m: unicode_escape(m.group(0))[0],
2785 s)
b53466e1 2786
d05cfe06
S
2787
2788def escape_rfc3986(s):
2789 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 2790 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2791
2792
2793def escape_url(url):
2794 """Escape URL as suggested by RFC 3986"""
2795 url_parsed = compat_urllib_parse_urlparse(url)
2796 return url_parsed._replace(
efbed08d 2797 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2798 path=escape_rfc3986(url_parsed.path),
2799 params=escape_rfc3986(url_parsed.params),
2800 query=escape_rfc3986(url_parsed.query),
2801 fragment=escape_rfc3986(url_parsed.fragment)
2802 ).geturl()
2803
62e609ab 2804
4dfbf869 2805def parse_qs(url):
2806 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2807
2808
62e609ab
PH
2809def read_batch_urls(batch_fd):
2810 def fixup(url):
2811 if not isinstance(url, compat_str):
2812 url = url.decode('utf-8', 'replace')
8c04f0be 2813 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2814 for bom in BOM_UTF8:
2815 if url.startswith(bom):
2816 url = url[len(bom):]
2817 url = url.lstrip()
2818 if not url or url.startswith(('#', ';', ']')):
62e609ab 2819 return False
8c04f0be 2820 # "#" cannot be stripped out since it is part of the URI
2821 # However, it can be safely stipped out if follwing a whitespace
2822 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
2823
2824 with contextlib.closing(batch_fd) as fd:
2825 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2826
2827
2828def urlencode_postdata(*args, **kargs):
15707c7e 2829 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2830
2831
38f9ef31 2832def update_url_query(url, query):
cacd9966
YCH
2833 if not query:
2834 return url
38f9ef31 2835 parsed_url = compat_urlparse.urlparse(url)
2836 qs = compat_parse_qs(parsed_url.query)
2837 qs.update(query)
2838 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2839 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2840
8e60dc75 2841
ed0291d1
S
2842def update_Request(req, url=None, data=None, headers={}, query={}):
2843 req_headers = req.headers.copy()
2844 req_headers.update(headers)
2845 req_data = data or req.data
2846 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2847 req_get_method = req.get_method()
2848 if req_get_method == 'HEAD':
2849 req_type = HEADRequest
2850 elif req_get_method == 'PUT':
2851 req_type = PUTRequest
2852 else:
2853 req_type = compat_urllib_request.Request
ed0291d1
S
2854 new_req = req_type(
2855 req_url, data=req_data, headers=req_headers,
2856 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2857 if hasattr(req, 'timeout'):
2858 new_req.timeout = req.timeout
2859 return new_req
2860
2861
10c87c15 2862def _multipart_encode_impl(data, boundary):
0c265486
YCH
2863 content_type = 'multipart/form-data; boundary=%s' % boundary
2864
2865 out = b''
2866 for k, v in data.items():
2867 out += b'--' + boundary.encode('ascii') + b'\r\n'
2868 if isinstance(k, compat_str):
0f06bcd7 2869 k = k.encode()
0c265486 2870 if isinstance(v, compat_str):
0f06bcd7 2871 v = v.encode()
0c265486
YCH
2872 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2873 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2874 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2875 if boundary.encode('ascii') in content:
2876 raise ValueError('Boundary overlaps with data')
2877 out += content
2878
2879 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2880
2881 return out, content_type
2882
2883
2884def multipart_encode(data, boundary=None):
2885 '''
2886 Encode a dict to RFC 7578-compliant form-data
2887
2888 data:
2889 A dict where keys and values can be either Unicode or bytes-like
2890 objects.
2891 boundary:
2892 If specified a Unicode object, it's used as the boundary. Otherwise
2893 a random boundary is generated.
2894
2895 Reference: https://tools.ietf.org/html/rfc7578
2896 '''
2897 has_specified_boundary = boundary is not None
2898
2899 while True:
2900 if boundary is None:
2901 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2902
2903 try:
10c87c15 2904 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2905 break
2906 except ValueError:
2907 if has_specified_boundary:
2908 raise
2909 boundary = None
2910
2911 return out, content_type
2912
2913
86296ad2 2914def dict_get(d, key_or_keys, default=None, skip_false_values=True):
a44ca5a4 2915 for val in map(d.get, variadic(key_or_keys)):
2916 if val is not None and (val or not skip_false_values):
2917 return val
2918 return default
cbecc9b9
S
2919
2920
c4f60dd7 2921def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2922 for f in funcs:
a32a9a7e 2923 try:
c4f60dd7 2924 val = f(*args, **kwargs)
2925 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
a32a9a7e
S
2926 pass
2927 else:
c4f60dd7 2928 if expected_type is None or isinstance(val, expected_type):
2929 return val
2930
2931
2932def try_get(src, getter, expected_type=None):
2933 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
2934
2935
90137ca4 2936def filter_dict(dct, cndn=lambda _, v: v is not None):
2937 return {k: v for k, v in dct.items() if cndn(k, v)}
2938
2939
6cc62232
S
2940def merge_dicts(*dicts):
2941 merged = {}
2942 for a_dict in dicts:
2943 for k, v in a_dict.items():
90137ca4 2944 if (v is not None and k not in merged
2945 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
2946 merged[k] = v
2947 return merged
2948
2949
8e60dc75
S
2950def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2951 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2952
16392824 2953
a1a530b0
PH
2954US_RATINGS = {
2955 'G': 0,
2956 'PG': 10,
2957 'PG-13': 13,
2958 'R': 16,
2959 'NC': 18,
2960}
fac55558
PH
2961
2962
a8795327 2963TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
2964 'TV-Y': 0,
2965 'TV-Y7': 7,
2966 'TV-G': 0,
2967 'TV-PG': 0,
2968 'TV-14': 14,
2969 'TV-MA': 17,
a8795327
S
2970}
2971
2972
146c80e2 2973def parse_age_limit(s):
19a03940 2974 # isinstance(False, int) is True. So type() must be used instead
2975 if type(s) is int:
a8795327 2976 return s if 0 <= s <= 21 else None
19a03940 2977 elif not isinstance(s, str):
d838b1bd 2978 return None
146c80e2 2979 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2980 if m:
2981 return int(m.group('age'))
5c5fae6d 2982 s = s.upper()
a8795327
S
2983 if s in US_RATINGS:
2984 return US_RATINGS[s]
5a16c9d9 2985 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 2986 if m:
5a16c9d9 2987 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 2988 return None
146c80e2
S
2989
2990
fac55558 2991def strip_jsonp(code):
609a61e3 2992 return re.sub(
5552c9eb 2993 r'''(?sx)^
e9c671d5 2994 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
2995 (?:\s*&&\s*(?P=func_name))?
2996 \s*\(\s*(?P<callback_data>.*)\);?
2997 \s*?(?://[^\n]*)*$''',
2998 r'\g<callback_data>', code)
478c2c61
PH
2999
3000
5c610515 3001def js_to_json(code, vars={}):
3002 # vars is a dict of var, val pairs to substitute
c843e685 3003 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 3004 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 3005 INTEGER_TABLE = (
86e5f3ed 3006 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3007 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
3008 )
3009
e05f6939 3010 def fix_kv(m):
e7b6d122
PH
3011 v = m.group(0)
3012 if v in ('true', 'false', 'null'):
3013 return v
421ddcb8
C
3014 elif v in ('undefined', 'void 0'):
3015 return 'null'
8bdd16b4 3016 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
bd1e4844 3017 return ""
3018
3019 if v[0] in ("'", '"'):
3020 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 3021 '"': '\\"',
bd1e4844 3022 "\\'": "'",
3023 '\\\n': '',
3024 '\\x': '\\u00',
3025 }.get(m.group(0), m.group(0)), v[1:-1])
8bdd16b4 3026 else:
3027 for regex, base in INTEGER_TABLE:
3028 im = re.match(regex, v)
3029 if im:
3030 i = int(im.group(1), base)
3031 return '"%d":' % i if v.endswith(':') else '%d' % i
89ac4a19 3032
5c610515 3033 if v in vars:
3034 return vars[v]
3035
e7b6d122 3036 return '"%s"' % v
e05f6939 3037
febff4c1
B
3038 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3039
bd1e4844 3040 return re.sub(r'''(?sx)
3041 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3042 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 3043 {comment}|,(?={skip}[\]}}])|
421ddcb8 3044 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
4195096e 3045 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
8bdd16b4 3046 [0-9]+(?={skip}:)|
3047 !+
4195096e 3048 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
3049
3050
478c2c61
PH
3051def qualities(quality_ids):
3052 """ Get a numeric quality value out of a list of possible values """
3053 def q(qid):
3054 try:
3055 return quality_ids.index(qid)
3056 except ValueError:
3057 return -1
3058 return q
3059
acd69589 3060
62f6f1cb 3061POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist')
1e43a6f7 3062
3063
de6000d9 3064DEFAULT_OUTTMPL = {
3065 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3066 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3067}
3068OUTTMPL_TYPES = {
72755351 3069 'chapter': None,
de6000d9 3070 'subtitle': None,
3071 'thumbnail': None,
3072 'description': 'description',
3073 'annotation': 'annotations.xml',
3074 'infojson': 'info.json',
08438d2c 3075 'link': None,
3b603dbd 3076 'pl_video': None,
5112f26a 3077 'pl_thumbnail': None,
de6000d9 3078 'pl_description': 'description',
3079 'pl_infojson': 'info.json',
3080}
0a871f68 3081
143db31d 3082# As of [1] format syntax is:
3083# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3084# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3085STR_FORMAT_RE_TMPL = r'''(?x)
3086 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3087 %
524e2e4f 3088 (?P<has_key>\((?P<key>{0})\))?
752cda38 3089 (?P<format>
524e2e4f 3090 (?P<conversion>[#0\-+ ]+)?
3091 (?P<min_width>\d+)?
3092 (?P<precision>\.\d+)?
3093 (?P<len_mod>[hlL])? # unused in python
901130bb 3094 {1} # conversion type
752cda38 3095 )
143db31d 3096'''
3097
7d1eb38a 3098
901130bb 3099STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3100
7d1eb38a 3101
a020a0dc
PH
3102def limit_length(s, length):
3103 """ Add ellipses to overly long strings """
3104 if s is None:
3105 return None
3106 ELLIPSES = '...'
3107 if len(s) > length:
3108 return s[:length - len(ELLIPSES)] + ELLIPSES
3109 return s
48844745
PH
3110
3111
3112def version_tuple(v):
5f9b8394 3113 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3114
3115
3116def is_outdated_version(version, limit, assume_new=True):
3117 if not version:
3118 return not assume_new
3119 try:
3120 return version_tuple(version) < version_tuple(limit)
3121 except ValueError:
3122 return not assume_new
732ea2f0
PH
3123
3124
3125def ytdl_is_updateable():
7a5c1cfe 3126 """ Returns if yt-dlp can be updated with -U """
735d865e 3127
5d535b4a 3128 from .update import is_non_updateable
732ea2f0 3129
5d535b4a 3130 return not is_non_updateable()
7d4111ed
PH
3131
3132
3133def args_to_str(args):
3134 # Get a short string representation for a subprocess command
702ccf2d 3135 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3136
3137
9b9c5355 3138def error_to_compat_str(err):
cfb0511d 3139 return str(err)
fdae2358
S
3140
3141
a44ca5a4 3142def error_to_str(err):
3143 return f'{type(err).__name__}: {err}'
3144
3145
c460bdd5 3146def mimetype2ext(mt):
eb9ee194
S
3147 if mt is None:
3148 return None
3149
9359f3d4
F
3150 mt, _, params = mt.partition(';')
3151 mt = mt.strip()
3152
3153 FULL_MAP = {
765ac263 3154 'audio/mp4': 'm4a',
6c33d24b
YCH
3155 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3156 # it's the most popular one
3157 'audio/mpeg': 'mp3',
ba39289d 3158 'audio/x-wav': 'wav',
9359f3d4
F
3159 'audio/wav': 'wav',
3160 'audio/wave': 'wav',
3161 }
3162
3163 ext = FULL_MAP.get(mt)
765ac263
JMF
3164 if ext is not None:
3165 return ext
3166
9359f3d4 3167 SUBTYPE_MAP = {
f6861ec9 3168 '3gpp': '3gp',
cafcf657 3169 'smptett+xml': 'tt',
cafcf657 3170 'ttaf+xml': 'dfxp',
a0d8d704 3171 'ttml+xml': 'ttml',
f6861ec9 3172 'x-flv': 'flv',
a0d8d704 3173 'x-mp4-fragmented': 'mp4',
d4f05d47 3174 'x-ms-sami': 'sami',
a0d8d704 3175 'x-ms-wmv': 'wmv',
b4173f15
RA
3176 'mpegurl': 'm3u8',
3177 'x-mpegurl': 'm3u8',
3178 'vnd.apple.mpegurl': 'm3u8',
3179 'dash+xml': 'mpd',
b4173f15 3180 'f4m+xml': 'f4m',
f164b971 3181 'hds+xml': 'f4m',
e910fe2f 3182 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 3183 'quicktime': 'mov',
98ce1a3f 3184 'mp2t': 'ts',
39e7107d 3185 'x-wav': 'wav',
9359f3d4
F
3186 'filmstrip+json': 'fs',
3187 'svg+xml': 'svg',
3188 }
3189
3190 _, _, subtype = mt.rpartition('/')
3191 ext = SUBTYPE_MAP.get(subtype.lower())
3192 if ext is not None:
3193 return ext
3194
3195 SUFFIX_MAP = {
3196 'json': 'json',
3197 'xml': 'xml',
3198 'zip': 'zip',
3199 'gzip': 'gz',
3200 }
3201
3202 _, _, suffix = subtype.partition('+')
3203 ext = SUFFIX_MAP.get(suffix)
3204 if ext is not None:
3205 return ext
3206
3207 return subtype.replace('+', '.')
c460bdd5
PH
3208
3209
2814f12b
THD
3210def ext2mimetype(ext_or_url):
3211 if not ext_or_url:
3212 return None
3213 if '.' not in ext_or_url:
3214 ext_or_url = f'file.{ext_or_url}'
3215 return mimetypes.guess_type(ext_or_url)[0]
3216
3217
4f3c5e06 3218def parse_codecs(codecs_str):
3219 # http://tools.ietf.org/html/rfc6381
3220 if not codecs_str:
3221 return {}
a0566bbf 3222 split_codecs = list(filter(None, map(
dbf5416a 3223 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3224 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3225 for full_codec in split_codecs:
9bd979ca 3226 parts = full_codec.split('.')
3227 codec = parts[0].replace('0', '')
3228 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3229 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
4f3c5e06 3230 if not vcodec:
b69fd25c 3231 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
176f1866 3232 if codec in ('dvh1', 'dvhe'):
3233 hdr = 'DV'
9bd979ca 3234 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3235 hdr = 'HDR10'
3236 elif full_codec.replace('0', '').startswith('vp9.2'):
176f1866 3237 hdr = 'HDR10'
b69fd25c 3238 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 3239 if not acodec:
3240 acodec = full_codec
4afa3ec4 3241 elif codec in ('stpp', 'wvtt',):
3fe75fdc 3242 if not scodec:
3243 scodec = full_codec
4f3c5e06 3244 else:
19a03940 3245 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3246 if vcodec or acodec or scodec:
4f3c5e06 3247 return {
3248 'vcodec': vcodec or 'none',
3249 'acodec': acodec or 'none',
176f1866 3250 'dynamic_range': hdr,
3fe75fdc 3251 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3252 }
b69fd25c 3253 elif len(split_codecs) == 2:
3254 return {
3255 'vcodec': split_codecs[0],
3256 'acodec': split_codecs[1],
3257 }
4f3c5e06 3258 return {}
3259
3260
2ccd1b10 3261def urlhandle_detect_ext(url_handle):
79298173 3262 getheader = url_handle.headers.get
2ccd1b10 3263
b55ee18f
PH
3264 cd = getheader('Content-Disposition')
3265 if cd:
3266 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3267 if m:
3268 e = determine_ext(m.group('filename'), default_ext=None)
3269 if e:
3270 return e
3271
c460bdd5 3272 return mimetype2ext(getheader('Content-Type'))
05900629
PH
3273
3274
1e399778
YCH
3275def encode_data_uri(data, mime_type):
3276 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3277
3278
05900629 3279def age_restricted(content_limit, age_limit):
6ec6cb4e 3280 """ Returns True iff the content should be blocked """
05900629
PH
3281
3282 if age_limit is None: # No limit set
3283 return False
3284 if content_limit is None:
3285 return False # Content available for everyone
3286 return age_limit < content_limit
61ca9a80
PH
3287
3288
3289def is_html(first_bytes):
3290 """ Detect whether a file contains HTML by examining its first bytes. """
3291
3292 BOMS = [
3293 (b'\xef\xbb\xbf', 'utf-8'),
3294 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3295 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3296 (b'\xff\xfe', 'utf-16-le'),
3297 (b'\xfe\xff', 'utf-16-be'),
3298 ]
3299 for bom, enc in BOMS:
3300 if first_bytes.startswith(bom):
3301 s = first_bytes[len(bom):].decode(enc, 'replace')
3302 break
3303 else:
3304 s = first_bytes.decode('utf-8', 'replace')
3305
3306 return re.match(r'^\s*<', s)
a055469f
PH
3307
3308
3309def determine_protocol(info_dict):
3310 protocol = info_dict.get('protocol')
3311 if protocol is not None:
3312 return protocol
3313
7de837a5 3314 url = sanitize_url(info_dict['url'])
a055469f
PH
3315 if url.startswith('rtmp'):
3316 return 'rtmp'
3317 elif url.startswith('mms'):
3318 return 'mms'
3319 elif url.startswith('rtsp'):
3320 return 'rtsp'
3321
3322 ext = determine_ext(url)
3323 if ext == 'm3u8':
3324 return 'm3u8'
3325 elif ext == 'f4m':
3326 return 'f4m'
3327
3328 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
3329
3330
c5e3f849 3331def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3332 """ Render a list of rows, each as a list of values.
3333 Text after a \t will be right aligned """
ec11a9f4 3334 def width(string):
c5e3f849 3335 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3336
3337 def get_max_lens(table):
ec11a9f4 3338 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3339
3340 def filter_using_list(row, filterArray):
d16df59d 3341 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3342
d16df59d 3343 max_lens = get_max_lens(data) if hide_empty else []
3344 header_row = filter_using_list(header_row, max_lens)
3345 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3346
cfb56d1a 3347 table = [header_row] + data
76d321f6 3348 max_lens = get_max_lens(table)
c5e3f849 3349 extra_gap += 1
76d321f6 3350 if delim:
c5e3f849 3351 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3352 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3353 for row in table:
3354 for pos, text in enumerate(map(str, row)):
c5e3f849 3355 if '\t' in text:
3356 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3357 else:
3358 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3359 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3360 return ret
347de493
PH
3361
3362
8f18aca8 3363def _match_one(filter_part, dct, incomplete):
77b87f05 3364 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3365 STRING_OPERATORS = {
3366 '*=': operator.contains,
3367 '^=': lambda attr, value: attr.startswith(value),
3368 '$=': lambda attr, value: attr.endswith(value),
3369 '~=': lambda attr, value: re.search(value, attr),
3370 }
347de493 3371 COMPARISON_OPERATORS = {
a047eeb6 3372 **STRING_OPERATORS,
3373 '<=': operator.le, # "<=" must be defined above "<"
347de493 3374 '<': operator.lt,
347de493 3375 '>=': operator.ge,
a047eeb6 3376 '>': operator.gt,
347de493 3377 '=': operator.eq,
347de493 3378 }
a047eeb6 3379
6db9c4d5 3380 if isinstance(incomplete, bool):
3381 is_incomplete = lambda _: incomplete
3382 else:
3383 is_incomplete = lambda k: k in incomplete
3384
347de493
PH
3385 operator_rex = re.compile(r'''(?x)\s*
3386 (?P<key>[a-z_]+)
77b87f05 3387 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3388 (?:
a047eeb6 3389 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3390 (?P<strval>.+?)
347de493
PH
3391 )
3392 \s*$
3393 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3394 m = operator_rex.search(filter_part)
3395 if m:
18f96d12 3396 m = m.groupdict()
3397 unnegated_op = COMPARISON_OPERATORS[m['op']]
3398 if m['negation']:
77b87f05
MT
3399 op = lambda attr, value: not unnegated_op(attr, value)
3400 else:
3401 op = unnegated_op
18f96d12 3402 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3403 if m['quote']:
3404 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3405 actual_value = dct.get(m['key'])
3406 numeric_comparison = None
f9934b96 3407 if isinstance(actual_value, (int, float)):
e5a088dc
S
3408 # If the original field is a string and matching comparisonvalue is
3409 # a number we should respect the origin of the original field
3410 # and process comparison value as a string (see
18f96d12 3411 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3412 try:
18f96d12 3413 numeric_comparison = int(comparison_value)
347de493 3414 except ValueError:
18f96d12 3415 numeric_comparison = parse_filesize(comparison_value)
3416 if numeric_comparison is None:
3417 numeric_comparison = parse_filesize(f'{comparison_value}B')
3418 if numeric_comparison is None:
3419 numeric_comparison = parse_duration(comparison_value)
3420 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3421 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3422 if actual_value is None:
6db9c4d5 3423 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3424 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3425
3426 UNARY_OPERATORS = {
1cc47c66
S
3427 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3428 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493
PH
3429 }
3430 operator_rex = re.compile(r'''(?x)\s*
3431 (?P<op>%s)\s*(?P<key>[a-z_]+)
3432 \s*$
3433 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3434 m = operator_rex.search(filter_part)
3435 if m:
3436 op = UNARY_OPERATORS[m.group('op')]
3437 actual_value = dct.get(m.group('key'))
6db9c4d5 3438 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3439 return True
347de493
PH
3440 return op(actual_value)
3441
3442 raise ValueError('Invalid filter part %r' % filter_part)
3443
3444
8f18aca8 3445def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3446 """ Filter a dictionary with a simple string syntax.
3447 @returns Whether the filter passes
3448 @param incomplete Set of keys that is expected to be missing from dct.
3449 Can be True/False to indicate all/none of the keys may be missing.
3450 All conditions on incomplete keys pass if the key is missing
8f18aca8 3451 """
347de493 3452 return all(
8f18aca8 3453 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3454 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3455
3456
b1a7cd05 3457def match_filter_func(filters):
3458 if not filters:
d1b5f70b 3459 return None
492272fe 3460 filters = set(variadic(filters))
d1b5f70b 3461
492272fe 3462 interactive = '-' in filters
3463 if interactive:
3464 filters.remove('-')
3465
3466 def _match_func(info_dict, incomplete=False):
3467 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3468 return NO_DEFAULT if interactive and not incomplete else None
347de493 3469 else:
b1a7cd05 3470 video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3471 filter_str = ') | ('.join(map(str.strip, filters))
3472 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3473 return _match_func
91410c9b
PH
3474
3475
bf6427d2
YCH
3476def parse_dfxp_time_expr(time_expr):
3477 if not time_expr:
d631d5f9 3478 return
bf6427d2 3479
1d485a1a 3480 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3481 if mobj:
3482 return float(mobj.group('time_offset'))
3483
db2fe38b 3484 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3485 if mobj:
db2fe38b 3486 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3487
3488
c1c924ab 3489def srt_subtitles_timecode(seconds):
aa7785f8 3490 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3491
3492
3493def ass_subtitles_timecode(seconds):
3494 time = timetuple_from_msec(seconds * 1000)
3495 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3496
3497
3498def dfxp2srt(dfxp_data):
3869028f
YCH
3499 '''
3500 @param dfxp_data A bytes-like object containing DFXP data
3501 @returns A unicode object containing converted SRT data
3502 '''
5b995f71 3503 LEGACY_NAMESPACES = (
3869028f
YCH
3504 (b'http://www.w3.org/ns/ttml', [
3505 b'http://www.w3.org/2004/11/ttaf1',
3506 b'http://www.w3.org/2006/04/ttaf1',
3507 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3508 ]),
3869028f
YCH
3509 (b'http://www.w3.org/ns/ttml#styling', [
3510 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3511 ]),
3512 )
3513
3514 SUPPORTED_STYLING = [
3515 'color',
3516 'fontFamily',
3517 'fontSize',
3518 'fontStyle',
3519 'fontWeight',
3520 'textDecoration'
3521 ]
3522
4e335771 3523 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3524 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3525 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3526 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3527 })
bf6427d2 3528
5b995f71
RA
3529 styles = {}
3530 default_style = {}
3531
86e5f3ed 3532 class TTMLPElementParser:
5b995f71
RA
3533 _out = ''
3534 _unclosed_elements = []
3535 _applied_styles = []
bf6427d2 3536
2b14cb56 3537 def start(self, tag, attrib):
5b995f71
RA
3538 if tag in (_x('ttml:br'), 'br'):
3539 self._out += '\n'
3540 else:
3541 unclosed_elements = []
3542 style = {}
3543 element_style_id = attrib.get('style')
3544 if default_style:
3545 style.update(default_style)
3546 if element_style_id:
3547 style.update(styles.get(element_style_id, {}))
3548 for prop in SUPPORTED_STYLING:
3549 prop_val = attrib.get(_x('tts:' + prop))
3550 if prop_val:
3551 style[prop] = prop_val
3552 if style:
3553 font = ''
3554 for k, v in sorted(style.items()):
3555 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3556 continue
3557 if k == 'color':
3558 font += ' color="%s"' % v
3559 elif k == 'fontSize':
3560 font += ' size="%s"' % v
3561 elif k == 'fontFamily':
3562 font += ' face="%s"' % v
3563 elif k == 'fontWeight' and v == 'bold':
3564 self._out += '<b>'
3565 unclosed_elements.append('b')
3566 elif k == 'fontStyle' and v == 'italic':
3567 self._out += '<i>'
3568 unclosed_elements.append('i')
3569 elif k == 'textDecoration' and v == 'underline':
3570 self._out += '<u>'
3571 unclosed_elements.append('u')
3572 if font:
3573 self._out += '<font' + font + '>'
3574 unclosed_elements.append('font')
3575 applied_style = {}
3576 if self._applied_styles:
3577 applied_style.update(self._applied_styles[-1])
3578 applied_style.update(style)
3579 self._applied_styles.append(applied_style)
3580 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3581
2b14cb56 3582 def end(self, tag):
5b995f71
RA
3583 if tag not in (_x('ttml:br'), 'br'):
3584 unclosed_elements = self._unclosed_elements.pop()
3585 for element in reversed(unclosed_elements):
3586 self._out += '</%s>' % element
3587 if unclosed_elements and self._applied_styles:
3588 self._applied_styles.pop()
bf6427d2 3589
2b14cb56 3590 def data(self, data):
5b995f71 3591 self._out += data
2b14cb56 3592
3593 def close(self):
5b995f71 3594 return self._out.strip()
2b14cb56 3595
3596 def parse_node(node):
3597 target = TTMLPElementParser()
3598 parser = xml.etree.ElementTree.XMLParser(target=target)
3599 parser.feed(xml.etree.ElementTree.tostring(node))
3600 return parser.close()
bf6427d2 3601
5b995f71
RA
3602 for k, v in LEGACY_NAMESPACES:
3603 for ns in v:
3604 dfxp_data = dfxp_data.replace(ns, k)
3605
3869028f 3606 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3607 out = []
5b995f71 3608 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3609
3610 if not paras:
3611 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3612
5b995f71
RA
3613 repeat = False
3614 while True:
3615 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3616 style_id = style.get('id') or style.get(_x('xml:id'))
3617 if not style_id:
3618 continue
5b995f71
RA
3619 parent_style_id = style.get('style')
3620 if parent_style_id:
3621 if parent_style_id not in styles:
3622 repeat = True
3623 continue
3624 styles[style_id] = styles[parent_style_id].copy()
3625 for prop in SUPPORTED_STYLING:
3626 prop_val = style.get(_x('tts:' + prop))
3627 if prop_val:
3628 styles.setdefault(style_id, {})[prop] = prop_val
3629 if repeat:
3630 repeat = False
3631 else:
3632 break
3633
3634 for p in ('body', 'div'):
3635 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3636 if ele is None:
3637 continue
3638 style = styles.get(ele.get('style'))
3639 if not style:
3640 continue
3641 default_style.update(style)
3642
bf6427d2 3643 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3644 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3645 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3646 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3647 if begin_time is None:
3648 continue
7dff0363 3649 if not end_time:
d631d5f9
YCH
3650 if not dur:
3651 continue
3652 end_time = begin_time + dur
bf6427d2
YCH
3653 out.append('%d\n%s --> %s\n%s\n\n' % (
3654 index,
c1c924ab
YCH
3655 srt_subtitles_timecode(begin_time),
3656 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3657 parse_node(para)))
3658
3659 return ''.join(out)
3660
3661
66e289ba
S
3662def cli_option(params, command_option, param):
3663 param = params.get(param)
98e698f1
RA
3664 if param:
3665 param = compat_str(param)
66e289ba
S
3666 return [command_option, param] if param is not None else []
3667
3668
3669def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3670 param = params.get(param)
5b232f46
S
3671 if param is None:
3672 return []
66e289ba
S
3673 assert isinstance(param, bool)
3674 if separator:
3675 return [command_option + separator + (true_value if param else false_value)]
3676 return [command_option, true_value if param else false_value]
3677
3678
3679def cli_valueless_option(params, command_option, param, expected_value=True):
3680 param = params.get(param)
3681 return [command_option] if param == expected_value else []
3682
3683
e92caff5 3684def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3685 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3686 if use_compat:
5b1ecbb3 3687 return argdict
3688 else:
3689 argdict = None
eab9b2bc 3690 if argdict is None:
5b1ecbb3 3691 return default
eab9b2bc 3692 assert isinstance(argdict, dict)
3693
e92caff5 3694 assert isinstance(keys, (list, tuple))
3695 for key_list in keys:
e92caff5 3696 arg_list = list(filter(
3697 lambda x: x is not None,
6606817a 3698 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3699 if arg_list:
3700 return [arg for args in arg_list for arg in args]
3701 return default
66e289ba 3702
6251555f 3703
330690a2 3704def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3705 main_key, exe = main_key.lower(), exe.lower()
3706 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3707 keys = [f'{root_key}{k}' for k in (keys or [''])]
3708 if root_key in keys:
3709 if main_key != exe:
3710 keys.append((main_key, exe))
3711 keys.append('default')
3712 else:
3713 use_compat = False
3714 return cli_configuration_args(argdict, keys, default, use_compat)
3715
66e289ba 3716
86e5f3ed 3717class ISO639Utils:
39672624
YCH
3718 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3719 _lang_map = {
3720 'aa': 'aar',
3721 'ab': 'abk',
3722 'ae': 'ave',
3723 'af': 'afr',
3724 'ak': 'aka',
3725 'am': 'amh',
3726 'an': 'arg',
3727 'ar': 'ara',
3728 'as': 'asm',
3729 'av': 'ava',
3730 'ay': 'aym',
3731 'az': 'aze',
3732 'ba': 'bak',
3733 'be': 'bel',
3734 'bg': 'bul',
3735 'bh': 'bih',
3736 'bi': 'bis',
3737 'bm': 'bam',
3738 'bn': 'ben',
3739 'bo': 'bod',
3740 'br': 'bre',
3741 'bs': 'bos',
3742 'ca': 'cat',
3743 'ce': 'che',
3744 'ch': 'cha',
3745 'co': 'cos',
3746 'cr': 'cre',
3747 'cs': 'ces',
3748 'cu': 'chu',
3749 'cv': 'chv',
3750 'cy': 'cym',
3751 'da': 'dan',
3752 'de': 'deu',
3753 'dv': 'div',
3754 'dz': 'dzo',
3755 'ee': 'ewe',
3756 'el': 'ell',
3757 'en': 'eng',
3758 'eo': 'epo',
3759 'es': 'spa',
3760 'et': 'est',
3761 'eu': 'eus',
3762 'fa': 'fas',
3763 'ff': 'ful',
3764 'fi': 'fin',
3765 'fj': 'fij',
3766 'fo': 'fao',
3767 'fr': 'fra',
3768 'fy': 'fry',
3769 'ga': 'gle',
3770 'gd': 'gla',
3771 'gl': 'glg',
3772 'gn': 'grn',
3773 'gu': 'guj',
3774 'gv': 'glv',
3775 'ha': 'hau',
3776 'he': 'heb',
b7acc835 3777 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
3778 'hi': 'hin',
3779 'ho': 'hmo',
3780 'hr': 'hrv',
3781 'ht': 'hat',
3782 'hu': 'hun',
3783 'hy': 'hye',
3784 'hz': 'her',
3785 'ia': 'ina',
3786 'id': 'ind',
b7acc835 3787 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
3788 'ie': 'ile',
3789 'ig': 'ibo',
3790 'ii': 'iii',
3791 'ik': 'ipk',
3792 'io': 'ido',
3793 'is': 'isl',
3794 'it': 'ita',
3795 'iu': 'iku',
3796 'ja': 'jpn',
3797 'jv': 'jav',
3798 'ka': 'kat',
3799 'kg': 'kon',
3800 'ki': 'kik',
3801 'kj': 'kua',
3802 'kk': 'kaz',
3803 'kl': 'kal',
3804 'km': 'khm',
3805 'kn': 'kan',
3806 'ko': 'kor',
3807 'kr': 'kau',
3808 'ks': 'kas',
3809 'ku': 'kur',
3810 'kv': 'kom',
3811 'kw': 'cor',
3812 'ky': 'kir',
3813 'la': 'lat',
3814 'lb': 'ltz',
3815 'lg': 'lug',
3816 'li': 'lim',
3817 'ln': 'lin',
3818 'lo': 'lao',
3819 'lt': 'lit',
3820 'lu': 'lub',
3821 'lv': 'lav',
3822 'mg': 'mlg',
3823 'mh': 'mah',
3824 'mi': 'mri',
3825 'mk': 'mkd',
3826 'ml': 'mal',
3827 'mn': 'mon',
3828 'mr': 'mar',
3829 'ms': 'msa',
3830 'mt': 'mlt',
3831 'my': 'mya',
3832 'na': 'nau',
3833 'nb': 'nob',
3834 'nd': 'nde',
3835 'ne': 'nep',
3836 'ng': 'ndo',
3837 'nl': 'nld',
3838 'nn': 'nno',
3839 'no': 'nor',
3840 'nr': 'nbl',
3841 'nv': 'nav',
3842 'ny': 'nya',
3843 'oc': 'oci',
3844 'oj': 'oji',
3845 'om': 'orm',
3846 'or': 'ori',
3847 'os': 'oss',
3848 'pa': 'pan',
3849 'pi': 'pli',
3850 'pl': 'pol',
3851 'ps': 'pus',
3852 'pt': 'por',
3853 'qu': 'que',
3854 'rm': 'roh',
3855 'rn': 'run',
3856 'ro': 'ron',
3857 'ru': 'rus',
3858 'rw': 'kin',
3859 'sa': 'san',
3860 'sc': 'srd',
3861 'sd': 'snd',
3862 'se': 'sme',
3863 'sg': 'sag',
3864 'si': 'sin',
3865 'sk': 'slk',
3866 'sl': 'slv',
3867 'sm': 'smo',
3868 'sn': 'sna',
3869 'so': 'som',
3870 'sq': 'sqi',
3871 'sr': 'srp',
3872 'ss': 'ssw',
3873 'st': 'sot',
3874 'su': 'sun',
3875 'sv': 'swe',
3876 'sw': 'swa',
3877 'ta': 'tam',
3878 'te': 'tel',
3879 'tg': 'tgk',
3880 'th': 'tha',
3881 'ti': 'tir',
3882 'tk': 'tuk',
3883 'tl': 'tgl',
3884 'tn': 'tsn',
3885 'to': 'ton',
3886 'tr': 'tur',
3887 'ts': 'tso',
3888 'tt': 'tat',
3889 'tw': 'twi',
3890 'ty': 'tah',
3891 'ug': 'uig',
3892 'uk': 'ukr',
3893 'ur': 'urd',
3894 'uz': 'uzb',
3895 've': 'ven',
3896 'vi': 'vie',
3897 'vo': 'vol',
3898 'wa': 'wln',
3899 'wo': 'wol',
3900 'xh': 'xho',
3901 'yi': 'yid',
e9a50fba 3902 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
3903 'yo': 'yor',
3904 'za': 'zha',
3905 'zh': 'zho',
3906 'zu': 'zul',
3907 }
3908
3909 @classmethod
3910 def short2long(cls, code):
3911 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3912 return cls._lang_map.get(code[:2])
3913
3914 @classmethod
3915 def long2short(cls, code):
3916 """Convert language code from ISO 639-2/T to ISO 639-1"""
3917 for short_name, long_name in cls._lang_map.items():
3918 if long_name == code:
3919 return short_name
3920
3921
86e5f3ed 3922class ISO3166Utils:
4eb10f66
YCH
3923 # From http://data.okfn.org/data/core/country-list
3924 _country_map = {
3925 'AF': 'Afghanistan',
3926 'AX': 'Åland Islands',
3927 'AL': 'Albania',
3928 'DZ': 'Algeria',
3929 'AS': 'American Samoa',
3930 'AD': 'Andorra',
3931 'AO': 'Angola',
3932 'AI': 'Anguilla',
3933 'AQ': 'Antarctica',
3934 'AG': 'Antigua and Barbuda',
3935 'AR': 'Argentina',
3936 'AM': 'Armenia',
3937 'AW': 'Aruba',
3938 'AU': 'Australia',
3939 'AT': 'Austria',
3940 'AZ': 'Azerbaijan',
3941 'BS': 'Bahamas',
3942 'BH': 'Bahrain',
3943 'BD': 'Bangladesh',
3944 'BB': 'Barbados',
3945 'BY': 'Belarus',
3946 'BE': 'Belgium',
3947 'BZ': 'Belize',
3948 'BJ': 'Benin',
3949 'BM': 'Bermuda',
3950 'BT': 'Bhutan',
3951 'BO': 'Bolivia, Plurinational State of',
3952 'BQ': 'Bonaire, Sint Eustatius and Saba',
3953 'BA': 'Bosnia and Herzegovina',
3954 'BW': 'Botswana',
3955 'BV': 'Bouvet Island',
3956 'BR': 'Brazil',
3957 'IO': 'British Indian Ocean Territory',
3958 'BN': 'Brunei Darussalam',
3959 'BG': 'Bulgaria',
3960 'BF': 'Burkina Faso',
3961 'BI': 'Burundi',
3962 'KH': 'Cambodia',
3963 'CM': 'Cameroon',
3964 'CA': 'Canada',
3965 'CV': 'Cape Verde',
3966 'KY': 'Cayman Islands',
3967 'CF': 'Central African Republic',
3968 'TD': 'Chad',
3969 'CL': 'Chile',
3970 'CN': 'China',
3971 'CX': 'Christmas Island',
3972 'CC': 'Cocos (Keeling) Islands',
3973 'CO': 'Colombia',
3974 'KM': 'Comoros',
3975 'CG': 'Congo',
3976 'CD': 'Congo, the Democratic Republic of the',
3977 'CK': 'Cook Islands',
3978 'CR': 'Costa Rica',
3979 'CI': 'Côte d\'Ivoire',
3980 'HR': 'Croatia',
3981 'CU': 'Cuba',
3982 'CW': 'Curaçao',
3983 'CY': 'Cyprus',
3984 'CZ': 'Czech Republic',
3985 'DK': 'Denmark',
3986 'DJ': 'Djibouti',
3987 'DM': 'Dominica',
3988 'DO': 'Dominican Republic',
3989 'EC': 'Ecuador',
3990 'EG': 'Egypt',
3991 'SV': 'El Salvador',
3992 'GQ': 'Equatorial Guinea',
3993 'ER': 'Eritrea',
3994 'EE': 'Estonia',
3995 'ET': 'Ethiopia',
3996 'FK': 'Falkland Islands (Malvinas)',
3997 'FO': 'Faroe Islands',
3998 'FJ': 'Fiji',
3999 'FI': 'Finland',
4000 'FR': 'France',
4001 'GF': 'French Guiana',
4002 'PF': 'French Polynesia',
4003 'TF': 'French Southern Territories',
4004 'GA': 'Gabon',
4005 'GM': 'Gambia',
4006 'GE': 'Georgia',
4007 'DE': 'Germany',
4008 'GH': 'Ghana',
4009 'GI': 'Gibraltar',
4010 'GR': 'Greece',
4011 'GL': 'Greenland',
4012 'GD': 'Grenada',
4013 'GP': 'Guadeloupe',
4014 'GU': 'Guam',
4015 'GT': 'Guatemala',
4016 'GG': 'Guernsey',
4017 'GN': 'Guinea',
4018 'GW': 'Guinea-Bissau',
4019 'GY': 'Guyana',
4020 'HT': 'Haiti',
4021 'HM': 'Heard Island and McDonald Islands',
4022 'VA': 'Holy See (Vatican City State)',
4023 'HN': 'Honduras',
4024 'HK': 'Hong Kong',
4025 'HU': 'Hungary',
4026 'IS': 'Iceland',
4027 'IN': 'India',
4028 'ID': 'Indonesia',
4029 'IR': 'Iran, Islamic Republic of',
4030 'IQ': 'Iraq',
4031 'IE': 'Ireland',
4032 'IM': 'Isle of Man',
4033 'IL': 'Israel',
4034 'IT': 'Italy',
4035 'JM': 'Jamaica',
4036 'JP': 'Japan',
4037 'JE': 'Jersey',
4038 'JO': 'Jordan',
4039 'KZ': 'Kazakhstan',
4040 'KE': 'Kenya',
4041 'KI': 'Kiribati',
4042 'KP': 'Korea, Democratic People\'s Republic of',
4043 'KR': 'Korea, Republic of',
4044 'KW': 'Kuwait',
4045 'KG': 'Kyrgyzstan',
4046 'LA': 'Lao People\'s Democratic Republic',
4047 'LV': 'Latvia',
4048 'LB': 'Lebanon',
4049 'LS': 'Lesotho',
4050 'LR': 'Liberia',
4051 'LY': 'Libya',
4052 'LI': 'Liechtenstein',
4053 'LT': 'Lithuania',
4054 'LU': 'Luxembourg',
4055 'MO': 'Macao',
4056 'MK': 'Macedonia, the Former Yugoslav Republic of',
4057 'MG': 'Madagascar',
4058 'MW': 'Malawi',
4059 'MY': 'Malaysia',
4060 'MV': 'Maldives',
4061 'ML': 'Mali',
4062 'MT': 'Malta',
4063 'MH': 'Marshall Islands',
4064 'MQ': 'Martinique',
4065 'MR': 'Mauritania',
4066 'MU': 'Mauritius',
4067 'YT': 'Mayotte',
4068 'MX': 'Mexico',
4069 'FM': 'Micronesia, Federated States of',
4070 'MD': 'Moldova, Republic of',
4071 'MC': 'Monaco',
4072 'MN': 'Mongolia',
4073 'ME': 'Montenegro',
4074 'MS': 'Montserrat',
4075 'MA': 'Morocco',
4076 'MZ': 'Mozambique',
4077 'MM': 'Myanmar',
4078 'NA': 'Namibia',
4079 'NR': 'Nauru',
4080 'NP': 'Nepal',
4081 'NL': 'Netherlands',
4082 'NC': 'New Caledonia',
4083 'NZ': 'New Zealand',
4084 'NI': 'Nicaragua',
4085 'NE': 'Niger',
4086 'NG': 'Nigeria',
4087 'NU': 'Niue',
4088 'NF': 'Norfolk Island',
4089 'MP': 'Northern Mariana Islands',
4090 'NO': 'Norway',
4091 'OM': 'Oman',
4092 'PK': 'Pakistan',
4093 'PW': 'Palau',
4094 'PS': 'Palestine, State of',
4095 'PA': 'Panama',
4096 'PG': 'Papua New Guinea',
4097 'PY': 'Paraguay',
4098 'PE': 'Peru',
4099 'PH': 'Philippines',
4100 'PN': 'Pitcairn',
4101 'PL': 'Poland',
4102 'PT': 'Portugal',
4103 'PR': 'Puerto Rico',
4104 'QA': 'Qatar',
4105 'RE': 'Réunion',
4106 'RO': 'Romania',
4107 'RU': 'Russian Federation',
4108 'RW': 'Rwanda',
4109 'BL': 'Saint Barthélemy',
4110 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4111 'KN': 'Saint Kitts and Nevis',
4112 'LC': 'Saint Lucia',
4113 'MF': 'Saint Martin (French part)',
4114 'PM': 'Saint Pierre and Miquelon',
4115 'VC': 'Saint Vincent and the Grenadines',
4116 'WS': 'Samoa',
4117 'SM': 'San Marino',
4118 'ST': 'Sao Tome and Principe',
4119 'SA': 'Saudi Arabia',
4120 'SN': 'Senegal',
4121 'RS': 'Serbia',
4122 'SC': 'Seychelles',
4123 'SL': 'Sierra Leone',
4124 'SG': 'Singapore',
4125 'SX': 'Sint Maarten (Dutch part)',
4126 'SK': 'Slovakia',
4127 'SI': 'Slovenia',
4128 'SB': 'Solomon Islands',
4129 'SO': 'Somalia',
4130 'ZA': 'South Africa',
4131 'GS': 'South Georgia and the South Sandwich Islands',
4132 'SS': 'South Sudan',
4133 'ES': 'Spain',
4134 'LK': 'Sri Lanka',
4135 'SD': 'Sudan',
4136 'SR': 'Suriname',
4137 'SJ': 'Svalbard and Jan Mayen',
4138 'SZ': 'Swaziland',
4139 'SE': 'Sweden',
4140 'CH': 'Switzerland',
4141 'SY': 'Syrian Arab Republic',
4142 'TW': 'Taiwan, Province of China',
4143 'TJ': 'Tajikistan',
4144 'TZ': 'Tanzania, United Republic of',
4145 'TH': 'Thailand',
4146 'TL': 'Timor-Leste',
4147 'TG': 'Togo',
4148 'TK': 'Tokelau',
4149 'TO': 'Tonga',
4150 'TT': 'Trinidad and Tobago',
4151 'TN': 'Tunisia',
4152 'TR': 'Turkey',
4153 'TM': 'Turkmenistan',
4154 'TC': 'Turks and Caicos Islands',
4155 'TV': 'Tuvalu',
4156 'UG': 'Uganda',
4157 'UA': 'Ukraine',
4158 'AE': 'United Arab Emirates',
4159 'GB': 'United Kingdom',
4160 'US': 'United States',
4161 'UM': 'United States Minor Outlying Islands',
4162 'UY': 'Uruguay',
4163 'UZ': 'Uzbekistan',
4164 'VU': 'Vanuatu',
4165 'VE': 'Venezuela, Bolivarian Republic of',
4166 'VN': 'Viet Nam',
4167 'VG': 'Virgin Islands, British',
4168 'VI': 'Virgin Islands, U.S.',
4169 'WF': 'Wallis and Futuna',
4170 'EH': 'Western Sahara',
4171 'YE': 'Yemen',
4172 'ZM': 'Zambia',
4173 'ZW': 'Zimbabwe',
4174 }
4175
4176 @classmethod
4177 def short2full(cls, code):
4178 """Convert an ISO 3166-2 country code to the corresponding full name"""
4179 return cls._country_map.get(code.upper())
4180
4181
86e5f3ed 4182class GeoUtils:
773f291d
S
4183 # Major IPv4 address blocks per country
4184 _country_ip_map = {
53896ca5 4185 'AD': '46.172.224.0/19',
773f291d
S
4186 'AE': '94.200.0.0/13',
4187 'AF': '149.54.0.0/17',
4188 'AG': '209.59.64.0/18',
4189 'AI': '204.14.248.0/21',
4190 'AL': '46.99.0.0/16',
4191 'AM': '46.70.0.0/15',
4192 'AO': '105.168.0.0/13',
53896ca5
S
4193 'AP': '182.50.184.0/21',
4194 'AQ': '23.154.160.0/24',
773f291d
S
4195 'AR': '181.0.0.0/12',
4196 'AS': '202.70.112.0/20',
53896ca5 4197 'AT': '77.116.0.0/14',
773f291d
S
4198 'AU': '1.128.0.0/11',
4199 'AW': '181.41.0.0/18',
53896ca5
S
4200 'AX': '185.217.4.0/22',
4201 'AZ': '5.197.0.0/16',
773f291d
S
4202 'BA': '31.176.128.0/17',
4203 'BB': '65.48.128.0/17',
4204 'BD': '114.130.0.0/16',
4205 'BE': '57.0.0.0/8',
53896ca5 4206 'BF': '102.178.0.0/15',
773f291d
S
4207 'BG': '95.42.0.0/15',
4208 'BH': '37.131.0.0/17',
4209 'BI': '154.117.192.0/18',
4210 'BJ': '137.255.0.0/16',
53896ca5 4211 'BL': '185.212.72.0/23',
773f291d
S
4212 'BM': '196.12.64.0/18',
4213 'BN': '156.31.0.0/16',
4214 'BO': '161.56.0.0/16',
4215 'BQ': '161.0.80.0/20',
53896ca5 4216 'BR': '191.128.0.0/12',
773f291d
S
4217 'BS': '24.51.64.0/18',
4218 'BT': '119.2.96.0/19',
4219 'BW': '168.167.0.0/16',
4220 'BY': '178.120.0.0/13',
4221 'BZ': '179.42.192.0/18',
4222 'CA': '99.224.0.0/11',
4223 'CD': '41.243.0.0/16',
53896ca5
S
4224 'CF': '197.242.176.0/21',
4225 'CG': '160.113.0.0/16',
773f291d 4226 'CH': '85.0.0.0/13',
53896ca5 4227 'CI': '102.136.0.0/14',
773f291d
S
4228 'CK': '202.65.32.0/19',
4229 'CL': '152.172.0.0/14',
53896ca5 4230 'CM': '102.244.0.0/14',
773f291d
S
4231 'CN': '36.128.0.0/10',
4232 'CO': '181.240.0.0/12',
4233 'CR': '201.192.0.0/12',
4234 'CU': '152.206.0.0/15',
4235 'CV': '165.90.96.0/19',
4236 'CW': '190.88.128.0/17',
53896ca5 4237 'CY': '31.153.0.0/16',
773f291d
S
4238 'CZ': '88.100.0.0/14',
4239 'DE': '53.0.0.0/8',
4240 'DJ': '197.241.0.0/17',
4241 'DK': '87.48.0.0/12',
4242 'DM': '192.243.48.0/20',
4243 'DO': '152.166.0.0/15',
4244 'DZ': '41.96.0.0/12',
4245 'EC': '186.68.0.0/15',
4246 'EE': '90.190.0.0/15',
4247 'EG': '156.160.0.0/11',
4248 'ER': '196.200.96.0/20',
4249 'ES': '88.0.0.0/11',
4250 'ET': '196.188.0.0/14',
4251 'EU': '2.16.0.0/13',
4252 'FI': '91.152.0.0/13',
4253 'FJ': '144.120.0.0/16',
53896ca5 4254 'FK': '80.73.208.0/21',
773f291d
S
4255 'FM': '119.252.112.0/20',
4256 'FO': '88.85.32.0/19',
4257 'FR': '90.0.0.0/9',
4258 'GA': '41.158.0.0/15',
4259 'GB': '25.0.0.0/8',
4260 'GD': '74.122.88.0/21',
4261 'GE': '31.146.0.0/16',
4262 'GF': '161.22.64.0/18',
4263 'GG': '62.68.160.0/19',
53896ca5
S
4264 'GH': '154.160.0.0/12',
4265 'GI': '95.164.0.0/16',
773f291d
S
4266 'GL': '88.83.0.0/19',
4267 'GM': '160.182.0.0/15',
4268 'GN': '197.149.192.0/18',
4269 'GP': '104.250.0.0/19',
4270 'GQ': '105.235.224.0/20',
4271 'GR': '94.64.0.0/13',
4272 'GT': '168.234.0.0/16',
4273 'GU': '168.123.0.0/16',
4274 'GW': '197.214.80.0/20',
4275 'GY': '181.41.64.0/18',
4276 'HK': '113.252.0.0/14',
4277 'HN': '181.210.0.0/16',
4278 'HR': '93.136.0.0/13',
4279 'HT': '148.102.128.0/17',
4280 'HU': '84.0.0.0/14',
4281 'ID': '39.192.0.0/10',
4282 'IE': '87.32.0.0/12',
4283 'IL': '79.176.0.0/13',
4284 'IM': '5.62.80.0/20',
4285 'IN': '117.192.0.0/10',
4286 'IO': '203.83.48.0/21',
4287 'IQ': '37.236.0.0/14',
4288 'IR': '2.176.0.0/12',
4289 'IS': '82.221.0.0/16',
4290 'IT': '79.0.0.0/10',
4291 'JE': '87.244.64.0/18',
4292 'JM': '72.27.0.0/17',
4293 'JO': '176.29.0.0/16',
53896ca5 4294 'JP': '133.0.0.0/8',
773f291d
S
4295 'KE': '105.48.0.0/12',
4296 'KG': '158.181.128.0/17',
4297 'KH': '36.37.128.0/17',
4298 'KI': '103.25.140.0/22',
4299 'KM': '197.255.224.0/20',
53896ca5 4300 'KN': '198.167.192.0/19',
773f291d
S
4301 'KP': '175.45.176.0/22',
4302 'KR': '175.192.0.0/10',
4303 'KW': '37.36.0.0/14',
4304 'KY': '64.96.0.0/15',
4305 'KZ': '2.72.0.0/13',
4306 'LA': '115.84.64.0/18',
4307 'LB': '178.135.0.0/16',
53896ca5 4308 'LC': '24.92.144.0/20',
773f291d
S
4309 'LI': '82.117.0.0/19',
4310 'LK': '112.134.0.0/15',
53896ca5 4311 'LR': '102.183.0.0/16',
773f291d
S
4312 'LS': '129.232.0.0/17',
4313 'LT': '78.56.0.0/13',
4314 'LU': '188.42.0.0/16',
4315 'LV': '46.109.0.0/16',
4316 'LY': '41.252.0.0/14',
4317 'MA': '105.128.0.0/11',
4318 'MC': '88.209.64.0/18',
4319 'MD': '37.246.0.0/16',
4320 'ME': '178.175.0.0/17',
4321 'MF': '74.112.232.0/21',
4322 'MG': '154.126.0.0/17',
4323 'MH': '117.103.88.0/21',
4324 'MK': '77.28.0.0/15',
4325 'ML': '154.118.128.0/18',
4326 'MM': '37.111.0.0/17',
4327 'MN': '49.0.128.0/17',
4328 'MO': '60.246.0.0/16',
4329 'MP': '202.88.64.0/20',
4330 'MQ': '109.203.224.0/19',
4331 'MR': '41.188.64.0/18',
4332 'MS': '208.90.112.0/22',
4333 'MT': '46.11.0.0/16',
4334 'MU': '105.16.0.0/12',
4335 'MV': '27.114.128.0/18',
53896ca5 4336 'MW': '102.70.0.0/15',
773f291d
S
4337 'MX': '187.192.0.0/11',
4338 'MY': '175.136.0.0/13',
4339 'MZ': '197.218.0.0/15',
4340 'NA': '41.182.0.0/16',
4341 'NC': '101.101.0.0/18',
4342 'NE': '197.214.0.0/18',
4343 'NF': '203.17.240.0/22',
4344 'NG': '105.112.0.0/12',
4345 'NI': '186.76.0.0/15',
4346 'NL': '145.96.0.0/11',
4347 'NO': '84.208.0.0/13',
4348 'NP': '36.252.0.0/15',
4349 'NR': '203.98.224.0/19',
4350 'NU': '49.156.48.0/22',
4351 'NZ': '49.224.0.0/14',
4352 'OM': '5.36.0.0/15',
4353 'PA': '186.72.0.0/15',
4354 'PE': '186.160.0.0/14',
4355 'PF': '123.50.64.0/18',
4356 'PG': '124.240.192.0/19',
4357 'PH': '49.144.0.0/13',
4358 'PK': '39.32.0.0/11',
4359 'PL': '83.0.0.0/11',
4360 'PM': '70.36.0.0/20',
4361 'PR': '66.50.0.0/16',
4362 'PS': '188.161.0.0/16',
4363 'PT': '85.240.0.0/13',
4364 'PW': '202.124.224.0/20',
4365 'PY': '181.120.0.0/14',
4366 'QA': '37.210.0.0/15',
53896ca5 4367 'RE': '102.35.0.0/16',
773f291d 4368 'RO': '79.112.0.0/13',
53896ca5 4369 'RS': '93.86.0.0/15',
773f291d 4370 'RU': '5.136.0.0/13',
53896ca5 4371 'RW': '41.186.0.0/16',
773f291d
S
4372 'SA': '188.48.0.0/13',
4373 'SB': '202.1.160.0/19',
4374 'SC': '154.192.0.0/11',
53896ca5 4375 'SD': '102.120.0.0/13',
773f291d 4376 'SE': '78.64.0.0/12',
53896ca5 4377 'SG': '8.128.0.0/10',
773f291d
S
4378 'SI': '188.196.0.0/14',
4379 'SK': '78.98.0.0/15',
53896ca5 4380 'SL': '102.143.0.0/17',
773f291d
S
4381 'SM': '89.186.32.0/19',
4382 'SN': '41.82.0.0/15',
53896ca5 4383 'SO': '154.115.192.0/18',
773f291d
S
4384 'SR': '186.179.128.0/17',
4385 'SS': '105.235.208.0/21',
4386 'ST': '197.159.160.0/19',
4387 'SV': '168.243.0.0/16',
4388 'SX': '190.102.0.0/20',
4389 'SY': '5.0.0.0/16',
4390 'SZ': '41.84.224.0/19',
4391 'TC': '65.255.48.0/20',
4392 'TD': '154.68.128.0/19',
4393 'TG': '196.168.0.0/14',
4394 'TH': '171.96.0.0/13',
4395 'TJ': '85.9.128.0/18',
4396 'TK': '27.96.24.0/21',
4397 'TL': '180.189.160.0/20',
4398 'TM': '95.85.96.0/19',
4399 'TN': '197.0.0.0/11',
4400 'TO': '175.176.144.0/21',
4401 'TR': '78.160.0.0/11',
4402 'TT': '186.44.0.0/15',
4403 'TV': '202.2.96.0/19',
4404 'TW': '120.96.0.0/11',
4405 'TZ': '156.156.0.0/14',
53896ca5
S
4406 'UA': '37.52.0.0/14',
4407 'UG': '102.80.0.0/13',
4408 'US': '6.0.0.0/8',
773f291d 4409 'UY': '167.56.0.0/13',
53896ca5 4410 'UZ': '84.54.64.0/18',
773f291d 4411 'VA': '212.77.0.0/19',
53896ca5 4412 'VC': '207.191.240.0/21',
773f291d 4413 'VE': '186.88.0.0/13',
53896ca5 4414 'VG': '66.81.192.0/20',
773f291d
S
4415 'VI': '146.226.0.0/16',
4416 'VN': '14.160.0.0/11',
4417 'VU': '202.80.32.0/20',
4418 'WF': '117.20.32.0/21',
4419 'WS': '202.4.32.0/19',
4420 'YE': '134.35.0.0/16',
4421 'YT': '41.242.116.0/22',
4422 'ZA': '41.0.0.0/11',
53896ca5
S
4423 'ZM': '102.144.0.0/13',
4424 'ZW': '102.177.192.0/18',
773f291d
S
4425 }
4426
4427 @classmethod
5f95927a
S
4428 def random_ipv4(cls, code_or_block):
4429 if len(code_or_block) == 2:
4430 block = cls._country_ip_map.get(code_or_block.upper())
4431 if not block:
4432 return None
4433 else:
4434 block = code_or_block
773f291d
S
4435 addr, preflen = block.split('/')
4436 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4437 addr_max = addr_min | (0xffffffff >> int(preflen))
18a0defa 4438 return compat_str(socket.inet_ntoa(
4248dad9 4439 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4440
4441
91410c9b 4442class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
4443 def __init__(self, proxies=None):
4444 # Set default handlers
4445 for type in ('http', 'https'):
4446 setattr(self, '%s_open' % type,
4447 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4448 meth(r, proxy, type))
38e87f6c 4449 compat_urllib_request.ProxyHandler.__init__(self, proxies)
2461f79d 4450
91410c9b 4451 def proxy_open(self, req, proxy, type):
2461f79d 4452 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4453 if req_proxy is not None:
4454 proxy = req_proxy
2461f79d
PH
4455 del req.headers['Ytdl-request-proxy']
4456
4457 if proxy == '__noproxy__':
4458 return None # No Proxy
51fb4995 4459 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4460 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4461 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4462 return None
91410c9b
PH
4463 return compat_urllib_request.ProxyHandler.proxy_open(
4464 self, req, proxy, type)
5bc880b9
YCH
4465
4466
0a5445dd
YCH
4467# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4468# released into Public Domain
4469# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4470
4471def long_to_bytes(n, blocksize=0):
4472 """long_to_bytes(n:long, blocksize:int) : string
4473 Convert a long integer to a byte string.
4474
4475 If optional blocksize is given and greater than zero, pad the front of the
4476 byte string with binary zeros so that the length is a multiple of
4477 blocksize.
4478 """
4479 # after much testing, this algorithm was deemed to be the fastest
4480 s = b''
4481 n = int(n)
4482 while n > 0:
4483 s = compat_struct_pack('>I', n & 0xffffffff) + s
4484 n = n >> 32
4485 # strip off leading zeros
4486 for i in range(len(s)):
4487 if s[i] != b'\000'[0]:
4488 break
4489 else:
4490 # only happens when n == 0
4491 s = b'\000'
4492 i = 0
4493 s = s[i:]
4494 # add back some pad bytes. this could be done more efficiently w.r.t. the
4495 # de-padding being done above, but sigh...
4496 if blocksize > 0 and len(s) % blocksize:
4497 s = (blocksize - len(s) % blocksize) * b'\000' + s
4498 return s
4499
4500
4501def bytes_to_long(s):
4502 """bytes_to_long(string) : long
4503 Convert a byte string to a long integer.
4504
4505 This is (essentially) the inverse of long_to_bytes().
4506 """
4507 acc = 0
4508 length = len(s)
4509 if length % 4:
4510 extra = (4 - length % 4)
4511 s = b'\000' * extra + s
4512 length = length + extra
4513 for i in range(0, length, 4):
4514 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4515 return acc
4516
4517
5bc880b9
YCH
4518def ohdave_rsa_encrypt(data, exponent, modulus):
4519 '''
4520 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4521
4522 Input:
4523 data: data to encrypt, bytes-like object
4524 exponent, modulus: parameter e and N of RSA algorithm, both integer
4525 Output: hex string of encrypted data
4526
4527 Limitation: supports one block encryption only
4528 '''
4529
4530 payload = int(binascii.hexlify(data[::-1]), 16)
4531 encrypted = pow(payload, exponent, modulus)
4532 return '%x' % encrypted
81bdc8fd
YCH
4533
4534
f48409c7
YCH
4535def pkcs1pad(data, length):
4536 """
4537 Padding input data with PKCS#1 scheme
4538
4539 @param {int[]} data input data
4540 @param {int} length target length
4541 @returns {int[]} padded data
4542 """
4543 if len(data) > length - 11:
4544 raise ValueError('Input data too long for PKCS#1 padding')
4545
4546 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4547 return [0, 2] + pseudo_random + [0] + data
4548
4549
5eb6bdce 4550def encode_base_n(num, n, table=None):
59f898b7 4551 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
4552 if not table:
4553 table = FULL_TABLE[:n]
4554
5eb6bdce
YCH
4555 if n > len(table):
4556 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4557
4558 if num == 0:
4559 return table[0]
4560
81bdc8fd
YCH
4561 ret = ''
4562 while num:
4563 ret = table[num % n] + ret
4564 num = num // n
4565 return ret
f52354a8
YCH
4566
4567
4568def decode_packed_codes(code):
06b3fe29 4569 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4570 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4571 base = int(base)
4572 count = int(count)
4573 symbols = symbols.split('|')
4574 symbol_table = {}
4575
4576 while count:
4577 count -= 1
5eb6bdce 4578 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4579 symbol_table[base_n_count] = symbols[count] or base_n_count
4580
4581 return re.sub(
4582 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4583 obfuscated_code)
e154c651 4584
4585
1ced2221
S
4586def caesar(s, alphabet, shift):
4587 if shift == 0:
4588 return s
4589 l = len(alphabet)
4590 return ''.join(
4591 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4592 for c in s)
4593
4594
4595def rot47(s):
4596 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4597
4598
e154c651 4599def parse_m3u8_attributes(attrib):
4600 info = {}
4601 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4602 if val.startswith('"'):
4603 val = val[1:-1]
4604 info[key] = val
4605 return info
1143535d
YCH
4606
4607
4608def urshift(val, n):
4609 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4610
4611
4612# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 4613# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
4614def decode_png(png_data):
4615 # Reference: https://www.w3.org/TR/PNG/
4616 header = png_data[8:]
4617
4618 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
86e5f3ed 4619 raise OSError('Not a valid PNG file.')
d3f8e038
YCH
4620
4621 int_map = {1: '>B', 2: '>H', 4: '>I'}
4622 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4623
4624 chunks = []
4625
4626 while header:
4627 length = unpack_integer(header[:4])
4628 header = header[4:]
4629
4630 chunk_type = header[:4]
4631 header = header[4:]
4632
4633 chunk_data = header[:length]
4634 header = header[length:]
4635
4636 header = header[4:] # Skip CRC
4637
4638 chunks.append({
4639 'type': chunk_type,
4640 'length': length,
4641 'data': chunk_data
4642 })
4643
4644 ihdr = chunks[0]['data']
4645
4646 width = unpack_integer(ihdr[:4])
4647 height = unpack_integer(ihdr[4:8])
4648
4649 idat = b''
4650
4651 for chunk in chunks:
4652 if chunk['type'] == b'IDAT':
4653 idat += chunk['data']
4654
4655 if not idat:
86e5f3ed 4656 raise OSError('Unable to read PNG data.')
d3f8e038
YCH
4657
4658 decompressed_data = bytearray(zlib.decompress(idat))
4659
4660 stride = width * 3
4661 pixels = []
4662
4663 def _get_pixel(idx):
4664 x = idx % stride
4665 y = idx // stride
4666 return pixels[y][x]
4667
4668 for y in range(height):
4669 basePos = y * (1 + stride)
4670 filter_type = decompressed_data[basePos]
4671
4672 current_row = []
4673
4674 pixels.append(current_row)
4675
4676 for x in range(stride):
4677 color = decompressed_data[1 + basePos + x]
4678 basex = y * stride + x
4679 left = 0
4680 up = 0
4681
4682 if x > 2:
4683 left = _get_pixel(basex - 3)
4684 if y > 0:
4685 up = _get_pixel(basex - stride)
4686
4687 if filter_type == 1: # Sub
4688 color = (color + left) & 0xff
4689 elif filter_type == 2: # Up
4690 color = (color + up) & 0xff
4691 elif filter_type == 3: # Average
4692 color = (color + ((left + up) >> 1)) & 0xff
4693 elif filter_type == 4: # Paeth
4694 a = left
4695 b = up
4696 c = 0
4697
4698 if x > 2 and y > 0:
4699 c = _get_pixel(basex - stride - 3)
4700
4701 p = a + b - c
4702
4703 pa = abs(p - a)
4704 pb = abs(p - b)
4705 pc = abs(p - c)
4706
4707 if pa <= pb and pa <= pc:
4708 color = (color + a) & 0xff
4709 elif pb <= pc:
4710 color = (color + b) & 0xff
4711 else:
4712 color = (color + c) & 0xff
4713
4714 current_row.append(color)
4715
4716 return width, height, pixels
efa97bdc
YCH
4717
4718
4719def write_xattr(path, key, value):
6f7563be 4720 # Windows: Write xattrs to NTFS Alternate Data Streams:
4721 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4722 if compat_os_name == 'nt':
4723 assert ':' not in key
4724 assert os.path.exists(path)
efa97bdc
YCH
4725
4726 try:
6f7563be 4727 with open(f'{path}:{key}', 'wb') as f:
4728 f.write(value)
86e5f3ed 4729 except OSError as e:
efa97bdc 4730 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 4731 return
efa97bdc 4732
6f7563be 4733 # UNIX Method 1. Use xattrs/pyxattrs modules
4734 from .dependencies import xattr
efa97bdc 4735
6f7563be 4736 setxattr = None
4737 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
4738 # Unicode arguments are not supported in pyxattr until version 0.5.0
4739 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4740 if version_tuple(xattr.__version__) >= (0, 5, 0):
4741 setxattr = xattr.set
4742 elif xattr:
4743 setxattr = xattr.setxattr
efa97bdc 4744
6f7563be 4745 if setxattr:
4746 try:
4747 setxattr(path, key, value)
4748 except OSError as e:
4749 raise XAttrMetadataError(e.errno, e.strerror)
4750 return
efa97bdc 4751
6f7563be 4752 # UNIX Method 2. Use setfattr/xattr executables
4753 exe = ('setfattr' if check_executable('setfattr', ['--version'])
4754 else 'xattr' if check_executable('xattr', ['-h']) else None)
4755 if not exe:
4756 raise XAttrUnavailableError(
4757 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
4758 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 4759
0f06bcd7 4760 value = value.decode()
6f7563be 4761 try:
4762 p = Popen(
4763 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
4764 stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4765 except OSError as e:
4766 raise XAttrMetadataError(e.errno, e.strerror)
4767 stderr = p.communicate_or_kill()[1].decode('utf-8', 'replace')
4768 if p.returncode:
4769 raise XAttrMetadataError(p.returncode, stderr)
0c265486
YCH
4770
4771
4772def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
4773 start_date = datetime.date(1950, 1, 1)
4774 end_date = datetime.date(1995, 12, 31)
4775 offset = random.randint(0, (end_date - start_date).days)
4776 random_date = start_date + datetime.timedelta(offset)
0c265486 4777 return {
aa374bc7
AS
4778 year_field: str(random_date.year),
4779 month_field: str(random_date.month),
4780 day_field: str(random_date.day),
0c265486 4781 }
732044af 4782
c76eb41b 4783
732044af 4784# Templates for internet shortcut files, which are plain text files.
e5a998f3 4785DOT_URL_LINK_TEMPLATE = '''\
732044af 4786[InternetShortcut]
4787URL=%(url)s
e5a998f3 4788'''
732044af 4789
e5a998f3 4790DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 4791<?xml version="1.0" encoding="UTF-8"?>
4792<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4793<plist version="1.0">
4794<dict>
4795\t<key>URL</key>
4796\t<string>%(url)s</string>
4797</dict>
4798</plist>
e5a998f3 4799'''
732044af 4800
e5a998f3 4801DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 4802[Desktop Entry]
4803Encoding=UTF-8
4804Name=%(filename)s
4805Type=Link
4806URL=%(url)s
4807Icon=text-html
e5a998f3 4808'''
732044af 4809
08438d2c 4810LINK_TEMPLATES = {
4811 'url': DOT_URL_LINK_TEMPLATE,
4812 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4813 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4814}
4815
732044af 4816
4817def iri_to_uri(iri):
4818 """
4819 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4820
4821 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4822 """
4823
4824 iri_parts = compat_urllib_parse_urlparse(iri)
4825
4826 if '[' in iri_parts.netloc:
4827 raise ValueError('IPv6 URIs are not, yet, supported.')
4828 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4829
4830 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4831
4832 net_location = ''
4833 if iri_parts.username:
f9934b96 4834 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 4835 if iri_parts.password is not None:
f9934b96 4836 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 4837 net_location += '@'
4838
0f06bcd7 4839 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 4840 # The 'idna' encoding produces ASCII text.
4841 if iri_parts.port is not None and iri_parts.port != 80:
4842 net_location += ':' + str(iri_parts.port)
4843
f9934b96 4844 return urllib.parse.urlunparse(
732044af 4845 (iri_parts.scheme,
4846 net_location,
4847
f9934b96 4848 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 4849
4850 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 4851 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 4852
4853 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 4854 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 4855
f9934b96 4856 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 4857
4858 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4859
4860
4861def to_high_limit_path(path):
4862 if sys.platform in ['win32', 'cygwin']:
4863 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 4864 return '\\\\?\\' + os.path.abspath(path)
732044af 4865
4866 return path
76d321f6 4867
c76eb41b 4868
b868936c 4869def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
e0ddbd02 4870 val = traverse_obj(obj, *variadic(field))
4871 if val in ignore:
4872 return default
4873 return template % (func(val) if func else val)
00dd0cd5 4874
4875
4876def clean_podcast_url(url):
4877 return re.sub(r'''(?x)
4878 (?:
4879 (?:
4880 chtbl\.com/track|
4881 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4882 play\.podtrac\.com
4883 )/[^/]+|
4884 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4885 flex\.acast\.com|
4886 pd(?:
4887 cn\.co| # https://podcorn.com/analytics-prefix/
4888 st\.fm # https://podsights.com/docs/
4889 )/e
4890 )/''', '', url)
ffcb8191
THD
4891
4892
4893_HEX_TABLE = '0123456789abcdef'
4894
4895
4896def random_uuidv4():
4897 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 4898
4899
4900def make_dir(path, to_screen=None):
4901 try:
4902 dn = os.path.dirname(path)
4903 if dn and not os.path.exists(dn):
4904 os.makedirs(dn)
4905 return True
86e5f3ed 4906 except OSError as err:
0202b52a 4907 if callable(to_screen) is not None:
4908 to_screen('unable to create directory ' + error_to_compat_str(err))
4909 return False
f74980cb 4910
4911
4912def get_executable_path():
c552ae88 4913 from zipimport import zipimporter
4914 if hasattr(sys, 'frozen'): # Running from PyInstaller
4915 path = os.path.dirname(sys.executable)
cfb0511d 4916 elif isinstance(__loader__, zipimporter): # Running from ZIP
c552ae88 4917 path = os.path.join(os.path.dirname(__file__), '../..')
4918 else:
4919 path = os.path.join(os.path.dirname(__file__), '..')
f74980cb 4920 return os.path.abspath(path)
4921
4922
2f567473 4923def load_plugins(name, suffix, namespace):
3ae5e797 4924 classes = {}
19a03940 4925 with contextlib.suppress(FileNotFoundError):
019a94f7
ÁS
4926 plugins_spec = importlib.util.spec_from_file_location(
4927 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
4928 plugins = importlib.util.module_from_spec(plugins_spec)
4929 sys.modules[plugins_spec.name] = plugins
4930 plugins_spec.loader.exec_module(plugins)
f74980cb 4931 for name in dir(plugins):
2f567473 4932 if name in namespace:
4933 continue
4934 if not name.endswith(suffix):
f74980cb 4935 continue
4936 klass = getattr(plugins, name)
3ae5e797 4937 classes[name] = namespace[name] = klass
f74980cb 4938 return classes
06167fbb 4939
4940
325ebc17 4941def traverse_obj(
352d63fd 4942 obj, *path_list, default=None, expected_type=None, get_all=True,
325ebc17 4943 casesense=True, is_user_input=False, traverse_string=False):
324ad820 4944 ''' Traverse nested list/dict/tuple
8f334380 4945 @param path_list A list of paths which are checked one by one.
19a03940 4946 Each path is a list of keys where each key is a:
4947 - None: Do nothing
4948 - string: A dictionary key
4949 - int: An index into a list
4950 - tuple: A list of keys all of which will be traversed
4951 - Ellipsis: Fetch all values in the object
4952 - Function: Takes the key and value as arguments
4953 and returns whether the key matches or not
325ebc17 4954 @param default Default value to return
352d63fd 4955 @param expected_type Only accept final value of this type (Can also be any callable)
4956 @param get_all Return all the values obtained from a path or only the first one
324ad820 4957 @param casesense Whether to consider dictionary keys as case sensitive
4958 @param is_user_input Whether the keys are generated from user input. If True,
4959 strings are converted to int/slice if necessary
4960 @param traverse_string Whether to traverse inside strings. If True, any
4961 non-compatible object will also be converted into a string
8f334380 4962 # TODO: Write tests
324ad820 4963 '''
325ebc17 4964 if not casesense:
dbf5416a 4965 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
8f334380 4966 path_list = (map(_lower, variadic(path)) for path in path_list)
4967
4968 def _traverse_obj(obj, path, _current_depth=0):
4969 nonlocal depth
4970 path = tuple(variadic(path))
4971 for i, key in enumerate(path):
1797b073 4972 if None in (key, obj):
4973 return obj
8f334380 4974 if isinstance(key, (list, tuple)):
4975 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
4976 key = ...
4977 if key is ...:
4978 obj = (obj.values() if isinstance(obj, dict)
4979 else obj if isinstance(obj, (list, tuple, LazyList))
4980 else str(obj) if traverse_string else [])
4981 _current_depth += 1
4982 depth = max(depth, _current_depth)
4983 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
2614f646 4984 elif callable(key):
4985 if isinstance(obj, (list, tuple, LazyList)):
4986 obj = enumerate(obj)
4987 elif isinstance(obj, dict):
4988 obj = obj.items()
4989 else:
4990 if not traverse_string:
4991 return None
4992 obj = str(obj)
4993 _current_depth += 1
4994 depth = max(depth, _current_depth)
e6f868a6 4995 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
575e17a1 4996 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
325ebc17 4997 obj = (obj.get(key) if casesense or (key in obj)
4998 else next((v for k, v in obj.items() if _lower(k) == key), None))
4999 else:
5000 if is_user_input:
5001 key = (int_or_none(key) if ':' not in key
5002 else slice(*map(int_or_none, key.split(':'))))
8f334380 5003 if key == slice(None):
575e17a1 5004 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
325ebc17 5005 if not isinstance(key, (int, slice)):
9fea350f 5006 return None
8f334380 5007 if not isinstance(obj, (list, tuple, LazyList)):
325ebc17 5008 if not traverse_string:
5009 return None
5010 obj = str(obj)
5011 try:
5012 obj = obj[key]
5013 except IndexError:
324ad820 5014 return None
325ebc17 5015 return obj
5016
352d63fd 5017 if isinstance(expected_type, type):
5018 type_test = lambda val: val if isinstance(val, expected_type) else None
5019 elif expected_type is not None:
5020 type_test = expected_type
5021 else:
5022 type_test = lambda val: val
5023
8f334380 5024 for path in path_list:
5025 depth = 0
5026 val = _traverse_obj(obj, path)
325ebc17 5027 if val is not None:
8f334380 5028 if depth:
5029 for _ in range(depth - 1):
6586bca9 5030 val = itertools.chain.from_iterable(v for v in val if v is not None)
352d63fd 5031 val = [v for v in map(type_test, val) if v is not None]
8f334380 5032 if val:
352d63fd 5033 return val if get_all else val[0]
5034 else:
5035 val = type_test(val)
5036 if val is not None:
8f334380 5037 return val
325ebc17 5038 return default
324ad820 5039
5040
5041def traverse_dict(dictn, keys, casesense=True):
ee8dd27a 5042 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5043 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5044 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5045
5046
ff91cf74 5047def get_first(obj, keys, **kwargs):
5048 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5049
5050
4b4b7f74 5051def variadic(x, allowed_types=(str, bytes, dict)):
cb89cfc1 5052 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
bd50a52b
THD
5053
5054
3e9b66d7
LNO
5055def decode_base(value, digits):
5056 # This will convert given base-x string to scalar (long or int)
5057 table = {char: index for index, char in enumerate(digits)}
5058 result = 0
5059 base = len(digits)
5060 for chr in value:
5061 result *= base
5062 result += table[chr]
5063 return result
5064
5065
5066def time_seconds(**kwargs):
5067 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5068 return t.timestamp()
5069
5070
49fa4d9a
N
5071# create a JSON Web Signature (jws) with HS256 algorithm
5072# the resulting format is in JWS Compact Serialization
5073# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5074# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5075def jwt_encode_hs256(payload_data, key, headers={}):
5076 header_data = {
5077 'alg': 'HS256',
5078 'typ': 'JWT',
5079 }
5080 if headers:
5081 header_data.update(headers)
0f06bcd7 5082 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5083 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5084 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
5085 signature_b64 = base64.b64encode(h.digest())
5086 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5087 return token
819e0531 5088
5089
16b0d7e6 5090# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5091def jwt_decode_hs256(jwt):
5092 header_b64, payload_b64, signature_b64 = jwt.split('.')
5093 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5094 return payload_data
5095
5096
819e0531 5097def supports_terminal_sequences(stream):
5098 if compat_os_name == 'nt':
e3c7d495 5099 from .compat import WINDOWS_VT_MODE # Must be imported locally
5100 if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
819e0531 5101 return False
5102 elif not os.getenv('TERM'):
5103 return False
5104 try:
5105 return stream.isatty()
5106 except BaseException:
5107 return False
5108
5109
ec11a9f4 5110_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5111
5112
5113def remove_terminal_sequences(string):
5114 return _terminal_sequences_re.sub('', string)
5115
5116
5117def number_of_digits(number):
5118 return len('%d' % number)
34921b43 5119
5120
5121def join_nonempty(*values, delim='-', from_dict=None):
5122 if from_dict is not None:
c586f9e8 5123 values = map(from_dict.get, values)
34921b43 5124 return delim.join(map(str, filter(None, values)))
06e57990 5125
5126
27231526
ZM
5127def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5128 """
5129 Find the largest format dimensions in terms of video width and, for each thumbnail:
5130 * Modify the URL: Match the width with the provided regex and replace with the former width
5131 * Update dimensions
5132
5133 This function is useful with video services that scale the provided thumbnails on demand
5134 """
5135 _keys = ('width', 'height')
5136 max_dimensions = max(
86e5f3ed 5137 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
5138 default=(0, 0))
5139 if not max_dimensions[0]:
5140 return thumbnails
5141 return [
5142 merge_dicts(
5143 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5144 dict(zip(_keys, max_dimensions)), thumbnail)
5145 for thumbnail in thumbnails
5146 ]
5147
5148
93c8410d
LNO
5149def parse_http_range(range):
5150 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5151 if not range:
5152 return None, None, None
5153 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5154 if not crg:
5155 return None, None, None
5156 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5157
5158
06e57990 5159class Config:
5160 own_args = None
5161 filename = None
5162 __initialized = False
5163
5164 def __init__(self, parser, label=None):
5165 self._parser, self.label = parser, label
5166 self._loaded_paths, self.configs = set(), []
5167
5168 def init(self, args=None, filename=None):
5169 assert not self.__initialized
65662dff 5170 directory = ''
06e57990 5171 if filename:
5172 location = os.path.realpath(filename)
65662dff 5173 directory = os.path.dirname(location)
06e57990 5174 if location in self._loaded_paths:
5175 return False
5176 self._loaded_paths.add(location)
5177
5178 self.__initialized = True
5179 self.own_args, self.filename = args, filename
5180 for location in self._parser.parse_args(args)[0].config_locations or []:
65662dff 5181 location = os.path.join(directory, expand_path(location))
06e57990 5182 if os.path.isdir(location):
5183 location = os.path.join(location, 'yt-dlp.conf')
5184 if not os.path.exists(location):
5185 self._parser.error(f'config location {location} does not exist')
5186 self.append_config(self.read_file(location), location)
5187 return True
5188
5189 def __str__(self):
5190 label = join_nonempty(
5191 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5192 delim=' ')
5193 return join_nonempty(
5194 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5195 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5196 delim='\n')
5197
5198 @staticmethod
5199 def read_file(filename, default=[]):
5200 try:
5201 optionf = open(filename)
86e5f3ed 5202 except OSError:
06e57990 5203 return default # silently skip if file is not present
5204 try:
5205 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5206 contents = optionf.read()
f9934b96 5207 res = shlex.split(contents, comments=True)
06e57990 5208 finally:
5209 optionf.close()
5210 return res
5211
5212 @staticmethod
5213 def hide_login_info(opts):
86e5f3ed 5214 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 5215 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5216
5217 def _scrub_eq(o):
5218 m = eqre.match(o)
5219 if m:
5220 return m.group('key') + '=PRIVATE'
5221 else:
5222 return o
5223
5224 opts = list(map(_scrub_eq, opts))
5225 for idx, opt in enumerate(opts):
5226 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5227 opts[idx + 1] = 'PRIVATE'
5228 return opts
5229
5230 def append_config(self, *args, label=None):
5231 config = type(self)(self._parser, label)
5232 config._loaded_paths = self._loaded_paths
5233 if config.init(*args):
5234 self.configs.append(config)
5235
5236 @property
5237 def all_args(self):
5238 for config in reversed(self.configs):
5239 yield from config.all_args
5240 yield from self.own_args or []
5241
5242 def parse_args(self):
19a03940 5243 return self._parser.parse_args(self.all_args)
da42679b
LNO
5244
5245
5246class WebSocketsWrapper():
5247 """Wraps websockets module to use in non-async scopes"""
abfecb7b 5248 pool = None
da42679b 5249
3cea3edd 5250 def __init__(self, url, headers=None, connect=True):
059bc4db 5251 self.loop = asyncio.new_event_loop()
9cd08050 5252 # XXX: "loop" is deprecated
5253 self.conn = websockets.connect(
5254 url, extra_headers=headers, ping_interval=None,
5255 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5256 if connect:
5257 self.__enter__()
15dfb392 5258 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5259
5260 def __enter__(self):
3cea3edd 5261 if not self.pool:
9cd08050 5262 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5263 return self
5264
5265 def send(self, *args):
5266 self.run_with_loop(self.pool.send(*args), self.loop)
5267
5268 def recv(self, *args):
5269 return self.run_with_loop(self.pool.recv(*args), self.loop)
5270
5271 def __exit__(self, type, value, traceback):
5272 try:
5273 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5274 finally:
5275 self.loop.close()
15dfb392 5276 self._cancel_all_tasks(self.loop)
da42679b
LNO
5277
5278 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5279 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5280 @staticmethod
5281 def run_with_loop(main, loop):
059bc4db 5282 if not asyncio.iscoroutine(main):
da42679b
LNO
5283 raise ValueError(f'a coroutine was expected, got {main!r}')
5284
5285 try:
5286 return loop.run_until_complete(main)
5287 finally:
5288 loop.run_until_complete(loop.shutdown_asyncgens())
5289 if hasattr(loop, 'shutdown_default_executor'):
5290 loop.run_until_complete(loop.shutdown_default_executor())
5291
5292 @staticmethod
5293 def _cancel_all_tasks(loop):
059bc4db 5294 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
5295
5296 if not to_cancel:
5297 return
5298
5299 for task in to_cancel:
5300 task.cancel()
5301
9cd08050 5302 # XXX: "loop" is removed in python 3.10+
da42679b 5303 loop.run_until_complete(
059bc4db 5304 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
5305
5306 for task in to_cancel:
5307 if task.cancelled():
5308 continue
5309 if task.exception() is not None:
5310 loop.call_exception_handler({
5311 'message': 'unhandled exception during asyncio.run() shutdown',
5312 'exception': task.exception(),
5313 'task': task,
5314 })
5315
5316
8b7539d2 5317def merge_headers(*dicts):
08d30158 5318 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 5319 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 5320
5321
5322class classproperty:
5323 def __init__(self, f):
5324 self.f = f
5325
5326 def __get__(self, _, cls):
5327 return self.f(cls)
19a03940 5328
5329
5330def Namespace(**kwargs):
5331 return collections.namedtuple('Namespace', kwargs)(**kwargs)
9b8ee23b 5332
5333
5334# Deprecated
5335has_certifi = bool(certifi)
5336has_websockets = bool(websockets)