]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
[extractor/telegraaf] Use mobile GraphQL API endpoint
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
6929b41a 1import asyncio
15dfb392 2import atexit
1e399778 3import base64
5bc880b9 4import binascii
912b38b4 5import calendar
676eb3f2 6import codecs
c380cc28 7import collections
62e609ab 8import contextlib
c496ca96 9import datetime
0c265486 10import email.header
f8271158 11import email.utils
f45c185f 12import errno
d77c3dfd 13import gzip
49fa4d9a
N
14import hashlib
15import hmac
ac668111 16import html.entities
17import html.parser
54007a45 18import http.client
19import http.cookiejar
019a94f7 20import importlib.util
b1f94422 21import inspect
03f9daab 22import io
79a2e94e 23import itertools
f4bfd65f 24import json
d77c3dfd 25import locale
02dbf93f 26import math
f8271158 27import mimetypes
347de493 28import operator
d77c3dfd 29import os
c496ca96 30import platform
773f291d 31import random
d77c3dfd 32import re
f8271158 33import shlex
c496ca96 34import socket
79a2e94e 35import ssl
ac668111 36import struct
1c088fa8 37import subprocess
d77c3dfd 38import sys
181c8655 39import tempfile
c380cc28 40import time
01951dda 41import traceback
64fa820c 42import types
989a01c2 43import unicodedata
14f25df2 44import urllib.error
f8271158 45import urllib.parse
ac668111 46import urllib.request
bcf89ce6 47import xml.etree.ElementTree
d77c3dfd 48import zlib
d77c3dfd 49
6929b41a 50from .compat import functools # isort: split
8c25f81b 51from .compat import (
36e6f62c 52 compat_etree_fromstring,
51098426 53 compat_expanduser,
f8271158 54 compat_HTMLParseError,
efa97bdc 55 compat_os_name,
702ccf2d 56 compat_shlex_quote,
8c25f81b 57)
ac668111 58from .dependencies import brotli, certifi, websockets, xattr
f8271158 59from .socks import ProxyType, sockssocket
71aff188 60
4644ac55 61
51fb4995
YCH
62def register_socks_protocols():
63 # "Register" SOCKS protocols
d5ae6bb5
YCH
64 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
65 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995 66 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
14f25df2 67 if scheme not in urllib.parse.uses_netloc:
68 urllib.parse.uses_netloc.append(scheme)
51fb4995
YCH
69
70
468e2e92
FV
71# This is not clearly defined otherwise
72compiled_regex_type = type(re.compile(''))
73
f7a147e3
S
74
75def random_user_agent():
76 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
77 _CHROME_VERSIONS = (
19b4c74d 78 '90.0.4430.212',
79 '90.0.4430.24',
80 '90.0.4430.70',
81 '90.0.4430.72',
82 '90.0.4430.85',
83 '90.0.4430.93',
84 '91.0.4472.101',
85 '91.0.4472.106',
86 '91.0.4472.114',
87 '91.0.4472.124',
88 '91.0.4472.164',
89 '91.0.4472.19',
90 '91.0.4472.77',
91 '92.0.4515.107',
92 '92.0.4515.115',
93 '92.0.4515.131',
94 '92.0.4515.159',
95 '92.0.4515.43',
96 '93.0.4556.0',
97 '93.0.4577.15',
98 '93.0.4577.63',
99 '93.0.4577.82',
100 '94.0.4606.41',
101 '94.0.4606.54',
102 '94.0.4606.61',
103 '94.0.4606.71',
104 '94.0.4606.81',
105 '94.0.4606.85',
106 '95.0.4638.17',
107 '95.0.4638.50',
108 '95.0.4638.54',
109 '95.0.4638.69',
110 '95.0.4638.74',
111 '96.0.4664.18',
112 '96.0.4664.45',
113 '96.0.4664.55',
114 '96.0.4664.93',
115 '97.0.4692.20',
f7a147e3
S
116 )
117 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
118
119
4390d5ec 120SUPPORTED_ENCODINGS = [
121 'gzip', 'deflate'
122]
9b8ee23b 123if brotli:
4390d5ec 124 SUPPORTED_ENCODINGS.append('br')
125
3e669f36 126std_headers = {
f7a147e3 127 'User-Agent': random_user_agent(),
59ae15a5 128 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59ae15a5 129 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 130 'Sec-Fetch-Mode': 'navigate',
3e669f36 131}
f427df17 132
5f6a1245 133
fb37eb25
S
134USER_AGENTS = {
135 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
136}
137
138
bf42a990 139NO_DEFAULT = object()
7b2c3f47 140IDENTITY = lambda x: x
bf42a990 141
7105440c
YCH
142ENGLISH_MONTH_NAMES = [
143 'January', 'February', 'March', 'April', 'May', 'June',
144 'July', 'August', 'September', 'October', 'November', 'December']
145
f6717dec
S
146MONTH_NAMES = {
147 'en': ENGLISH_MONTH_NAMES,
148 'fr': [
3e4185c3
S
149 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
150 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 151}
a942d6cb 152
8f53dc44 153# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42
154TIMEZONE_NAMES = {
155 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0,
156 'AST': -4, 'ADT': -3, # Atlantic (used in Canada)
157 'EST': -5, 'EDT': -4, # Eastern
158 'CST': -6, 'CDT': -5, # Central
159 'MST': -7, 'MDT': -6, # Mountain
160 'PST': -8, 'PDT': -7 # Pacific
161}
162
c587cbb7 163# needed for sanitizing filenames in restricted mode
c8827027 164ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
165 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
166 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 167
46f59e89
S
168DATE_FORMATS = (
169 '%d %B %Y',
170 '%d %b %Y',
171 '%B %d %Y',
cb655f34
S
172 '%B %dst %Y',
173 '%B %dnd %Y',
9d30c213 174 '%B %drd %Y',
cb655f34 175 '%B %dth %Y',
46f59e89 176 '%b %d %Y',
cb655f34
S
177 '%b %dst %Y',
178 '%b %dnd %Y',
9d30c213 179 '%b %drd %Y',
cb655f34 180 '%b %dth %Y',
46f59e89
S
181 '%b %dst %Y %I:%M',
182 '%b %dnd %Y %I:%M',
9d30c213 183 '%b %drd %Y %I:%M',
46f59e89
S
184 '%b %dth %Y %I:%M',
185 '%Y %m %d',
186 '%Y-%m-%d',
bccdbd22 187 '%Y.%m.%d.',
46f59e89 188 '%Y/%m/%d',
81c13222 189 '%Y/%m/%d %H:%M',
46f59e89 190 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
191 '%Y%m%d%H%M',
192 '%Y%m%d%H%M%S',
4f3fa23e 193 '%Y%m%d',
0c1c6f4b 194 '%Y-%m-%d %H:%M',
46f59e89
S
195 '%Y-%m-%d %H:%M:%S',
196 '%Y-%m-%d %H:%M:%S.%f',
5014558a 197 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
198 '%d.%m.%Y %H:%M',
199 '%d.%m.%Y %H.%M',
200 '%Y-%m-%dT%H:%M:%SZ',
201 '%Y-%m-%dT%H:%M:%S.%fZ',
202 '%Y-%m-%dT%H:%M:%S.%f0Z',
203 '%Y-%m-%dT%H:%M:%S',
204 '%Y-%m-%dT%H:%M:%S.%f',
205 '%Y-%m-%dT%H:%M',
c6eed6b8
S
206 '%b %d %Y at %H:%M',
207 '%b %d %Y at %H:%M:%S',
b555ae9b
S
208 '%B %d %Y at %H:%M',
209 '%B %d %Y at %H:%M:%S',
a63d9bd0 210 '%H:%M %d-%b-%Y',
46f59e89
S
211)
212
213DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
214DATE_FORMATS_DAY_FIRST.extend([
215 '%d-%m-%Y',
216 '%d.%m.%Y',
217 '%d.%m.%y',
218 '%d/%m/%Y',
219 '%d/%m/%y',
220 '%d/%m/%Y %H:%M:%S',
47304e07 221 '%d-%m-%Y %H:%M',
46f59e89
S
222])
223
224DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
225DATE_FORMATS_MONTH_FIRST.extend([
226 '%m-%d-%Y',
227 '%m.%d.%Y',
228 '%m/%d/%Y',
229 '%m/%d/%y',
230 '%m/%d/%Y %H:%M:%S',
231])
232
06b3fe29 233PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
ae61d108 234JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?})\s*</script>'
06b3fe29 235
1d485a1a 236NUMBER_RE = r'\d+(?:\.\d+)?'
237
7105440c 238
0b9c08b4 239@functools.cache
d77c3dfd 240def preferredencoding():
59ae15a5 241 """Get preferred encoding.
d77c3dfd 242
59ae15a5
PH
243 Returns the best encoding scheme for the system, based on
244 locale.getpreferredencoding() and some further tweaks.
245 """
246 try:
247 pref = locale.getpreferredencoding()
28e614de 248 'TEST'.encode(pref)
70a1165b 249 except Exception:
59ae15a5 250 pref = 'UTF-8'
bae611f2 251
59ae15a5 252 return pref
d77c3dfd 253
f4bfd65f 254
181c8655 255def write_json_file(obj, fn):
1394646a 256 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 257
cfb0511d 258 tf = tempfile.NamedTemporaryFile(
259 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
260 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
181c8655
PH
261
262 try:
263 with tf:
45d86abe 264 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
265 if sys.platform == 'win32':
266 # Need to remove existing file on Windows, else os.rename raises
267 # WindowsError or FileExistsError.
19a03940 268 with contextlib.suppress(OSError):
1394646a 269 os.unlink(fn)
19a03940 270 with contextlib.suppress(OSError):
9cd5f54e
R
271 mask = os.umask(0)
272 os.umask(mask)
273 os.chmod(tf.name, 0o666 & ~mask)
181c8655 274 os.rename(tf.name, fn)
70a1165b 275 except Exception:
19a03940 276 with contextlib.suppress(OSError):
181c8655 277 os.remove(tf.name)
181c8655
PH
278 raise
279
280
cfb0511d 281def find_xpath_attr(node, xpath, key, val=None):
282 """ Find the xpath xpath[@key=val] """
283 assert re.match(r'^[a-zA-Z_-]+$', key)
86e5f3ed 284 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
cfb0511d 285 return node.find(expr)
59ae56fa 286
d7e66d39
JMF
287# On python2.6 the xml.etree.ElementTree.Element methods don't support
288# the namespace parameter
5f6a1245
JW
289
290
d7e66d39
JMF
291def xpath_with_ns(path, ns_map):
292 components = [c.split(':') for c in path.split('/')]
293 replaced = []
294 for c in components:
295 if len(c) == 1:
296 replaced.append(c[0])
297 else:
298 ns, tag = c
299 replaced.append('{%s}%s' % (ns_map[ns], tag))
300 return '/'.join(replaced)
301
d77c3dfd 302
a41fb80c 303def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 304 def _find_xpath(xpath):
f9934b96 305 return node.find(xpath)
578c0745 306
14f25df2 307 if isinstance(xpath, str):
578c0745
S
308 n = _find_xpath(xpath)
309 else:
310 for xp in xpath:
311 n = _find_xpath(xp)
312 if n is not None:
313 break
d74bebd5 314
8e636da4 315 if n is None:
bf42a990
S
316 if default is not NO_DEFAULT:
317 return default
318 elif fatal:
bf0ff932
PH
319 name = xpath if name is None else name
320 raise ExtractorError('Could not find XML element %s' % name)
321 else:
322 return None
a41fb80c
S
323 return n
324
325
326def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
327 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
328 if n is None or n == default:
329 return n
330 if n.text is None:
331 if default is not NO_DEFAULT:
332 return default
333 elif fatal:
334 name = xpath if name is None else name
335 raise ExtractorError('Could not find XML element\'s text %s' % name)
336 else:
337 return None
338 return n.text
a41fb80c
S
339
340
341def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
342 n = find_xpath_attr(node, xpath, key)
343 if n is None:
344 if default is not NO_DEFAULT:
345 return default
346 elif fatal:
86e5f3ed 347 name = f'{xpath}[@{key}]' if name is None else name
a41fb80c
S
348 raise ExtractorError('Could not find XML attribute %s' % name)
349 else:
350 return None
351 return n.attrib[key]
bf0ff932
PH
352
353
c487cf00 354def get_element_by_id(id, html, **kwargs):
43e8fafd 355 """Return the content of the tag with the specified ID in the passed HTML document"""
c487cf00 356 return get_element_by_attribute('id', id, html, **kwargs)
43e8fafd 357
12ea2f30 358
c487cf00 359def get_element_html_by_id(id, html, **kwargs):
6f32a0b5 360 """Return the html of the tag with the specified ID in the passed HTML document"""
c487cf00 361 return get_element_html_by_attribute('id', id, html, **kwargs)
6f32a0b5
ZM
362
363
84c237fb 364def get_element_by_class(class_name, html):
2af12ad9
TC
365 """Return the content of the first tag with the specified class in the passed HTML document"""
366 retval = get_elements_by_class(class_name, html)
367 return retval[0] if retval else None
368
369
6f32a0b5
ZM
370def get_element_html_by_class(class_name, html):
371 """Return the html of the first tag with the specified class in the passed HTML document"""
372 retval = get_elements_html_by_class(class_name, html)
373 return retval[0] if retval else None
374
375
c487cf00 376def get_element_by_attribute(attribute, value, html, **kwargs):
377 retval = get_elements_by_attribute(attribute, value, html, **kwargs)
2af12ad9
TC
378 return retval[0] if retval else None
379
380
c487cf00 381def get_element_html_by_attribute(attribute, value, html, **kargs):
382 retval = get_elements_html_by_attribute(attribute, value, html, **kargs)
6f32a0b5
ZM
383 return retval[0] if retval else None
384
385
c487cf00 386def get_elements_by_class(class_name, html, **kargs):
2af12ad9
TC
387 """Return the content of all tags with the specified class in the passed HTML document as a list"""
388 return get_elements_by_attribute(
64fa820c 389 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
84c237fb
YCH
390 html, escape_value=False)
391
392
6f32a0b5
ZM
393def get_elements_html_by_class(class_name, html):
394 """Return the html of all tags with the specified class in the passed HTML document as a list"""
395 return get_elements_html_by_attribute(
64fa820c 396 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name),
6f32a0b5
ZM
397 html, escape_value=False)
398
399
400def get_elements_by_attribute(*args, **kwargs):
43e8fafd 401 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
402 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
403
404
405def get_elements_html_by_attribute(*args, **kwargs):
406 """Return the html of the tag with the specified attribute in the passed HTML document"""
407 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
408
409
410def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
411 """
412 Return the text (content) and the html (whole) of the tag with the specified
413 attribute in the passed HTML document
414 """
9e6dd238 415
86e5f3ed 416 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
0254f162 417
84c237fb
YCH
418 value = re.escape(value) if escape_value else value
419
86e5f3ed 420 partial_element_re = rf'''(?x)
6f32a0b5 421 <(?P<tag>[a-zA-Z0-9:._-]+)
0254f162 422 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
86e5f3ed 423 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
424 '''
38285056 425
0254f162
ZM
426 for m in re.finditer(partial_element_re, html):
427 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 428
0254f162
ZM
429 yield (
430 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
431 whole
432 )
a921f407 433
c5229f39 434
ac668111 435class HTMLBreakOnClosingTagParser(html.parser.HTMLParser):
6f32a0b5
ZM
436 """
437 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
438 closing tag for the first opening tag it has encountered, and can be used
439 as a context manager
440 """
441
442 class HTMLBreakOnClosingTagException(Exception):
443 pass
444
445 def __init__(self):
446 self.tagstack = collections.deque()
ac668111 447 html.parser.HTMLParser.__init__(self)
6f32a0b5
ZM
448
449 def __enter__(self):
450 return self
451
452 def __exit__(self, *_):
453 self.close()
454
455 def close(self):
456 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
457 # so data remains buffered; we no longer have any interest in it, thus
458 # override this method to discard it
459 pass
460
461 def handle_starttag(self, tag, _):
462 self.tagstack.append(tag)
463
464 def handle_endtag(self, tag):
465 if not self.tagstack:
466 raise compat_HTMLParseError('no tags in the stack')
467 while self.tagstack:
468 inner_tag = self.tagstack.pop()
469 if inner_tag == tag:
470 break
471 else:
472 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
473 if not self.tagstack:
474 raise self.HTMLBreakOnClosingTagException()
475
476
477def get_element_text_and_html_by_tag(tag, html):
478 """
479 For the first element with the specified tag in the passed HTML document
480 return its' content (text) and the whole element (html)
481 """
482 def find_or_raise(haystack, needle, exc):
483 try:
484 return haystack.index(needle)
485 except ValueError:
486 raise exc
487 closing_tag = f'</{tag}>'
488 whole_start = find_or_raise(
489 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
490 content_start = find_or_raise(
491 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
492 content_start += whole_start + 1
493 with HTMLBreakOnClosingTagParser() as parser:
494 parser.feed(html[whole_start:content_start])
495 if not parser.tagstack or parser.tagstack[0] != tag:
496 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
497 offset = content_start
498 while offset < len(html):
499 next_closing_tag_start = find_or_raise(
500 html[offset:], closing_tag,
501 compat_HTMLParseError(f'closing {tag} tag not found'))
502 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
503 try:
504 parser.feed(html[offset:offset + next_closing_tag_end])
505 offset += next_closing_tag_end
506 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
507 return html[content_start:offset + next_closing_tag_start], \
508 html[whole_start:offset + next_closing_tag_end]
509 raise compat_HTMLParseError('unexpected end of html')
510
511
ac668111 512class HTMLAttributeParser(html.parser.HTMLParser):
8bb56eee 513 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 514
8bb56eee 515 def __init__(self):
c5229f39 516 self.attrs = {}
ac668111 517 html.parser.HTMLParser.__init__(self)
8bb56eee
BF
518
519 def handle_starttag(self, tag, attrs):
520 self.attrs = dict(attrs)
521
c5229f39 522
ac668111 523class HTMLListAttrsParser(html.parser.HTMLParser):
73673ccf
FF
524 """HTML parser to gather the attributes for the elements of a list"""
525
526 def __init__(self):
ac668111 527 html.parser.HTMLParser.__init__(self)
73673ccf
FF
528 self.items = []
529 self._level = 0
530
531 def handle_starttag(self, tag, attrs):
532 if tag == 'li' and self._level == 0:
533 self.items.append(dict(attrs))
534 self._level += 1
535
536 def handle_endtag(self, tag):
537 self._level -= 1
538
539
8bb56eee
BF
540def extract_attributes(html_element):
541 """Given a string for an HTML element such as
542 <el
543 a="foo" B="bar" c="&98;az" d=boz
544 empty= noval entity="&amp;"
545 sq='"' dq="'"
546 >
547 Decode and return a dictionary of attributes.
548 {
549 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
550 'empty': '', 'noval': None, 'entity': '&',
551 'sq': '"', 'dq': '\''
552 }.
8bb56eee
BF
553 """
554 parser = HTMLAttributeParser()
19a03940 555 with contextlib.suppress(compat_HTMLParseError):
b4a3d461
S
556 parser.feed(html_element)
557 parser.close()
8bb56eee 558 return parser.attrs
9e6dd238 559
c5229f39 560
73673ccf
FF
561def parse_list(webpage):
562 """Given a string for an series of HTML <li> elements,
563 return a dictionary of their attributes"""
564 parser = HTMLListAttrsParser()
565 parser.feed(webpage)
566 parser.close()
567 return parser.items
568
569
9e6dd238 570def clean_html(html):
59ae15a5 571 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
572
573 if html is None: # Convenience for sanitizing descriptions etc.
574 return html
575
49185227 576 html = re.sub(r'\s+', ' ', html)
577 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
578 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
579 # Strip html tags
580 html = re.sub('<.*?>', '', html)
581 # Replace html entities
582 html = unescapeHTML(html)
7decf895 583 return html.strip()
9e6dd238
FV
584
585
b7c47b74 586class LenientJSONDecoder(json.JSONDecoder):
587 def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs):
588 self.transform_source, self.ignore_extra = transform_source, ignore_extra
589 super().__init__(*args, **kwargs)
590
591 def decode(self, s):
592 if self.transform_source:
593 s = self.transform_source(s)
2fa669f7 594 try:
595 if self.ignore_extra:
596 return self.raw_decode(s.lstrip())[0]
597 return super().decode(s)
598 except json.JSONDecodeError as e:
599 if e.pos is not None:
600 raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos)
601 raise
b7c47b74 602
603
d77c3dfd 604def sanitize_open(filename, open_mode):
59ae15a5
PH
605 """Try to open the given filename, and slightly tweak it if this fails.
606
607 Attempts to open the given filename. If this fails, it tries to change
608 the filename slightly, step by step, until it's either able to open it
609 or it fails and raises a final exception, like the standard open()
610 function.
611
612 It returns the tuple (stream, definitive_file_name).
613 """
0edb3e33 614 if filename == '-':
615 if sys.platform == 'win32':
616 import msvcrt
be5c1ae8 617
62b58c09 618 # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout
daef7911 619 with contextlib.suppress(io.UnsupportedOperation):
620 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
0edb3e33 621 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5 622
0edb3e33 623 for attempt in range(2):
624 try:
625 try:
89737671 626 if sys.platform == 'win32':
b506289f 627 # FIXME: An exclusive lock also locks the file from being read.
628 # Since windows locks are mandatory, don't lock the file on windows (for now).
629 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
89737671 630 raise LockingUnsupportedError()
0edb3e33 631 stream = locked_file(filename, open_mode, block=False).__enter__()
8a82af35 632 except OSError:
0edb3e33 633 stream = open(filename, open_mode)
8a82af35 634 return stream, filename
86e5f3ed 635 except OSError as err:
0edb3e33 636 if attempt or err.errno in (errno.EACCES,):
637 raise
638 old_filename, filename = filename, sanitize_path(filename)
639 if old_filename == filename:
640 raise
d77c3dfd
FV
641
642
643def timeconvert(timestr):
59ae15a5
PH
644 """Convert RFC 2822 defined time string into system timestamp"""
645 timestamp = None
646 timetuple = email.utils.parsedate_tz(timestr)
647 if timetuple is not None:
648 timestamp = email.utils.mktime_tz(timetuple)
649 return timestamp
1c469a94 650
5f6a1245 651
5c3895ff 652def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
59ae15a5 653 """Sanitizes a string so it could be used as part of a filename.
5c3895ff 654 @param restricted Use a stricter subset of allowed characters
655 @param is_id Whether this is an ID that should be kept unchanged if possible.
656 If unset, yt-dlp's new sanitization rules are in effect
59ae15a5 657 """
5c3895ff 658 if s == '':
659 return ''
660
59ae15a5 661 def replace_insane(char):
c587cbb7
AT
662 if restricted and char in ACCENT_CHARS:
663 return ACCENT_CHARS[char]
91dd88b9 664 elif not restricted and char == '\n':
5c3895ff 665 return '\0 '
989a01c2 666 elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\':
667 # Replace with their full-width unicode counterparts
668 return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0))
91dd88b9 669 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
670 return ''
671 elif char == '"':
672 return '' if restricted else '\''
673 elif char == ':':
5c3895ff 674 return '\0_\0-' if restricted else '\0 \0-'
59ae15a5 675 elif char in '\\/|*<>':
5c3895ff 676 return '\0_'
677 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
678 return '\0_'
59ae15a5
PH
679 return char
680
989a01c2 681 if restricted and is_id is NO_DEFAULT:
682 s = unicodedata.normalize('NFKC', s)
5c3895ff 683 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
28e614de 684 result = ''.join(map(replace_insane, s))
5c3895ff 685 if is_id is NO_DEFAULT:
ae61d108 686 result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars
687 STRIP_RE = r'(?:\0.|[ _-])*'
5c3895ff 688 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
689 result = result.replace('\0', '') or '_'
690
796173d0
PH
691 if not is_id:
692 while '__' in result:
693 result = result.replace('__', '_')
694 result = result.strip('_')
695 # Common case of "Foreign band name - English song title"
696 if restricted and result.startswith('-_'):
697 result = result[2:]
5a42414b
PH
698 if result.startswith('-'):
699 result = '_' + result[len('-'):]
a7440261 700 result = result.lstrip('.')
796173d0
PH
701 if not result:
702 result = '_'
59ae15a5 703 return result
d77c3dfd 704
5f6a1245 705
c2934512 706def sanitize_path(s, force=False):
a2aaf4db 707 """Sanitizes and normalizes path on Windows"""
c2934512 708 if sys.platform == 'win32':
c4218ac3 709 force = False
c2934512 710 drive_or_unc, _ = os.path.splitdrive(s)
c2934512 711 elif force:
712 drive_or_unc = ''
713 else:
a2aaf4db 714 return s
c2934512 715
be531ef1
S
716 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
717 if drive_or_unc:
a2aaf4db
S
718 norm_path.pop(0)
719 sanitized_path = [
ec85ded8 720 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 721 for path_part in norm_path]
be531ef1
S
722 if drive_or_unc:
723 sanitized_path.insert(0, drive_or_unc + os.path.sep)
4abea8ca 724 elif force and s and s[0] == os.path.sep:
c4218ac3 725 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
726 return os.path.join(*sanitized_path)
727
728
8f97a15d 729def sanitize_url(url, *, scheme='http'):
befa4708
S
730 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
731 # the number of unwanted failures due to missing protocol
21633673 732 if url is None:
733 return
734 elif url.startswith('//'):
8f97a15d 735 return f'{scheme}:{url}'
befa4708
S
736 # Fix some common typos seen so far
737 COMMON_TYPOS = (
067aa17e 738 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
739 (r'^httpss://', r'https://'),
740 # https://bx1.be/lives/direct-tv/
741 (r'^rmtp([es]?)://', r'rtmp\1://'),
742 )
743 for mistake, fixup in COMMON_TYPOS:
744 if re.match(mistake, url):
745 return re.sub(mistake, fixup, url)
bc6b9bcd 746 return url
17bcc626
S
747
748
5435dcf9 749def extract_basic_auth(url):
14f25df2 750 parts = urllib.parse.urlsplit(url)
5435dcf9
HH
751 if parts.username is None:
752 return url, None
14f25df2 753 url = urllib.parse.urlunsplit(parts._replace(netloc=(
5435dcf9
HH
754 parts.hostname if parts.port is None
755 else '%s:%d' % (parts.hostname, parts.port))))
756 auth_payload = base64.b64encode(
0f06bcd7 757 ('%s:%s' % (parts.username, parts.password or '')).encode())
758 return url, f'Basic {auth_payload.decode()}'
5435dcf9
HH
759
760
67dda517 761def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 762 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
763 if auth_header is not None:
764 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
765 headers['Authorization'] = auth_header
ac668111 766 return urllib.request.Request(url, *args, **kwargs)
67dda517
S
767
768
51098426 769def expand_path(s):
2fa669f7 770 """Expand shell variables and ~"""
51098426
S
771 return os.path.expandvars(compat_expanduser(s))
772
773
7e9a6125 774def orderedSet(iterable, *, lazy=False):
775 """Remove all duplicates from the input iterable"""
776 def _iter():
777 seen = [] # Do not use set since the items can be unhashable
778 for x in iterable:
779 if x not in seen:
780 seen.append(x)
781 yield x
782
783 return _iter() if lazy else list(_iter())
d77c3dfd 784
912b38b4 785
55b2f099 786def _htmlentity_transform(entity_with_semicolon):
4e408e47 787 """Transforms an HTML entity to a character."""
55b2f099
YCH
788 entity = entity_with_semicolon[:-1]
789
4e408e47 790 # Known non-numeric HTML entity
ac668111 791 if entity in html.entities.name2codepoint:
792 return chr(html.entities.name2codepoint[entity])
4e408e47 793
62b58c09
L
794 # TODO: HTML5 allows entities without a semicolon.
795 # E.g. '&Eacuteric' should be decoded as 'Éric'.
ac668111 796 if entity_with_semicolon in html.entities.html5:
797 return html.entities.html5[entity_with_semicolon]
55b2f099 798
91757b0f 799 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
800 if mobj is not None:
801 numstr = mobj.group(1)
28e614de 802 if numstr.startswith('x'):
4e408e47 803 base = 16
28e614de 804 numstr = '0%s' % numstr
4e408e47
PH
805 else:
806 base = 10
067aa17e 807 # See https://github.com/ytdl-org/youtube-dl/issues/7518
19a03940 808 with contextlib.suppress(ValueError):
ac668111 809 return chr(int(numstr, base))
4e408e47
PH
810
811 # Unknown entity in name, return its literal representation
7a3f0c00 812 return '&%s;' % entity
4e408e47
PH
813
814
d77c3dfd 815def unescapeHTML(s):
912b38b4
PH
816 if s is None:
817 return None
19a03940 818 assert isinstance(s, str)
d77c3dfd 819
4e408e47 820 return re.sub(
95f3f7c2 821 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 822
8bf48f23 823
cdb19aa4 824def escapeHTML(text):
825 return (
826 text
827 .replace('&', '&amp;')
828 .replace('<', '&lt;')
829 .replace('>', '&gt;')
830 .replace('"', '&quot;')
831 .replace("'", '&#39;')
832 )
833
834
f5b1bca9 835def process_communicate_or_kill(p, *args, **kwargs):
da4db748 836 deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed '
837 f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead')
8a82af35 838 return Popen.communicate_or_kill(p, *args, **kwargs)
f5b1bca9 839
840
d3c93ec2 841class Popen(subprocess.Popen):
842 if sys.platform == 'win32':
843 _startupinfo = subprocess.STARTUPINFO()
844 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
845 else:
846 _startupinfo = None
847
82ea226c
L
848 @staticmethod
849 def _fix_pyinstaller_ld_path(env):
850 """Restore LD_LIBRARY_PATH when using PyInstaller
851 Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations
852 https://github.com/yt-dlp/yt-dlp/issues/4573
853 """
854 if not hasattr(sys, '_MEIPASS'):
855 return
856
857 def _fix(key):
858 orig = env.get(f'{key}_ORIG')
859 if orig is None:
860 env.pop(key, None)
861 else:
862 env[key] = orig
863
864 _fix('LD_LIBRARY_PATH') # Linux
865 _fix('DYLD_LIBRARY_PATH') # macOS
866
867 def __init__(self, *args, env=None, text=False, **kwargs):
868 if env is None:
869 env = os.environ.copy()
870 self._fix_pyinstaller_ld_path(env)
871
f0c9fb96 872 if text is True:
873 kwargs['universal_newlines'] = True # For 3.6 compatibility
874 kwargs.setdefault('encoding', 'utf-8')
875 kwargs.setdefault('errors', 'replace')
82ea226c 876 super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo)
d3c93ec2 877
878 def communicate_or_kill(self, *args, **kwargs):
8a82af35 879 try:
880 return self.communicate(*args, **kwargs)
881 except BaseException: # Including KeyboardInterrupt
f0c9fb96 882 self.kill(timeout=None)
8a82af35 883 raise
d3c93ec2 884
f0c9fb96 885 def kill(self, *, timeout=0):
886 super().kill()
887 if timeout != 0:
888 self.wait(timeout=timeout)
889
890 @classmethod
992dc6b4 891 def run(cls, *args, timeout=None, **kwargs):
f0c9fb96 892 with cls(*args, **kwargs) as proc:
992dc6b4 893 stdout, stderr = proc.communicate_or_kill(timeout=timeout)
f0c9fb96 894 return stdout or '', stderr or '', proc.returncode
895
d3c93ec2 896
aa49acd1
S
897def get_subprocess_encoding():
898 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
899 # For subprocess calls, encode with locale encoding
900 # Refer to http://stackoverflow.com/a/9951851/35070
901 encoding = preferredencoding()
902 else:
903 encoding = sys.getfilesystemencoding()
904 if encoding is None:
905 encoding = 'utf-8'
906 return encoding
907
908
8bf48f23 909def encodeFilename(s, for_subprocess=False):
19a03940 910 assert isinstance(s, str)
cfb0511d 911 return s
aa49acd1
S
912
913
914def decodeFilename(b, for_subprocess=False):
cfb0511d 915 return b
8bf48f23 916
f07b74fc
PH
917
918def encodeArgument(s):
cfb0511d 919 # Legacy code that uses byte strings
920 # Uncomment the following line after fixing all post processors
14f25df2 921 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s))
cfb0511d 922 return s if isinstance(s, str) else s.decode('ascii')
f07b74fc
PH
923
924
aa49acd1 925def decodeArgument(b):
cfb0511d 926 return b
aa49acd1
S
927
928
8271226a
PH
929def decodeOption(optval):
930 if optval is None:
931 return optval
932 if isinstance(optval, bytes):
933 optval = optval.decode(preferredencoding())
934
14f25df2 935 assert isinstance(optval, str)
8271226a 936 return optval
1c256f70 937
5f6a1245 938
aa7785f8 939_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
940
941
942def timetuple_from_msec(msec):
943 secs, msec = divmod(msec, 1000)
944 mins, secs = divmod(secs, 60)
945 hrs, mins = divmod(mins, 60)
946 return _timetuple(hrs, mins, secs, msec)
947
948
cdb19aa4 949def formatSeconds(secs, delim=':', msec=False):
aa7785f8 950 time = timetuple_from_msec(secs * 1000)
951 if time.hours:
952 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
953 elif time.minutes:
954 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 955 else:
aa7785f8 956 ret = '%d' % time.seconds
957 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 958
a0ddb8a2 959
77562778 960def _ssl_load_windows_store_certs(ssl_context, storename):
961 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
962 try:
963 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
964 if encoding == 'x509_asn' and (
965 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
966 except PermissionError:
967 return
968 for cert in certs:
19a03940 969 with contextlib.suppress(ssl.SSLError):
77562778 970 ssl_context.load_verify_locations(cadata=cert)
a2366922 971
77562778 972
973def make_HTTPS_handler(params, **kwargs):
974 opts_check_certificate = not params.get('nocheckcertificate')
975 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
976 context.check_hostname = opts_check_certificate
f81c62a6 977 if params.get('legacyserverconnect'):
978 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
4f28b537 979 # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998
980 context.set_ciphers('DEFAULT')
8a82af35 981
77562778 982 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
983 if opts_check_certificate:
d5820461 984 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
985 context.load_verify_locations(cafile=certifi.where())
168bbc4f 986 else:
987 try:
988 context.load_default_certs()
989 # Work around the issue in load_default_certs when there are bad certificates. See:
990 # https://github.com/yt-dlp/yt-dlp/issues/1060,
991 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
992 except ssl.SSLError:
993 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
994 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
995 for storename in ('CA', 'ROOT'):
996 _ssl_load_windows_store_certs(context, storename)
997 context.set_default_verify_paths()
8a82af35 998
bb58c9ed 999 client_certfile = params.get('client_certificate')
1000 if client_certfile:
1001 try:
1002 context.load_cert_chain(
1003 client_certfile, keyfile=params.get('client_certificate_key'),
1004 password=params.get('client_certificate_password'))
1005 except ssl.SSLError:
1006 raise YoutubeDLError('Unable to load client certificate')
2c6dcb65 1007
1008 # Some servers may reject requests if ALPN extension is not sent. See:
1009 # https://github.com/python/cpython/issues/85140
1010 # https://github.com/yt-dlp/yt-dlp/issues/3878
1011 with contextlib.suppress(NotImplementedError):
1012 context.set_alpn_protocols(['http/1.1'])
1013
77562778 1014 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 1015
732ea2f0 1016
5873d4cc 1017def bug_reports_message(before=';'):
57e0f077 1018 from .update import REPOSITORY
1019
1020 msg = (f'please report this issue on https://github.com/{REPOSITORY}/issues?q= , '
1021 'filling out the appropriate issue template. Confirm you are on the latest version using yt-dlp -U')
5873d4cc
F
1022
1023 before = before.rstrip()
1024 if not before or before.endswith(('.', '!', '?')):
1025 msg = msg[0].title() + msg[1:]
1026
1027 return (before + ' ' if before else '') + msg
08f2a92c
JMF
1028
1029
bf5b9d85
PM
1030class YoutubeDLError(Exception):
1031 """Base exception for YoutubeDL errors."""
aa9369a2 1032 msg = None
1033
1034 def __init__(self, msg=None):
1035 if msg is not None:
1036 self.msg = msg
1037 elif self.msg is None:
1038 self.msg = type(self).__name__
1039 super().__init__(self.msg)
bf5b9d85
PM
1040
1041
ac668111 1042network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error]
3158150c 1043if hasattr(ssl, 'CertificateError'):
1044 network_exceptions.append(ssl.CertificateError)
1045network_exceptions = tuple(network_exceptions)
1046
1047
bf5b9d85 1048class ExtractorError(YoutubeDLError):
1c256f70 1049 """Error during info extraction."""
5f6a1245 1050
1151c407 1051 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1052 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1053 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1054 """
3158150c 1055 if sys.exc_info()[0] in network_exceptions:
9a82b238 1056 expected = True
d5979c5d 1057
7265a219 1058 self.orig_msg = str(msg)
1c256f70 1059 self.traceback = tb
1151c407 1060 self.expected = expected
2eabb802 1061 self.cause = cause
d11271dd 1062 self.video_id = video_id
1151c407 1063 self.ie = ie
1064 self.exc_info = sys.exc_info() # preserve original exception
5df14442 1065 if isinstance(self.exc_info[1], ExtractorError):
1066 self.exc_info = self.exc_info[1].exc_info
1151c407 1067
86e5f3ed 1068 super().__init__(''.join((
a70635b8 1069 format_field(ie, None, '[%s] '),
1070 format_field(video_id, None, '%s: '),
7265a219 1071 msg,
a70635b8 1072 format_field(cause, None, ' (caused by %r)'),
1151c407 1073 '' if expected else bug_reports_message())))
1c256f70 1074
01951dda 1075 def format_traceback(self):
497d2fab 1076 return join_nonempty(
1077 self.traceback and ''.join(traceback.format_tb(self.traceback)),
e491d06d 1078 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
497d2fab 1079 delim='\n') or None
01951dda 1080
1c256f70 1081
416c7fcb
PH
1082class UnsupportedError(ExtractorError):
1083 def __init__(self, url):
86e5f3ed 1084 super().__init__(
416c7fcb
PH
1085 'Unsupported URL: %s' % url, expected=True)
1086 self.url = url
1087
1088
55b3e45b
JMF
1089class RegexNotFoundError(ExtractorError):
1090 """Error when a regex didn't match"""
1091 pass
1092
1093
773f291d
S
1094class GeoRestrictedError(ExtractorError):
1095 """Geographic restriction Error exception.
1096
1097 This exception may be thrown when a video is not available from your
1098 geographic location due to geographic restrictions imposed by a website.
1099 """
b6e0c7d2 1100
0db3bae8 1101 def __init__(self, msg, countries=None, **kwargs):
1102 kwargs['expected'] = True
86e5f3ed 1103 super().__init__(msg, **kwargs)
773f291d
S
1104 self.countries = countries
1105
1106
693f0600 1107class UserNotLive(ExtractorError):
1108 """Error when a channel/user is not live"""
1109
1110 def __init__(self, msg=None, **kwargs):
1111 kwargs['expected'] = True
1112 super().__init__(msg or 'The channel is not currently live', **kwargs)
1113
1114
bf5b9d85 1115class DownloadError(YoutubeDLError):
59ae15a5 1116 """Download Error exception.
d77c3dfd 1117
59ae15a5
PH
1118 This exception may be thrown by FileDownloader objects if they are not
1119 configured to continue on errors. They will contain the appropriate
1120 error message.
1121 """
5f6a1245 1122
8cc83b8d
FV
1123 def __init__(self, msg, exc_info=None):
1124 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
86e5f3ed 1125 super().__init__(msg)
8cc83b8d 1126 self.exc_info = exc_info
d77c3dfd
FV
1127
1128
498f5606 1129class EntryNotInPlaylist(YoutubeDLError):
1130 """Entry not in playlist exception.
1131
1132 This exception will be thrown by YoutubeDL when a requested entry
1133 is not found in the playlist info_dict
1134 """
aa9369a2 1135 msg = 'Entry not found in info'
498f5606 1136
1137
bf5b9d85 1138class SameFileError(YoutubeDLError):
59ae15a5 1139 """Same File exception.
d77c3dfd 1140
59ae15a5
PH
1141 This exception will be thrown by FileDownloader objects if they detect
1142 multiple files would have to be downloaded to the same file on disk.
1143 """
aa9369a2 1144 msg = 'Fixed output name but more than one file to download'
1145
1146 def __init__(self, filename=None):
1147 if filename is not None:
1148 self.msg += f': {filename}'
1149 super().__init__(self.msg)
d77c3dfd
FV
1150
1151
bf5b9d85 1152class PostProcessingError(YoutubeDLError):
59ae15a5 1153 """Post Processing exception.
d77c3dfd 1154
59ae15a5
PH
1155 This exception may be raised by PostProcessor's .run() method to
1156 indicate an error in the postprocessing task.
1157 """
5f6a1245 1158
5f6a1245 1159
48f79687 1160class DownloadCancelled(YoutubeDLError):
1161 """ Exception raised when the download queue should be interrupted """
1162 msg = 'The download was cancelled'
8b0d7497 1163
8b0d7497 1164
48f79687 1165class ExistingVideoReached(DownloadCancelled):
1166 """ --break-on-existing triggered """
1167 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1168
48f79687 1169
1170class RejectedVideoReached(DownloadCancelled):
1171 """ --break-on-reject triggered """
1172 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
51d9739f 1173
1174
48f79687 1175class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1176 """ --max-downloads limit has been reached. """
48f79687 1177 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1178
1179
f2ebc5c7 1180class ReExtractInfo(YoutubeDLError):
1181 """ Video info needs to be re-extracted. """
1182
1183 def __init__(self, msg, expected=False):
1184 super().__init__(msg)
1185 self.expected = expected
1186
1187
1188class ThrottledDownload(ReExtractInfo):
48f79687 1189 """ Download speed below --throttled-rate. """
aa9369a2 1190 msg = 'The download speed is below throttle limit'
d77c3dfd 1191
43b22906 1192 def __init__(self):
1193 super().__init__(self.msg, expected=False)
f2ebc5c7 1194
d77c3dfd 1195
bf5b9d85 1196class UnavailableVideoError(YoutubeDLError):
59ae15a5 1197 """Unavailable Format exception.
d77c3dfd 1198
59ae15a5
PH
1199 This exception will be thrown when a video is requested
1200 in a format that is not available for that video.
1201 """
aa9369a2 1202 msg = 'Unable to download video'
1203
1204 def __init__(self, err=None):
1205 if err is not None:
1206 self.msg += f': {err}'
1207 super().__init__(self.msg)
d77c3dfd
FV
1208
1209
bf5b9d85 1210class ContentTooShortError(YoutubeDLError):
59ae15a5 1211 """Content Too Short exception.
d77c3dfd 1212
59ae15a5
PH
1213 This exception may be raised by FileDownloader objects when a file they
1214 download is too small for what the server announced first, indicating
1215 the connection was probably interrupted.
1216 """
d77c3dfd 1217
59ae15a5 1218 def __init__(self, downloaded, expected):
86e5f3ed 1219 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
2c7ed247 1220 # Both in bytes
59ae15a5
PH
1221 self.downloaded = downloaded
1222 self.expected = expected
d77c3dfd 1223
5f6a1245 1224
bf5b9d85 1225class XAttrMetadataError(YoutubeDLError):
efa97bdc 1226 def __init__(self, code=None, msg='Unknown error'):
86e5f3ed 1227 super().__init__(msg)
efa97bdc 1228 self.code = code
bd264412 1229 self.msg = msg
efa97bdc
YCH
1230
1231 # Parsing code and msg
3089bc74 1232 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1233 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1234 self.reason = 'NO_SPACE'
1235 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1236 self.reason = 'VALUE_TOO_LONG'
1237 else:
1238 self.reason = 'NOT_SUPPORTED'
1239
1240
bf5b9d85 1241class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1242 pass
1243
1244
c5a59d93 1245def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
f9934b96 1246 hc = http_class(*args, **kwargs)
be4a824d 1247 source_address = ydl_handler._params.get('source_address')
8959018a 1248
be4a824d 1249 if source_address is not None:
8959018a
AU
1250 # This is to workaround _create_connection() from socket where it will try all
1251 # address data from getaddrinfo() including IPv6. This filters the result from
1252 # getaddrinfo() based on the source_address value.
1253 # This is based on the cpython socket.create_connection() function.
1254 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1255 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1256 host, port = address
1257 err = None
1258 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1259 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1260 ip_addrs = [addr for addr in addrs if addr[0] == af]
1261 if addrs and not ip_addrs:
1262 ip_version = 'v4' if af == socket.AF_INET else 'v6'
86e5f3ed 1263 raise OSError(
9e21e6d9
S
1264 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1265 % (ip_version, source_address[0]))
8959018a
AU
1266 for res in ip_addrs:
1267 af, socktype, proto, canonname, sa = res
1268 sock = None
1269 try:
1270 sock = socket.socket(af, socktype, proto)
1271 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1272 sock.settimeout(timeout)
1273 sock.bind(source_address)
1274 sock.connect(sa)
1275 err = None # Explicitly break reference cycle
1276 return sock
86e5f3ed 1277 except OSError as _:
8959018a
AU
1278 err = _
1279 if sock is not None:
1280 sock.close()
1281 if err is not None:
1282 raise err
1283 else:
86e5f3ed 1284 raise OSError('getaddrinfo returns an empty list')
9e21e6d9
S
1285 if hasattr(hc, '_create_connection'):
1286 hc._create_connection = _create_connection
cfb0511d 1287 hc.source_address = (source_address, 0)
be4a824d
PH
1288
1289 return hc
1290
1291
87f0e62d 1292def handle_youtubedl_headers(headers):
992fc9d6
YCH
1293 filtered_headers = headers
1294
1295 if 'Youtubedl-no-compression' in filtered_headers:
86e5f3ed 1296 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
87f0e62d 1297 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1298
992fc9d6 1299 return filtered_headers
87f0e62d
YCH
1300
1301
ac668111 1302class YoutubeDLHandler(urllib.request.HTTPHandler):
59ae15a5
PH
1303 """Handler for HTTP requests and responses.
1304
1305 This class, when installed with an OpenerDirector, automatically adds
1306 the standard headers to every HTTP request and handles gzipped and
1307 deflated responses from web servers. If compression is to be avoided in
1308 a particular request, the original request in the program code only has
0424ec30 1309 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1310 removed before making the real request.
1311
1312 Part of this code was copied from:
1313
1314 http://techknack.net/python-urllib2-handlers/
1315
1316 Andrew Rowls, the author of that code, agreed to release it to the
1317 public domain.
1318 """
1319
be4a824d 1320 def __init__(self, params, *args, **kwargs):
ac668111 1321 urllib.request.HTTPHandler.__init__(self, *args, **kwargs)
be4a824d
PH
1322 self._params = params
1323
1324 def http_open(self, req):
ac668111 1325 conn_class = http.client.HTTPConnection
71aff188
YCH
1326
1327 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1328 if socks_proxy:
1329 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1330 del req.headers['Ytdl-socks-proxy']
1331
be4a824d 1332 return self.do_open(functools.partial(
71aff188 1333 _create_http_connection, self, conn_class, False),
be4a824d
PH
1334 req)
1335
59ae15a5
PH
1336 @staticmethod
1337 def deflate(data):
fc2119f2 1338 if not data:
1339 return data
59ae15a5
PH
1340 try:
1341 return zlib.decompress(data, -zlib.MAX_WBITS)
1342 except zlib.error:
1343 return zlib.decompress(data)
1344
4390d5ec 1345 @staticmethod
1346 def brotli(data):
1347 if not data:
1348 return data
9b8ee23b 1349 return brotli.decompress(data)
4390d5ec 1350
acebc9cd 1351 def http_request(self, req):
51f267d9
S
1352 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1353 # always respected by websites, some tend to give out URLs with non percent-encoded
1354 # non-ASCII characters (see telemb.py, ard.py [#3412])
1355 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1356 # To work around aforementioned issue we will replace request's original URL with
1357 # percent-encoded one
1358 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1359 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1360 url = req.get_full_url()
1361 url_escaped = escape_url(url)
1362
1363 # Substitute URL if any change after escaping
1364 if url != url_escaped:
15d260eb 1365 req = update_Request(req, url=url_escaped)
51f267d9 1366
8b7539d2 1367 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1368 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1369 # The dict keys are capitalized because of this bug by urllib
1370 if h.capitalize() not in req.headers:
33ac271b 1371 req.add_header(h, v)
87f0e62d 1372
af14914b 1373 if 'Accept-encoding' not in req.headers:
1374 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1375
87f0e62d 1376 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b 1377
379a4f16 1378 return super().do_request_(req)
59ae15a5 1379
acebc9cd 1380 def http_response(self, req, resp):
59ae15a5
PH
1381 old_resp = resp
1382 # gzip
1383 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1384 content = resp.read()
1385 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1386 try:
1387 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1388 except OSError as original_ioerror:
aa3e9507
PH
1389 # There may be junk add the end of the file
1390 # See http://stackoverflow.com/q/4928560/35070 for details
1391 for i in range(1, 1024):
1392 try:
1393 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1394 uncompressed = io.BytesIO(gz.read())
86e5f3ed 1395 except OSError:
aa3e9507
PH
1396 continue
1397 break
1398 else:
1399 raise original_ioerror
ac668111 1400 resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1401 resp.msg = old_resp.msg
c047270c 1402 del resp.headers['Content-encoding']
59ae15a5
PH
1403 # deflate
1404 if resp.headers.get('Content-encoding', '') == 'deflate':
1405 gz = io.BytesIO(self.deflate(resp.read()))
ac668111 1406 resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1407 resp.msg = old_resp.msg
c047270c 1408 del resp.headers['Content-encoding']
4390d5ec 1409 # brotli
1410 if resp.headers.get('Content-encoding', '') == 'br':
ac668111 1411 resp = urllib.request.addinfourl(
4390d5ec 1412 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1413 resp.msg = old_resp.msg
1414 del resp.headers['Content-encoding']
ad729172 1415 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1416 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1417 if 300 <= resp.code < 400:
1418 location = resp.headers.get('Location')
1419 if location:
1420 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
0f06bcd7 1421 location = location.encode('iso-8859-1').decode()
5a4d9ddb
S
1422 location_escaped = escape_url(location)
1423 if location != location_escaped:
1424 del resp.headers['Location']
1425 resp.headers['Location'] = location_escaped
59ae15a5 1426 return resp
0f8d03f8 1427
acebc9cd
PH
1428 https_request = http_request
1429 https_response = http_response
bf50b038 1430
5de90176 1431
71aff188
YCH
1432def make_socks_conn_class(base_class, socks_proxy):
1433 assert issubclass(base_class, (
ac668111 1434 http.client.HTTPConnection, http.client.HTTPSConnection))
71aff188 1435
14f25df2 1436 url_components = urllib.parse.urlparse(socks_proxy)
71aff188
YCH
1437 if url_components.scheme.lower() == 'socks5':
1438 socks_type = ProxyType.SOCKS5
1439 elif url_components.scheme.lower() in ('socks', 'socks4'):
1440 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1441 elif url_components.scheme.lower() == 'socks4a':
1442 socks_type = ProxyType.SOCKS4A
71aff188 1443
cdd94c2e
YCH
1444 def unquote_if_non_empty(s):
1445 if not s:
1446 return s
ac668111 1447 return urllib.parse.unquote_plus(s)
cdd94c2e 1448
71aff188
YCH
1449 proxy_args = (
1450 socks_type,
1451 url_components.hostname, url_components.port or 1080,
1452 True, # Remote DNS
cdd94c2e
YCH
1453 unquote_if_non_empty(url_components.username),
1454 unquote_if_non_empty(url_components.password),
71aff188
YCH
1455 )
1456
1457 class SocksConnection(base_class):
1458 def connect(self):
1459 self.sock = sockssocket()
1460 self.sock.setproxy(*proxy_args)
19a03940 1461 if isinstance(self.timeout, (int, float)):
71aff188
YCH
1462 self.sock.settimeout(self.timeout)
1463 self.sock.connect((self.host, self.port))
1464
ac668111 1465 if isinstance(self, http.client.HTTPSConnection):
71aff188
YCH
1466 if hasattr(self, '_context'): # Python > 2.6
1467 self.sock = self._context.wrap_socket(
1468 self.sock, server_hostname=self.host)
1469 else:
1470 self.sock = ssl.wrap_socket(self.sock)
1471
1472 return SocksConnection
1473
1474
ac668111 1475class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler):
be4a824d 1476 def __init__(self, params, https_conn_class=None, *args, **kwargs):
ac668111 1477 urllib.request.HTTPSHandler.__init__(self, *args, **kwargs)
1478 self._https_conn_class = https_conn_class or http.client.HTTPSConnection
be4a824d
PH
1479 self._params = params
1480
1481 def https_open(self, req):
4f264c02 1482 kwargs = {}
71aff188
YCH
1483 conn_class = self._https_conn_class
1484
4f264c02
JMF
1485 if hasattr(self, '_context'): # python > 2.6
1486 kwargs['context'] = self._context
1487 if hasattr(self, '_check_hostname'): # python 3.x
1488 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1489
1490 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1491 if socks_proxy:
1492 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1493 del req.headers['Ytdl-socks-proxy']
1494
4f28b537 1495 try:
1496 return self.do_open(
1497 functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs)
1498 except urllib.error.URLError as e:
1499 if (isinstance(e.reason, ssl.SSLError)
1500 and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'):
1501 raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect')
1502 raise
be4a824d
PH
1503
1504
941e881e 1505def is_path_like(f):
1506 return isinstance(f, (str, bytes, os.PathLike))
1507
1508
ac668111 1509class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar):
f1a8511f
S
1510 """
1511 See [1] for cookie file format.
1512
1513 1. https://curl.haxx.se/docs/http-cookies.html
1514 """
e7e62441 1515 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1516 _ENTRY_LEN = 7
1517 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1518# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1519
1520'''
1521 _CookieFileEntry = collections.namedtuple(
1522 'CookieFileEntry',
1523 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1524
d76fa1f3 1525 def __init__(self, filename=None, *args, **kwargs):
1526 super().__init__(None, *args, **kwargs)
941e881e 1527 if is_path_like(filename):
d76fa1f3 1528 filename = os.fspath(filename)
1529 self.filename = filename
1530
24146491 1531 @staticmethod
1532 def _true_or_false(cndn):
1533 return 'TRUE' if cndn else 'FALSE'
1534
d76fa1f3 1535 @contextlib.contextmanager
1536 def open(self, file, *, write=False):
941e881e 1537 if is_path_like(file):
d76fa1f3 1538 with open(file, 'w' if write else 'r', encoding='utf-8') as f:
1539 yield f
1540 else:
1541 if write:
1542 file.truncate(0)
1543 yield file
1544
24146491 1545 def _really_save(self, f, ignore_discard=False, ignore_expires=False):
1546 now = time.time()
1547 for cookie in self:
1548 if (not ignore_discard and cookie.discard
1549 or not ignore_expires and cookie.is_expired(now)):
1550 continue
1551 name, value = cookie.name, cookie.value
1552 if value is None:
1553 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1554 # with no name, whereas http.cookiejar regards it as a
1555 # cookie with no value.
1556 name, value = '', name
1557 f.write('%s\n' % '\t'.join((
1558 cookie.domain,
1559 self._true_or_false(cookie.domain.startswith('.')),
1560 cookie.path,
1561 self._true_or_false(cookie.secure),
1562 str_or_none(cookie.expires, default=''),
1563 name, value
1564 )))
1565
1566 def save(self, filename=None, *args, **kwargs):
c380cc28
S
1567 """
1568 Save cookies to a file.
24146491 1569 Code is taken from CPython 3.6
1570 https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """
c380cc28 1571
c380cc28
S
1572 if filename is None:
1573 if self.filename is not None:
1574 filename = self.filename
1575 else:
ac668111 1576 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
c380cc28 1577
24146491 1578 # Store session cookies with `expires` set to 0 instead of an empty string
1bab3437
S
1579 for cookie in self:
1580 if cookie.expires is None:
1581 cookie.expires = 0
c380cc28 1582
d76fa1f3 1583 with self.open(filename, write=True) as f:
c380cc28 1584 f.write(self._HEADER)
24146491 1585 self._really_save(f, *args, **kwargs)
1bab3437
S
1586
1587 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1588 """Load cookies from a file."""
1589 if filename is None:
1590 if self.filename is not None:
1591 filename = self.filename
1592 else:
ac668111 1593 raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT)
e7e62441 1594
c380cc28
S
1595 def prepare_line(line):
1596 if line.startswith(self._HTTPONLY_PREFIX):
1597 line = line[len(self._HTTPONLY_PREFIX):]
1598 # comments and empty lines are fine
1599 if line.startswith('#') or not line.strip():
1600 return line
1601 cookie_list = line.split('\t')
1602 if len(cookie_list) != self._ENTRY_LEN:
ac668111 1603 raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list))
c380cc28
S
1604 cookie = self._CookieFileEntry(*cookie_list)
1605 if cookie.expires_at and not cookie.expires_at.isdigit():
ac668111 1606 raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
c380cc28
S
1607 return line
1608
e7e62441 1609 cf = io.StringIO()
d76fa1f3 1610 with self.open(filename) as f:
e7e62441 1611 for line in f:
c380cc28
S
1612 try:
1613 cf.write(prepare_line(line))
ac668111 1614 except http.cookiejar.LoadError as e:
94aa0644 1615 if f'{line.strip()} '[0] in '[{"':
ac668111 1616 raise http.cookiejar.LoadError(
94aa0644 1617 'Cookies file must be Netscape formatted, not JSON. See '
17ffed18 1618 'https://github.com/yt-dlp/yt-dlp/wiki/FAQ#how-do-i-pass-cookies-to-yt-dlp')
19a03940 1619 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
c380cc28 1620 continue
e7e62441 1621 cf.seek(0)
1622 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1623 # Session cookies are denoted by either `expires` field set to
1624 # an empty string or 0. MozillaCookieJar only recognizes the former
1625 # (see [1]). So we need force the latter to be recognized as session
1626 # cookies on our own.
1627 # Session cookies may be important for cookies-based authentication,
1628 # e.g. usually, when user does not check 'Remember me' check box while
1629 # logging in on a site, some important cookies are stored as session
1630 # cookies so that not recognizing them will result in failed login.
1631 # 1. https://bugs.python.org/issue17164
1632 for cookie in self:
1633 # Treat `expires=0` cookies as session cookies
1634 if cookie.expires == 0:
1635 cookie.expires = None
1636 cookie.discard = True
1637
1638
ac668111 1639class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor):
a6420bf5 1640 def __init__(self, cookiejar=None):
ac668111 1641 urllib.request.HTTPCookieProcessor.__init__(self, cookiejar)
a6420bf5
S
1642
1643 def http_response(self, request, response):
ac668111 1644 return urllib.request.HTTPCookieProcessor.http_response(self, request, response)
a6420bf5 1645
ac668111 1646 https_request = urllib.request.HTTPCookieProcessor.http_request
a6420bf5
S
1647 https_response = http_response
1648
1649
ac668111 1650class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler):
201c1459 1651 """YoutubeDL redirect handler
1652
1653 The code is based on HTTPRedirectHandler implementation from CPython [1].
1654
1655 This redirect handler solves two issues:
1656 - ensures redirect URL is always unicode under python 2
1657 - introduces support for experimental HTTP response status code
1658 308 Permanent Redirect [2] used by some sites [3]
1659
1660 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1661 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1662 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1663 """
1664
ac668111 1665 http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302
201c1459 1666
1667 def redirect_request(self, req, fp, code, msg, headers, newurl):
1668 """Return a Request or None in response to a redirect.
1669
1670 This is called by the http_error_30x methods when a
1671 redirection response is received. If a redirection should
1672 take place, return a new Request to allow http_error_30x to
1673 perform the redirect. Otherwise, raise HTTPError if no-one
1674 else should try to handle this url. Return None if you can't
1675 but another Handler might.
1676 """
1677 m = req.get_method()
1678 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1679 or code in (301, 302, 303) and m == "POST")):
14f25df2 1680 raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp)
201c1459 1681 # Strictly (according to RFC 2616), 301 or 302 in response to
1682 # a POST MUST NOT cause a redirection without confirmation
1683 # from the user (of urllib.request, in this case). In practice,
1684 # essentially all clients do redirect in this case, so we do
1685 # the same.
1686
201c1459 1687 # Be conciliant with URIs containing a space. This is mainly
1688 # redundant with the more complete encoding done in http_error_302(),
1689 # but it is kept for compatibility with other callers.
1690 newurl = newurl.replace(' ', '%20')
1691
1692 CONTENT_HEADERS = ("content-length", "content-type")
1693 # NB: don't use dict comprehension for python 2.6 compatibility
86e5f3ed 1694 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
afac4caa 1695
1696 # A 303 must either use GET or HEAD for subsequent request
1697 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
1698 if code == 303 and m != 'HEAD':
1699 m = 'GET'
1700 # 301 and 302 redirects are commonly turned into a GET from a POST
1701 # for subsequent requests by browsers, so we'll do the same.
1702 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
1703 # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
1704 if code in (301, 302) and m == 'POST':
1705 m = 'GET'
1706
ac668111 1707 return urllib.request.Request(
201c1459 1708 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
afac4caa 1709 unverifiable=True, method=m)
fca6dba8
S
1710
1711
46f59e89
S
1712def extract_timezone(date_str):
1713 m = re.search(
f137e4c2 1714 r'''(?x)
1715 ^.{8,}? # >=8 char non-TZ prefix, if present
1716 (?P<tz>Z| # just the UTC Z, or
1717 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1718 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1719 [ ]? # optional space
1720 (?P<sign>\+|-) # +/-
1721 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1722 $)
1723 ''', date_str)
46f59e89 1724 if not m:
8f53dc44 1725 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1726 timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip())
1727 if timezone is not None:
1728 date_str = date_str[:-len(m.group('tz'))]
1729 timezone = datetime.timedelta(hours=timezone or 0)
46f59e89
S
1730 else:
1731 date_str = date_str[:-len(m.group('tz'))]
1732 if not m.group('sign'):
1733 timezone = datetime.timedelta()
1734 else:
1735 sign = 1 if m.group('sign') == '+' else -1
1736 timezone = datetime.timedelta(
1737 hours=sign * int(m.group('hours')),
1738 minutes=sign * int(m.group('minutes')))
1739 return timezone, date_str
1740
1741
08b38d54 1742def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1743 """ Return a UNIX timestamp from the given date """
1744
1745 if date_str is None:
1746 return None
1747
52c3a6e4
S
1748 date_str = re.sub(r'\.[0-9]+', '', date_str)
1749
08b38d54 1750 if timezone is None:
46f59e89
S
1751 timezone, date_str = extract_timezone(date_str)
1752
19a03940 1753 with contextlib.suppress(ValueError):
86e5f3ed 1754 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
52c3a6e4
S
1755 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1756 return calendar.timegm(dt.timetuple())
912b38b4
PH
1757
1758
46f59e89
S
1759def date_formats(day_first=True):
1760 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1761
1762
42bdd9d0 1763def unified_strdate(date_str, day_first=True):
bf50b038 1764 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1765
1766 if date_str is None:
1767 return None
bf50b038 1768 upload_date = None
5f6a1245 1769 # Replace commas
026fcc04 1770 date_str = date_str.replace(',', ' ')
42bdd9d0 1771 # Remove AM/PM + timezone
9bb8e0a3 1772 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1773 _, date_str = extract_timezone(date_str)
42bdd9d0 1774
46f59e89 1775 for expression in date_formats(day_first):
19a03940 1776 with contextlib.suppress(ValueError):
bf50b038 1777 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
42393ce2
PH
1778 if upload_date is None:
1779 timetuple = email.utils.parsedate_tz(date_str)
1780 if timetuple:
19a03940 1781 with contextlib.suppress(ValueError):
c6b9cf05 1782 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402 1783 if upload_date is not None:
14f25df2 1784 return str(upload_date)
bf50b038 1785
5f6a1245 1786
46f59e89
S
1787def unified_timestamp(date_str, day_first=True):
1788 if date_str is None:
1789 return None
1790
8f53dc44 1791 date_str = re.sub(r'\s+', ' ', re.sub(
1792 r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str))
46f59e89 1793
7dc2a74e 1794 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1795 timezone, date_str = extract_timezone(date_str)
1796
1797 # Remove AM/PM + timezone
1798 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1799
deef3195
S
1800 # Remove unrecognized timezones from ISO 8601 alike timestamps
1801 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1802 if m:
1803 date_str = date_str[:-len(m.group('tz'))]
1804
f226880c
PH
1805 # Python only supports microseconds, so remove nanoseconds
1806 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1807 if m:
1808 date_str = m.group(1)
1809
46f59e89 1810 for expression in date_formats(day_first):
19a03940 1811 with contextlib.suppress(ValueError):
7dc2a74e 1812 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89 1813 return calendar.timegm(dt.timetuple())
8f53dc44 1814
46f59e89
S
1815 timetuple = email.utils.parsedate_tz(date_str)
1816 if timetuple:
8f53dc44 1817 return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds()
46f59e89
S
1818
1819
28e614de 1820def determine_ext(url, default_ext='unknown_video'):
85750f89 1821 if url is None or '.' not in url:
f4776371 1822 return default_ext
9cb9a5df 1823 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1824 if re.match(r'^[A-Za-z0-9]+$', guess):
1825 return guess
a7aaa398
S
1826 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1827 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1828 return guess.rstrip('/')
73e79f2a 1829 else:
cbdbb766 1830 return default_ext
73e79f2a 1831
5f6a1245 1832
824fa511
S
1833def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1834 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1835
5f6a1245 1836
9e62f283 1837def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
3d38b2d6 1838 R"""
1839 Return a datetime object from a string.
1840 Supported format:
1841 (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)?
1842
1843 @param format strftime format of DATE
1844 @param precision Round the datetime object: auto|microsecond|second|minute|hour|day
1845 auto: round to the unit provided in date_str (if applicable).
9e62f283 1846 """
1847 auto_precision = False
1848 if precision == 'auto':
1849 auto_precision = True
1850 precision = 'microsecond'
396a76f7 1851 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1852 if date_str in ('now', 'today'):
37254abc 1853 return today
f8795e10
PH
1854 if date_str == 'yesterday':
1855 return today - datetime.timedelta(days=1)
9e62f283 1856 match = re.match(
3d38b2d6 1857 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?',
9e62f283 1858 date_str)
37254abc 1859 if match is not None:
9e62f283 1860 start_time = datetime_from_str(match.group('start'), precision, format)
1861 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1862 unit = match.group('unit')
9e62f283 1863 if unit == 'month' or unit == 'year':
1864 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1865 unit = 'day'
9e62f283 1866 else:
1867 if unit == 'week':
1868 unit = 'day'
1869 time *= 7
1870 delta = datetime.timedelta(**{unit + 's': time})
1871 new_date = start_time + delta
1872 if auto_precision:
1873 return datetime_round(new_date, unit)
1874 return new_date
1875
1876 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1877
1878
d49f8db3 1879def date_from_str(date_str, format='%Y%m%d', strict=False):
3d38b2d6 1880 R"""
1881 Return a date object from a string using datetime_from_str
9e62f283 1882
3d38b2d6 1883 @param strict Restrict allowed patterns to "YYYYMMDD" and
1884 (now|today|yesterday)(-\d+(day|week|month|year)s?)?
9e62f283 1885 """
3d38b2d6 1886 if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str):
1887 raise ValueError(f'Invalid date format "{date_str}"')
9e62f283 1888 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1889
1890
1891def datetime_add_months(dt, months):
1892 """Increment/Decrement a datetime object by months."""
1893 month = dt.month + months - 1
1894 year = dt.year + month // 12
1895 month = month % 12 + 1
1896 day = min(dt.day, calendar.monthrange(year, month)[1])
1897 return dt.replace(year, month, day)
1898
1899
1900def datetime_round(dt, precision='day'):
1901 """
1902 Round a datetime object's time to a specific precision
1903 """
1904 if precision == 'microsecond':
1905 return dt
1906
1907 unit_seconds = {
1908 'day': 86400,
1909 'hour': 3600,
1910 'minute': 60,
1911 'second': 1,
1912 }
1913 roundto = lambda x, n: ((x + n / 2) // n) * n
1914 timestamp = calendar.timegm(dt.timetuple())
1915 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1916
1917
e63fc1be 1918def hyphenate_date(date_str):
1919 """
1920 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1921 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1922 if match is not None:
1923 return '-'.join(match.groups())
1924 else:
1925 return date_str
1926
5f6a1245 1927
86e5f3ed 1928class DateRange:
bd558525 1929 """Represents a time interval between two dates"""
5f6a1245 1930
bd558525
JMF
1931 def __init__(self, start=None, end=None):
1932 """start and end must be strings in the format accepted by date"""
1933 if start is not None:
d49f8db3 1934 self.start = date_from_str(start, strict=True)
bd558525
JMF
1935 else:
1936 self.start = datetime.datetime.min.date()
1937 if end is not None:
d49f8db3 1938 self.end = date_from_str(end, strict=True)
bd558525
JMF
1939 else:
1940 self.end = datetime.datetime.max.date()
37254abc 1941 if self.start > self.end:
bd558525 1942 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1943
bd558525
JMF
1944 @classmethod
1945 def day(cls, day):
1946 """Returns a range that only contains the given day"""
5f6a1245
JW
1947 return cls(day, day)
1948
bd558525
JMF
1949 def __contains__(self, date):
1950 """Check if the date is in the range"""
37254abc
JMF
1951 if not isinstance(date, datetime.date):
1952 date = date_from_str(date)
1953 return self.start <= date <= self.end
5f6a1245 1954
bd558525 1955 def __str__(self):
86e5f3ed 1956 return f'{self.start.isoformat()} - {self.end.isoformat()}'
c496ca96 1957
f2df4071 1958 def __eq__(self, other):
1959 return (isinstance(other, DateRange)
1960 and self.start == other.start and self.end == other.end)
1961
c496ca96
PH
1962
1963def platform_name():
14f25df2 1964 """ Returns the platform name as a str """
da4db748 1965 deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead')
b1f94422 1966 return platform.platform()
c496ca96 1967
b1f94422 1968
1969@functools.cache
1970def system_identifier():
1971 python_implementation = platform.python_implementation()
1972 if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'):
1973 python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3]
dab284f8 1974 libc_ver = []
1975 with contextlib.suppress(OSError): # We may not have access to the executable
1976 libc_ver = platform.libc_ver()
b1f94422 1977
1978 return 'Python %s (%s %s) - %s %s' % (
1979 platform.python_version(),
1980 python_implementation,
1981 platform.architecture()[0],
1982 platform.platform(),
dab284f8 1983 format_field(join_nonempty(*libc_ver, delim=' '), None, '(%s)'),
b1f94422 1984 )
c257baff
PH
1985
1986
0b9c08b4 1987@functools.cache
49fa4d9a 1988def get_windows_version():
8a82af35 1989 ''' Get Windows version. returns () if it's not running on Windows '''
49fa4d9a
N
1990 if compat_os_name == 'nt':
1991 return version_tuple(platform.win32_ver()[1])
1992 else:
8a82af35 1993 return ()
49fa4d9a
N
1994
1995
734f90bb 1996def write_string(s, out=None, encoding=None):
19a03940 1997 assert isinstance(s, str)
1998 out = out or sys.stderr
7459e3a2 1999
fe1daad3 2000 if compat_os_name == 'nt' and supports_terminal_sequences(out):
3fe75fdc 2001 s = re.sub(r'([\r\n]+)', r' \1', s)
59f943cd 2002
8a82af35 2003 enc, buffer = None, out
cfb0511d 2004 if 'b' in getattr(out, 'mode', ''):
c487cf00 2005 enc = encoding or preferredencoding()
104aa738 2006 elif hasattr(out, 'buffer'):
8a82af35 2007 buffer = out.buffer
104aa738 2008 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
c487cf00 2009
8a82af35 2010 buffer.write(s.encode(enc, 'ignore') if enc else s)
7459e3a2
PH
2011 out.flush()
2012
2013
da4db748 2014def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs):
2015 from . import _IN_CLI
2016 if _IN_CLI:
2017 if msg in deprecation_warning._cache:
2018 return
2019 deprecation_warning._cache.add(msg)
2020 if printer:
2021 return printer(f'{msg}{bug_reports_message()}', **kwargs)
2022 return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs)
2023 else:
2024 import warnings
2025 warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3)
2026
2027
2028deprecation_warning._cache = set()
2029
2030
48ea9cea
PH
2031def bytes_to_intlist(bs):
2032 if not bs:
2033 return []
2034 if isinstance(bs[0], int): # Python 3
2035 return list(bs)
2036 else:
2037 return [ord(c) for c in bs]
2038
c257baff 2039
cba892fa 2040def intlist_to_bytes(xs):
2041 if not xs:
2042 return b''
ac668111 2043 return struct.pack('%dB' % len(xs), *xs)
c38b1e77
PH
2044
2045
8a82af35 2046class LockingUnsupportedError(OSError):
1890fc63 2047 msg = 'File locking is not supported'
0edb3e33 2048
2049 def __init__(self):
2050 super().__init__(self.msg)
2051
2052
c1c9a79c
PH
2053# Cross-platform file locking
2054if sys.platform == 'win32':
fe0918bb 2055 import ctypes
c1c9a79c
PH
2056 import ctypes.wintypes
2057 import msvcrt
2058
2059 class OVERLAPPED(ctypes.Structure):
2060 _fields_ = [
2061 ('Internal', ctypes.wintypes.LPVOID),
2062 ('InternalHigh', ctypes.wintypes.LPVOID),
2063 ('Offset', ctypes.wintypes.DWORD),
2064 ('OffsetHigh', ctypes.wintypes.DWORD),
2065 ('hEvent', ctypes.wintypes.HANDLE),
2066 ]
2067
2068 kernel32 = ctypes.windll.kernel32
2069 LockFileEx = kernel32.LockFileEx
2070 LockFileEx.argtypes = [
2071 ctypes.wintypes.HANDLE, # hFile
2072 ctypes.wintypes.DWORD, # dwFlags
2073 ctypes.wintypes.DWORD, # dwReserved
2074 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2075 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2076 ctypes.POINTER(OVERLAPPED) # Overlapped
2077 ]
2078 LockFileEx.restype = ctypes.wintypes.BOOL
2079 UnlockFileEx = kernel32.UnlockFileEx
2080 UnlockFileEx.argtypes = [
2081 ctypes.wintypes.HANDLE, # hFile
2082 ctypes.wintypes.DWORD, # dwReserved
2083 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2084 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2085 ctypes.POINTER(OVERLAPPED) # Overlapped
2086 ]
2087 UnlockFileEx.restype = ctypes.wintypes.BOOL
2088 whole_low = 0xffffffff
2089 whole_high = 0x7fffffff
2090
747c0bd1 2091 def _lock_file(f, exclusive, block):
c1c9a79c
PH
2092 overlapped = OVERLAPPED()
2093 overlapped.Offset = 0
2094 overlapped.OffsetHigh = 0
2095 overlapped.hEvent = 0
2096 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 2097
2098 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2099 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2100 0, whole_low, whole_high, f._lock_file_overlapped_p):
2cb19820 2101 # NB: No argument form of "ctypes.FormatError" does not work on PyPy
2102 raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}')
c1c9a79c
PH
2103
2104 def _unlock_file(f):
2105 assert f._lock_file_overlapped_p
2106 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 2107 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
2108 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2109
2110else:
399a76e6
YCH
2111 try:
2112 import fcntl
c1c9a79c 2113
a3125791 2114 def _lock_file(f, exclusive, block):
b63837bc 2115 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
2116 if not block:
2117 flags |= fcntl.LOCK_NB
acea8d7c 2118 try:
b63837bc 2119 fcntl.flock(f, flags)
acea8d7c
JK
2120 except BlockingIOError:
2121 raise
2122 except OSError: # AOSP does not have flock()
b63837bc 2123 fcntl.lockf(f, flags)
c1c9a79c 2124
399a76e6 2125 def _unlock_file(f):
acea8d7c
JK
2126 try:
2127 fcntl.flock(f, fcntl.LOCK_UN)
2128 except OSError:
2129 fcntl.lockf(f, fcntl.LOCK_UN)
a3125791 2130
399a76e6 2131 except ImportError:
399a76e6 2132
a3125791 2133 def _lock_file(f, exclusive, block):
0edb3e33 2134 raise LockingUnsupportedError()
399a76e6
YCH
2135
2136 def _unlock_file(f):
0edb3e33 2137 raise LockingUnsupportedError()
c1c9a79c
PH
2138
2139
86e5f3ed 2140class locked_file:
0edb3e33 2141 locked = False
747c0bd1 2142
a3125791 2143 def __init__(self, filename, mode, block=True, encoding=None):
fcfa8853
JK
2144 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2145 raise NotImplementedError(mode)
2146 self.mode, self.block = mode, block
2147
2148 writable = any(f in mode for f in 'wax+')
2149 readable = any(f in mode for f in 'r+')
2150 flags = functools.reduce(operator.ior, (
2151 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2152 getattr(os, 'O_BINARY', 0), # Windows only
2153 getattr(os, 'O_NOINHERIT', 0), # Windows only
2154 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2155 os.O_APPEND if 'a' in mode else 0,
2156 os.O_EXCL if 'x' in mode else 0,
2157 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2158 ))
2159
98804d03 2160 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
c1c9a79c
PH
2161
2162 def __enter__(self):
a3125791 2163 exclusive = 'r' not in self.mode
c1c9a79c 2164 try:
a3125791 2165 _lock_file(self.f, exclusive, self.block)
0edb3e33 2166 self.locked = True
86e5f3ed 2167 except OSError:
c1c9a79c
PH
2168 self.f.close()
2169 raise
fcfa8853 2170 if 'w' in self.mode:
131e14dc
JK
2171 try:
2172 self.f.truncate()
2173 except OSError as e:
1890fc63 2174 if e.errno not in (
2175 errno.ESPIPE, # Illegal seek - expected for FIFO
2176 errno.EINVAL, # Invalid argument - expected for /dev/null
2177 ):
2178 raise
c1c9a79c
PH
2179 return self
2180
0edb3e33 2181 def unlock(self):
2182 if not self.locked:
2183 return
c1c9a79c 2184 try:
0edb3e33 2185 _unlock_file(self.f)
c1c9a79c 2186 finally:
0edb3e33 2187 self.locked = False
c1c9a79c 2188
0edb3e33 2189 def __exit__(self, *_):
2190 try:
2191 self.unlock()
2192 finally:
2193 self.f.close()
4eb7f1d1 2194
0edb3e33 2195 open = __enter__
2196 close = __exit__
a3125791 2197
0edb3e33 2198 def __getattr__(self, attr):
2199 return getattr(self.f, attr)
a3125791 2200
0edb3e33 2201 def __iter__(self):
2202 return iter(self.f)
a3125791 2203
4eb7f1d1 2204
0b9c08b4 2205@functools.cache
4644ac55
S
2206def get_filesystem_encoding():
2207 encoding = sys.getfilesystemencoding()
2208 return encoding if encoding is not None else 'utf-8'
2209
2210
4eb7f1d1 2211def shell_quote(args):
a6a173c2 2212 quoted_args = []
4644ac55 2213 encoding = get_filesystem_encoding()
a6a173c2
JMF
2214 for a in args:
2215 if isinstance(a, bytes):
2216 # We may get a filename encoded with 'encodeFilename'
2217 a = a.decode(encoding)
aefce8e6 2218 quoted_args.append(compat_shlex_quote(a))
28e614de 2219 return ' '.join(quoted_args)
9d4660ca
PH
2220
2221
2222def smuggle_url(url, data):
2223 """ Pass additional data in a URL for internal use. """
2224
81953d1a
RA
2225 url, idata = unsmuggle_url(url, {})
2226 data.update(idata)
14f25df2 2227 sdata = urllib.parse.urlencode(
28e614de
PH
2228 {'__youtubedl_smuggle': json.dumps(data)})
2229 return url + '#' + sdata
9d4660ca
PH
2230
2231
79f82953 2232def unsmuggle_url(smug_url, default=None):
83e865a3 2233 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2234 return smug_url, default
28e614de 2235 url, _, sdata = smug_url.rpartition('#')
14f25df2 2236 jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2237 data = json.loads(jsond)
2238 return url, data
02dbf93f
PH
2239
2240
e0fd9573 2241def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2242 """ Formats numbers with decimal sufixes like K, M, etc """
2243 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2244 if num is None or num < 0:
e0fd9573 2245 return None
eeb2a770 2246 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2247 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2248 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
abbeeebc 2249 if factor == 1024:
2250 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2251 converted = num / (factor ** exponent)
abbeeebc 2252 return fmt % (converted, suffix)
e0fd9573 2253
2254
02dbf93f 2255def format_bytes(bytes):
f02d24d8 2256 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2257
1c088fa8 2258
fb47597b
S
2259def lookup_unit_table(unit_table, s):
2260 units_re = '|'.join(re.escape(u) for u in unit_table)
2261 m = re.match(
782b1b5b 2262 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
2263 if not m:
2264 return None
2265 num_str = m.group('num').replace(',', '.')
2266 mult = unit_table[m.group('unit')]
2267 return int(float(num_str) * mult)
2268
2269
be64b5b0
PH
2270def parse_filesize(s):
2271 if s is None:
2272 return None
2273
dfb1b146 2274 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2275 # but we support those too
2276 _UNIT_TABLE = {
2277 'B': 1,
2278 'b': 1,
70852b47 2279 'bytes': 1,
be64b5b0
PH
2280 'KiB': 1024,
2281 'KB': 1000,
2282 'kB': 1024,
2283 'Kb': 1000,
13585d76 2284 'kb': 1000,
70852b47
YCH
2285 'kilobytes': 1000,
2286 'kibibytes': 1024,
be64b5b0
PH
2287 'MiB': 1024 ** 2,
2288 'MB': 1000 ** 2,
2289 'mB': 1024 ** 2,
2290 'Mb': 1000 ** 2,
13585d76 2291 'mb': 1000 ** 2,
70852b47
YCH
2292 'megabytes': 1000 ** 2,
2293 'mebibytes': 1024 ** 2,
be64b5b0
PH
2294 'GiB': 1024 ** 3,
2295 'GB': 1000 ** 3,
2296 'gB': 1024 ** 3,
2297 'Gb': 1000 ** 3,
13585d76 2298 'gb': 1000 ** 3,
70852b47
YCH
2299 'gigabytes': 1000 ** 3,
2300 'gibibytes': 1024 ** 3,
be64b5b0
PH
2301 'TiB': 1024 ** 4,
2302 'TB': 1000 ** 4,
2303 'tB': 1024 ** 4,
2304 'Tb': 1000 ** 4,
13585d76 2305 'tb': 1000 ** 4,
70852b47
YCH
2306 'terabytes': 1000 ** 4,
2307 'tebibytes': 1024 ** 4,
be64b5b0
PH
2308 'PiB': 1024 ** 5,
2309 'PB': 1000 ** 5,
2310 'pB': 1024 ** 5,
2311 'Pb': 1000 ** 5,
13585d76 2312 'pb': 1000 ** 5,
70852b47
YCH
2313 'petabytes': 1000 ** 5,
2314 'pebibytes': 1024 ** 5,
be64b5b0
PH
2315 'EiB': 1024 ** 6,
2316 'EB': 1000 ** 6,
2317 'eB': 1024 ** 6,
2318 'Eb': 1000 ** 6,
13585d76 2319 'eb': 1000 ** 6,
70852b47
YCH
2320 'exabytes': 1000 ** 6,
2321 'exbibytes': 1024 ** 6,
be64b5b0
PH
2322 'ZiB': 1024 ** 7,
2323 'ZB': 1000 ** 7,
2324 'zB': 1024 ** 7,
2325 'Zb': 1000 ** 7,
13585d76 2326 'zb': 1000 ** 7,
70852b47
YCH
2327 'zettabytes': 1000 ** 7,
2328 'zebibytes': 1024 ** 7,
be64b5b0
PH
2329 'YiB': 1024 ** 8,
2330 'YB': 1000 ** 8,
2331 'yB': 1024 ** 8,
2332 'Yb': 1000 ** 8,
13585d76 2333 'yb': 1000 ** 8,
70852b47
YCH
2334 'yottabytes': 1000 ** 8,
2335 'yobibytes': 1024 ** 8,
be64b5b0
PH
2336 }
2337
fb47597b
S
2338 return lookup_unit_table(_UNIT_TABLE, s)
2339
2340
2341def parse_count(s):
2342 if s is None:
be64b5b0
PH
2343 return None
2344
352d5da8 2345 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2346
2347 if re.match(r'^[\d,.]+$', s):
2348 return str_to_int(s)
2349
2350 _UNIT_TABLE = {
2351 'k': 1000,
2352 'K': 1000,
2353 'm': 1000 ** 2,
2354 'M': 1000 ** 2,
2355 'kk': 1000 ** 2,
2356 'KK': 1000 ** 2,
352d5da8 2357 'b': 1000 ** 3,
2358 'B': 1000 ** 3,
fb47597b 2359 }
be64b5b0 2360
352d5da8 2361 ret = lookup_unit_table(_UNIT_TABLE, s)
2362 if ret is not None:
2363 return ret
2364
2365 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2366 if mobj:
2367 return str_to_int(mobj.group(1))
be64b5b0 2368
2f7ae819 2369
5d45484c 2370def parse_resolution(s, *, lenient=False):
b871d7e9
S
2371 if s is None:
2372 return {}
2373
5d45484c
LNO
2374 if lenient:
2375 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2376 else:
2377 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2378 if mobj:
2379 return {
2380 'width': int(mobj.group('w')),
2381 'height': int(mobj.group('h')),
2382 }
2383
17ec8bcf 2384 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2385 if mobj:
2386 return {'height': int(mobj.group(1))}
2387
2388 mobj = re.search(r'\b([48])[kK]\b', s)
2389 if mobj:
2390 return {'height': int(mobj.group(1)) * 540}
2391
2392 return {}
2393
2394
0dc41787 2395def parse_bitrate(s):
14f25df2 2396 if not isinstance(s, str):
0dc41787
S
2397 return
2398 mobj = re.search(r'\b(\d+)\s*kbps', s)
2399 if mobj:
2400 return int(mobj.group(1))
2401
2402
a942d6cb 2403def month_by_name(name, lang='en'):
caefb1de
PH
2404 """ Return the number of a month by (locale-independently) English name """
2405
f6717dec 2406 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2407
caefb1de 2408 try:
f6717dec 2409 return month_names.index(name) + 1
7105440c
YCH
2410 except ValueError:
2411 return None
2412
2413
2414def month_by_abbreviation(abbrev):
2415 """ Return the number of a month by (locale-independently) English
2416 abbreviations """
2417
2418 try:
2419 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2420 except ValueError:
2421 return None
18258362
JMF
2422
2423
5aafe895 2424def fix_xml_ampersands(xml_str):
18258362 2425 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2426 return re.sub(
2427 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2428 '&amp;',
5aafe895 2429 xml_str)
e3946f98
PH
2430
2431
2432def setproctitle(title):
14f25df2 2433 assert isinstance(title, str)
c1c05c67 2434
fe0918bb 2435 # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4541
2436 try:
2437 import ctypes
2438 except ImportError:
c1c05c67
YCH
2439 return
2440
e3946f98 2441 try:
611c1dd9 2442 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2443 except OSError:
2444 return
2f49bcd6
RC
2445 except TypeError:
2446 # LoadLibrary in Windows Python 2.7.13 only expects
2447 # a bytestring, but since unicode_literals turns
2448 # every string into a unicode string, it fails.
2449 return
0f06bcd7 2450 title_bytes = title.encode()
6eefe533
PH
2451 buf = ctypes.create_string_buffer(len(title_bytes))
2452 buf.value = title_bytes
e3946f98 2453 try:
6eefe533 2454 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2455 except AttributeError:
2456 return # Strange libc, just skip this
d7dda168
PH
2457
2458
2459def remove_start(s, start):
46bc9b7d 2460 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2461
2462
2b9faf55 2463def remove_end(s, end):
46bc9b7d 2464 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2465
2466
31b2051e
S
2467def remove_quotes(s):
2468 if s is None or len(s) < 2:
2469 return s
2470 for quote in ('"', "'", ):
2471 if s[0] == quote and s[-1] == quote:
2472 return s[1:-1]
2473 return s
2474
2475
b6e0c7d2 2476def get_domain(url):
ebf99aaf 2477 """
2478 This implementation is inconsistent, but is kept for compatibility.
2479 Use this only for "webpage_url_domain"
2480 """
2481 return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None
b6e0c7d2
U
2482
2483
29eb5174 2484def url_basename(url):
14f25df2 2485 path = urllib.parse.urlparse(url).path
28e614de 2486 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2487
2488
02dc0a36 2489def base_url(url):
7657ec7e 2490 return re.match(r'https?://[^?#]+/', url).group()
02dc0a36
S
2491
2492
e34c3361 2493def urljoin(base, path):
4b5de77b 2494 if isinstance(path, bytes):
0f06bcd7 2495 path = path.decode()
14f25df2 2496 if not isinstance(path, str) or not path:
e34c3361 2497 return None
fad4ceb5 2498 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2499 return path
4b5de77b 2500 if isinstance(base, bytes):
0f06bcd7 2501 base = base.decode()
14f25df2 2502 if not isinstance(base, str) or not re.match(
4b5de77b 2503 r'^(?:https?:)?//', base):
e34c3361 2504 return None
14f25df2 2505 return urllib.parse.urljoin(base, path)
e34c3361
S
2506
2507
ac668111 2508class HEADRequest(urllib.request.Request):
aa94a6d3 2509 def get_method(self):
611c1dd9 2510 return 'HEAD'
7217e148
PH
2511
2512
ac668111 2513class PUTRequest(urllib.request.Request):
95cf60e8
S
2514 def get_method(self):
2515 return 'PUT'
2516
2517
9732d77e 2518def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2519 if get_attr and v is not None:
2520 v = getattr(v, get_attr, None)
1812afb7
S
2521 try:
2522 return int(v) * invscale // scale
31c49255 2523 except (ValueError, TypeError, OverflowError):
af98f8ff 2524 return default
9732d77e 2525
9572013d 2526
40a90862 2527def str_or_none(v, default=None):
14f25df2 2528 return default if v is None else str(v)
40a90862 2529
9732d77e
PH
2530
2531def str_to_int(int_str):
48d4681e 2532 """ A more relaxed version of int_or_none """
f9934b96 2533 if isinstance(int_str, int):
348c6bf1 2534 return int_str
14f25df2 2535 elif isinstance(int_str, str):
42db58ec
S
2536 int_str = re.sub(r'[,\.\+]', '', int_str)
2537 return int_or_none(int_str)
608d11f5
PH
2538
2539
9732d77e 2540def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2541 if v is None:
2542 return default
2543 try:
2544 return float(v) * invscale / scale
5e1271c5 2545 except (ValueError, TypeError):
caf80631 2546 return default
43f775e4
PH
2547
2548
c7e327c4
S
2549def bool_or_none(v, default=None):
2550 return v if isinstance(v, bool) else default
2551
2552
53cd37ba 2553def strip_or_none(v, default=None):
14f25df2 2554 return v.strip() if isinstance(v, str) else default
b72b4431
S
2555
2556
af03000a 2557def url_or_none(url):
14f25df2 2558 if not url or not isinstance(url, str):
af03000a
S
2559 return None
2560 url = url.strip()
29f7c58a 2561 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2562
2563
3e9b66d7 2564def request_to_url(req):
ac668111 2565 if isinstance(req, urllib.request.Request):
3e9b66d7
LNO
2566 return req.get_full_url()
2567 else:
2568 return req
2569
2570
e29663c6 2571def strftime_or_none(timestamp, date_format, default=None):
2572 datetime_object = None
2573 try:
f9934b96 2574 if isinstance(timestamp, (int, float)): # unix timestamp
e29663c6 2575 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
14f25df2 2576 elif isinstance(timestamp, str): # assume YYYYMMDD
e29663c6 2577 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
9665f15a 2578 date_format = re.sub( # Support %s on windows
2579 r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format)
e29663c6 2580 return datetime_object.strftime(date_format)
2581 except (ValueError, TypeError, AttributeError):
2582 return default
2583
2584
608d11f5 2585def parse_duration(s):
f9934b96 2586 if not isinstance(s, str):
608d11f5 2587 return None
ca7b3246 2588 s = s.strip()
38d79fd1 2589 if not s:
2590 return None
ca7b3246 2591
acaff495 2592 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2593 m = re.match(r'''(?x)
2594 (?P<before_secs>
2595 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2596 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2597 (?P<ms>[.:][0-9]+)?Z?$
2598 ''', s)
acaff495 2599 if m:
8bd1c00b 2600 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2601 else:
2602 m = re.match(
056653bb
S
2603 r'''(?ix)(?:P?
2604 (?:
1c1b2f96 2605 [0-9]+\s*y(?:ears?)?,?\s*
056653bb
S
2606 )?
2607 (?:
1c1b2f96 2608 [0-9]+\s*m(?:onths?)?,?\s*
056653bb
S
2609 )?
2610 (?:
1c1b2f96 2611 [0-9]+\s*w(?:eeks?)?,?\s*
056653bb 2612 )?
8f4b58d7 2613 (?:
1c1b2f96 2614 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
8f4b58d7 2615 )?
056653bb 2616 T)?
acaff495 2617 (?:
1c1b2f96 2618 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
acaff495 2619 )?
2620 (?:
1c1b2f96 2621 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
acaff495 2622 )?
2623 (?:
2624 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2625 )?Z?$''', s)
acaff495 2626 if m:
2627 days, hours, mins, secs, ms = m.groups()
2628 else:
15846398 2629 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2630 if m:
2631 hours, mins = m.groups()
2632 else:
2633 return None
2634
acaff495 2635 if ms:
19a03940 2636 ms = ms.replace(':', '.')
2637 return sum(float(part or 0) * mult for part, mult in (
2638 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
91d7d0b3
JMF
2639
2640
e65e4c88 2641def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2642 name, real_ext = os.path.splitext(filename)
e65e4c88 2643 return (
86e5f3ed 2644 f'{name}.{ext}{real_ext}'
e65e4c88 2645 if not expected_real_ext or real_ext[1:] == expected_real_ext
86e5f3ed 2646 else f'{filename}.{ext}')
d70ad093
PH
2647
2648
b3ed15b7
S
2649def replace_extension(filename, ext, expected_real_ext=None):
2650 name, real_ext = os.path.splitext(filename)
86e5f3ed 2651 return '{}.{}'.format(
b3ed15b7
S
2652 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2653 ext)
2654
2655
d70ad093
PH
2656def check_executable(exe, args=[]):
2657 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2658 args can be a list of arguments for a short output (like -version) """
2659 try:
f0c9fb96 2660 Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
d70ad093
PH
2661 except OSError:
2662 return False
2663 return exe
b7ab0590
PH
2664
2665
8a7f68d0 2666def _get_exe_version_output(exe, args, *, to_screen=None):
2667 if to_screen:
2668 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
95807118 2669 try:
b64d04c1 2670 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2671 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2672 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
f0c9fb96 2673 stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True,
2674 stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
95807118
PH
2675 except OSError:
2676 return False
f0c9fb96 2677 return stdout
cae97f65
PH
2678
2679
2680def detect_exe_version(output, version_re=None, unrecognized='present'):
14f25df2 2681 assert isinstance(output, str)
cae97f65
PH
2682 if version_re is None:
2683 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2684 m = re.search(version_re, output)
95807118
PH
2685 if m:
2686 return m.group(1)
2687 else:
2688 return unrecognized
2689
2690
9af98e17 2691def get_exe_version(exe, args=['--version'],
2692 version_re=None, unrecognized='present'):
2693 """ Returns the version of the specified executable,
2694 or False if the executable is not present """
2695 out = _get_exe_version_output(exe, args)
2696 return detect_exe_version(out, version_re, unrecognized) if out else False
2697
2698
7e88d7d7 2699def frange(start=0, stop=None, step=1):
2700 """Float range"""
2701 if stop is None:
2702 start, stop = 0, start
2703 sign = [-1, 1][step > 0] if step else 0
2704 while sign * start < sign * stop:
2705 yield start
2706 start += step
2707
2708
cb89cfc1 2709class LazyList(collections.abc.Sequence):
0f06bcd7 2710 """Lazy immutable list from an iterable
2711 Note that slices of a LazyList are lists and not LazyList"""
483336e7 2712
8e5fecc8 2713 class IndexError(IndexError):
2714 pass
2715
282f5709 2716 def __init__(self, iterable, *, reverse=False, _cache=None):
0f06bcd7 2717 self._iterable = iter(iterable)
2718 self._cache = [] if _cache is None else _cache
2719 self._reversed = reverse
483336e7 2720
2721 def __iter__(self):
0f06bcd7 2722 if self._reversed:
28419ca2 2723 # We need to consume the entire iterable to iterate in reverse
981052c9 2724 yield from self.exhaust()
28419ca2 2725 return
0f06bcd7 2726 yield from self._cache
2727 for item in self._iterable:
2728 self._cache.append(item)
483336e7 2729 yield item
2730
0f06bcd7 2731 def _exhaust(self):
2732 self._cache.extend(self._iterable)
2733 self._iterable = [] # Discard the emptied iterable to make it pickle-able
2734 return self._cache
28419ca2 2735
981052c9 2736 def exhaust(self):
0f06bcd7 2737 """Evaluate the entire iterable"""
2738 return self._exhaust()[::-1 if self._reversed else 1]
981052c9 2739
28419ca2 2740 @staticmethod
0f06bcd7 2741 def _reverse_index(x):
f2df4071 2742 return None if x is None else ~x
483336e7 2743
2744 def __getitem__(self, idx):
2745 if isinstance(idx, slice):
0f06bcd7 2746 if self._reversed:
2747 idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1))
e0f2b4b4 2748 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2749 elif isinstance(idx, int):
0f06bcd7 2750 if self._reversed:
2751 idx = self._reverse_index(idx)
e0f2b4b4 2752 start, stop, step = idx, idx, 0
483336e7 2753 else:
2754 raise TypeError('indices must be integers or slices')
e0f2b4b4 2755 if ((start or 0) < 0 or (stop or 0) < 0
2756 or (start is None and step < 0)
2757 or (stop is None and step > 0)):
483336e7 2758 # We need to consume the entire iterable to be able to slice from the end
2759 # Obviously, never use this with infinite iterables
0f06bcd7 2760 self._exhaust()
8e5fecc8 2761 try:
0f06bcd7 2762 return self._cache[idx]
8e5fecc8 2763 except IndexError as e:
2764 raise self.IndexError(e) from e
0f06bcd7 2765 n = max(start or 0, stop or 0) - len(self._cache) + 1
28419ca2 2766 if n > 0:
0f06bcd7 2767 self._cache.extend(itertools.islice(self._iterable, n))
8e5fecc8 2768 try:
0f06bcd7 2769 return self._cache[idx]
8e5fecc8 2770 except IndexError as e:
2771 raise self.IndexError(e) from e
483336e7 2772
2773 def __bool__(self):
2774 try:
0f06bcd7 2775 self[-1] if self._reversed else self[0]
8e5fecc8 2776 except self.IndexError:
483336e7 2777 return False
2778 return True
2779
2780 def __len__(self):
0f06bcd7 2781 self._exhaust()
2782 return len(self._cache)
483336e7 2783
282f5709 2784 def __reversed__(self):
0f06bcd7 2785 return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache)
282f5709 2786
2787 def __copy__(self):
0f06bcd7 2788 return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache)
282f5709 2789
28419ca2 2790 def __repr__(self):
2791 # repr and str should mimic a list. So we exhaust the iterable
2792 return repr(self.exhaust())
2793
2794 def __str__(self):
2795 return repr(self.exhaust())
2796
483336e7 2797
7be9ccff 2798class PagedList:
c07a39ae 2799
2800 class IndexError(IndexError):
2801 pass
2802
dd26ced1
PH
2803 def __len__(self):
2804 # This is only useful for tests
2805 return len(self.getslice())
2806
7be9ccff 2807 def __init__(self, pagefunc, pagesize, use_cache=True):
2808 self._pagefunc = pagefunc
2809 self._pagesize = pagesize
f1d13090 2810 self._pagecount = float('inf')
7be9ccff 2811 self._use_cache = use_cache
2812 self._cache = {}
2813
2814 def getpage(self, pagenum):
d8cf8d97 2815 page_results = self._cache.get(pagenum)
2816 if page_results is None:
f1d13090 2817 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2818 if self._use_cache:
2819 self._cache[pagenum] = page_results
2820 return page_results
2821
2822 def getslice(self, start=0, end=None):
2823 return list(self._getslice(start, end))
2824
2825 def _getslice(self, start, end):
55575225 2826 raise NotImplementedError('This method must be implemented by subclasses')
2827
2828 def __getitem__(self, idx):
f1d13090 2829 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2830 if not isinstance(idx, int) or idx < 0:
2831 raise TypeError('indices must be non-negative integers')
2832 entries = self.getslice(idx, idx + 1)
d8cf8d97 2833 if not entries:
c07a39ae 2834 raise self.IndexError()
d8cf8d97 2835 return entries[0]
55575225 2836
9c44d242
PH
2837
2838class OnDemandPagedList(PagedList):
a44ca5a4 2839 """Download pages until a page with less than maximum results"""
86e5f3ed 2840
7be9ccff 2841 def _getslice(self, start, end):
b7ab0590
PH
2842 for pagenum in itertools.count(start // self._pagesize):
2843 firstid = pagenum * self._pagesize
2844 nextfirstid = pagenum * self._pagesize + self._pagesize
2845 if start >= nextfirstid:
2846 continue
2847
b7ab0590
PH
2848 startv = (
2849 start % self._pagesize
2850 if firstid <= start < nextfirstid
2851 else 0)
b7ab0590
PH
2852 endv = (
2853 ((end - 1) % self._pagesize) + 1
2854 if (end is not None and firstid <= end <= nextfirstid)
2855 else None)
2856
f1d13090 2857 try:
2858 page_results = self.getpage(pagenum)
2859 except Exception:
2860 self._pagecount = pagenum - 1
2861 raise
b7ab0590
PH
2862 if startv != 0 or endv is not None:
2863 page_results = page_results[startv:endv]
7be9ccff 2864 yield from page_results
b7ab0590
PH
2865
2866 # A little optimization - if current page is not "full", ie. does
2867 # not contain page_size videos then we can assume that this page
2868 # is the last one - there are no more ids on further pages -
2869 # i.e. no need to query again.
2870 if len(page_results) + startv < self._pagesize:
2871 break
2872
2873 # If we got the whole page, but the next page is not interesting,
2874 # break out early as well
2875 if end == nextfirstid:
2876 break
81c2f20b
PH
2877
2878
9c44d242 2879class InAdvancePagedList(PagedList):
a44ca5a4 2880 """PagedList with total number of pages known in advance"""
86e5f3ed 2881
9c44d242 2882 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2883 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2884 self._pagecount = pagecount
9c44d242 2885
7be9ccff 2886 def _getslice(self, start, end):
9c44d242 2887 start_page = start // self._pagesize
d37707bd 2888 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2889 skip_elems = start - start_page * self._pagesize
2890 only_more = None if end is None else end - start
2891 for pagenum in range(start_page, end_page):
7be9ccff 2892 page_results = self.getpage(pagenum)
9c44d242 2893 if skip_elems:
7be9ccff 2894 page_results = page_results[skip_elems:]
9c44d242
PH
2895 skip_elems = None
2896 if only_more is not None:
7be9ccff 2897 if len(page_results) < only_more:
2898 only_more -= len(page_results)
9c44d242 2899 else:
7be9ccff 2900 yield from page_results[:only_more]
9c44d242 2901 break
7be9ccff 2902 yield from page_results
9c44d242
PH
2903
2904
7e88d7d7 2905class PlaylistEntries:
2906 MissingEntry = object()
2907 is_exhausted = False
2908
2909 def __init__(self, ydl, info_dict):
7e9a6125 2910 self.ydl = ydl
2911
2912 # _entries must be assigned now since infodict can change during iteration
2913 entries = info_dict.get('entries')
2914 if entries is None:
2915 raise EntryNotInPlaylist('There are no entries')
2916 elif isinstance(entries, list):
2917 self.is_exhausted = True
2918
2919 requested_entries = info_dict.get('requested_entries')
2920 self.is_incomplete = bool(requested_entries)
2921 if self.is_incomplete:
2922 assert self.is_exhausted
2923 self._entries = [self.MissingEntry] * max(requested_entries)
2924 for i, entry in zip(requested_entries, entries):
2925 self._entries[i - 1] = entry
2926 elif isinstance(entries, (list, PagedList, LazyList)):
2927 self._entries = entries
2928 else:
2929 self._entries = LazyList(entries)
7e88d7d7 2930
2931 PLAYLIST_ITEMS_RE = re.compile(r'''(?x)
2932 (?P<start>[+-]?\d+)?
2933 (?P<range>[:-]
2934 (?P<end>[+-]?\d+|inf(?:inite)?)?
2935 (?::(?P<step>[+-]?\d+))?
2936 )?''')
2937
2938 @classmethod
2939 def parse_playlist_items(cls, string):
2940 for segment in string.split(','):
2941 if not segment:
2942 raise ValueError('There is two or more consecutive commas')
2943 mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment)
2944 if not mobj:
2945 raise ValueError(f'{segment!r} is not a valid specification')
2946 start, end, step, has_range = mobj.group('start', 'end', 'step', 'range')
2947 if int_or_none(step) == 0:
2948 raise ValueError(f'Step in {segment!r} cannot be zero')
2949 yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start)
2950
2951 def get_requested_items(self):
2952 playlist_items = self.ydl.params.get('playlist_items')
2953 playlist_start = self.ydl.params.get('playliststart', 1)
2954 playlist_end = self.ydl.params.get('playlistend')
2955 # For backwards compatibility, interpret -1 as whole list
2956 if playlist_end in (-1, None):
2957 playlist_end = ''
2958 if not playlist_items:
2959 playlist_items = f'{playlist_start}:{playlist_end}'
2960 elif playlist_start != 1 or playlist_end:
2961 self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True)
2962
2963 for index in self.parse_playlist_items(playlist_items):
2964 for i, entry in self[index]:
2965 yield i, entry
1ac4fd80 2966 if not entry:
2967 continue
7e88d7d7 2968 try:
2969 # TODO: Add auto-generated fields
2970 self.ydl._match_entry(entry, incomplete=True, silent=True)
2971 except (ExistingVideoReached, RejectedVideoReached):
2972 return
2973
7e9a6125 2974 def get_full_count(self):
2975 if self.is_exhausted and not self.is_incomplete:
7e88d7d7 2976 return len(self)
2977 elif isinstance(self._entries, InAdvancePagedList):
2978 if self._entries._pagesize == 1:
2979 return self._entries._pagecount
2980
7e88d7d7 2981 @functools.cached_property
2982 def _getter(self):
2983 if isinstance(self._entries, list):
2984 def get_entry(i):
2985 try:
2986 entry = self._entries[i]
2987 except IndexError:
2988 entry = self.MissingEntry
2989 if not self.is_incomplete:
2990 raise self.IndexError()
2991 if entry is self.MissingEntry:
2992 raise EntryNotInPlaylist(f'Entry {i} cannot be found')
2993 return entry
2994 else:
2995 def get_entry(i):
2996 try:
2997 return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i)
2998 except (LazyList.IndexError, PagedList.IndexError):
2999 raise self.IndexError()
3000 return get_entry
3001
3002 def __getitem__(self, idx):
3003 if isinstance(idx, int):
3004 idx = slice(idx, idx)
3005
3006 # NB: PlaylistEntries[1:10] => (0, 1, ... 9)
3007 step = 1 if idx.step is None else idx.step
3008 if idx.start is None:
3009 start = 0 if step > 0 else len(self) - 1
3010 else:
3011 start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start
3012
3013 # NB: Do not call len(self) when idx == [:]
3014 if idx.stop is None:
3015 stop = 0 if step < 0 else float('inf')
3016 else:
3017 stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop
3018 stop += [-1, 1][step > 0]
3019
3020 for i in frange(start, stop, step):
3021 if i < 0:
3022 continue
3023 try:
7e9a6125 3024 entry = self._getter(i)
3025 except self.IndexError:
3026 self.is_exhausted = True
3027 if step > 0:
7e88d7d7 3028 break
7e9a6125 3029 continue
7e88d7d7 3030 yield i + 1, entry
3031
3032 def __len__(self):
3033 return len(tuple(self[:]))
3034
3035 class IndexError(IndexError):
3036 pass
3037
3038
81c2f20b 3039def uppercase_escape(s):
676eb3f2 3040 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 3041 return re.sub(
a612753d 3042 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
3043 lambda m: unicode_escape(m.group(0))[0],
3044 s)
0fe2ff78
YCH
3045
3046
3047def lowercase_escape(s):
3048 unicode_escape = codecs.getdecoder('unicode_escape')
3049 return re.sub(
3050 r'\\u[0-9a-fA-F]{4}',
3051 lambda m: unicode_escape(m.group(0))[0],
3052 s)
b53466e1 3053
d05cfe06
S
3054
3055def escape_rfc3986(s):
3056 """Escape non-ASCII characters as suggested by RFC 3986"""
f9934b96 3057 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
3058
3059
3060def escape_url(url):
3061 """Escape URL as suggested by RFC 3986"""
14f25df2 3062 url_parsed = urllib.parse.urlparse(url)
d05cfe06 3063 return url_parsed._replace(
efbed08d 3064 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
3065 path=escape_rfc3986(url_parsed.path),
3066 params=escape_rfc3986(url_parsed.params),
3067 query=escape_rfc3986(url_parsed.query),
3068 fragment=escape_rfc3986(url_parsed.fragment)
3069 ).geturl()
3070
62e609ab 3071
4dfbf869 3072def parse_qs(url):
14f25df2 3073 return urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
4dfbf869 3074
3075
62e609ab
PH
3076def read_batch_urls(batch_fd):
3077 def fixup(url):
14f25df2 3078 if not isinstance(url, str):
62e609ab 3079 url = url.decode('utf-8', 'replace')
8c04f0be 3080 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
3081 for bom in BOM_UTF8:
3082 if url.startswith(bom):
3083 url = url[len(bom):]
3084 url = url.lstrip()
3085 if not url or url.startswith(('#', ';', ']')):
62e609ab 3086 return False
8c04f0be 3087 # "#" cannot be stripped out since it is part of the URI
962ffcf8 3088 # However, it can be safely stripped out if following a whitespace
8c04f0be 3089 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
3090
3091 with contextlib.closing(batch_fd) as fd:
3092 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
3093
3094
3095def urlencode_postdata(*args, **kargs):
14f25df2 3096 return urllib.parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
3097
3098
38f9ef31 3099def update_url_query(url, query):
cacd9966
YCH
3100 if not query:
3101 return url
14f25df2 3102 parsed_url = urllib.parse.urlparse(url)
3103 qs = urllib.parse.parse_qs(parsed_url.query)
38f9ef31 3104 qs.update(query)
14f25df2 3105 return urllib.parse.urlunparse(parsed_url._replace(
3106 query=urllib.parse.urlencode(qs, True)))
16392824 3107
8e60dc75 3108
c043c246 3109def update_Request(req, url=None, data=None, headers=None, query=None):
ed0291d1 3110 req_headers = req.headers.copy()
c043c246 3111 req_headers.update(headers or {})
ed0291d1
S
3112 req_data = data or req.data
3113 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
3114 req_get_method = req.get_method()
3115 if req_get_method == 'HEAD':
3116 req_type = HEADRequest
3117 elif req_get_method == 'PUT':
3118 req_type = PUTRequest
3119 else:
ac668111 3120 req_type = urllib.request.Request
ed0291d1
S
3121 new_req = req_type(
3122 req_url, data=req_data, headers=req_headers,
3123 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3124 if hasattr(req, 'timeout'):
3125 new_req.timeout = req.timeout
3126 return new_req
3127
3128
10c87c15 3129def _multipart_encode_impl(data, boundary):
0c265486
YCH
3130 content_type = 'multipart/form-data; boundary=%s' % boundary
3131
3132 out = b''
3133 for k, v in data.items():
3134 out += b'--' + boundary.encode('ascii') + b'\r\n'
14f25df2 3135 if isinstance(k, str):
0f06bcd7 3136 k = k.encode()
14f25df2 3137 if isinstance(v, str):
0f06bcd7 3138 v = v.encode()
0c265486
YCH
3139 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3140 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 3141 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
3142 if boundary.encode('ascii') in content:
3143 raise ValueError('Boundary overlaps with data')
3144 out += content
3145
3146 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3147
3148 return out, content_type
3149
3150
3151def multipart_encode(data, boundary=None):
3152 '''
3153 Encode a dict to RFC 7578-compliant form-data
3154
3155 data:
3156 A dict where keys and values can be either Unicode or bytes-like
3157 objects.
3158 boundary:
3159 If specified a Unicode object, it's used as the boundary. Otherwise
3160 a random boundary is generated.
3161
3162 Reference: https://tools.ietf.org/html/rfc7578
3163 '''
3164 has_specified_boundary = boundary is not None
3165
3166 while True:
3167 if boundary is None:
3168 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3169
3170 try:
10c87c15 3171 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
3172 break
3173 except ValueError:
3174 if has_specified_boundary:
3175 raise
3176 boundary = None
3177
3178 return out, content_type
3179
3180
86296ad2 3181def dict_get(d, key_or_keys, default=None, skip_false_values=True):
a44ca5a4 3182 for val in map(d.get, variadic(key_or_keys)):
3183 if val is not None and (val or not skip_false_values):
3184 return val
3185 return default
cbecc9b9
S
3186
3187
c4f60dd7 3188def try_call(*funcs, expected_type=None, args=[], kwargs={}):
3189 for f in funcs:
a32a9a7e 3190 try:
c4f60dd7 3191 val = f(*args, **kwargs)
3192 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
a32a9a7e
S
3193 pass
3194 else:
c4f60dd7 3195 if expected_type is None or isinstance(val, expected_type):
3196 return val
3197
3198
3199def try_get(src, getter, expected_type=None):
3200 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
329ca3be
S
3201
3202
90137ca4 3203def filter_dict(dct, cndn=lambda _, v: v is not None):
3204 return {k: v for k, v in dct.items() if cndn(k, v)}
3205
3206
6cc62232
S
3207def merge_dicts(*dicts):
3208 merged = {}
3209 for a_dict in dicts:
3210 for k, v in a_dict.items():
90137ca4 3211 if (v is not None and k not in merged
3212 or isinstance(v, str) and merged[k] == ''):
6cc62232
S
3213 merged[k] = v
3214 return merged
3215
3216
8e60dc75 3217def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
14f25df2 3218 return string if isinstance(string, str) else str(string, encoding, errors)
8e60dc75 3219
16392824 3220
a1a530b0
PH
3221US_RATINGS = {
3222 'G': 0,
3223 'PG': 10,
3224 'PG-13': 13,
3225 'R': 16,
3226 'NC': 18,
3227}
fac55558
PH
3228
3229
a8795327 3230TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3231 'TV-Y': 0,
3232 'TV-Y7': 7,
3233 'TV-G': 0,
3234 'TV-PG': 0,
3235 'TV-14': 14,
3236 'TV-MA': 17,
a8795327
S
3237}
3238
3239
146c80e2 3240def parse_age_limit(s):
19a03940 3241 # isinstance(False, int) is True. So type() must be used instead
c487cf00 3242 if type(s) is int: # noqa: E721
a8795327 3243 return s if 0 <= s <= 21 else None
19a03940 3244 elif not isinstance(s, str):
d838b1bd 3245 return None
146c80e2 3246 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3247 if m:
3248 return int(m.group('age'))
5c5fae6d 3249 s = s.upper()
a8795327
S
3250 if s in US_RATINGS:
3251 return US_RATINGS[s]
5a16c9d9 3252 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3253 if m:
5a16c9d9 3254 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3255 return None
146c80e2
S
3256
3257
fac55558 3258def strip_jsonp(code):
609a61e3 3259 return re.sub(
5552c9eb 3260 r'''(?sx)^
e9c671d5 3261 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3262 (?:\s*&&\s*(?P=func_name))?
3263 \s*\(\s*(?P<callback_data>.*)\);?
3264 \s*?(?://[^\n]*)*$''',
3265 r'\g<callback_data>', code)
478c2c61
PH
3266
3267
8f53dc44 3268def js_to_json(code, vars={}, *, strict=False):
5c610515 3269 # vars is a dict of var, val pairs to substitute
c843e685 3270 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
86e5f3ed 3271 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
4195096e 3272 INTEGER_TABLE = (
86e5f3ed 3273 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
3274 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
4195096e
S
3275 )
3276
e05f6939 3277 def fix_kv(m):
e7b6d122
PH
3278 v = m.group(0)
3279 if v in ('true', 'false', 'null'):
3280 return v
421ddcb8
C
3281 elif v in ('undefined', 'void 0'):
3282 return 'null'
8bdd16b4 3283 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
bd1e4844 3284 return ""
3285
3286 if v[0] in ("'", '"'):
3287 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 3288 '"': '\\"',
bd1e4844 3289 "\\'": "'",
3290 '\\\n': '',
3291 '\\x': '\\u00',
3292 }.get(m.group(0), m.group(0)), v[1:-1])
8bdd16b4 3293 else:
3294 for regex, base in INTEGER_TABLE:
3295 im = re.match(regex, v)
3296 if im:
3297 i = int(im.group(1), base)
3298 return '"%d":' % i if v.endswith(':') else '%d' % i
89ac4a19 3299
5c610515 3300 if v in vars:
3301 return vars[v]
8f53dc44 3302 if strict:
3303 raise ValueError(f'Unknown value: {v}')
5c610515 3304
e7b6d122 3305 return '"%s"' % v
e05f6939 3306
8072ef2b 3307 def create_map(mobj):
3308 return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars))))
3309
8072ef2b 3310 code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code)
8f53dc44 3311 if not strict:
3312 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
febff4c1 3313
bd1e4844 3314 return re.sub(r'''(?sx)
3315 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3316 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 3317 {comment}|,(?={skip}[\]}}])|
421ddcb8 3318 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
4195096e 3319 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
8bdd16b4 3320 [0-9]+(?={skip}:)|
3321 !+
4195096e 3322 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
3323
3324
478c2c61
PH
3325def qualities(quality_ids):
3326 """ Get a numeric quality value out of a list of possible values """
3327 def q(qid):
3328 try:
3329 return quality_ids.index(qid)
3330 except ValueError:
3331 return -1
3332 return q
3333
acd69589 3334
8aa0e7cd 3335POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist')
1e43a6f7 3336
3337
de6000d9 3338DEFAULT_OUTTMPL = {
3339 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3340 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3341}
3342OUTTMPL_TYPES = {
72755351 3343 'chapter': None,
de6000d9 3344 'subtitle': None,
3345 'thumbnail': None,
3346 'description': 'description',
3347 'annotation': 'annotations.xml',
3348 'infojson': 'info.json',
08438d2c 3349 'link': None,
3b603dbd 3350 'pl_video': None,
5112f26a 3351 'pl_thumbnail': None,
de6000d9 3352 'pl_description': 'description',
3353 'pl_infojson': 'info.json',
3354}
0a871f68 3355
143db31d 3356# As of [1] format syntax is:
3357# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3358# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3359STR_FORMAT_RE_TMPL = r'''(?x)
3360 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3361 %
524e2e4f 3362 (?P<has_key>\((?P<key>{0})\))?
752cda38 3363 (?P<format>
524e2e4f 3364 (?P<conversion>[#0\-+ ]+)?
3365 (?P<min_width>\d+)?
3366 (?P<precision>\.\d+)?
3367 (?P<len_mod>[hlL])? # unused in python
901130bb 3368 {1} # conversion type
752cda38 3369 )
143db31d 3370'''
3371
7d1eb38a 3372
901130bb 3373STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3374
7d1eb38a 3375
a020a0dc
PH
3376def limit_length(s, length):
3377 """ Add ellipses to overly long strings """
3378 if s is None:
3379 return None
3380 ELLIPSES = '...'
3381 if len(s) > length:
3382 return s[:length - len(ELLIPSES)] + ELLIPSES
3383 return s
48844745
PH
3384
3385
3386def version_tuple(v):
5f9b8394 3387 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3388
3389
3390def is_outdated_version(version, limit, assume_new=True):
3391 if not version:
3392 return not assume_new
3393 try:
3394 return version_tuple(version) < version_tuple(limit)
3395 except ValueError:
3396 return not assume_new
732ea2f0
PH
3397
3398
3399def ytdl_is_updateable():
7a5c1cfe 3400 """ Returns if yt-dlp can be updated with -U """
735d865e 3401
5d535b4a 3402 from .update import is_non_updateable
732ea2f0 3403
5d535b4a 3404 return not is_non_updateable()
7d4111ed
PH
3405
3406
3407def args_to_str(args):
3408 # Get a short string representation for a subprocess command
702ccf2d 3409 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3410
3411
9b9c5355 3412def error_to_compat_str(err):
cfb0511d 3413 return str(err)
fdae2358
S
3414
3415
a44ca5a4 3416def error_to_str(err):
3417 return f'{type(err).__name__}: {err}'
3418
3419
c460bdd5 3420def mimetype2ext(mt):
eb9ee194
S
3421 if mt is None:
3422 return None
3423
9359f3d4
F
3424 mt, _, params = mt.partition(';')
3425 mt = mt.strip()
3426
3427 FULL_MAP = {
765ac263 3428 'audio/mp4': 'm4a',
6c33d24b
YCH
3429 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3430 # it's the most popular one
3431 'audio/mpeg': 'mp3',
ba39289d 3432 'audio/x-wav': 'wav',
9359f3d4
F
3433 'audio/wav': 'wav',
3434 'audio/wave': 'wav',
3435 }
3436
3437 ext = FULL_MAP.get(mt)
765ac263
JMF
3438 if ext is not None:
3439 return ext
3440
9359f3d4 3441 SUBTYPE_MAP = {
f6861ec9 3442 '3gpp': '3gp',
cafcf657 3443 'smptett+xml': 'tt',
cafcf657 3444 'ttaf+xml': 'dfxp',
a0d8d704 3445 'ttml+xml': 'ttml',
f6861ec9 3446 'x-flv': 'flv',
a0d8d704 3447 'x-mp4-fragmented': 'mp4',
d4f05d47 3448 'x-ms-sami': 'sami',
a0d8d704 3449 'x-ms-wmv': 'wmv',
b4173f15
RA
3450 'mpegurl': 'm3u8',
3451 'x-mpegurl': 'm3u8',
3452 'vnd.apple.mpegurl': 'm3u8',
3453 'dash+xml': 'mpd',
b4173f15 3454 'f4m+xml': 'f4m',
f164b971 3455 'hds+xml': 'f4m',
e910fe2f 3456 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 3457 'quicktime': 'mov',
98ce1a3f 3458 'mp2t': 'ts',
39e7107d 3459 'x-wav': 'wav',
9359f3d4
F
3460 'filmstrip+json': 'fs',
3461 'svg+xml': 'svg',
3462 }
3463
3464 _, _, subtype = mt.rpartition('/')
3465 ext = SUBTYPE_MAP.get(subtype.lower())
3466 if ext is not None:
3467 return ext
3468
3469 SUFFIX_MAP = {
3470 'json': 'json',
3471 'xml': 'xml',
3472 'zip': 'zip',
3473 'gzip': 'gz',
3474 }
3475
3476 _, _, suffix = subtype.partition('+')
3477 ext = SUFFIX_MAP.get(suffix)
3478 if ext is not None:
3479 return ext
3480
3481 return subtype.replace('+', '.')
c460bdd5
PH
3482
3483
2814f12b
THD
3484def ext2mimetype(ext_or_url):
3485 if not ext_or_url:
3486 return None
3487 if '.' not in ext_or_url:
3488 ext_or_url = f'file.{ext_or_url}'
3489 return mimetypes.guess_type(ext_or_url)[0]
3490
3491
4f3c5e06 3492def parse_codecs(codecs_str):
3493 # http://tools.ietf.org/html/rfc6381
3494 if not codecs_str:
3495 return {}
a0566bbf 3496 split_codecs = list(filter(None, map(
dbf5416a 3497 str.strip, codecs_str.strip().strip(',').split(','))))
3fe75fdc 3498 vcodec, acodec, scodec, hdr = None, None, None, None
a0566bbf 3499 for full_codec in split_codecs:
d816f61f 3500 parts = re.sub(r'0+(?=\d)', '', full_codec).split('.')
3501 if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3502 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3503 if vcodec:
3504 continue
3505 vcodec = full_codec
3506 if parts[0] in ('dvh1', 'dvhe'):
3507 hdr = 'DV'
3508 elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10':
3509 hdr = 'HDR10'
3510 elif parts[:2] == ['vp9', '2']:
3511 hdr = 'HDR10'
3512 elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac',
3513 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3514 acodec = acodec or full_codec
3515 elif parts[0] in ('stpp', 'wvtt'):
3516 scodec = scodec or full_codec
4f3c5e06 3517 else:
19a03940 3518 write_string(f'WARNING: Unknown codec {full_codec}\n')
3fe75fdc 3519 if vcodec or acodec or scodec:
4f3c5e06 3520 return {
3521 'vcodec': vcodec or 'none',
3522 'acodec': acodec or 'none',
176f1866 3523 'dynamic_range': hdr,
3fe75fdc 3524 **({'scodec': scodec} if scodec is not None else {}),
4f3c5e06 3525 }
b69fd25c 3526 elif len(split_codecs) == 2:
3527 return {
3528 'vcodec': split_codecs[0],
3529 'acodec': split_codecs[1],
3530 }
4f3c5e06 3531 return {}
3532
3533
fc61aff4
LL
3534def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None):
3535 assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts)
3536
3537 allow_mkv = not preferences or 'mkv' in preferences
3538
3539 if allow_mkv and max(len(acodecs), len(vcodecs)) > 1:
3540 return 'mkv' # TODO: any other format allows this?
3541
3542 # TODO: All codecs supported by parse_codecs isn't handled here
3543 COMPATIBLE_CODECS = {
3544 'mp4': {
3545 'av1', 'hevc', 'avc1', 'mp4a', # fourcc (m3u8, mpd)
3546 'h264', 'aacl', # Set in ISM
3547 },
3548 'webm': {
3549 'av1', 'vp9', 'vp8', 'opus', 'vrbs',
3550 'vp9x', 'vp8x', # in the webm spec
3551 },
3552 }
3553
8f84770a 3554 sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', ''))
3555 vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs)
fc61aff4
LL
3556
3557 for ext in preferences or COMPATIBLE_CODECS.keys():
3558 codec_set = COMPATIBLE_CODECS.get(ext, set())
3559 if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)):
3560 return ext
3561
3562 COMPATIBLE_EXTS = (
3563 {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'},
3564 {'webm'},
3565 )
3566 for ext in preferences or vexts:
3567 current_exts = {ext, *vexts, *aexts}
3568 if ext == 'mkv' or current_exts == {ext} or any(
3569 ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS):
3570 return ext
3571 return 'mkv' if allow_mkv else preferences[-1]
3572
3573
2ccd1b10 3574def urlhandle_detect_ext(url_handle):
79298173 3575 getheader = url_handle.headers.get
2ccd1b10 3576
b55ee18f
PH
3577 cd = getheader('Content-Disposition')
3578 if cd:
3579 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3580 if m:
3581 e = determine_ext(m.group('filename'), default_ext=None)
3582 if e:
3583 return e
3584
c460bdd5 3585 return mimetype2ext(getheader('Content-Type'))
05900629
PH
3586
3587
1e399778
YCH
3588def encode_data_uri(data, mime_type):
3589 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3590
3591
05900629 3592def age_restricted(content_limit, age_limit):
6ec6cb4e 3593 """ Returns True iff the content should be blocked """
05900629
PH
3594
3595 if age_limit is None: # No limit set
3596 return False
3597 if content_limit is None:
3598 return False # Content available for everyone
3599 return age_limit < content_limit
61ca9a80
PH
3600
3601
88f60feb 3602# List of known byte-order-marks (BOM)
a904a7f8
L
3603BOMS = [
3604 (b'\xef\xbb\xbf', 'utf-8'),
3605 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3606 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3607 (b'\xff\xfe', 'utf-16-le'),
3608 (b'\xfe\xff', 'utf-16-be'),
3609]
a904a7f8
L
3610
3611
61ca9a80
PH
3612def is_html(first_bytes):
3613 """ Detect whether a file contains HTML by examining its first bytes. """
3614
80e8493e 3615 encoding = 'utf-8'
61ca9a80 3616 for bom, enc in BOMS:
80e8493e 3617 while first_bytes.startswith(bom):
3618 encoding, first_bytes = enc, first_bytes[len(bom):]
61ca9a80 3619
80e8493e 3620 return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace'))
a055469f
PH
3621
3622
3623def determine_protocol(info_dict):
3624 protocol = info_dict.get('protocol')
3625 if protocol is not None:
3626 return protocol
3627
7de837a5 3628 url = sanitize_url(info_dict['url'])
a055469f
PH
3629 if url.startswith('rtmp'):
3630 return 'rtmp'
3631 elif url.startswith('mms'):
3632 return 'mms'
3633 elif url.startswith('rtsp'):
3634 return 'rtsp'
3635
3636 ext = determine_ext(url)
3637 if ext == 'm3u8':
deae7c17 3638 return 'm3u8' if info_dict.get('is_live') else 'm3u8_native'
a055469f
PH
3639 elif ext == 'f4m':
3640 return 'f4m'
3641
14f25df2 3642 return urllib.parse.urlparse(url).scheme
cfb56d1a
PH
3643
3644
c5e3f849 3645def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3646 """ Render a list of rows, each as a list of values.
3647 Text after a \t will be right aligned """
ec11a9f4 3648 def width(string):
c5e3f849 3649 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3650
3651 def get_max_lens(table):
ec11a9f4 3652 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3653
3654 def filter_using_list(row, filterArray):
d16df59d 3655 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3656
d16df59d 3657 max_lens = get_max_lens(data) if hide_empty else []
3658 header_row = filter_using_list(header_row, max_lens)
3659 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3660
cfb56d1a 3661 table = [header_row] + data
76d321f6 3662 max_lens = get_max_lens(table)
c5e3f849 3663 extra_gap += 1
76d321f6 3664 if delim:
c5e3f849 3665 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3666 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3667 for row in table:
3668 for pos, text in enumerate(map(str, row)):
c5e3f849 3669 if '\t' in text:
3670 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3671 else:
3672 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3673 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3674 return ret
347de493
PH
3675
3676
8f18aca8 3677def _match_one(filter_part, dct, incomplete):
77b87f05 3678 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3679 STRING_OPERATORS = {
3680 '*=': operator.contains,
3681 '^=': lambda attr, value: attr.startswith(value),
3682 '$=': lambda attr, value: attr.endswith(value),
3683 '~=': lambda attr, value: re.search(value, attr),
3684 }
347de493 3685 COMPARISON_OPERATORS = {
a047eeb6 3686 **STRING_OPERATORS,
3687 '<=': operator.le, # "<=" must be defined above "<"
347de493 3688 '<': operator.lt,
347de493 3689 '>=': operator.ge,
a047eeb6 3690 '>': operator.gt,
347de493 3691 '=': operator.eq,
347de493 3692 }
a047eeb6 3693
6db9c4d5 3694 if isinstance(incomplete, bool):
3695 is_incomplete = lambda _: incomplete
3696 else:
3697 is_incomplete = lambda k: k in incomplete
3698
64fa820c 3699 operator_rex = re.compile(r'''(?x)
347de493 3700 (?P<key>[a-z_]+)
77b87f05 3701 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3702 (?:
a047eeb6 3703 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3704 (?P<strval>.+?)
347de493 3705 )
347de493 3706 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
64fa820c 3707 m = operator_rex.fullmatch(filter_part.strip())
347de493 3708 if m:
18f96d12 3709 m = m.groupdict()
3710 unnegated_op = COMPARISON_OPERATORS[m['op']]
3711 if m['negation']:
77b87f05
MT
3712 op = lambda attr, value: not unnegated_op(attr, value)
3713 else:
3714 op = unnegated_op
18f96d12 3715 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3716 if m['quote']:
3717 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3718 actual_value = dct.get(m['key'])
3719 numeric_comparison = None
f9934b96 3720 if isinstance(actual_value, (int, float)):
e5a088dc
S
3721 # If the original field is a string and matching comparisonvalue is
3722 # a number we should respect the origin of the original field
3723 # and process comparison value as a string (see
18f96d12 3724 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3725 try:
18f96d12 3726 numeric_comparison = int(comparison_value)
347de493 3727 except ValueError:
18f96d12 3728 numeric_comparison = parse_filesize(comparison_value)
3729 if numeric_comparison is None:
3730 numeric_comparison = parse_filesize(f'{comparison_value}B')
3731 if numeric_comparison is None:
3732 numeric_comparison = parse_duration(comparison_value)
3733 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3734 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3735 if actual_value is None:
6db9c4d5 3736 return is_incomplete(m['key']) or m['none_inclusive']
18f96d12 3737 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3738
3739 UNARY_OPERATORS = {
1cc47c66
S
3740 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3741 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493 3742 }
64fa820c 3743 operator_rex = re.compile(r'''(?x)
347de493 3744 (?P<op>%s)\s*(?P<key>[a-z_]+)
347de493 3745 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
64fa820c 3746 m = operator_rex.fullmatch(filter_part.strip())
347de493
PH
3747 if m:
3748 op = UNARY_OPERATORS[m.group('op')]
3749 actual_value = dct.get(m.group('key'))
6db9c4d5 3750 if is_incomplete(m.group('key')) and actual_value is None:
8f18aca8 3751 return True
347de493
PH
3752 return op(actual_value)
3753
3754 raise ValueError('Invalid filter part %r' % filter_part)
3755
3756
8f18aca8 3757def match_str(filter_str, dct, incomplete=False):
6db9c4d5 3758 """ Filter a dictionary with a simple string syntax.
3759 @returns Whether the filter passes
3760 @param incomplete Set of keys that is expected to be missing from dct.
3761 Can be True/False to indicate all/none of the keys may be missing.
3762 All conditions on incomplete keys pass if the key is missing
8f18aca8 3763 """
347de493 3764 return all(
8f18aca8 3765 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3766 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3767
3768
b1a7cd05 3769def match_filter_func(filters):
3770 if not filters:
d1b5f70b 3771 return None
492272fe 3772 filters = set(variadic(filters))
d1b5f70b 3773
492272fe 3774 interactive = '-' in filters
3775 if interactive:
3776 filters.remove('-')
3777
3778 def _match_func(info_dict, incomplete=False):
3779 if not filters or any(match_str(f, info_dict, incomplete) for f in filters):
3780 return NO_DEFAULT if interactive and not incomplete else None
347de493 3781 else:
3bec830a 3782 video_title = info_dict.get('title') or info_dict.get('id') or 'entry'
b1a7cd05 3783 filter_str = ') | ('.join(map(str.strip, filters))
3784 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
347de493 3785 return _match_func
91410c9b
PH
3786
3787
f2df4071 3788class download_range_func:
3789 def __init__(self, chapters, ranges):
3790 self.chapters, self.ranges = chapters, ranges
3791
3792 def __call__(self, info_dict, ydl):
5ec1b6b7 3793 warning = ('There are no chapters matching the regex' if info_dict.get('chapters')
56ba69e4 3794 else 'Cannot match chapters since chapter information is unavailable')
f2df4071 3795 for regex in self.chapters or []:
5ec1b6b7 3796 for i, chapter in enumerate(info_dict.get('chapters') or []):
3797 if re.search(regex, chapter['title']):
3798 warning = None
3799 yield {**chapter, 'index': i}
f2df4071 3800 if self.chapters and warning:
5ec1b6b7 3801 ydl.to_screen(f'[info] {info_dict["id"]}: {warning}')
3802
f2df4071 3803 yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or [])
5ec1b6b7 3804
f2df4071 3805 def __eq__(self, other):
3806 return (isinstance(other, download_range_func)
3807 and self.chapters == other.chapters and self.ranges == other.ranges)
5ec1b6b7 3808
3809
bf6427d2
YCH
3810def parse_dfxp_time_expr(time_expr):
3811 if not time_expr:
d631d5f9 3812 return
bf6427d2 3813
1d485a1a 3814 mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr)
bf6427d2
YCH
3815 if mobj:
3816 return float(mobj.group('time_offset'))
3817
db2fe38b 3818 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3819 if mobj:
db2fe38b 3820 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3821
3822
c1c924ab 3823def srt_subtitles_timecode(seconds):
aa7785f8 3824 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3825
3826
3827def ass_subtitles_timecode(seconds):
3828 time = timetuple_from_msec(seconds * 1000)
3829 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3830
3831
3832def dfxp2srt(dfxp_data):
3869028f
YCH
3833 '''
3834 @param dfxp_data A bytes-like object containing DFXP data
3835 @returns A unicode object containing converted SRT data
3836 '''
5b995f71 3837 LEGACY_NAMESPACES = (
3869028f
YCH
3838 (b'http://www.w3.org/ns/ttml', [
3839 b'http://www.w3.org/2004/11/ttaf1',
3840 b'http://www.w3.org/2006/04/ttaf1',
3841 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3842 ]),
3869028f
YCH
3843 (b'http://www.w3.org/ns/ttml#styling', [
3844 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3845 ]),
3846 )
3847
3848 SUPPORTED_STYLING = [
3849 'color',
3850 'fontFamily',
3851 'fontSize',
3852 'fontStyle',
3853 'fontWeight',
3854 'textDecoration'
3855 ]
3856
4e335771 3857 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3858 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3859 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3860 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3861 })
bf6427d2 3862
5b995f71
RA
3863 styles = {}
3864 default_style = {}
3865
86e5f3ed 3866 class TTMLPElementParser:
5b995f71
RA
3867 _out = ''
3868 _unclosed_elements = []
3869 _applied_styles = []
bf6427d2 3870
2b14cb56 3871 def start(self, tag, attrib):
5b995f71
RA
3872 if tag in (_x('ttml:br'), 'br'):
3873 self._out += '\n'
3874 else:
3875 unclosed_elements = []
3876 style = {}
3877 element_style_id = attrib.get('style')
3878 if default_style:
3879 style.update(default_style)
3880 if element_style_id:
3881 style.update(styles.get(element_style_id, {}))
3882 for prop in SUPPORTED_STYLING:
3883 prop_val = attrib.get(_x('tts:' + prop))
3884 if prop_val:
3885 style[prop] = prop_val
3886 if style:
3887 font = ''
3888 for k, v in sorted(style.items()):
3889 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3890 continue
3891 if k == 'color':
3892 font += ' color="%s"' % v
3893 elif k == 'fontSize':
3894 font += ' size="%s"' % v
3895 elif k == 'fontFamily':
3896 font += ' face="%s"' % v
3897 elif k == 'fontWeight' and v == 'bold':
3898 self._out += '<b>'
3899 unclosed_elements.append('b')
3900 elif k == 'fontStyle' and v == 'italic':
3901 self._out += '<i>'
3902 unclosed_elements.append('i')
3903 elif k == 'textDecoration' and v == 'underline':
3904 self._out += '<u>'
3905 unclosed_elements.append('u')
3906 if font:
3907 self._out += '<font' + font + '>'
3908 unclosed_elements.append('font')
3909 applied_style = {}
3910 if self._applied_styles:
3911 applied_style.update(self._applied_styles[-1])
3912 applied_style.update(style)
3913 self._applied_styles.append(applied_style)
3914 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3915
2b14cb56 3916 def end(self, tag):
5b995f71
RA
3917 if tag not in (_x('ttml:br'), 'br'):
3918 unclosed_elements = self._unclosed_elements.pop()
3919 for element in reversed(unclosed_elements):
3920 self._out += '</%s>' % element
3921 if unclosed_elements and self._applied_styles:
3922 self._applied_styles.pop()
bf6427d2 3923
2b14cb56 3924 def data(self, data):
5b995f71 3925 self._out += data
2b14cb56 3926
3927 def close(self):
5b995f71 3928 return self._out.strip()
2b14cb56 3929
3930 def parse_node(node):
3931 target = TTMLPElementParser()
3932 parser = xml.etree.ElementTree.XMLParser(target=target)
3933 parser.feed(xml.etree.ElementTree.tostring(node))
3934 return parser.close()
bf6427d2 3935
5b995f71
RA
3936 for k, v in LEGACY_NAMESPACES:
3937 for ns in v:
3938 dfxp_data = dfxp_data.replace(ns, k)
3939
3869028f 3940 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3941 out = []
5b995f71 3942 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3943
3944 if not paras:
3945 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3946
5b995f71
RA
3947 repeat = False
3948 while True:
3949 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3950 style_id = style.get('id') or style.get(_x('xml:id'))
3951 if not style_id:
3952 continue
5b995f71
RA
3953 parent_style_id = style.get('style')
3954 if parent_style_id:
3955 if parent_style_id not in styles:
3956 repeat = True
3957 continue
3958 styles[style_id] = styles[parent_style_id].copy()
3959 for prop in SUPPORTED_STYLING:
3960 prop_val = style.get(_x('tts:' + prop))
3961 if prop_val:
3962 styles.setdefault(style_id, {})[prop] = prop_val
3963 if repeat:
3964 repeat = False
3965 else:
3966 break
3967
3968 for p in ('body', 'div'):
3969 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3970 if ele is None:
3971 continue
3972 style = styles.get(ele.get('style'))
3973 if not style:
3974 continue
3975 default_style.update(style)
3976
bf6427d2 3977 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3978 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3979 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3980 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3981 if begin_time is None:
3982 continue
7dff0363 3983 if not end_time:
d631d5f9
YCH
3984 if not dur:
3985 continue
3986 end_time = begin_time + dur
bf6427d2
YCH
3987 out.append('%d\n%s --> %s\n%s\n\n' % (
3988 index,
c1c924ab
YCH
3989 srt_subtitles_timecode(begin_time),
3990 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3991 parse_node(para)))
3992
3993 return ''.join(out)
3994
3995
c487cf00 3996def cli_option(params, command_option, param, separator=None):
66e289ba 3997 param = params.get(param)
c487cf00 3998 return ([] if param is None
3999 else [command_option, str(param)] if separator is None
4000 else [f'{command_option}{separator}{param}'])
66e289ba
S
4001
4002
4003def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
4004 param = params.get(param)
c487cf00 4005 assert param in (True, False, None)
4006 return cli_option({True: true_value, False: false_value}, command_option, param, separator)
66e289ba
S
4007
4008
4009def cli_valueless_option(params, command_option, param, expected_value=True):
c487cf00 4010 return [command_option] if params.get(param) == expected_value else []
66e289ba
S
4011
4012
e92caff5 4013def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 4014 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 4015 if use_compat:
5b1ecbb3 4016 return argdict
4017 else:
4018 argdict = None
eab9b2bc 4019 if argdict is None:
5b1ecbb3 4020 return default
eab9b2bc 4021 assert isinstance(argdict, dict)
4022
e92caff5 4023 assert isinstance(keys, (list, tuple))
4024 for key_list in keys:
e92caff5 4025 arg_list = list(filter(
4026 lambda x: x is not None,
6606817a 4027 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 4028 if arg_list:
4029 return [arg for args in arg_list for arg in args]
4030 return default
66e289ba 4031
6251555f 4032
330690a2 4033def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
4034 main_key, exe = main_key.lower(), exe.lower()
4035 root_key = exe if main_key == exe else f'{main_key}+{exe}'
4036 keys = [f'{root_key}{k}' for k in (keys or [''])]
4037 if root_key in keys:
4038 if main_key != exe:
4039 keys.append((main_key, exe))
4040 keys.append('default')
4041 else:
4042 use_compat = False
4043 return cli_configuration_args(argdict, keys, default, use_compat)
4044
66e289ba 4045
86e5f3ed 4046class ISO639Utils:
39672624
YCH
4047 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
4048 _lang_map = {
4049 'aa': 'aar',
4050 'ab': 'abk',
4051 'ae': 'ave',
4052 'af': 'afr',
4053 'ak': 'aka',
4054 'am': 'amh',
4055 'an': 'arg',
4056 'ar': 'ara',
4057 'as': 'asm',
4058 'av': 'ava',
4059 'ay': 'aym',
4060 'az': 'aze',
4061 'ba': 'bak',
4062 'be': 'bel',
4063 'bg': 'bul',
4064 'bh': 'bih',
4065 'bi': 'bis',
4066 'bm': 'bam',
4067 'bn': 'ben',
4068 'bo': 'bod',
4069 'br': 'bre',
4070 'bs': 'bos',
4071 'ca': 'cat',
4072 'ce': 'che',
4073 'ch': 'cha',
4074 'co': 'cos',
4075 'cr': 'cre',
4076 'cs': 'ces',
4077 'cu': 'chu',
4078 'cv': 'chv',
4079 'cy': 'cym',
4080 'da': 'dan',
4081 'de': 'deu',
4082 'dv': 'div',
4083 'dz': 'dzo',
4084 'ee': 'ewe',
4085 'el': 'ell',
4086 'en': 'eng',
4087 'eo': 'epo',
4088 'es': 'spa',
4089 'et': 'est',
4090 'eu': 'eus',
4091 'fa': 'fas',
4092 'ff': 'ful',
4093 'fi': 'fin',
4094 'fj': 'fij',
4095 'fo': 'fao',
4096 'fr': 'fra',
4097 'fy': 'fry',
4098 'ga': 'gle',
4099 'gd': 'gla',
4100 'gl': 'glg',
4101 'gn': 'grn',
4102 'gu': 'guj',
4103 'gv': 'glv',
4104 'ha': 'hau',
4105 'he': 'heb',
b7acc835 4106 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
4107 'hi': 'hin',
4108 'ho': 'hmo',
4109 'hr': 'hrv',
4110 'ht': 'hat',
4111 'hu': 'hun',
4112 'hy': 'hye',
4113 'hz': 'her',
4114 'ia': 'ina',
4115 'id': 'ind',
b7acc835 4116 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
4117 'ie': 'ile',
4118 'ig': 'ibo',
4119 'ii': 'iii',
4120 'ik': 'ipk',
4121 'io': 'ido',
4122 'is': 'isl',
4123 'it': 'ita',
4124 'iu': 'iku',
4125 'ja': 'jpn',
4126 'jv': 'jav',
4127 'ka': 'kat',
4128 'kg': 'kon',
4129 'ki': 'kik',
4130 'kj': 'kua',
4131 'kk': 'kaz',
4132 'kl': 'kal',
4133 'km': 'khm',
4134 'kn': 'kan',
4135 'ko': 'kor',
4136 'kr': 'kau',
4137 'ks': 'kas',
4138 'ku': 'kur',
4139 'kv': 'kom',
4140 'kw': 'cor',
4141 'ky': 'kir',
4142 'la': 'lat',
4143 'lb': 'ltz',
4144 'lg': 'lug',
4145 'li': 'lim',
4146 'ln': 'lin',
4147 'lo': 'lao',
4148 'lt': 'lit',
4149 'lu': 'lub',
4150 'lv': 'lav',
4151 'mg': 'mlg',
4152 'mh': 'mah',
4153 'mi': 'mri',
4154 'mk': 'mkd',
4155 'ml': 'mal',
4156 'mn': 'mon',
4157 'mr': 'mar',
4158 'ms': 'msa',
4159 'mt': 'mlt',
4160 'my': 'mya',
4161 'na': 'nau',
4162 'nb': 'nob',
4163 'nd': 'nde',
4164 'ne': 'nep',
4165 'ng': 'ndo',
4166 'nl': 'nld',
4167 'nn': 'nno',
4168 'no': 'nor',
4169 'nr': 'nbl',
4170 'nv': 'nav',
4171 'ny': 'nya',
4172 'oc': 'oci',
4173 'oj': 'oji',
4174 'om': 'orm',
4175 'or': 'ori',
4176 'os': 'oss',
4177 'pa': 'pan',
4178 'pi': 'pli',
4179 'pl': 'pol',
4180 'ps': 'pus',
4181 'pt': 'por',
4182 'qu': 'que',
4183 'rm': 'roh',
4184 'rn': 'run',
4185 'ro': 'ron',
4186 'ru': 'rus',
4187 'rw': 'kin',
4188 'sa': 'san',
4189 'sc': 'srd',
4190 'sd': 'snd',
4191 'se': 'sme',
4192 'sg': 'sag',
4193 'si': 'sin',
4194 'sk': 'slk',
4195 'sl': 'slv',
4196 'sm': 'smo',
4197 'sn': 'sna',
4198 'so': 'som',
4199 'sq': 'sqi',
4200 'sr': 'srp',
4201 'ss': 'ssw',
4202 'st': 'sot',
4203 'su': 'sun',
4204 'sv': 'swe',
4205 'sw': 'swa',
4206 'ta': 'tam',
4207 'te': 'tel',
4208 'tg': 'tgk',
4209 'th': 'tha',
4210 'ti': 'tir',
4211 'tk': 'tuk',
4212 'tl': 'tgl',
4213 'tn': 'tsn',
4214 'to': 'ton',
4215 'tr': 'tur',
4216 'ts': 'tso',
4217 'tt': 'tat',
4218 'tw': 'twi',
4219 'ty': 'tah',
4220 'ug': 'uig',
4221 'uk': 'ukr',
4222 'ur': 'urd',
4223 'uz': 'uzb',
4224 've': 'ven',
4225 'vi': 'vie',
4226 'vo': 'vol',
4227 'wa': 'wln',
4228 'wo': 'wol',
4229 'xh': 'xho',
4230 'yi': 'yid',
e9a50fba 4231 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
4232 'yo': 'yor',
4233 'za': 'zha',
4234 'zh': 'zho',
4235 'zu': 'zul',
4236 }
4237
4238 @classmethod
4239 def short2long(cls, code):
4240 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4241 return cls._lang_map.get(code[:2])
4242
4243 @classmethod
4244 def long2short(cls, code):
4245 """Convert language code from ISO 639-2/T to ISO 639-1"""
4246 for short_name, long_name in cls._lang_map.items():
4247 if long_name == code:
4248 return short_name
4249
4250
86e5f3ed 4251class ISO3166Utils:
4eb10f66
YCH
4252 # From http://data.okfn.org/data/core/country-list
4253 _country_map = {
4254 'AF': 'Afghanistan',
4255 'AX': 'Åland Islands',
4256 'AL': 'Albania',
4257 'DZ': 'Algeria',
4258 'AS': 'American Samoa',
4259 'AD': 'Andorra',
4260 'AO': 'Angola',
4261 'AI': 'Anguilla',
4262 'AQ': 'Antarctica',
4263 'AG': 'Antigua and Barbuda',
4264 'AR': 'Argentina',
4265 'AM': 'Armenia',
4266 'AW': 'Aruba',
4267 'AU': 'Australia',
4268 'AT': 'Austria',
4269 'AZ': 'Azerbaijan',
4270 'BS': 'Bahamas',
4271 'BH': 'Bahrain',
4272 'BD': 'Bangladesh',
4273 'BB': 'Barbados',
4274 'BY': 'Belarus',
4275 'BE': 'Belgium',
4276 'BZ': 'Belize',
4277 'BJ': 'Benin',
4278 'BM': 'Bermuda',
4279 'BT': 'Bhutan',
4280 'BO': 'Bolivia, Plurinational State of',
4281 'BQ': 'Bonaire, Sint Eustatius and Saba',
4282 'BA': 'Bosnia and Herzegovina',
4283 'BW': 'Botswana',
4284 'BV': 'Bouvet Island',
4285 'BR': 'Brazil',
4286 'IO': 'British Indian Ocean Territory',
4287 'BN': 'Brunei Darussalam',
4288 'BG': 'Bulgaria',
4289 'BF': 'Burkina Faso',
4290 'BI': 'Burundi',
4291 'KH': 'Cambodia',
4292 'CM': 'Cameroon',
4293 'CA': 'Canada',
4294 'CV': 'Cape Verde',
4295 'KY': 'Cayman Islands',
4296 'CF': 'Central African Republic',
4297 'TD': 'Chad',
4298 'CL': 'Chile',
4299 'CN': 'China',
4300 'CX': 'Christmas Island',
4301 'CC': 'Cocos (Keeling) Islands',
4302 'CO': 'Colombia',
4303 'KM': 'Comoros',
4304 'CG': 'Congo',
4305 'CD': 'Congo, the Democratic Republic of the',
4306 'CK': 'Cook Islands',
4307 'CR': 'Costa Rica',
4308 'CI': 'Côte d\'Ivoire',
4309 'HR': 'Croatia',
4310 'CU': 'Cuba',
4311 'CW': 'Curaçao',
4312 'CY': 'Cyprus',
4313 'CZ': 'Czech Republic',
4314 'DK': 'Denmark',
4315 'DJ': 'Djibouti',
4316 'DM': 'Dominica',
4317 'DO': 'Dominican Republic',
4318 'EC': 'Ecuador',
4319 'EG': 'Egypt',
4320 'SV': 'El Salvador',
4321 'GQ': 'Equatorial Guinea',
4322 'ER': 'Eritrea',
4323 'EE': 'Estonia',
4324 'ET': 'Ethiopia',
4325 'FK': 'Falkland Islands (Malvinas)',
4326 'FO': 'Faroe Islands',
4327 'FJ': 'Fiji',
4328 'FI': 'Finland',
4329 'FR': 'France',
4330 'GF': 'French Guiana',
4331 'PF': 'French Polynesia',
4332 'TF': 'French Southern Territories',
4333 'GA': 'Gabon',
4334 'GM': 'Gambia',
4335 'GE': 'Georgia',
4336 'DE': 'Germany',
4337 'GH': 'Ghana',
4338 'GI': 'Gibraltar',
4339 'GR': 'Greece',
4340 'GL': 'Greenland',
4341 'GD': 'Grenada',
4342 'GP': 'Guadeloupe',
4343 'GU': 'Guam',
4344 'GT': 'Guatemala',
4345 'GG': 'Guernsey',
4346 'GN': 'Guinea',
4347 'GW': 'Guinea-Bissau',
4348 'GY': 'Guyana',
4349 'HT': 'Haiti',
4350 'HM': 'Heard Island and McDonald Islands',
4351 'VA': 'Holy See (Vatican City State)',
4352 'HN': 'Honduras',
4353 'HK': 'Hong Kong',
4354 'HU': 'Hungary',
4355 'IS': 'Iceland',
4356 'IN': 'India',
4357 'ID': 'Indonesia',
4358 'IR': 'Iran, Islamic Republic of',
4359 'IQ': 'Iraq',
4360 'IE': 'Ireland',
4361 'IM': 'Isle of Man',
4362 'IL': 'Israel',
4363 'IT': 'Italy',
4364 'JM': 'Jamaica',
4365 'JP': 'Japan',
4366 'JE': 'Jersey',
4367 'JO': 'Jordan',
4368 'KZ': 'Kazakhstan',
4369 'KE': 'Kenya',
4370 'KI': 'Kiribati',
4371 'KP': 'Korea, Democratic People\'s Republic of',
4372 'KR': 'Korea, Republic of',
4373 'KW': 'Kuwait',
4374 'KG': 'Kyrgyzstan',
4375 'LA': 'Lao People\'s Democratic Republic',
4376 'LV': 'Latvia',
4377 'LB': 'Lebanon',
4378 'LS': 'Lesotho',
4379 'LR': 'Liberia',
4380 'LY': 'Libya',
4381 'LI': 'Liechtenstein',
4382 'LT': 'Lithuania',
4383 'LU': 'Luxembourg',
4384 'MO': 'Macao',
4385 'MK': 'Macedonia, the Former Yugoslav Republic of',
4386 'MG': 'Madagascar',
4387 'MW': 'Malawi',
4388 'MY': 'Malaysia',
4389 'MV': 'Maldives',
4390 'ML': 'Mali',
4391 'MT': 'Malta',
4392 'MH': 'Marshall Islands',
4393 'MQ': 'Martinique',
4394 'MR': 'Mauritania',
4395 'MU': 'Mauritius',
4396 'YT': 'Mayotte',
4397 'MX': 'Mexico',
4398 'FM': 'Micronesia, Federated States of',
4399 'MD': 'Moldova, Republic of',
4400 'MC': 'Monaco',
4401 'MN': 'Mongolia',
4402 'ME': 'Montenegro',
4403 'MS': 'Montserrat',
4404 'MA': 'Morocco',
4405 'MZ': 'Mozambique',
4406 'MM': 'Myanmar',
4407 'NA': 'Namibia',
4408 'NR': 'Nauru',
4409 'NP': 'Nepal',
4410 'NL': 'Netherlands',
4411 'NC': 'New Caledonia',
4412 'NZ': 'New Zealand',
4413 'NI': 'Nicaragua',
4414 'NE': 'Niger',
4415 'NG': 'Nigeria',
4416 'NU': 'Niue',
4417 'NF': 'Norfolk Island',
4418 'MP': 'Northern Mariana Islands',
4419 'NO': 'Norway',
4420 'OM': 'Oman',
4421 'PK': 'Pakistan',
4422 'PW': 'Palau',
4423 'PS': 'Palestine, State of',
4424 'PA': 'Panama',
4425 'PG': 'Papua New Guinea',
4426 'PY': 'Paraguay',
4427 'PE': 'Peru',
4428 'PH': 'Philippines',
4429 'PN': 'Pitcairn',
4430 'PL': 'Poland',
4431 'PT': 'Portugal',
4432 'PR': 'Puerto Rico',
4433 'QA': 'Qatar',
4434 'RE': 'Réunion',
4435 'RO': 'Romania',
4436 'RU': 'Russian Federation',
4437 'RW': 'Rwanda',
4438 'BL': 'Saint Barthélemy',
4439 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4440 'KN': 'Saint Kitts and Nevis',
4441 'LC': 'Saint Lucia',
4442 'MF': 'Saint Martin (French part)',
4443 'PM': 'Saint Pierre and Miquelon',
4444 'VC': 'Saint Vincent and the Grenadines',
4445 'WS': 'Samoa',
4446 'SM': 'San Marino',
4447 'ST': 'Sao Tome and Principe',
4448 'SA': 'Saudi Arabia',
4449 'SN': 'Senegal',
4450 'RS': 'Serbia',
4451 'SC': 'Seychelles',
4452 'SL': 'Sierra Leone',
4453 'SG': 'Singapore',
4454 'SX': 'Sint Maarten (Dutch part)',
4455 'SK': 'Slovakia',
4456 'SI': 'Slovenia',
4457 'SB': 'Solomon Islands',
4458 'SO': 'Somalia',
4459 'ZA': 'South Africa',
4460 'GS': 'South Georgia and the South Sandwich Islands',
4461 'SS': 'South Sudan',
4462 'ES': 'Spain',
4463 'LK': 'Sri Lanka',
4464 'SD': 'Sudan',
4465 'SR': 'Suriname',
4466 'SJ': 'Svalbard and Jan Mayen',
4467 'SZ': 'Swaziland',
4468 'SE': 'Sweden',
4469 'CH': 'Switzerland',
4470 'SY': 'Syrian Arab Republic',
4471 'TW': 'Taiwan, Province of China',
4472 'TJ': 'Tajikistan',
4473 'TZ': 'Tanzania, United Republic of',
4474 'TH': 'Thailand',
4475 'TL': 'Timor-Leste',
4476 'TG': 'Togo',
4477 'TK': 'Tokelau',
4478 'TO': 'Tonga',
4479 'TT': 'Trinidad and Tobago',
4480 'TN': 'Tunisia',
4481 'TR': 'Turkey',
4482 'TM': 'Turkmenistan',
4483 'TC': 'Turks and Caicos Islands',
4484 'TV': 'Tuvalu',
4485 'UG': 'Uganda',
4486 'UA': 'Ukraine',
4487 'AE': 'United Arab Emirates',
4488 'GB': 'United Kingdom',
4489 'US': 'United States',
4490 'UM': 'United States Minor Outlying Islands',
4491 'UY': 'Uruguay',
4492 'UZ': 'Uzbekistan',
4493 'VU': 'Vanuatu',
4494 'VE': 'Venezuela, Bolivarian Republic of',
4495 'VN': 'Viet Nam',
4496 'VG': 'Virgin Islands, British',
4497 'VI': 'Virgin Islands, U.S.',
4498 'WF': 'Wallis and Futuna',
4499 'EH': 'Western Sahara',
4500 'YE': 'Yemen',
4501 'ZM': 'Zambia',
4502 'ZW': 'Zimbabwe',
2f97cc61 4503 # Not ISO 3166 codes, but used for IP blocks
4504 'AP': 'Asia/Pacific Region',
4505 'EU': 'Europe',
4eb10f66
YCH
4506 }
4507
4508 @classmethod
4509 def short2full(cls, code):
4510 """Convert an ISO 3166-2 country code to the corresponding full name"""
4511 return cls._country_map.get(code.upper())
4512
4513
86e5f3ed 4514class GeoUtils:
773f291d
S
4515 # Major IPv4 address blocks per country
4516 _country_ip_map = {
53896ca5 4517 'AD': '46.172.224.0/19',
773f291d
S
4518 'AE': '94.200.0.0/13',
4519 'AF': '149.54.0.0/17',
4520 'AG': '209.59.64.0/18',
4521 'AI': '204.14.248.0/21',
4522 'AL': '46.99.0.0/16',
4523 'AM': '46.70.0.0/15',
4524 'AO': '105.168.0.0/13',
53896ca5
S
4525 'AP': '182.50.184.0/21',
4526 'AQ': '23.154.160.0/24',
773f291d
S
4527 'AR': '181.0.0.0/12',
4528 'AS': '202.70.112.0/20',
53896ca5 4529 'AT': '77.116.0.0/14',
773f291d
S
4530 'AU': '1.128.0.0/11',
4531 'AW': '181.41.0.0/18',
53896ca5
S
4532 'AX': '185.217.4.0/22',
4533 'AZ': '5.197.0.0/16',
773f291d
S
4534 'BA': '31.176.128.0/17',
4535 'BB': '65.48.128.0/17',
4536 'BD': '114.130.0.0/16',
4537 'BE': '57.0.0.0/8',
53896ca5 4538 'BF': '102.178.0.0/15',
773f291d
S
4539 'BG': '95.42.0.0/15',
4540 'BH': '37.131.0.0/17',
4541 'BI': '154.117.192.0/18',
4542 'BJ': '137.255.0.0/16',
53896ca5 4543 'BL': '185.212.72.0/23',
773f291d
S
4544 'BM': '196.12.64.0/18',
4545 'BN': '156.31.0.0/16',
4546 'BO': '161.56.0.0/16',
4547 'BQ': '161.0.80.0/20',
53896ca5 4548 'BR': '191.128.0.0/12',
773f291d
S
4549 'BS': '24.51.64.0/18',
4550 'BT': '119.2.96.0/19',
4551 'BW': '168.167.0.0/16',
4552 'BY': '178.120.0.0/13',
4553 'BZ': '179.42.192.0/18',
4554 'CA': '99.224.0.0/11',
4555 'CD': '41.243.0.0/16',
53896ca5
S
4556 'CF': '197.242.176.0/21',
4557 'CG': '160.113.0.0/16',
773f291d 4558 'CH': '85.0.0.0/13',
53896ca5 4559 'CI': '102.136.0.0/14',
773f291d
S
4560 'CK': '202.65.32.0/19',
4561 'CL': '152.172.0.0/14',
53896ca5 4562 'CM': '102.244.0.0/14',
773f291d
S
4563 'CN': '36.128.0.0/10',
4564 'CO': '181.240.0.0/12',
4565 'CR': '201.192.0.0/12',
4566 'CU': '152.206.0.0/15',
4567 'CV': '165.90.96.0/19',
4568 'CW': '190.88.128.0/17',
53896ca5 4569 'CY': '31.153.0.0/16',
773f291d
S
4570 'CZ': '88.100.0.0/14',
4571 'DE': '53.0.0.0/8',
4572 'DJ': '197.241.0.0/17',
4573 'DK': '87.48.0.0/12',
4574 'DM': '192.243.48.0/20',
4575 'DO': '152.166.0.0/15',
4576 'DZ': '41.96.0.0/12',
4577 'EC': '186.68.0.0/15',
4578 'EE': '90.190.0.0/15',
4579 'EG': '156.160.0.0/11',
4580 'ER': '196.200.96.0/20',
4581 'ES': '88.0.0.0/11',
4582 'ET': '196.188.0.0/14',
4583 'EU': '2.16.0.0/13',
4584 'FI': '91.152.0.0/13',
4585 'FJ': '144.120.0.0/16',
53896ca5 4586 'FK': '80.73.208.0/21',
773f291d
S
4587 'FM': '119.252.112.0/20',
4588 'FO': '88.85.32.0/19',
4589 'FR': '90.0.0.0/9',
4590 'GA': '41.158.0.0/15',
4591 'GB': '25.0.0.0/8',
4592 'GD': '74.122.88.0/21',
4593 'GE': '31.146.0.0/16',
4594 'GF': '161.22.64.0/18',
4595 'GG': '62.68.160.0/19',
53896ca5
S
4596 'GH': '154.160.0.0/12',
4597 'GI': '95.164.0.0/16',
773f291d
S
4598 'GL': '88.83.0.0/19',
4599 'GM': '160.182.0.0/15',
4600 'GN': '197.149.192.0/18',
4601 'GP': '104.250.0.0/19',
4602 'GQ': '105.235.224.0/20',
4603 'GR': '94.64.0.0/13',
4604 'GT': '168.234.0.0/16',
4605 'GU': '168.123.0.0/16',
4606 'GW': '197.214.80.0/20',
4607 'GY': '181.41.64.0/18',
4608 'HK': '113.252.0.0/14',
4609 'HN': '181.210.0.0/16',
4610 'HR': '93.136.0.0/13',
4611 'HT': '148.102.128.0/17',
4612 'HU': '84.0.0.0/14',
4613 'ID': '39.192.0.0/10',
4614 'IE': '87.32.0.0/12',
4615 'IL': '79.176.0.0/13',
4616 'IM': '5.62.80.0/20',
4617 'IN': '117.192.0.0/10',
4618 'IO': '203.83.48.0/21',
4619 'IQ': '37.236.0.0/14',
4620 'IR': '2.176.0.0/12',
4621 'IS': '82.221.0.0/16',
4622 'IT': '79.0.0.0/10',
4623 'JE': '87.244.64.0/18',
4624 'JM': '72.27.0.0/17',
4625 'JO': '176.29.0.0/16',
53896ca5 4626 'JP': '133.0.0.0/8',
773f291d
S
4627 'KE': '105.48.0.0/12',
4628 'KG': '158.181.128.0/17',
4629 'KH': '36.37.128.0/17',
4630 'KI': '103.25.140.0/22',
4631 'KM': '197.255.224.0/20',
53896ca5 4632 'KN': '198.167.192.0/19',
773f291d
S
4633 'KP': '175.45.176.0/22',
4634 'KR': '175.192.0.0/10',
4635 'KW': '37.36.0.0/14',
4636 'KY': '64.96.0.0/15',
4637 'KZ': '2.72.0.0/13',
4638 'LA': '115.84.64.0/18',
4639 'LB': '178.135.0.0/16',
53896ca5 4640 'LC': '24.92.144.0/20',
773f291d
S
4641 'LI': '82.117.0.0/19',
4642 'LK': '112.134.0.0/15',
53896ca5 4643 'LR': '102.183.0.0/16',
773f291d
S
4644 'LS': '129.232.0.0/17',
4645 'LT': '78.56.0.0/13',
4646 'LU': '188.42.0.0/16',
4647 'LV': '46.109.0.0/16',
4648 'LY': '41.252.0.0/14',
4649 'MA': '105.128.0.0/11',
4650 'MC': '88.209.64.0/18',
4651 'MD': '37.246.0.0/16',
4652 'ME': '178.175.0.0/17',
4653 'MF': '74.112.232.0/21',
4654 'MG': '154.126.0.0/17',
4655 'MH': '117.103.88.0/21',
4656 'MK': '77.28.0.0/15',
4657 'ML': '154.118.128.0/18',
4658 'MM': '37.111.0.0/17',
4659 'MN': '49.0.128.0/17',
4660 'MO': '60.246.0.0/16',
4661 'MP': '202.88.64.0/20',
4662 'MQ': '109.203.224.0/19',
4663 'MR': '41.188.64.0/18',
4664 'MS': '208.90.112.0/22',
4665 'MT': '46.11.0.0/16',
4666 'MU': '105.16.0.0/12',
4667 'MV': '27.114.128.0/18',
53896ca5 4668 'MW': '102.70.0.0/15',
773f291d
S
4669 'MX': '187.192.0.0/11',
4670 'MY': '175.136.0.0/13',
4671 'MZ': '197.218.0.0/15',
4672 'NA': '41.182.0.0/16',
4673 'NC': '101.101.0.0/18',
4674 'NE': '197.214.0.0/18',
4675 'NF': '203.17.240.0/22',
4676 'NG': '105.112.0.0/12',
4677 'NI': '186.76.0.0/15',
4678 'NL': '145.96.0.0/11',
4679 'NO': '84.208.0.0/13',
4680 'NP': '36.252.0.0/15',
4681 'NR': '203.98.224.0/19',
4682 'NU': '49.156.48.0/22',
4683 'NZ': '49.224.0.0/14',
4684 'OM': '5.36.0.0/15',
4685 'PA': '186.72.0.0/15',
4686 'PE': '186.160.0.0/14',
4687 'PF': '123.50.64.0/18',
4688 'PG': '124.240.192.0/19',
4689 'PH': '49.144.0.0/13',
4690 'PK': '39.32.0.0/11',
4691 'PL': '83.0.0.0/11',
4692 'PM': '70.36.0.0/20',
4693 'PR': '66.50.0.0/16',
4694 'PS': '188.161.0.0/16',
4695 'PT': '85.240.0.0/13',
4696 'PW': '202.124.224.0/20',
4697 'PY': '181.120.0.0/14',
4698 'QA': '37.210.0.0/15',
53896ca5 4699 'RE': '102.35.0.0/16',
773f291d 4700 'RO': '79.112.0.0/13',
53896ca5 4701 'RS': '93.86.0.0/15',
773f291d 4702 'RU': '5.136.0.0/13',
53896ca5 4703 'RW': '41.186.0.0/16',
773f291d
S
4704 'SA': '188.48.0.0/13',
4705 'SB': '202.1.160.0/19',
4706 'SC': '154.192.0.0/11',
53896ca5 4707 'SD': '102.120.0.0/13',
773f291d 4708 'SE': '78.64.0.0/12',
53896ca5 4709 'SG': '8.128.0.0/10',
773f291d
S
4710 'SI': '188.196.0.0/14',
4711 'SK': '78.98.0.0/15',
53896ca5 4712 'SL': '102.143.0.0/17',
773f291d
S
4713 'SM': '89.186.32.0/19',
4714 'SN': '41.82.0.0/15',
53896ca5 4715 'SO': '154.115.192.0/18',
773f291d
S
4716 'SR': '186.179.128.0/17',
4717 'SS': '105.235.208.0/21',
4718 'ST': '197.159.160.0/19',
4719 'SV': '168.243.0.0/16',
4720 'SX': '190.102.0.0/20',
4721 'SY': '5.0.0.0/16',
4722 'SZ': '41.84.224.0/19',
4723 'TC': '65.255.48.0/20',
4724 'TD': '154.68.128.0/19',
4725 'TG': '196.168.0.0/14',
4726 'TH': '171.96.0.0/13',
4727 'TJ': '85.9.128.0/18',
4728 'TK': '27.96.24.0/21',
4729 'TL': '180.189.160.0/20',
4730 'TM': '95.85.96.0/19',
4731 'TN': '197.0.0.0/11',
4732 'TO': '175.176.144.0/21',
4733 'TR': '78.160.0.0/11',
4734 'TT': '186.44.0.0/15',
4735 'TV': '202.2.96.0/19',
4736 'TW': '120.96.0.0/11',
4737 'TZ': '156.156.0.0/14',
53896ca5
S
4738 'UA': '37.52.0.0/14',
4739 'UG': '102.80.0.0/13',
4740 'US': '6.0.0.0/8',
773f291d 4741 'UY': '167.56.0.0/13',
53896ca5 4742 'UZ': '84.54.64.0/18',
773f291d 4743 'VA': '212.77.0.0/19',
53896ca5 4744 'VC': '207.191.240.0/21',
773f291d 4745 'VE': '186.88.0.0/13',
53896ca5 4746 'VG': '66.81.192.0/20',
773f291d
S
4747 'VI': '146.226.0.0/16',
4748 'VN': '14.160.0.0/11',
4749 'VU': '202.80.32.0/20',
4750 'WF': '117.20.32.0/21',
4751 'WS': '202.4.32.0/19',
4752 'YE': '134.35.0.0/16',
4753 'YT': '41.242.116.0/22',
4754 'ZA': '41.0.0.0/11',
53896ca5
S
4755 'ZM': '102.144.0.0/13',
4756 'ZW': '102.177.192.0/18',
773f291d
S
4757 }
4758
4759 @classmethod
5f95927a
S
4760 def random_ipv4(cls, code_or_block):
4761 if len(code_or_block) == 2:
4762 block = cls._country_ip_map.get(code_or_block.upper())
4763 if not block:
4764 return None
4765 else:
4766 block = code_or_block
773f291d 4767 addr, preflen = block.split('/')
ac668111 4768 addr_min = struct.unpack('!L', socket.inet_aton(addr))[0]
773f291d 4769 addr_max = addr_min | (0xffffffff >> int(preflen))
14f25df2 4770 return str(socket.inet_ntoa(
ac668111 4771 struct.pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4772
4773
ac668111 4774class PerRequestProxyHandler(urllib.request.ProxyHandler):
2461f79d
PH
4775 def __init__(self, proxies=None):
4776 # Set default handlers
4777 for type in ('http', 'https'):
4778 setattr(self, '%s_open' % type,
4779 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4780 meth(r, proxy, type))
ac668111 4781 urllib.request.ProxyHandler.__init__(self, proxies)
2461f79d 4782
91410c9b 4783 def proxy_open(self, req, proxy, type):
2461f79d 4784 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4785 if req_proxy is not None:
4786 proxy = req_proxy
2461f79d
PH
4787 del req.headers['Ytdl-request-proxy']
4788
4789 if proxy == '__noproxy__':
4790 return None # No Proxy
14f25df2 4791 if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4792 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4793 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4794 return None
ac668111 4795 return urllib.request.ProxyHandler.proxy_open(
91410c9b 4796 self, req, proxy, type)
5bc880b9
YCH
4797
4798
0a5445dd
YCH
4799# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4800# released into Public Domain
4801# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4802
4803def long_to_bytes(n, blocksize=0):
4804 """long_to_bytes(n:long, blocksize:int) : string
4805 Convert a long integer to a byte string.
4806
4807 If optional blocksize is given and greater than zero, pad the front of the
4808 byte string with binary zeros so that the length is a multiple of
4809 blocksize.
4810 """
4811 # after much testing, this algorithm was deemed to be the fastest
4812 s = b''
4813 n = int(n)
4814 while n > 0:
ac668111 4815 s = struct.pack('>I', n & 0xffffffff) + s
0a5445dd
YCH
4816 n = n >> 32
4817 # strip off leading zeros
4818 for i in range(len(s)):
4819 if s[i] != b'\000'[0]:
4820 break
4821 else:
4822 # only happens when n == 0
4823 s = b'\000'
4824 i = 0
4825 s = s[i:]
4826 # add back some pad bytes. this could be done more efficiently w.r.t. the
4827 # de-padding being done above, but sigh...
4828 if blocksize > 0 and len(s) % blocksize:
4829 s = (blocksize - len(s) % blocksize) * b'\000' + s
4830 return s
4831
4832
4833def bytes_to_long(s):
4834 """bytes_to_long(string) : long
4835 Convert a byte string to a long integer.
4836
4837 This is (essentially) the inverse of long_to_bytes().
4838 """
4839 acc = 0
4840 length = len(s)
4841 if length % 4:
4842 extra = (4 - length % 4)
4843 s = b'\000' * extra + s
4844 length = length + extra
4845 for i in range(0, length, 4):
ac668111 4846 acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0]
0a5445dd
YCH
4847 return acc
4848
4849
5bc880b9
YCH
4850def ohdave_rsa_encrypt(data, exponent, modulus):
4851 '''
4852 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4853
4854 Input:
4855 data: data to encrypt, bytes-like object
4856 exponent, modulus: parameter e and N of RSA algorithm, both integer
4857 Output: hex string of encrypted data
4858
4859 Limitation: supports one block encryption only
4860 '''
4861
4862 payload = int(binascii.hexlify(data[::-1]), 16)
4863 encrypted = pow(payload, exponent, modulus)
4864 return '%x' % encrypted
81bdc8fd
YCH
4865
4866
f48409c7
YCH
4867def pkcs1pad(data, length):
4868 """
4869 Padding input data with PKCS#1 scheme
4870
4871 @param {int[]} data input data
4872 @param {int} length target length
4873 @returns {int[]} padded data
4874 """
4875 if len(data) > length - 11:
4876 raise ValueError('Input data too long for PKCS#1 padding')
4877
4878 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4879 return [0, 2] + pseudo_random + [0] + data
4880
4881
7b2c3f47 4882def _base_n_table(n, table):
4883 if not table and not n:
4884 raise ValueError('Either table or n must be specified')
612f2be5 4885 table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n]
4886
44f14eb4 4887 if n and n != len(table):
612f2be5 4888 raise ValueError(f'base {n} exceeds table length {len(table)}')
4889 return table
59f898b7 4890
5eb6bdce 4891
7b2c3f47 4892def encode_base_n(num, n=None, table=None):
4893 """Convert given int to a base-n string"""
612f2be5 4894 table = _base_n_table(n, table)
7b2c3f47 4895 if not num:
5eb6bdce
YCH
4896 return table[0]
4897
7b2c3f47 4898 result, base = '', len(table)
81bdc8fd 4899 while num:
7b2c3f47 4900 result = table[num % base] + result
612f2be5 4901 num = num // base
7b2c3f47 4902 return result
4903
4904
4905def decode_base_n(string, n=None, table=None):
4906 """Convert given base-n string to int"""
4907 table = {char: index for index, char in enumerate(_base_n_table(n, table))}
4908 result, base = 0, len(table)
4909 for char in string:
4910 result = result * base + table[char]
4911 return result
4912
4913
4914def decode_base(value, digits):
da4db748 4915 deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed '
4916 f'in a future version. Use {__name__}.decode_base_n instead')
7b2c3f47 4917 return decode_base_n(value, table=digits)
f52354a8
YCH
4918
4919
4920def decode_packed_codes(code):
06b3fe29 4921 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4922 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4923 base = int(base)
4924 count = int(count)
4925 symbols = symbols.split('|')
4926 symbol_table = {}
4927
4928 while count:
4929 count -= 1
5eb6bdce 4930 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4931 symbol_table[base_n_count] = symbols[count] or base_n_count
4932
4933 return re.sub(
4934 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4935 obfuscated_code)
e154c651 4936
4937
1ced2221
S
4938def caesar(s, alphabet, shift):
4939 if shift == 0:
4940 return s
4941 l = len(alphabet)
4942 return ''.join(
4943 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4944 for c in s)
4945
4946
4947def rot47(s):
4948 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4949
4950
e154c651 4951def parse_m3u8_attributes(attrib):
4952 info = {}
4953 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4954 if val.startswith('"'):
4955 val = val[1:-1]
4956 info[key] = val
4957 return info
1143535d
YCH
4958
4959
4960def urshift(val, n):
4961 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4962
4963
4964# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 4965# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
4966def decode_png(png_data):
4967 # Reference: https://www.w3.org/TR/PNG/
4968 header = png_data[8:]
4969
4970 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
86e5f3ed 4971 raise OSError('Not a valid PNG file.')
d3f8e038
YCH
4972
4973 int_map = {1: '>B', 2: '>H', 4: '>I'}
ac668111 4974 unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0]
d3f8e038
YCH
4975
4976 chunks = []
4977
4978 while header:
4979 length = unpack_integer(header[:4])
4980 header = header[4:]
4981
4982 chunk_type = header[:4]
4983 header = header[4:]
4984
4985 chunk_data = header[:length]
4986 header = header[length:]
4987
4988 header = header[4:] # Skip CRC
4989
4990 chunks.append({
4991 'type': chunk_type,
4992 'length': length,
4993 'data': chunk_data
4994 })
4995
4996 ihdr = chunks[0]['data']
4997
4998 width = unpack_integer(ihdr[:4])
4999 height = unpack_integer(ihdr[4:8])
5000
5001 idat = b''
5002
5003 for chunk in chunks:
5004 if chunk['type'] == b'IDAT':
5005 idat += chunk['data']
5006
5007 if not idat:
86e5f3ed 5008 raise OSError('Unable to read PNG data.')
d3f8e038
YCH
5009
5010 decompressed_data = bytearray(zlib.decompress(idat))
5011
5012 stride = width * 3
5013 pixels = []
5014
5015 def _get_pixel(idx):
5016 x = idx % stride
5017 y = idx // stride
5018 return pixels[y][x]
5019
5020 for y in range(height):
5021 basePos = y * (1 + stride)
5022 filter_type = decompressed_data[basePos]
5023
5024 current_row = []
5025
5026 pixels.append(current_row)
5027
5028 for x in range(stride):
5029 color = decompressed_data[1 + basePos + x]
5030 basex = y * stride + x
5031 left = 0
5032 up = 0
5033
5034 if x > 2:
5035 left = _get_pixel(basex - 3)
5036 if y > 0:
5037 up = _get_pixel(basex - stride)
5038
5039 if filter_type == 1: # Sub
5040 color = (color + left) & 0xff
5041 elif filter_type == 2: # Up
5042 color = (color + up) & 0xff
5043 elif filter_type == 3: # Average
5044 color = (color + ((left + up) >> 1)) & 0xff
5045 elif filter_type == 4: # Paeth
5046 a = left
5047 b = up
5048 c = 0
5049
5050 if x > 2 and y > 0:
5051 c = _get_pixel(basex - stride - 3)
5052
5053 p = a + b - c
5054
5055 pa = abs(p - a)
5056 pb = abs(p - b)
5057 pc = abs(p - c)
5058
5059 if pa <= pb and pa <= pc:
5060 color = (color + a) & 0xff
5061 elif pb <= pc:
5062 color = (color + b) & 0xff
5063 else:
5064 color = (color + c) & 0xff
5065
5066 current_row.append(color)
5067
5068 return width, height, pixels
efa97bdc
YCH
5069
5070
5071def write_xattr(path, key, value):
6f7563be 5072 # Windows: Write xattrs to NTFS Alternate Data Streams:
5073 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
5074 if compat_os_name == 'nt':
5075 assert ':' not in key
5076 assert os.path.exists(path)
efa97bdc
YCH
5077
5078 try:
6f7563be 5079 with open(f'{path}:{key}', 'wb') as f:
5080 f.write(value)
86e5f3ed 5081 except OSError as e:
efa97bdc 5082 raise XAttrMetadataError(e.errno, e.strerror)
6f7563be 5083 return
efa97bdc 5084
6f7563be 5085 # UNIX Method 1. Use xattrs/pyxattrs modules
efa97bdc 5086
6f7563be 5087 setxattr = None
5088 if getattr(xattr, '_yt_dlp__identifier', None) == 'pyxattr':
5089 # Unicode arguments are not supported in pyxattr until version 0.5.0
5090 # See https://github.com/ytdl-org/youtube-dl/issues/5498
5091 if version_tuple(xattr.__version__) >= (0, 5, 0):
5092 setxattr = xattr.set
5093 elif xattr:
5094 setxattr = xattr.setxattr
efa97bdc 5095
6f7563be 5096 if setxattr:
5097 try:
5098 setxattr(path, key, value)
5099 except OSError as e:
5100 raise XAttrMetadataError(e.errno, e.strerror)
5101 return
efa97bdc 5102
6f7563be 5103 # UNIX Method 2. Use setfattr/xattr executables
5104 exe = ('setfattr' if check_executable('setfattr', ['--version'])
5105 else 'xattr' if check_executable('xattr', ['-h']) else None)
5106 if not exe:
5107 raise XAttrUnavailableError(
5108 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the '
5109 + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)'))
efa97bdc 5110
0f06bcd7 5111 value = value.decode()
6f7563be 5112 try:
f0c9fb96 5113 _, stderr, returncode = Popen.run(
6f7563be 5114 [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path],
e121e3ce 5115 text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
6f7563be 5116 except OSError as e:
5117 raise XAttrMetadataError(e.errno, e.strerror)
f0c9fb96 5118 if returncode:
5119 raise XAttrMetadataError(returncode, stderr)
0c265486
YCH
5120
5121
5122def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
5123 start_date = datetime.date(1950, 1, 1)
5124 end_date = datetime.date(1995, 12, 31)
5125 offset = random.randint(0, (end_date - start_date).days)
5126 random_date = start_date + datetime.timedelta(offset)
0c265486 5127 return {
aa374bc7
AS
5128 year_field: str(random_date.year),
5129 month_field: str(random_date.month),
5130 day_field: str(random_date.day),
0c265486 5131 }
732044af 5132
c76eb41b 5133
732044af 5134# Templates for internet shortcut files, which are plain text files.
e5a998f3 5135DOT_URL_LINK_TEMPLATE = '''\
732044af 5136[InternetShortcut]
5137URL=%(url)s
e5a998f3 5138'''
732044af 5139
e5a998f3 5140DOT_WEBLOC_LINK_TEMPLATE = '''\
732044af 5141<?xml version="1.0" encoding="UTF-8"?>
5142<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
5143<plist version="1.0">
5144<dict>
5145\t<key>URL</key>
5146\t<string>%(url)s</string>
5147</dict>
5148</plist>
e5a998f3 5149'''
732044af 5150
e5a998f3 5151DOT_DESKTOP_LINK_TEMPLATE = '''\
732044af 5152[Desktop Entry]
5153Encoding=UTF-8
5154Name=%(filename)s
5155Type=Link
5156URL=%(url)s
5157Icon=text-html
e5a998f3 5158'''
732044af 5159
08438d2c 5160LINK_TEMPLATES = {
5161 'url': DOT_URL_LINK_TEMPLATE,
5162 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
5163 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
5164}
5165
732044af 5166
5167def iri_to_uri(iri):
5168 """
5169 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
5170
5171 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
5172 """
5173
14f25df2 5174 iri_parts = urllib.parse.urlparse(iri)
732044af 5175
5176 if '[' in iri_parts.netloc:
5177 raise ValueError('IPv6 URIs are not, yet, supported.')
5178 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5179
5180 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5181
5182 net_location = ''
5183 if iri_parts.username:
f9934b96 5184 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
732044af 5185 if iri_parts.password is not None:
f9934b96 5186 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
732044af 5187 net_location += '@'
5188
0f06bcd7 5189 net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames.
732044af 5190 # The 'idna' encoding produces ASCII text.
5191 if iri_parts.port is not None and iri_parts.port != 80:
5192 net_location += ':' + str(iri_parts.port)
5193
f9934b96 5194 return urllib.parse.urlunparse(
732044af 5195 (iri_parts.scheme,
5196 net_location,
5197
f9934b96 5198 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5199
5200 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
f9934b96 5201 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
732044af 5202
5203 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
f9934b96 5204 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
732044af 5205
f9934b96 5206 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
732044af 5207
5208 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5209
5210
5211def to_high_limit_path(path):
5212 if sys.platform in ['win32', 'cygwin']:
5213 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
e5a998f3 5214 return '\\\\?\\' + os.path.abspath(path)
732044af 5215
5216 return path
76d321f6 5217
c76eb41b 5218
7b2c3f47 5219def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY):
e0ddbd02 5220 val = traverse_obj(obj, *variadic(field))
7b2c3f47 5221 if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore):
e0ddbd02 5222 return default
7b2c3f47 5223 return template % func(val)
00dd0cd5 5224
5225
5226def clean_podcast_url(url):
5227 return re.sub(r'''(?x)
5228 (?:
5229 (?:
5230 chtbl\.com/track|
5231 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5232 play\.podtrac\.com
5233 )/[^/]+|
5234 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5235 flex\.acast\.com|
5236 pd(?:
5237 cn\.co| # https://podcorn.com/analytics-prefix/
5238 st\.fm # https://podsights.com/docs/
5239 )/e
5240 )/''', '', url)
ffcb8191
THD
5241
5242
5243_HEX_TABLE = '0123456789abcdef'
5244
5245
5246def random_uuidv4():
5247 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5248
5249
5250def make_dir(path, to_screen=None):
5251 try:
5252 dn = os.path.dirname(path)
5253 if dn and not os.path.exists(dn):
5254 os.makedirs(dn)
5255 return True
86e5f3ed 5256 except OSError as err:
0202b52a 5257 if callable(to_screen) is not None:
5258 to_screen('unable to create directory ' + error_to_compat_str(err))
5259 return False
f74980cb 5260
5261
5262def get_executable_path():
b5899f4f 5263 from .update import _get_variant_and_executable_path
c487cf00 5264
b5899f4f 5265 return os.path.dirname(os.path.abspath(_get_variant_and_executable_path()[1]))
f74980cb 5266
5267
2f567473 5268def load_plugins(name, suffix, namespace):
3ae5e797 5269 classes = {}
19a03940 5270 with contextlib.suppress(FileNotFoundError):
019a94f7
ÁS
5271 plugins_spec = importlib.util.spec_from_file_location(
5272 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5273 plugins = importlib.util.module_from_spec(plugins_spec)
5274 sys.modules[plugins_spec.name] = plugins
5275 plugins_spec.loader.exec_module(plugins)
f74980cb 5276 for name in dir(plugins):
2f567473 5277 if name in namespace:
5278 continue
5279 if not name.endswith(suffix):
f74980cb 5280 continue
5281 klass = getattr(plugins, name)
3ae5e797 5282 classes[name] = namespace[name] = klass
f74980cb 5283 return classes
06167fbb 5284
5285
325ebc17 5286def traverse_obj(
352d63fd 5287 obj, *path_list, default=None, expected_type=None, get_all=True,
325ebc17 5288 casesense=True, is_user_input=False, traverse_string=False):
324ad820 5289 ''' Traverse nested list/dict/tuple
8f334380 5290 @param path_list A list of paths which are checked one by one.
19a03940 5291 Each path is a list of keys where each key is a:
5292 - None: Do nothing
07a1250e 5293 - string: A dictionary key / regex group
19a03940 5294 - int: An index into a list
5295 - tuple: A list of keys all of which will be traversed
5296 - Ellipsis: Fetch all values in the object
5297 - Function: Takes the key and value as arguments
5298 and returns whether the key matches or not
325ebc17 5299 @param default Default value to return
352d63fd 5300 @param expected_type Only accept final value of this type (Can also be any callable)
5301 @param get_all Return all the values obtained from a path or only the first one
324ad820 5302 @param casesense Whether to consider dictionary keys as case sensitive
07a1250e 5303
5304 The following are only meant to be used by YoutubeDL.prepare_outtmpl and is not part of the API
5305
5306 @param path_list In addition to the above,
5307 - dict: Given {k:v, ...}; return {k: traverse_obj(obj, v), ...}
324ad820 5308 @param is_user_input Whether the keys are generated from user input. If True,
5309 strings are converted to int/slice if necessary
5310 @param traverse_string Whether to traverse inside strings. If True, any
5311 non-compatible object will also be converted into a string
07a1250e 5312 ''' # TODO: Write tests
325ebc17 5313 if not casesense:
dbf5416a 5314 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
8f334380 5315 path_list = (map(_lower, variadic(path)) for path in path_list)
5316
5317 def _traverse_obj(obj, path, _current_depth=0):
5318 nonlocal depth
5319 path = tuple(variadic(path))
5320 for i, key in enumerate(path):
1797b073 5321 if None in (key, obj):
5322 return obj
8f334380 5323 if isinstance(key, (list, tuple)):
5324 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5325 key = ...
07a1250e 5326
8f334380 5327 if key is ...:
5328 obj = (obj.values() if isinstance(obj, dict)
5329 else obj if isinstance(obj, (list, tuple, LazyList))
5330 else str(obj) if traverse_string else [])
5331 _current_depth += 1
5332 depth = max(depth, _current_depth)
5333 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
07a1250e 5334 elif isinstance(key, dict):
5335 obj = filter_dict({k: _traverse_obj(obj, v, _current_depth) for k, v in key.items()})
2614f646 5336 elif callable(key):
5337 if isinstance(obj, (list, tuple, LazyList)):
5338 obj = enumerate(obj)
5339 elif isinstance(obj, dict):
5340 obj = obj.items()
5341 else:
5342 if not traverse_string:
5343 return None
5344 obj = str(obj)
5345 _current_depth += 1
5346 depth = max(depth, _current_depth)
e6f868a6 5347 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
575e17a1 5348 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
325ebc17 5349 obj = (obj.get(key) if casesense or (key in obj)
5350 else next((v for k, v in obj.items() if _lower(k) == key), None))
5351 else:
5352 if is_user_input:
5353 key = (int_or_none(key) if ':' not in key
5354 else slice(*map(int_or_none, key.split(':'))))
8f334380 5355 if key == slice(None):
575e17a1 5356 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
325ebc17 5357 if not isinstance(key, (int, slice)):
9fea350f 5358 return None
8f334380 5359 if not isinstance(obj, (list, tuple, LazyList)):
325ebc17 5360 if not traverse_string:
5361 return None
5362 obj = str(obj)
5363 try:
5364 obj = obj[key]
5365 except IndexError:
324ad820 5366 return None
325ebc17 5367 return obj
5368
352d63fd 5369 if isinstance(expected_type, type):
5370 type_test = lambda val: val if isinstance(val, expected_type) else None
352d63fd 5371 else:
7b2c3f47 5372 type_test = expected_type or IDENTITY
352d63fd 5373
8f334380 5374 for path in path_list:
5375 depth = 0
5376 val = _traverse_obj(obj, path)
325ebc17 5377 if val is not None:
8f334380 5378 if depth:
5379 for _ in range(depth - 1):
6586bca9 5380 val = itertools.chain.from_iterable(v for v in val if v is not None)
352d63fd 5381 val = [v for v in map(type_test, val) if v is not None]
8f334380 5382 if val:
352d63fd 5383 return val if get_all else val[0]
5384 else:
5385 val = type_test(val)
5386 if val is not None:
8f334380 5387 return val
325ebc17 5388 return default
324ad820 5389
5390
5391def traverse_dict(dictn, keys, casesense=True):
da4db748 5392 deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed '
5393 f'in a future version. Use "{__name__}.traverse_obj" instead')
ee8dd27a 5394 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5395
5396
ff91cf74 5397def get_first(obj, keys, **kwargs):
5398 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5399
5400
4b4b7f74 5401def variadic(x, allowed_types=(str, bytes, dict)):
cb89cfc1 5402 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
bd50a52b
THD
5403
5404
3e9b66d7
LNO
5405def time_seconds(**kwargs):
5406 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5407 return t.timestamp()
5408
5409
49fa4d9a
N
5410# create a JSON Web Signature (jws) with HS256 algorithm
5411# the resulting format is in JWS Compact Serialization
5412# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5413# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5414def jwt_encode_hs256(payload_data, key, headers={}):
5415 header_data = {
5416 'alg': 'HS256',
5417 'typ': 'JWT',
5418 }
5419 if headers:
5420 header_data.update(headers)
0f06bcd7 5421 header_b64 = base64.b64encode(json.dumps(header_data).encode())
5422 payload_b64 = base64.b64encode(json.dumps(payload_data).encode())
5423 h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256)
49fa4d9a
N
5424 signature_b64 = base64.b64encode(h.digest())
5425 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5426 return token
819e0531 5427
5428
16b0d7e6 5429# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5430def jwt_decode_hs256(jwt):
5431 header_b64, payload_b64, signature_b64 = jwt.split('.')
5432 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5433 return payload_data
5434
5435
53973b4d 5436WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None
5437
5438
0b9c08b4 5439@functools.cache
819e0531 5440def supports_terminal_sequences(stream):
5441 if compat_os_name == 'nt':
8a82af35 5442 if not WINDOWS_VT_MODE:
819e0531 5443 return False
5444 elif not os.getenv('TERM'):
5445 return False
5446 try:
5447 return stream.isatty()
5448 except BaseException:
5449 return False
5450
5451
53973b4d 5452def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075
8a82af35 5453 if get_windows_version() < (10, 0, 10586):
53973b4d 5454 return
5455 global WINDOWS_VT_MODE
53973b4d 5456 try:
f0c9fb96 5457 Popen.run('', shell=True)
53973b4d 5458 except Exception:
5459 return
5460
5461 WINDOWS_VT_MODE = True
5462 supports_terminal_sequences.cache_clear()
5463
5464
ec11a9f4 5465_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5466
5467
5468def remove_terminal_sequences(string):
5469 return _terminal_sequences_re.sub('', string)
5470
5471
5472def number_of_digits(number):
5473 return len('%d' % number)
34921b43 5474
5475
5476def join_nonempty(*values, delim='-', from_dict=None):
5477 if from_dict is not None:
7b2c3f47 5478 values = (traverse_obj(from_dict, variadic(v)) for v in values)
34921b43 5479 return delim.join(map(str, filter(None, values)))
06e57990 5480
5481
27231526
ZM
5482def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5483 """
5484 Find the largest format dimensions in terms of video width and, for each thumbnail:
5485 * Modify the URL: Match the width with the provided regex and replace with the former width
5486 * Update dimensions
5487
5488 This function is useful with video services that scale the provided thumbnails on demand
5489 """
5490 _keys = ('width', 'height')
5491 max_dimensions = max(
86e5f3ed 5492 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
27231526
ZM
5493 default=(0, 0))
5494 if not max_dimensions[0]:
5495 return thumbnails
5496 return [
5497 merge_dicts(
5498 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5499 dict(zip(_keys, max_dimensions)), thumbnail)
5500 for thumbnail in thumbnails
5501 ]
5502
5503
93c8410d
LNO
5504def parse_http_range(range):
5505 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5506 if not range:
5507 return None, None, None
5508 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5509 if not crg:
5510 return None, None, None
5511 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5512
5513
6b9e832d 5514def read_stdin(what):
5515 eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D'
5516 write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n')
5517 return sys.stdin
5518
5519
a904a7f8
L
5520def determine_file_encoding(data):
5521 """
88f60feb 5522 Detect the text encoding used
a904a7f8
L
5523 @returns (encoding, bytes to skip)
5524 """
5525
88f60feb 5526 # BOM marks are given priority over declarations
a904a7f8 5527 for bom, enc in BOMS:
a904a7f8
L
5528 if data.startswith(bom):
5529 return enc, len(bom)
5530
88f60feb 5531 # Strip off all null bytes to match even when UTF-16 or UTF-32 is used.
5532 # We ignore the endianness to get a good enough match
a904a7f8 5533 data = data.replace(b'\0', b'')
88f60feb 5534 mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data)
5535 return mobj.group(1).decode() if mobj else None, 0
a904a7f8
L
5536
5537
06e57990 5538class Config:
5539 own_args = None
9e491463 5540 parsed_args = None
06e57990 5541 filename = None
5542 __initialized = False
5543
5544 def __init__(self, parser, label=None):
9e491463 5545 self.parser, self.label = parser, label
06e57990 5546 self._loaded_paths, self.configs = set(), []
5547
5548 def init(self, args=None, filename=None):
5549 assert not self.__initialized
284a60c5 5550 self.own_args, self.filename = args, filename
5551 return self.load_configs()
5552
5553 def load_configs(self):
65662dff 5554 directory = ''
284a60c5 5555 if self.filename:
5556 location = os.path.realpath(self.filename)
65662dff 5557 directory = os.path.dirname(location)
06e57990 5558 if location in self._loaded_paths:
5559 return False
5560 self._loaded_paths.add(location)
5561
284a60c5 5562 self.__initialized = True
5563 opts, _ = self.parser.parse_known_args(self.own_args)
5564 self.parsed_args = self.own_args
9e491463 5565 for location in opts.config_locations or []:
6b9e832d 5566 if location == '-':
1060f82f 5567 if location in self._loaded_paths:
5568 continue
5569 self._loaded_paths.add(location)
6b9e832d 5570 self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin')
5571 continue
65662dff 5572 location = os.path.join(directory, expand_path(location))
06e57990 5573 if os.path.isdir(location):
5574 location = os.path.join(location, 'yt-dlp.conf')
5575 if not os.path.exists(location):
9e491463 5576 self.parser.error(f'config location {location} does not exist')
06e57990 5577 self.append_config(self.read_file(location), location)
5578 return True
5579
5580 def __str__(self):
5581 label = join_nonempty(
5582 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5583 delim=' ')
5584 return join_nonempty(
5585 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5586 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5587 delim='\n')
5588
5589 @staticmethod
5590 def read_file(filename, default=[]):
5591 try:
a904a7f8 5592 optionf = open(filename, 'rb')
86e5f3ed 5593 except OSError:
06e57990 5594 return default # silently skip if file is not present
a904a7f8
L
5595 try:
5596 enc, skip = determine_file_encoding(optionf.read(512))
5597 optionf.seek(skip, io.SEEK_SET)
5598 except OSError:
5599 enc = None # silently skip read errors
06e57990 5600 try:
5601 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
a904a7f8 5602 contents = optionf.read().decode(enc or preferredencoding())
f9934b96 5603 res = shlex.split(contents, comments=True)
44a6fcff 5604 except Exception as err:
5605 raise ValueError(f'Unable to parse "{filename}": {err}')
06e57990 5606 finally:
5607 optionf.close()
5608 return res
5609
5610 @staticmethod
5611 def hide_login_info(opts):
86e5f3ed 5612 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
06e57990 5613 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5614
5615 def _scrub_eq(o):
5616 m = eqre.match(o)
5617 if m:
5618 return m.group('key') + '=PRIVATE'
5619 else:
5620 return o
5621
5622 opts = list(map(_scrub_eq, opts))
5623 for idx, opt in enumerate(opts):
5624 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5625 opts[idx + 1] = 'PRIVATE'
5626 return opts
5627
5628 def append_config(self, *args, label=None):
9e491463 5629 config = type(self)(self.parser, label)
06e57990 5630 config._loaded_paths = self._loaded_paths
5631 if config.init(*args):
5632 self.configs.append(config)
5633
5634 @property
5635 def all_args(self):
5636 for config in reversed(self.configs):
5637 yield from config.all_args
9e491463 5638 yield from self.parsed_args or []
5639
5640 def parse_known_args(self, **kwargs):
5641 return self.parser.parse_known_args(self.all_args, **kwargs)
06e57990 5642
5643 def parse_args(self):
9e491463 5644 return self.parser.parse_args(self.all_args)
da42679b
LNO
5645
5646
5647class WebSocketsWrapper():
5648 """Wraps websockets module to use in non-async scopes"""
abfecb7b 5649 pool = None
da42679b 5650
3cea3edd 5651 def __init__(self, url, headers=None, connect=True):
059bc4db 5652 self.loop = asyncio.new_event_loop()
9cd08050 5653 # XXX: "loop" is deprecated
5654 self.conn = websockets.connect(
5655 url, extra_headers=headers, ping_interval=None,
5656 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
3cea3edd
LNO
5657 if connect:
5658 self.__enter__()
15dfb392 5659 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5660
5661 def __enter__(self):
3cea3edd 5662 if not self.pool:
9cd08050 5663 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
da42679b
LNO
5664 return self
5665
5666 def send(self, *args):
5667 self.run_with_loop(self.pool.send(*args), self.loop)
5668
5669 def recv(self, *args):
5670 return self.run_with_loop(self.pool.recv(*args), self.loop)
5671
5672 def __exit__(self, type, value, traceback):
5673 try:
5674 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5675 finally:
5676 self.loop.close()
15dfb392 5677 self._cancel_all_tasks(self.loop)
da42679b
LNO
5678
5679 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5680 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5681 @staticmethod
5682 def run_with_loop(main, loop):
059bc4db 5683 if not asyncio.iscoroutine(main):
da42679b
LNO
5684 raise ValueError(f'a coroutine was expected, got {main!r}')
5685
5686 try:
5687 return loop.run_until_complete(main)
5688 finally:
5689 loop.run_until_complete(loop.shutdown_asyncgens())
5690 if hasattr(loop, 'shutdown_default_executor'):
5691 loop.run_until_complete(loop.shutdown_default_executor())
5692
5693 @staticmethod
5694 def _cancel_all_tasks(loop):
059bc4db 5695 to_cancel = asyncio.all_tasks(loop)
da42679b
LNO
5696
5697 if not to_cancel:
5698 return
5699
5700 for task in to_cancel:
5701 task.cancel()
5702
9cd08050 5703 # XXX: "loop" is removed in python 3.10+
da42679b 5704 loop.run_until_complete(
059bc4db 5705 asyncio.gather(*to_cancel, loop=loop, return_exceptions=True))
da42679b
LNO
5706
5707 for task in to_cancel:
5708 if task.cancelled():
5709 continue
5710 if task.exception() is not None:
5711 loop.call_exception_handler({
5712 'message': 'unhandled exception during asyncio.run() shutdown',
5713 'exception': task.exception(),
5714 'task': task,
5715 })
5716
5717
8b7539d2 5718def merge_headers(*dicts):
08d30158 5719 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
76aa9913 5720 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
28787f16 5721
5722
b1f94422 5723def cached_method(f):
5724 """Cache a method"""
5725 signature = inspect.signature(f)
5726
5727 @functools.wraps(f)
5728 def wrapper(self, *args, **kwargs):
5729 bound_args = signature.bind(self, *args, **kwargs)
5730 bound_args.apply_defaults()
5731 key = tuple(bound_args.arguments.values())
5732
5733 if not hasattr(self, '__cached_method__cache'):
5734 self.__cached_method__cache = {}
5735 cache = self.__cached_method__cache.setdefault(f.__name__, {})
5736 if key not in cache:
5737 cache[key] = f(self, *args, **kwargs)
5738 return cache[key]
5739 return wrapper
5740
5741
28787f16 5742class classproperty:
b1f94422 5743 """property access for class methods"""
c487cf00 5744
5745 def __init__(self, func):
5746 functools.update_wrapper(self, func)
5747 self.func = func
28787f16 5748
5749 def __get__(self, _, cls):
c487cf00 5750 return self.func(cls)
19a03940 5751
5752
64fa820c 5753class Namespace(types.SimpleNamespace):
591bb9d3 5754 """Immutable namespace"""
591bb9d3 5755
7896214c 5756 def __iter__(self):
64fa820c 5757 return iter(self.__dict__.values())
7896214c 5758
64fa820c 5759 @property
5760 def items_(self):
5761 return self.__dict__.items()
9b8ee23b 5762
5763
8dc59305 5764MEDIA_EXTENSIONS = Namespace(
5765 common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'),
5766 video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'),
5767 common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'),
5768 audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'),
5769 thumbnails=('jpg', 'png', 'webp'),
5770 storyboards=('mhtml', ),
5771 subtitles=('srt', 'vtt', 'ass', 'lrc'),
5772 manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'),
5773)
5774MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video
5775MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio
5776
5777KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests)
5778
5779
be5c1ae8 5780class RetryManager:
5781 """Usage:
5782 for retry in RetryManager(...):
5783 try:
5784 ...
5785 except SomeException as err:
5786 retry.error = err
5787 continue
5788 """
5789 attempt, _error = 0, None
5790
5791 def __init__(self, _retries, _error_callback, **kwargs):
5792 self.retries = _retries or 0
5793 self.error_callback = functools.partial(_error_callback, **kwargs)
5794
5795 def _should_retry(self):
5796 return self._error is not NO_DEFAULT and self.attempt <= self.retries
5797
5798 @property
5799 def error(self):
5800 if self._error is NO_DEFAULT:
5801 return None
5802 return self._error
5803
5804 @error.setter
5805 def error(self, value):
5806 self._error = value
5807
5808 def __iter__(self):
5809 while self._should_retry():
5810 self.error = NO_DEFAULT
5811 self.attempt += 1
5812 yield self
5813 if self.error:
5814 self.error_callback(self.error, self.attempt, self.retries)
5815
5816 @staticmethod
5817 def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None):
5818 """Utility function for reporting retries"""
5819 if count > retries:
5820 if error:
5821 return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e))
5822 raise e
5823
5824 if not count:
5825 return warn(e)
5826 elif isinstance(e, ExtractorError):
3ce29336 5827 e = remove_end(str_or_none(e.cause) or e.orig_msg, '.')
be5c1ae8 5828 warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...')
5829
5830 delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func
5831 if delay:
5832 info(f'Sleeping {delay:.2f} seconds ...')
5833 time.sleep(delay)
5834
5835
0647d925 5836def make_archive_id(ie, video_id):
5837 ie_key = ie if isinstance(ie, str) else ie.ie_key()
5838 return f'{ie_key.lower()} {video_id}'
5839
5840
a1c5bd82 5841def truncate_string(s, left, right=0):
5842 assert left > 3 and right >= 0
5843 if s is None or len(s) <= left + right:
5844 return s
5845 return f'{s[:left-3]}...{s[-right:]}'
5846
5847
5314b521 5848def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None):
5849 assert 'all' in alias_dict, '"all" alias is required'
5850 requested = list(start or [])
5851 for val in options:
5852 discard = val.startswith('-')
5853 if discard:
5854 val = val[1:]
5855
5856 if val in alias_dict:
5857 val = alias_dict[val] if not discard else [
5858 i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]]
5859 # NB: Do not allow regex in aliases for performance
5860 requested = orderedSet_from_options(val, alias_dict, start=requested)
5861 continue
5862
5863 current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex
5864 else [val] if val in alias_dict['all'] else None)
5865 if current is None:
5866 raise ValueError(val)
5867
5868 if discard:
5869 for item in current:
5870 while item in requested:
5871 requested.remove(item)
5872 else:
5873 requested.extend(current)
5874
5875 return orderedSet(requested)
5876
5877
9b8ee23b 5878# Deprecated
5879has_certifi = bool(certifi)
5880has_websockets = bool(websockets)