]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
Revert d6579d532bed8fa9e316404e6f30df4402f4a632
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
cc52de43 1#!/usr/bin/env python3
dcdb292f 2# coding: utf-8
d77c3dfd 3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
c380cc28 10import collections
62e609ab 11import contextlib
e3946f98 12import ctypes
c496ca96
PH
13import datetime
14import email.utils
0c265486 15import email.header
f45c185f 16import errno
be4a824d 17import functools
d77c3dfd 18import gzip
49fa4d9a
N
19import hashlib
20import hmac
019a94f7 21import importlib.util
03f9daab 22import io
79a2e94e 23import itertools
f4bfd65f 24import json
d77c3dfd 25import locale
02dbf93f 26import math
347de493 27import operator
d77c3dfd 28import os
c496ca96 29import platform
773f291d 30import random
d77c3dfd 31import re
c496ca96 32import socket
79a2e94e 33import ssl
1c088fa8 34import subprocess
d77c3dfd 35import sys
181c8655 36import tempfile
c380cc28 37import time
01951dda 38import traceback
bcf89ce6 39import xml.etree.ElementTree
d77c3dfd 40import zlib
2814f12b 41import mimetypes
d77c3dfd 42
8c25f81b 43from .compat import (
b4a3d461 44 compat_HTMLParseError,
8bb56eee 45 compat_HTMLParser,
201c1459 46 compat_HTTPError,
8f9312c3 47 compat_basestring,
8c25f81b 48 compat_chr,
1bab3437 49 compat_cookiejar,
d7cd9a9e 50 compat_ctypes_WINFUNCTYPE,
36e6f62c 51 compat_etree_fromstring,
51098426 52 compat_expanduser,
8c25f81b 53 compat_html_entities,
55b2f099 54 compat_html_entities_html5,
be4a824d 55 compat_http_client,
42db58ec 56 compat_integer_types,
e29663c6 57 compat_numeric_types,
c86b6142 58 compat_kwargs,
efa97bdc 59 compat_os_name,
8c25f81b 60 compat_parse_qs,
06e57990 61 compat_shlex_split,
702ccf2d 62 compat_shlex_quote,
8c25f81b 63 compat_str,
edaa23f8 64 compat_struct_pack,
d3f8e038 65 compat_struct_unpack,
8c25f81b
PH
66 compat_urllib_error,
67 compat_urllib_parse,
15707c7e 68 compat_urllib_parse_urlencode,
8c25f81b 69 compat_urllib_parse_urlparse,
732044af 70 compat_urllib_parse_urlunparse,
71 compat_urllib_parse_quote,
72 compat_urllib_parse_quote_plus,
7581bfc9 73 compat_urllib_parse_unquote_plus,
8c25f81b
PH
74 compat_urllib_request,
75 compat_urlparse,
810c10ba 76 compat_xpath,
8c25f81b 77)
4644ac55 78
71aff188
YCH
79from .socks import (
80 ProxyType,
81 sockssocket,
82)
83
4644ac55 84
51fb4995
YCH
85def register_socks_protocols():
86 # "Register" SOCKS protocols
d5ae6bb5
YCH
87 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
88 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
89 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
90 if scheme not in compat_urlparse.uses_netloc:
91 compat_urlparse.uses_netloc.append(scheme)
92
93
468e2e92
FV
94# This is not clearly defined otherwise
95compiled_regex_type = type(re.compile(''))
96
f7a147e3
S
97
98def random_user_agent():
99 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
100 _CHROME_VERSIONS = (
19b4c74d 101 '90.0.4430.212',
102 '90.0.4430.24',
103 '90.0.4430.70',
104 '90.0.4430.72',
105 '90.0.4430.85',
106 '90.0.4430.93',
107 '91.0.4472.101',
108 '91.0.4472.106',
109 '91.0.4472.114',
110 '91.0.4472.124',
111 '91.0.4472.164',
112 '91.0.4472.19',
113 '91.0.4472.77',
114 '92.0.4515.107',
115 '92.0.4515.115',
116 '92.0.4515.131',
117 '92.0.4515.159',
118 '92.0.4515.43',
119 '93.0.4556.0',
120 '93.0.4577.15',
121 '93.0.4577.63',
122 '93.0.4577.82',
123 '94.0.4606.41',
124 '94.0.4606.54',
125 '94.0.4606.61',
126 '94.0.4606.71',
127 '94.0.4606.81',
128 '94.0.4606.85',
129 '95.0.4638.17',
130 '95.0.4638.50',
131 '95.0.4638.54',
132 '95.0.4638.69',
133 '95.0.4638.74',
134 '96.0.4664.18',
135 '96.0.4664.45',
136 '96.0.4664.55',
137 '96.0.4664.93',
138 '97.0.4692.20',
f7a147e3
S
139 )
140 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
141
142
3e669f36 143std_headers = {
f7a147e3 144 'User-Agent': random_user_agent(),
59ae15a5
PH
145 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
146 'Accept-Encoding': 'gzip, deflate',
147 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 148}
f427df17 149
5f6a1245 150
fb37eb25
S
151USER_AGENTS = {
152 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
153}
154
155
bf42a990
S
156NO_DEFAULT = object()
157
7105440c
YCH
158ENGLISH_MONTH_NAMES = [
159 'January', 'February', 'March', 'April', 'May', 'June',
160 'July', 'August', 'September', 'October', 'November', 'December']
161
f6717dec
S
162MONTH_NAMES = {
163 'en': ENGLISH_MONTH_NAMES,
164 'fr': [
3e4185c3
S
165 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
166 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 167}
a942d6cb 168
a7aaa398
S
169KNOWN_EXTENSIONS = (
170 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
171 'flv', 'f4v', 'f4a', 'f4b',
172 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
173 'mkv', 'mka', 'mk3d',
174 'avi', 'divx',
175 'mov',
176 'asf', 'wmv', 'wma',
177 '3gp', '3g2',
178 'mp3',
179 'flac',
180 'ape',
181 'wav',
182 'f4f', 'f4m', 'm3u8', 'smil')
183
c587cbb7 184# needed for sanitizing filenames in restricted mode
c8827027 185ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
186 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
187 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 188
46f59e89
S
189DATE_FORMATS = (
190 '%d %B %Y',
191 '%d %b %Y',
192 '%B %d %Y',
cb655f34
S
193 '%B %dst %Y',
194 '%B %dnd %Y',
9d30c213 195 '%B %drd %Y',
cb655f34 196 '%B %dth %Y',
46f59e89 197 '%b %d %Y',
cb655f34
S
198 '%b %dst %Y',
199 '%b %dnd %Y',
9d30c213 200 '%b %drd %Y',
cb655f34 201 '%b %dth %Y',
46f59e89
S
202 '%b %dst %Y %I:%M',
203 '%b %dnd %Y %I:%M',
9d30c213 204 '%b %drd %Y %I:%M',
46f59e89
S
205 '%b %dth %Y %I:%M',
206 '%Y %m %d',
207 '%Y-%m-%d',
bccdbd22 208 '%Y.%m.%d.',
46f59e89 209 '%Y/%m/%d',
81c13222 210 '%Y/%m/%d %H:%M',
46f59e89 211 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
212 '%Y%m%d%H%M',
213 '%Y%m%d%H%M%S',
4f3fa23e 214 '%Y%m%d',
0c1c6f4b 215 '%Y-%m-%d %H:%M',
46f59e89
S
216 '%Y-%m-%d %H:%M:%S',
217 '%Y-%m-%d %H:%M:%S.%f',
5014558a 218 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
219 '%d.%m.%Y %H:%M',
220 '%d.%m.%Y %H.%M',
221 '%Y-%m-%dT%H:%M:%SZ',
222 '%Y-%m-%dT%H:%M:%S.%fZ',
223 '%Y-%m-%dT%H:%M:%S.%f0Z',
224 '%Y-%m-%dT%H:%M:%S',
225 '%Y-%m-%dT%H:%M:%S.%f',
226 '%Y-%m-%dT%H:%M',
c6eed6b8
S
227 '%b %d %Y at %H:%M',
228 '%b %d %Y at %H:%M:%S',
b555ae9b
S
229 '%B %d %Y at %H:%M',
230 '%B %d %Y at %H:%M:%S',
a63d9bd0 231 '%H:%M %d-%b-%Y',
46f59e89
S
232)
233
234DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
235DATE_FORMATS_DAY_FIRST.extend([
236 '%d-%m-%Y',
237 '%d.%m.%Y',
238 '%d.%m.%y',
239 '%d/%m/%Y',
240 '%d/%m/%y',
241 '%d/%m/%Y %H:%M:%S',
242])
243
244DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
245DATE_FORMATS_MONTH_FIRST.extend([
246 '%m-%d-%Y',
247 '%m.%d.%Y',
248 '%m/%d/%Y',
249 '%m/%d/%y',
250 '%m/%d/%Y %H:%M:%S',
251])
252
06b3fe29 253PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
22f5f5c6 254JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
06b3fe29 255
7105440c 256
d77c3dfd 257def preferredencoding():
59ae15a5 258 """Get preferred encoding.
d77c3dfd 259
59ae15a5
PH
260 Returns the best encoding scheme for the system, based on
261 locale.getpreferredencoding() and some further tweaks.
262 """
263 try:
264 pref = locale.getpreferredencoding()
28e614de 265 'TEST'.encode(pref)
70a1165b 266 except Exception:
59ae15a5 267 pref = 'UTF-8'
bae611f2 268
59ae15a5 269 return pref
d77c3dfd 270
f4bfd65f 271
181c8655 272def write_json_file(obj, fn):
1394646a 273 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 274
92120217 275 fn = encodeFilename(fn)
61ee5aeb 276 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
277 encoding = get_filesystem_encoding()
278 # os.path.basename returns a bytes object, but NamedTemporaryFile
279 # will fail if the filename contains non ascii characters unless we
280 # use a unicode object
281 path_basename = lambda f: os.path.basename(fn).decode(encoding)
282 # the same for os.path.dirname
283 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
284 else:
285 path_basename = os.path.basename
286 path_dirname = os.path.dirname
287
73159f99
S
288 args = {
289 'suffix': '.tmp',
ec5f6016
JMF
290 'prefix': path_basename(fn) + '.',
291 'dir': path_dirname(fn),
73159f99
S
292 'delete': False,
293 }
294
181c8655
PH
295 # In Python 2.x, json.dump expects a bytestream.
296 # In Python 3.x, it writes to a character stream
297 if sys.version_info < (3, 0):
73159f99 298 args['mode'] = 'wb'
181c8655 299 else:
73159f99
S
300 args.update({
301 'mode': 'w',
302 'encoding': 'utf-8',
303 })
304
c86b6142 305 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
306
307 try:
308 with tf:
45d86abe 309 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
310 if sys.platform == 'win32':
311 # Need to remove existing file on Windows, else os.rename raises
312 # WindowsError or FileExistsError.
313 try:
314 os.unlink(fn)
315 except OSError:
316 pass
9cd5f54e
R
317 try:
318 mask = os.umask(0)
319 os.umask(mask)
320 os.chmod(tf.name, 0o666 & ~mask)
321 except OSError:
322 pass
181c8655 323 os.rename(tf.name, fn)
70a1165b 324 except Exception:
181c8655
PH
325 try:
326 os.remove(tf.name)
327 except OSError:
328 pass
329 raise
330
331
332if sys.version_info >= (2, 7):
ee114368 333 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 334 """ Find the xpath xpath[@key=val] """
5d2354f1 335 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 336 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
337 return node.find(expr)
338else:
ee114368 339 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 340 for f in node.findall(compat_xpath(xpath)):
ee114368
S
341 if key not in f.attrib:
342 continue
343 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
344 return f
345 return None
346
d7e66d39
JMF
347# On python2.6 the xml.etree.ElementTree.Element methods don't support
348# the namespace parameter
5f6a1245
JW
349
350
d7e66d39
JMF
351def xpath_with_ns(path, ns_map):
352 components = [c.split(':') for c in path.split('/')]
353 replaced = []
354 for c in components:
355 if len(c) == 1:
356 replaced.append(c[0])
357 else:
358 ns, tag = c
359 replaced.append('{%s}%s' % (ns_map[ns], tag))
360 return '/'.join(replaced)
361
d77c3dfd 362
a41fb80c 363def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 364 def _find_xpath(xpath):
810c10ba 365 return node.find(compat_xpath(xpath))
578c0745
S
366
367 if isinstance(xpath, (str, compat_str)):
368 n = _find_xpath(xpath)
369 else:
370 for xp in xpath:
371 n = _find_xpath(xp)
372 if n is not None:
373 break
d74bebd5 374
8e636da4 375 if n is None:
bf42a990
S
376 if default is not NO_DEFAULT:
377 return default
378 elif fatal:
bf0ff932
PH
379 name = xpath if name is None else name
380 raise ExtractorError('Could not find XML element %s' % name)
381 else:
382 return None
a41fb80c
S
383 return n
384
385
386def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
387 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
388 if n is None or n == default:
389 return n
390 if n.text is None:
391 if default is not NO_DEFAULT:
392 return default
393 elif fatal:
394 name = xpath if name is None else name
395 raise ExtractorError('Could not find XML element\'s text %s' % name)
396 else:
397 return None
398 return n.text
a41fb80c
S
399
400
401def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
402 n = find_xpath_attr(node, xpath, key)
403 if n is None:
404 if default is not NO_DEFAULT:
405 return default
406 elif fatal:
407 name = '%s[@%s]' % (xpath, key) if name is None else name
408 raise ExtractorError('Could not find XML attribute %s' % name)
409 else:
410 return None
411 return n.attrib[key]
bf0ff932
PH
412
413
9e6dd238 414def get_element_by_id(id, html):
43e8fafd 415 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 416 return get_element_by_attribute('id', id, html)
43e8fafd 417
12ea2f30 418
6f32a0b5
ZM
419def get_element_html_by_id(id, html):
420 """Return the html of the tag with the specified ID in the passed HTML document"""
421 return get_element_html_by_attribute('id', id, html)
422
423
84c237fb 424def get_element_by_class(class_name, html):
2af12ad9
TC
425 """Return the content of the first tag with the specified class in the passed HTML document"""
426 retval = get_elements_by_class(class_name, html)
427 return retval[0] if retval else None
428
429
6f32a0b5
ZM
430def get_element_html_by_class(class_name, html):
431 """Return the html of the first tag with the specified class in the passed HTML document"""
432 retval = get_elements_html_by_class(class_name, html)
433 return retval[0] if retval else None
434
435
2af12ad9
TC
436def get_element_by_attribute(attribute, value, html, escape_value=True):
437 retval = get_elements_by_attribute(attribute, value, html, escape_value)
438 return retval[0] if retval else None
439
440
6f32a0b5
ZM
441def get_element_html_by_attribute(attribute, value, html, escape_value=True):
442 retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
443 return retval[0] if retval else None
444
445
2af12ad9
TC
446def get_elements_by_class(class_name, html):
447 """Return the content of all tags with the specified class in the passed HTML document as a list"""
448 return get_elements_by_attribute(
84c237fb
YCH
449 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
450 html, escape_value=False)
451
452
6f32a0b5
ZM
453def get_elements_html_by_class(class_name, html):
454 """Return the html of all tags with the specified class in the passed HTML document as a list"""
455 return get_elements_html_by_attribute(
456 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
457 html, escape_value=False)
458
459
460def get_elements_by_attribute(*args, **kwargs):
43e8fafd 461 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
462 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
463
464
465def get_elements_html_by_attribute(*args, **kwargs):
466 """Return the html of the tag with the specified attribute in the passed HTML document"""
467 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
468
469
470def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
471 """
472 Return the text (content) and the html (whole) of the tag with the specified
473 attribute in the passed HTML document
474 """
9e6dd238 475
0254f162
ZM
476 value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
477
84c237fb
YCH
478 value = re.escape(value) if escape_value else value
479
0254f162 480 partial_element_re = r'''(?x)
6f32a0b5 481 <(?P<tag>[a-zA-Z0-9:._-]+)
0254f162
ZM
482 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
483 \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
484 ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
38285056 485
0254f162
ZM
486 for m in re.finditer(partial_element_re, html):
487 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 488
0254f162
ZM
489 yield (
490 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
491 whole
492 )
a921f407 493
c5229f39 494
6f32a0b5
ZM
495class HTMLBreakOnClosingTagParser(compat_HTMLParser):
496 """
497 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
498 closing tag for the first opening tag it has encountered, and can be used
499 as a context manager
500 """
501
502 class HTMLBreakOnClosingTagException(Exception):
503 pass
504
505 def __init__(self):
506 self.tagstack = collections.deque()
507 compat_HTMLParser.__init__(self)
508
509 def __enter__(self):
510 return self
511
512 def __exit__(self, *_):
513 self.close()
514
515 def close(self):
516 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
517 # so data remains buffered; we no longer have any interest in it, thus
518 # override this method to discard it
519 pass
520
521 def handle_starttag(self, tag, _):
522 self.tagstack.append(tag)
523
524 def handle_endtag(self, tag):
525 if not self.tagstack:
526 raise compat_HTMLParseError('no tags in the stack')
527 while self.tagstack:
528 inner_tag = self.tagstack.pop()
529 if inner_tag == tag:
530 break
531 else:
532 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
533 if not self.tagstack:
534 raise self.HTMLBreakOnClosingTagException()
535
536
537def get_element_text_and_html_by_tag(tag, html):
538 """
539 For the first element with the specified tag in the passed HTML document
540 return its' content (text) and the whole element (html)
541 """
542 def find_or_raise(haystack, needle, exc):
543 try:
544 return haystack.index(needle)
545 except ValueError:
546 raise exc
547 closing_tag = f'</{tag}>'
548 whole_start = find_or_raise(
549 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
550 content_start = find_or_raise(
551 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
552 content_start += whole_start + 1
553 with HTMLBreakOnClosingTagParser() as parser:
554 parser.feed(html[whole_start:content_start])
555 if not parser.tagstack or parser.tagstack[0] != tag:
556 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
557 offset = content_start
558 while offset < len(html):
559 next_closing_tag_start = find_or_raise(
560 html[offset:], closing_tag,
561 compat_HTMLParseError(f'closing {tag} tag not found'))
562 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
563 try:
564 parser.feed(html[offset:offset + next_closing_tag_end])
565 offset += next_closing_tag_end
566 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
567 return html[content_start:offset + next_closing_tag_start], \
568 html[whole_start:offset + next_closing_tag_end]
569 raise compat_HTMLParseError('unexpected end of html')
570
571
8bb56eee
BF
572class HTMLAttributeParser(compat_HTMLParser):
573 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 574
8bb56eee 575 def __init__(self):
c5229f39 576 self.attrs = {}
8bb56eee
BF
577 compat_HTMLParser.__init__(self)
578
579 def handle_starttag(self, tag, attrs):
580 self.attrs = dict(attrs)
581
c5229f39 582
73673ccf
FF
583class HTMLListAttrsParser(compat_HTMLParser):
584 """HTML parser to gather the attributes for the elements of a list"""
585
586 def __init__(self):
587 compat_HTMLParser.__init__(self)
588 self.items = []
589 self._level = 0
590
591 def handle_starttag(self, tag, attrs):
592 if tag == 'li' and self._level == 0:
593 self.items.append(dict(attrs))
594 self._level += 1
595
596 def handle_endtag(self, tag):
597 self._level -= 1
598
599
8bb56eee
BF
600def extract_attributes(html_element):
601 """Given a string for an HTML element such as
602 <el
603 a="foo" B="bar" c="&98;az" d=boz
604 empty= noval entity="&amp;"
605 sq='"' dq="'"
606 >
607 Decode and return a dictionary of attributes.
608 {
609 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
610 'empty': '', 'noval': None, 'entity': '&',
611 'sq': '"', 'dq': '\''
612 }.
613 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
614 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
615 """
616 parser = HTMLAttributeParser()
b4a3d461
S
617 try:
618 parser.feed(html_element)
619 parser.close()
620 # Older Python may throw HTMLParseError in case of malformed HTML
621 except compat_HTMLParseError:
622 pass
8bb56eee 623 return parser.attrs
9e6dd238 624
c5229f39 625
73673ccf
FF
626def parse_list(webpage):
627 """Given a string for an series of HTML <li> elements,
628 return a dictionary of their attributes"""
629 parser = HTMLListAttrsParser()
630 parser.feed(webpage)
631 parser.close()
632 return parser.items
633
634
9e6dd238 635def clean_html(html):
59ae15a5 636 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
637
638 if html is None: # Convenience for sanitizing descriptions etc.
639 return html
640
59ae15a5
PH
641 # Newline vs <br />
642 html = html.replace('\n', ' ')
edd9221c
TF
643 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
644 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
645 # Strip html tags
646 html = re.sub('<.*?>', '', html)
647 # Replace html entities
648 html = unescapeHTML(html)
7decf895 649 return html.strip()
9e6dd238
FV
650
651
d77c3dfd 652def sanitize_open(filename, open_mode):
59ae15a5
PH
653 """Try to open the given filename, and slightly tweak it if this fails.
654
655 Attempts to open the given filename. If this fails, it tries to change
656 the filename slightly, step by step, until it's either able to open it
657 or it fails and raises a final exception, like the standard open()
658 function.
659
660 It returns the tuple (stream, definitive_file_name).
661 """
662 try:
28e614de 663 if filename == '-':
59ae15a5
PH
664 if sys.platform == 'win32':
665 import msvcrt
666 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 667 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
668 stream = open(encodeFilename(filename), open_mode)
669 return (stream, filename)
670 except (IOError, OSError) as err:
f45c185f
PH
671 if err.errno in (errno.EACCES,):
672 raise
59ae15a5 673
f45c185f 674 # In case of error, try to remove win32 forbidden chars
d55de57b 675 alt_filename = sanitize_path(filename)
f45c185f
PH
676 if alt_filename == filename:
677 raise
678 else:
679 # An exception here should be caught in the caller
d55de57b 680 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 681 return (stream, alt_filename)
d77c3dfd
FV
682
683
684def timeconvert(timestr):
59ae15a5
PH
685 """Convert RFC 2822 defined time string into system timestamp"""
686 timestamp = None
687 timetuple = email.utils.parsedate_tz(timestr)
688 if timetuple is not None:
689 timestamp = email.utils.mktime_tz(timetuple)
690 return timestamp
1c469a94 691
5f6a1245 692
796173d0 693def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
694 """Sanitizes a string so it could be used as part of a filename.
695 If restricted is set, use a stricter subset of allowed characters.
158af524
S
696 Set is_id if this is not an arbitrary string, but an ID that should be kept
697 if possible.
59ae15a5
PH
698 """
699 def replace_insane(char):
c587cbb7
AT
700 if restricted and char in ACCENT_CHARS:
701 return ACCENT_CHARS[char]
91dd88b9 702 elif not restricted and char == '\n':
703 return ' '
704 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
705 return ''
706 elif char == '"':
707 return '' if restricted else '\''
708 elif char == ':':
709 return '_-' if restricted else ' -'
710 elif char in '\\/|*<>':
711 return '_'
627dcfff 712 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
713 return '_'
714 if restricted and ord(char) > 127:
715 return '_'
716 return char
717
639f1cea 718 if s == '':
719 return ''
2aeb06d6
PH
720 # Handle timestamps
721 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 722 result = ''.join(map(replace_insane, s))
796173d0
PH
723 if not is_id:
724 while '__' in result:
725 result = result.replace('__', '_')
726 result = result.strip('_')
727 # Common case of "Foreign band name - English song title"
728 if restricted and result.startswith('-_'):
729 result = result[2:]
5a42414b
PH
730 if result.startswith('-'):
731 result = '_' + result[len('-'):]
a7440261 732 result = result.lstrip('.')
796173d0
PH
733 if not result:
734 result = '_'
59ae15a5 735 return result
d77c3dfd 736
5f6a1245 737
c2934512 738def sanitize_path(s, force=False):
a2aaf4db 739 """Sanitizes and normalizes path on Windows"""
c2934512 740 if sys.platform == 'win32':
c4218ac3 741 force = False
c2934512 742 drive_or_unc, _ = os.path.splitdrive(s)
743 if sys.version_info < (2, 7) and not drive_or_unc:
744 drive_or_unc, _ = os.path.splitunc(s)
745 elif force:
746 drive_or_unc = ''
747 else:
a2aaf4db 748 return s
c2934512 749
be531ef1
S
750 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
751 if drive_or_unc:
a2aaf4db
S
752 norm_path.pop(0)
753 sanitized_path = [
ec85ded8 754 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 755 for path_part in norm_path]
be531ef1
S
756 if drive_or_unc:
757 sanitized_path.insert(0, drive_or_unc + os.path.sep)
c4218ac3 758 elif force and s[0] == os.path.sep:
759 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
760 return os.path.join(*sanitized_path)
761
762
17bcc626 763def sanitize_url(url):
befa4708
S
764 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
765 # the number of unwanted failures due to missing protocol
766 if url.startswith('//'):
767 return 'http:%s' % url
768 # Fix some common typos seen so far
769 COMMON_TYPOS = (
067aa17e 770 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
771 (r'^httpss://', r'https://'),
772 # https://bx1.be/lives/direct-tv/
773 (r'^rmtp([es]?)://', r'rtmp\1://'),
774 )
775 for mistake, fixup in COMMON_TYPOS:
776 if re.match(mistake, url):
777 return re.sub(mistake, fixup, url)
bc6b9bcd 778 return url
17bcc626
S
779
780
5435dcf9
HH
781def extract_basic_auth(url):
782 parts = compat_urlparse.urlsplit(url)
783 if parts.username is None:
784 return url, None
785 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
786 parts.hostname if parts.port is None
787 else '%s:%d' % (parts.hostname, parts.port))))
788 auth_payload = base64.b64encode(
789 ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
790 return url, 'Basic ' + auth_payload.decode('utf-8')
791
792
67dda517 793def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 794 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
795 if auth_header is not None:
796 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
797 headers['Authorization'] = auth_header
798 return compat_urllib_request.Request(url, *args, **kwargs)
67dda517
S
799
800
51098426
S
801def expand_path(s):
802 """Expand shell variables and ~"""
803 return os.path.expandvars(compat_expanduser(s))
804
805
d77c3dfd 806def orderedSet(iterable):
59ae15a5
PH
807 """ Remove all duplicates from the input iterable """
808 res = []
809 for el in iterable:
810 if el not in res:
811 res.append(el)
812 return res
d77c3dfd 813
912b38b4 814
55b2f099 815def _htmlentity_transform(entity_with_semicolon):
4e408e47 816 """Transforms an HTML entity to a character."""
55b2f099
YCH
817 entity = entity_with_semicolon[:-1]
818
4e408e47
PH
819 # Known non-numeric HTML entity
820 if entity in compat_html_entities.name2codepoint:
821 return compat_chr(compat_html_entities.name2codepoint[entity])
822
55b2f099
YCH
823 # TODO: HTML5 allows entities without a semicolon. For example,
824 # '&Eacuteric' should be decoded as 'Éric'.
825 if entity_with_semicolon in compat_html_entities_html5:
826 return compat_html_entities_html5[entity_with_semicolon]
827
91757b0f 828 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
829 if mobj is not None:
830 numstr = mobj.group(1)
28e614de 831 if numstr.startswith('x'):
4e408e47 832 base = 16
28e614de 833 numstr = '0%s' % numstr
4e408e47
PH
834 else:
835 base = 10
067aa17e 836 # See https://github.com/ytdl-org/youtube-dl/issues/7518
7aefc49c
S
837 try:
838 return compat_chr(int(numstr, base))
839 except ValueError:
840 pass
4e408e47
PH
841
842 # Unknown entity in name, return its literal representation
7a3f0c00 843 return '&%s;' % entity
4e408e47
PH
844
845
d77c3dfd 846def unescapeHTML(s):
912b38b4
PH
847 if s is None:
848 return None
849 assert type(s) == compat_str
d77c3dfd 850
4e408e47 851 return re.sub(
95f3f7c2 852 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 853
8bf48f23 854
cdb19aa4 855def escapeHTML(text):
856 return (
857 text
858 .replace('&', '&amp;')
859 .replace('<', '&lt;')
860 .replace('>', '&gt;')
861 .replace('"', '&quot;')
862 .replace("'", '&#39;')
863 )
864
865
f5b1bca9 866def process_communicate_or_kill(p, *args, **kwargs):
867 try:
868 return p.communicate(*args, **kwargs)
869 except BaseException: # Including KeyboardInterrupt
870 p.kill()
871 p.wait()
872 raise
873
874
d3c93ec2 875class Popen(subprocess.Popen):
876 if sys.platform == 'win32':
877 _startupinfo = subprocess.STARTUPINFO()
878 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
879 else:
880 _startupinfo = None
881
882 def __init__(self, *args, **kwargs):
883 super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
884
885 def communicate_or_kill(self, *args, **kwargs):
886 return process_communicate_or_kill(self, *args, **kwargs)
887
888
aa49acd1
S
889def get_subprocess_encoding():
890 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
891 # For subprocess calls, encode with locale encoding
892 # Refer to http://stackoverflow.com/a/9951851/35070
893 encoding = preferredencoding()
894 else:
895 encoding = sys.getfilesystemencoding()
896 if encoding is None:
897 encoding = 'utf-8'
898 return encoding
899
900
8bf48f23 901def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
902 """
903 @param s The name of the file
904 """
d77c3dfd 905
8bf48f23 906 assert type(s) == compat_str
d77c3dfd 907
59ae15a5
PH
908 # Python 3 has a Unicode API
909 if sys.version_info >= (3, 0):
910 return s
0f00efed 911
aa49acd1
S
912 # Pass '' directly to use Unicode APIs on Windows 2000 and up
913 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
914 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
915 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
916 return s
917
8ee239e9
YCH
918 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
919 if sys.platform.startswith('java'):
920 return s
921
aa49acd1
S
922 return s.encode(get_subprocess_encoding(), 'ignore')
923
924
925def decodeFilename(b, for_subprocess=False):
926
927 if sys.version_info >= (3, 0):
928 return b
929
930 if not isinstance(b, bytes):
931 return b
932
933 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 934
f07b74fc
PH
935
936def encodeArgument(s):
937 if not isinstance(s, compat_str):
938 # Legacy code that uses byte strings
939 # Uncomment the following line after fixing all post processors
7af808a5 940 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
941 s = s.decode('ascii')
942 return encodeFilename(s, True)
943
944
aa49acd1
S
945def decodeArgument(b):
946 return decodeFilename(b, True)
947
948
8271226a
PH
949def decodeOption(optval):
950 if optval is None:
951 return optval
952 if isinstance(optval, bytes):
953 optval = optval.decode(preferredencoding())
954
955 assert isinstance(optval, compat_str)
956 return optval
1c256f70 957
5f6a1245 958
aa7785f8 959_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
960
961
962def timetuple_from_msec(msec):
963 secs, msec = divmod(msec, 1000)
964 mins, secs = divmod(secs, 60)
965 hrs, mins = divmod(mins, 60)
966 return _timetuple(hrs, mins, secs, msec)
967
968
cdb19aa4 969def formatSeconds(secs, delim=':', msec=False):
aa7785f8 970 time = timetuple_from_msec(secs * 1000)
971 if time.hours:
972 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
973 elif time.minutes:
974 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 975 else:
aa7785f8 976 ret = '%d' % time.seconds
977 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 978
a0ddb8a2 979
77562778 980def _ssl_load_windows_store_certs(ssl_context, storename):
981 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
982 try:
983 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
984 if encoding == 'x509_asn' and (
985 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
986 except PermissionError:
987 return
988 for cert in certs:
a2366922 989 try:
77562778 990 ssl_context.load_verify_locations(cadata=cert)
991 except ssl.SSLError:
a2366922
PH
992 pass
993
77562778 994
995def make_HTTPS_handler(params, **kwargs):
996 opts_check_certificate = not params.get('nocheckcertificate')
997 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
998 context.check_hostname = opts_check_certificate
999 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1000 if opts_check_certificate:
4e3d1898 1001 try:
1002 context.load_default_certs()
1003 # Work around the issue in load_default_certs when there are bad certificates. See:
1004 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1005 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1006 except ssl.SSLError:
1007 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1008 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1009 # Create a new context to discard any certificates that were already loaded
1010 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1011 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
1012 for storename in ('CA', 'ROOT'):
1013 _ssl_load_windows_store_certs(context, storename)
1014 context.set_default_verify_paths()
77562778 1015 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 1016
732ea2f0 1017
5873d4cc 1018def bug_reports_message(before=';'):
08f2a92c 1019 if ytdl_is_updateable():
7a5c1cfe 1020 update_cmd = 'type yt-dlp -U to update'
08f2a92c 1021 else:
7a5c1cfe 1022 update_cmd = 'see https://github.com/yt-dlp/yt-dlp on how to update'
5873d4cc 1023 msg = 'please report this issue on https://github.com/yt-dlp/yt-dlp .'
08f2a92c 1024 msg += ' Make sure you are using the latest version; %s.' % update_cmd
7a5c1cfe 1025 msg += ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
5873d4cc
F
1026
1027 before = before.rstrip()
1028 if not before or before.endswith(('.', '!', '?')):
1029 msg = msg[0].title() + msg[1:]
1030
1031 return (before + ' ' if before else '') + msg
08f2a92c
JMF
1032
1033
bf5b9d85
PM
1034class YoutubeDLError(Exception):
1035 """Base exception for YoutubeDL errors."""
aa9369a2 1036 msg = None
1037
1038 def __init__(self, msg=None):
1039 if msg is not None:
1040 self.msg = msg
1041 elif self.msg is None:
1042 self.msg = type(self).__name__
1043 super().__init__(self.msg)
bf5b9d85
PM
1044
1045
3158150c 1046network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1047if hasattr(ssl, 'CertificateError'):
1048 network_exceptions.append(ssl.CertificateError)
1049network_exceptions = tuple(network_exceptions)
1050
1051
bf5b9d85 1052class ExtractorError(YoutubeDLError):
1c256f70 1053 """Error during info extraction."""
5f6a1245 1054
1151c407 1055 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1056 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1057 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1058 """
3158150c 1059 if sys.exc_info()[0] in network_exceptions:
9a82b238 1060 expected = True
d5979c5d 1061
526d74ec 1062 self.msg = str(msg)
1c256f70 1063 self.traceback = tb
1151c407 1064 self.expected = expected
2eabb802 1065 self.cause = cause
d11271dd 1066 self.video_id = video_id
1151c407 1067 self.ie = ie
1068 self.exc_info = sys.exc_info() # preserve original exception
1069
1070 super(ExtractorError, self).__init__(''.join((
1071 format_field(ie, template='[%s] '),
1072 format_field(video_id, template='%s: '),
526d74ec 1073 self.msg,
1151c407 1074 format_field(cause, template=' (caused by %r)'),
1075 '' if expected else bug_reports_message())))
1c256f70 1076
01951dda
PH
1077 def format_traceback(self):
1078 if self.traceback is None:
1079 return None
28e614de 1080 return ''.join(traceback.format_tb(self.traceback))
01951dda 1081
1c256f70 1082
416c7fcb
PH
1083class UnsupportedError(ExtractorError):
1084 def __init__(self, url):
1085 super(UnsupportedError, self).__init__(
1086 'Unsupported URL: %s' % url, expected=True)
1087 self.url = url
1088
1089
55b3e45b
JMF
1090class RegexNotFoundError(ExtractorError):
1091 """Error when a regex didn't match"""
1092 pass
1093
1094
773f291d
S
1095class GeoRestrictedError(ExtractorError):
1096 """Geographic restriction Error exception.
1097
1098 This exception may be thrown when a video is not available from your
1099 geographic location due to geographic restrictions imposed by a website.
1100 """
b6e0c7d2 1101
0db3bae8 1102 def __init__(self, msg, countries=None, **kwargs):
1103 kwargs['expected'] = True
1104 super(GeoRestrictedError, self).__init__(msg, **kwargs)
773f291d
S
1105 self.countries = countries
1106
1107
bf5b9d85 1108class DownloadError(YoutubeDLError):
59ae15a5 1109 """Download Error exception.
d77c3dfd 1110
59ae15a5
PH
1111 This exception may be thrown by FileDownloader objects if they are not
1112 configured to continue on errors. They will contain the appropriate
1113 error message.
1114 """
5f6a1245 1115
8cc83b8d
FV
1116 def __init__(self, msg, exc_info=None):
1117 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1118 super(DownloadError, self).__init__(msg)
1119 self.exc_info = exc_info
d77c3dfd
FV
1120
1121
498f5606 1122class EntryNotInPlaylist(YoutubeDLError):
1123 """Entry not in playlist exception.
1124
1125 This exception will be thrown by YoutubeDL when a requested entry
1126 is not found in the playlist info_dict
1127 """
aa9369a2 1128 msg = 'Entry not found in info'
498f5606 1129
1130
bf5b9d85 1131class SameFileError(YoutubeDLError):
59ae15a5 1132 """Same File exception.
d77c3dfd 1133
59ae15a5
PH
1134 This exception will be thrown by FileDownloader objects if they detect
1135 multiple files would have to be downloaded to the same file on disk.
1136 """
aa9369a2 1137 msg = 'Fixed output name but more than one file to download'
1138
1139 def __init__(self, filename=None):
1140 if filename is not None:
1141 self.msg += f': {filename}'
1142 super().__init__(self.msg)
d77c3dfd
FV
1143
1144
bf5b9d85 1145class PostProcessingError(YoutubeDLError):
59ae15a5 1146 """Post Processing exception.
d77c3dfd 1147
59ae15a5
PH
1148 This exception may be raised by PostProcessor's .run() method to
1149 indicate an error in the postprocessing task.
1150 """
5f6a1245 1151
5f6a1245 1152
48f79687 1153class DownloadCancelled(YoutubeDLError):
1154 """ Exception raised when the download queue should be interrupted """
1155 msg = 'The download was cancelled'
8b0d7497 1156
8b0d7497 1157
48f79687 1158class ExistingVideoReached(DownloadCancelled):
1159 """ --break-on-existing triggered """
1160 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1161
48f79687 1162
1163class RejectedVideoReached(DownloadCancelled):
1164 """ --break-on-reject triggered """
1165 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
51d9739f 1166
1167
48f79687 1168class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1169 """ --max-downloads limit has been reached. """
48f79687 1170 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1171
1172
f2ebc5c7 1173class ReExtractInfo(YoutubeDLError):
1174 """ Video info needs to be re-extracted. """
1175
1176 def __init__(self, msg, expected=False):
1177 super().__init__(msg)
1178 self.expected = expected
1179
1180
1181class ThrottledDownload(ReExtractInfo):
48f79687 1182 """ Download speed below --throttled-rate. """
aa9369a2 1183 msg = 'The download speed is below throttle limit'
d77c3dfd 1184
43b22906 1185 def __init__(self):
1186 super().__init__(self.msg, expected=False)
f2ebc5c7 1187
d77c3dfd 1188
bf5b9d85 1189class UnavailableVideoError(YoutubeDLError):
59ae15a5 1190 """Unavailable Format exception.
d77c3dfd 1191
59ae15a5
PH
1192 This exception will be thrown when a video is requested
1193 in a format that is not available for that video.
1194 """
aa9369a2 1195 msg = 'Unable to download video'
1196
1197 def __init__(self, err=None):
1198 if err is not None:
1199 self.msg += f': {err}'
1200 super().__init__(self.msg)
d77c3dfd
FV
1201
1202
bf5b9d85 1203class ContentTooShortError(YoutubeDLError):
59ae15a5 1204 """Content Too Short exception.
d77c3dfd 1205
59ae15a5
PH
1206 This exception may be raised by FileDownloader objects when a file they
1207 download is too small for what the server announced first, indicating
1208 the connection was probably interrupted.
1209 """
d77c3dfd 1210
59ae15a5 1211 def __init__(self, downloaded, expected):
bf5b9d85
PM
1212 super(ContentTooShortError, self).__init__(
1213 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1214 )
2c7ed247 1215 # Both in bytes
59ae15a5
PH
1216 self.downloaded = downloaded
1217 self.expected = expected
d77c3dfd 1218
5f6a1245 1219
bf5b9d85 1220class XAttrMetadataError(YoutubeDLError):
efa97bdc
YCH
1221 def __init__(self, code=None, msg='Unknown error'):
1222 super(XAttrMetadataError, self).__init__(msg)
1223 self.code = code
bd264412 1224 self.msg = msg
efa97bdc
YCH
1225
1226 # Parsing code and msg
3089bc74 1227 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1228 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1229 self.reason = 'NO_SPACE'
1230 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1231 self.reason = 'VALUE_TOO_LONG'
1232 else:
1233 self.reason = 'NOT_SUPPORTED'
1234
1235
bf5b9d85 1236class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1237 pass
1238
1239
c5a59d93 1240def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
1241 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1242 # expected HTTP responses to meet HTTP/1.0 or later (see also
067aa17e 1243 # https://github.com/ytdl-org/youtube-dl/issues/6727)
e5e78797 1244 if sys.version_info < (3, 0):
65220c3b
S
1245 kwargs['strict'] = True
1246 hc = http_class(*args, **compat_kwargs(kwargs))
be4a824d 1247 source_address = ydl_handler._params.get('source_address')
8959018a 1248
be4a824d 1249 if source_address is not None:
8959018a
AU
1250 # This is to workaround _create_connection() from socket where it will try all
1251 # address data from getaddrinfo() including IPv6. This filters the result from
1252 # getaddrinfo() based on the source_address value.
1253 # This is based on the cpython socket.create_connection() function.
1254 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1255 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1256 host, port = address
1257 err = None
1258 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1259 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1260 ip_addrs = [addr for addr in addrs if addr[0] == af]
1261 if addrs and not ip_addrs:
1262 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1263 raise socket.error(
1264 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1265 % (ip_version, source_address[0]))
8959018a
AU
1266 for res in ip_addrs:
1267 af, socktype, proto, canonname, sa = res
1268 sock = None
1269 try:
1270 sock = socket.socket(af, socktype, proto)
1271 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1272 sock.settimeout(timeout)
1273 sock.bind(source_address)
1274 sock.connect(sa)
1275 err = None # Explicitly break reference cycle
1276 return sock
1277 except socket.error as _:
1278 err = _
1279 if sock is not None:
1280 sock.close()
1281 if err is not None:
1282 raise err
1283 else:
9e21e6d9
S
1284 raise socket.error('getaddrinfo returns an empty list')
1285 if hasattr(hc, '_create_connection'):
1286 hc._create_connection = _create_connection
be4a824d
PH
1287 sa = (source_address, 0)
1288 if hasattr(hc, 'source_address'): # Python 2.7+
1289 hc.source_address = sa
1290 else: # Python 2.6
1291 def _hc_connect(self, *args, **kwargs):
9e21e6d9 1292 sock = _create_connection(
be4a824d
PH
1293 (self.host, self.port), self.timeout, sa)
1294 if is_https:
d7932313
PH
1295 self.sock = ssl.wrap_socket(
1296 sock, self.key_file, self.cert_file,
1297 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
1298 else:
1299 self.sock = sock
1300 hc.connect = functools.partial(_hc_connect, hc)
1301
1302 return hc
1303
1304
87f0e62d 1305def handle_youtubedl_headers(headers):
992fc9d6
YCH
1306 filtered_headers = headers
1307
1308 if 'Youtubedl-no-compression' in filtered_headers:
1309 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 1310 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1311
992fc9d6 1312 return filtered_headers
87f0e62d
YCH
1313
1314
acebc9cd 1315class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
1316 """Handler for HTTP requests and responses.
1317
1318 This class, when installed with an OpenerDirector, automatically adds
1319 the standard headers to every HTTP request and handles gzipped and
1320 deflated responses from web servers. If compression is to be avoided in
1321 a particular request, the original request in the program code only has
0424ec30 1322 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1323 removed before making the real request.
1324
1325 Part of this code was copied from:
1326
1327 http://techknack.net/python-urllib2-handlers/
1328
1329 Andrew Rowls, the author of that code, agreed to release it to the
1330 public domain.
1331 """
1332
be4a824d
PH
1333 def __init__(self, params, *args, **kwargs):
1334 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1335 self._params = params
1336
1337 def http_open(self, req):
71aff188
YCH
1338 conn_class = compat_http_client.HTTPConnection
1339
1340 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1341 if socks_proxy:
1342 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1343 del req.headers['Ytdl-socks-proxy']
1344
be4a824d 1345 return self.do_open(functools.partial(
71aff188 1346 _create_http_connection, self, conn_class, False),
be4a824d
PH
1347 req)
1348
59ae15a5
PH
1349 @staticmethod
1350 def deflate(data):
fc2119f2 1351 if not data:
1352 return data
59ae15a5
PH
1353 try:
1354 return zlib.decompress(data, -zlib.MAX_WBITS)
1355 except zlib.error:
1356 return zlib.decompress(data)
1357
acebc9cd 1358 def http_request(self, req):
51f267d9
S
1359 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1360 # always respected by websites, some tend to give out URLs with non percent-encoded
1361 # non-ASCII characters (see telemb.py, ard.py [#3412])
1362 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1363 # To work around aforementioned issue we will replace request's original URL with
1364 # percent-encoded one
1365 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1366 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1367 url = req.get_full_url()
1368 url_escaped = escape_url(url)
1369
1370 # Substitute URL if any change after escaping
1371 if url != url_escaped:
15d260eb 1372 req = update_Request(req, url=url_escaped)
51f267d9 1373
33ac271b 1374 for h, v in std_headers.items():
3d5f7a39
JK
1375 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1376 # The dict keys are capitalized because of this bug by urllib
1377 if h.capitalize() not in req.headers:
33ac271b 1378 req.add_header(h, v)
87f0e62d
YCH
1379
1380 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
1381
1382 if sys.version_info < (2, 7) and '#' in req.get_full_url():
1383 # Python 2.6 is brain-dead when it comes to fragments
1384 req._Request__original = req._Request__original.partition('#')[0]
1385 req._Request__r_type = req._Request__r_type.partition('#')[0]
1386
59ae15a5
PH
1387 return req
1388
acebc9cd 1389 def http_response(self, req, resp):
59ae15a5
PH
1390 old_resp = resp
1391 # gzip
1392 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1393 content = resp.read()
1394 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1395 try:
1396 uncompressed = io.BytesIO(gz.read())
1397 except IOError as original_ioerror:
1398 # There may be junk add the end of the file
1399 # See http://stackoverflow.com/q/4928560/35070 for details
1400 for i in range(1, 1024):
1401 try:
1402 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1403 uncompressed = io.BytesIO(gz.read())
1404 except IOError:
1405 continue
1406 break
1407 else:
1408 raise original_ioerror
b407d853 1409 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1410 resp.msg = old_resp.msg
c047270c 1411 del resp.headers['Content-encoding']
59ae15a5
PH
1412 # deflate
1413 if resp.headers.get('Content-encoding', '') == 'deflate':
1414 gz = io.BytesIO(self.deflate(resp.read()))
b407d853 1415 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1416 resp.msg = old_resp.msg
c047270c 1417 del resp.headers['Content-encoding']
ad729172 1418 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1419 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1420 if 300 <= resp.code < 400:
1421 location = resp.headers.get('Location')
1422 if location:
1423 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1424 if sys.version_info >= (3, 0):
1425 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
1426 else:
1427 location = location.decode('utf-8')
5a4d9ddb
S
1428 location_escaped = escape_url(location)
1429 if location != location_escaped:
1430 del resp.headers['Location']
9a4aec8b
YCH
1431 if sys.version_info < (3, 0):
1432 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 1433 resp.headers['Location'] = location_escaped
59ae15a5 1434 return resp
0f8d03f8 1435
acebc9cd
PH
1436 https_request = http_request
1437 https_response = http_response
bf50b038 1438
5de90176 1439
71aff188
YCH
1440def make_socks_conn_class(base_class, socks_proxy):
1441 assert issubclass(base_class, (
1442 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1443
1444 url_components = compat_urlparse.urlparse(socks_proxy)
1445 if url_components.scheme.lower() == 'socks5':
1446 socks_type = ProxyType.SOCKS5
1447 elif url_components.scheme.lower() in ('socks', 'socks4'):
1448 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1449 elif url_components.scheme.lower() == 'socks4a':
1450 socks_type = ProxyType.SOCKS4A
71aff188 1451
cdd94c2e
YCH
1452 def unquote_if_non_empty(s):
1453 if not s:
1454 return s
1455 return compat_urllib_parse_unquote_plus(s)
1456
71aff188
YCH
1457 proxy_args = (
1458 socks_type,
1459 url_components.hostname, url_components.port or 1080,
1460 True, # Remote DNS
cdd94c2e
YCH
1461 unquote_if_non_empty(url_components.username),
1462 unquote_if_non_empty(url_components.password),
71aff188
YCH
1463 )
1464
1465 class SocksConnection(base_class):
1466 def connect(self):
1467 self.sock = sockssocket()
1468 self.sock.setproxy(*proxy_args)
1469 if type(self.timeout) in (int, float):
1470 self.sock.settimeout(self.timeout)
1471 self.sock.connect((self.host, self.port))
1472
1473 if isinstance(self, compat_http_client.HTTPSConnection):
1474 if hasattr(self, '_context'): # Python > 2.6
1475 self.sock = self._context.wrap_socket(
1476 self.sock, server_hostname=self.host)
1477 else:
1478 self.sock = ssl.wrap_socket(self.sock)
1479
1480 return SocksConnection
1481
1482
be4a824d
PH
1483class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1484 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1485 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1486 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1487 self._params = params
1488
1489 def https_open(self, req):
4f264c02 1490 kwargs = {}
71aff188
YCH
1491 conn_class = self._https_conn_class
1492
4f264c02
JMF
1493 if hasattr(self, '_context'): # python > 2.6
1494 kwargs['context'] = self._context
1495 if hasattr(self, '_check_hostname'): # python 3.x
1496 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1497
1498 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1499 if socks_proxy:
1500 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1501 del req.headers['Ytdl-socks-proxy']
1502
be4a824d 1503 return self.do_open(functools.partial(
71aff188 1504 _create_http_connection, self, conn_class, True),
4f264c02 1505 req, **kwargs)
be4a824d
PH
1506
1507
1bab3437 1508class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
f1a8511f
S
1509 """
1510 See [1] for cookie file format.
1511
1512 1. https://curl.haxx.se/docs/http-cookies.html
1513 """
e7e62441 1514 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1515 _ENTRY_LEN = 7
1516 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1517# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1518
1519'''
1520 _CookieFileEntry = collections.namedtuple(
1521 'CookieFileEntry',
1522 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1523
1bab3437 1524 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
c380cc28
S
1525 """
1526 Save cookies to a file.
1527
1528 Most of the code is taken from CPython 3.8 and slightly adapted
1529 to support cookie files with UTF-8 in both python 2 and 3.
1530 """
1531 if filename is None:
1532 if self.filename is not None:
1533 filename = self.filename
1534 else:
1535 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1536
1bab3437
S
1537 # Store session cookies with `expires` set to 0 instead of an empty
1538 # string
1539 for cookie in self:
1540 if cookie.expires is None:
1541 cookie.expires = 0
c380cc28
S
1542
1543 with io.open(filename, 'w', encoding='utf-8') as f:
1544 f.write(self._HEADER)
1545 now = time.time()
1546 for cookie in self:
1547 if not ignore_discard and cookie.discard:
1548 continue
1549 if not ignore_expires and cookie.is_expired(now):
1550 continue
1551 if cookie.secure:
1552 secure = 'TRUE'
1553 else:
1554 secure = 'FALSE'
1555 if cookie.domain.startswith('.'):
1556 initial_dot = 'TRUE'
1557 else:
1558 initial_dot = 'FALSE'
1559 if cookie.expires is not None:
1560 expires = compat_str(cookie.expires)
1561 else:
1562 expires = ''
1563 if cookie.value is None:
1564 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1565 # with no name, whereas http.cookiejar regards it as a
1566 # cookie with no value.
1567 name = ''
1568 value = cookie.name
1569 else:
1570 name = cookie.name
1571 value = cookie.value
1572 f.write(
1573 '\t'.join([cookie.domain, initial_dot, cookie.path,
1574 secure, expires, name, value]) + '\n')
1bab3437
S
1575
1576 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1577 """Load cookies from a file."""
1578 if filename is None:
1579 if self.filename is not None:
1580 filename = self.filename
1581 else:
1582 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1583
c380cc28
S
1584 def prepare_line(line):
1585 if line.startswith(self._HTTPONLY_PREFIX):
1586 line = line[len(self._HTTPONLY_PREFIX):]
1587 # comments and empty lines are fine
1588 if line.startswith('#') or not line.strip():
1589 return line
1590 cookie_list = line.split('\t')
1591 if len(cookie_list) != self._ENTRY_LEN:
1592 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1593 cookie = self._CookieFileEntry(*cookie_list)
1594 if cookie.expires_at and not cookie.expires_at.isdigit():
1595 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1596 return line
1597
e7e62441 1598 cf = io.StringIO()
c380cc28 1599 with io.open(filename, encoding='utf-8') as f:
e7e62441 1600 for line in f:
c380cc28
S
1601 try:
1602 cf.write(prepare_line(line))
1603 except compat_cookiejar.LoadError as e:
1604 write_string(
1605 'WARNING: skipping cookie file entry due to %s: %r\n'
1606 % (e, line), sys.stderr)
1607 continue
e7e62441 1608 cf.seek(0)
1609 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1610 # Session cookies are denoted by either `expires` field set to
1611 # an empty string or 0. MozillaCookieJar only recognizes the former
1612 # (see [1]). So we need force the latter to be recognized as session
1613 # cookies on our own.
1614 # Session cookies may be important for cookies-based authentication,
1615 # e.g. usually, when user does not check 'Remember me' check box while
1616 # logging in on a site, some important cookies are stored as session
1617 # cookies so that not recognizing them will result in failed login.
1618 # 1. https://bugs.python.org/issue17164
1619 for cookie in self:
1620 # Treat `expires=0` cookies as session cookies
1621 if cookie.expires == 0:
1622 cookie.expires = None
1623 cookie.discard = True
1624
1625
a6420bf5
S
1626class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1627 def __init__(self, cookiejar=None):
1628 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1629
1630 def http_response(self, request, response):
1631 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1632 # characters in Set-Cookie HTTP header of last response (see
067aa17e 1633 # https://github.com/ytdl-org/youtube-dl/issues/6769).
a6420bf5
S
1634 # In order to at least prevent crashing we will percent encode Set-Cookie
1635 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1636 # if sys.version_info < (3, 0) and response.headers:
1637 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1638 # set_cookie = response.headers.get(set_cookie_header)
1639 # if set_cookie:
1640 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1641 # if set_cookie != set_cookie_escaped:
1642 # del response.headers[set_cookie_header]
1643 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1644 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1645
f5fa042c 1646 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
a6420bf5
S
1647 https_response = http_response
1648
1649
fca6dba8 1650class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
201c1459 1651 """YoutubeDL redirect handler
1652
1653 The code is based on HTTPRedirectHandler implementation from CPython [1].
1654
1655 This redirect handler solves two issues:
1656 - ensures redirect URL is always unicode under python 2
1657 - introduces support for experimental HTTP response status code
1658 308 Permanent Redirect [2] used by some sites [3]
1659
1660 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1661 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1662 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1663 """
1664
1665 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1666
1667 def redirect_request(self, req, fp, code, msg, headers, newurl):
1668 """Return a Request or None in response to a redirect.
1669
1670 This is called by the http_error_30x methods when a
1671 redirection response is received. If a redirection should
1672 take place, return a new Request to allow http_error_30x to
1673 perform the redirect. Otherwise, raise HTTPError if no-one
1674 else should try to handle this url. Return None if you can't
1675 but another Handler might.
1676 """
1677 m = req.get_method()
1678 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1679 or code in (301, 302, 303) and m == "POST")):
1680 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1681 # Strictly (according to RFC 2616), 301 or 302 in response to
1682 # a POST MUST NOT cause a redirection without confirmation
1683 # from the user (of urllib.request, in this case). In practice,
1684 # essentially all clients do redirect in this case, so we do
1685 # the same.
1686
1687 # On python 2 urlh.geturl() may sometimes return redirect URL
1688 # as byte string instead of unicode. This workaround allows
1689 # to force it always return unicode.
1690 if sys.version_info[0] < 3:
1691 newurl = compat_str(newurl)
1692
1693 # Be conciliant with URIs containing a space. This is mainly
1694 # redundant with the more complete encoding done in http_error_302(),
1695 # but it is kept for compatibility with other callers.
1696 newurl = newurl.replace(' ', '%20')
1697
1698 CONTENT_HEADERS = ("content-length", "content-type")
1699 # NB: don't use dict comprehension for python 2.6 compatibility
1700 newheaders = dict((k, v) for k, v in req.headers.items()
1701 if k.lower() not in CONTENT_HEADERS)
1702 return compat_urllib_request.Request(
1703 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1704 unverifiable=True)
fca6dba8
S
1705
1706
46f59e89
S
1707def extract_timezone(date_str):
1708 m = re.search(
f137e4c2 1709 r'''(?x)
1710 ^.{8,}? # >=8 char non-TZ prefix, if present
1711 (?P<tz>Z| # just the UTC Z, or
1712 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1713 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1714 [ ]? # optional space
1715 (?P<sign>\+|-) # +/-
1716 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1717 $)
1718 ''', date_str)
46f59e89
S
1719 if not m:
1720 timezone = datetime.timedelta()
1721 else:
1722 date_str = date_str[:-len(m.group('tz'))]
1723 if not m.group('sign'):
1724 timezone = datetime.timedelta()
1725 else:
1726 sign = 1 if m.group('sign') == '+' else -1
1727 timezone = datetime.timedelta(
1728 hours=sign * int(m.group('hours')),
1729 minutes=sign * int(m.group('minutes')))
1730 return timezone, date_str
1731
1732
08b38d54 1733def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1734 """ Return a UNIX timestamp from the given date """
1735
1736 if date_str is None:
1737 return None
1738
52c3a6e4
S
1739 date_str = re.sub(r'\.[0-9]+', '', date_str)
1740
08b38d54 1741 if timezone is None:
46f59e89
S
1742 timezone, date_str = extract_timezone(date_str)
1743
52c3a6e4
S
1744 try:
1745 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1746 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1747 return calendar.timegm(dt.timetuple())
1748 except ValueError:
1749 pass
912b38b4
PH
1750
1751
46f59e89
S
1752def date_formats(day_first=True):
1753 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1754
1755
42bdd9d0 1756def unified_strdate(date_str, day_first=True):
bf50b038 1757 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1758
1759 if date_str is None:
1760 return None
bf50b038 1761 upload_date = None
5f6a1245 1762 # Replace commas
026fcc04 1763 date_str = date_str.replace(',', ' ')
42bdd9d0 1764 # Remove AM/PM + timezone
9bb8e0a3 1765 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1766 _, date_str = extract_timezone(date_str)
42bdd9d0 1767
46f59e89 1768 for expression in date_formats(day_first):
bf50b038
JMF
1769 try:
1770 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1771 except ValueError:
bf50b038 1772 pass
42393ce2
PH
1773 if upload_date is None:
1774 timetuple = email.utils.parsedate_tz(date_str)
1775 if timetuple:
c6b9cf05
S
1776 try:
1777 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1778 except ValueError:
1779 pass
6a750402
JMF
1780 if upload_date is not None:
1781 return compat_str(upload_date)
bf50b038 1782
5f6a1245 1783
46f59e89
S
1784def unified_timestamp(date_str, day_first=True):
1785 if date_str is None:
1786 return None
1787
2ae2ffda 1788 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1789
7dc2a74e 1790 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1791 timezone, date_str = extract_timezone(date_str)
1792
1793 # Remove AM/PM + timezone
1794 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1795
deef3195
S
1796 # Remove unrecognized timezones from ISO 8601 alike timestamps
1797 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1798 if m:
1799 date_str = date_str[:-len(m.group('tz'))]
1800
f226880c
PH
1801 # Python only supports microseconds, so remove nanoseconds
1802 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1803 if m:
1804 date_str = m.group(1)
1805
46f59e89
S
1806 for expression in date_formats(day_first):
1807 try:
7dc2a74e 1808 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1809 return calendar.timegm(dt.timetuple())
1810 except ValueError:
1811 pass
1812 timetuple = email.utils.parsedate_tz(date_str)
1813 if timetuple:
7dc2a74e 1814 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1815
1816
28e614de 1817def determine_ext(url, default_ext='unknown_video'):
85750f89 1818 if url is None or '.' not in url:
f4776371 1819 return default_ext
9cb9a5df 1820 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1821 if re.match(r'^[A-Za-z0-9]+$', guess):
1822 return guess
a7aaa398
S
1823 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1824 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1825 return guess.rstrip('/')
73e79f2a 1826 else:
cbdbb766 1827 return default_ext
73e79f2a 1828
5f6a1245 1829
824fa511
S
1830def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1831 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1832
5f6a1245 1833
9e62f283 1834def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
37254abc
JMF
1835 """
1836 Return a datetime object from a string in the format YYYYMMDD or
9e62f283 1837 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1838
1839 format: string date format used to return datetime object from
1840 precision: round the time portion of a datetime object.
1841 auto|microsecond|second|minute|hour|day.
1842 auto: round to the unit provided in date_str (if applicable).
1843 """
1844 auto_precision = False
1845 if precision == 'auto':
1846 auto_precision = True
1847 precision = 'microsecond'
1848 today = datetime_round(datetime.datetime.now(), precision)
f8795e10 1849 if date_str in ('now', 'today'):
37254abc 1850 return today
f8795e10
PH
1851 if date_str == 'yesterday':
1852 return today - datetime.timedelta(days=1)
9e62f283 1853 match = re.match(
1854 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1855 date_str)
37254abc 1856 if match is not None:
9e62f283 1857 start_time = datetime_from_str(match.group('start'), precision, format)
1858 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1859 unit = match.group('unit')
9e62f283 1860 if unit == 'month' or unit == 'year':
1861 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1862 unit = 'day'
9e62f283 1863 else:
1864 if unit == 'week':
1865 unit = 'day'
1866 time *= 7
1867 delta = datetime.timedelta(**{unit + 's': time})
1868 new_date = start_time + delta
1869 if auto_precision:
1870 return datetime_round(new_date, unit)
1871 return new_date
1872
1873 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1874
1875
1876def date_from_str(date_str, format='%Y%m%d'):
1877 """
1878 Return a datetime object from a string in the format YYYYMMDD or
1879 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1880
1881 format: string date format used to return datetime object from
1882 """
1883 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1884
1885
1886def datetime_add_months(dt, months):
1887 """Increment/Decrement a datetime object by months."""
1888 month = dt.month + months - 1
1889 year = dt.year + month // 12
1890 month = month % 12 + 1
1891 day = min(dt.day, calendar.monthrange(year, month)[1])
1892 return dt.replace(year, month, day)
1893
1894
1895def datetime_round(dt, precision='day'):
1896 """
1897 Round a datetime object's time to a specific precision
1898 """
1899 if precision == 'microsecond':
1900 return dt
1901
1902 unit_seconds = {
1903 'day': 86400,
1904 'hour': 3600,
1905 'minute': 60,
1906 'second': 1,
1907 }
1908 roundto = lambda x, n: ((x + n / 2) // n) * n
1909 timestamp = calendar.timegm(dt.timetuple())
1910 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1911
1912
e63fc1be 1913def hyphenate_date(date_str):
1914 """
1915 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1916 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1917 if match is not None:
1918 return '-'.join(match.groups())
1919 else:
1920 return date_str
1921
5f6a1245 1922
bd558525
JMF
1923class DateRange(object):
1924 """Represents a time interval between two dates"""
5f6a1245 1925
bd558525
JMF
1926 def __init__(self, start=None, end=None):
1927 """start and end must be strings in the format accepted by date"""
1928 if start is not None:
1929 self.start = date_from_str(start)
1930 else:
1931 self.start = datetime.datetime.min.date()
1932 if end is not None:
1933 self.end = date_from_str(end)
1934 else:
1935 self.end = datetime.datetime.max.date()
37254abc 1936 if self.start > self.end:
bd558525 1937 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1938
bd558525
JMF
1939 @classmethod
1940 def day(cls, day):
1941 """Returns a range that only contains the given day"""
5f6a1245
JW
1942 return cls(day, day)
1943
bd558525
JMF
1944 def __contains__(self, date):
1945 """Check if the date is in the range"""
37254abc
JMF
1946 if not isinstance(date, datetime.date):
1947 date = date_from_str(date)
1948 return self.start <= date <= self.end
5f6a1245 1949
bd558525 1950 def __str__(self):
5f6a1245 1951 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1952
1953
1954def platform_name():
1955 """ Returns the platform name as a compat_str """
1956 res = platform.platform()
1957 if isinstance(res, bytes):
1958 res = res.decode(preferredencoding())
1959
1960 assert isinstance(res, compat_str)
1961 return res
c257baff
PH
1962
1963
49fa4d9a
N
1964def get_windows_version():
1965 ''' Get Windows version. None if it's not running on Windows '''
1966 if compat_os_name == 'nt':
1967 return version_tuple(platform.win32_ver()[1])
1968 else:
1969 return None
1970
1971
b58ddb32
PH
1972def _windows_write_string(s, out):
1973 """ Returns True if the string was written using special methods,
1974 False if it has yet to be written out."""
1975 # Adapted from http://stackoverflow.com/a/3259271/35070
1976
b58ddb32
PH
1977 import ctypes.wintypes
1978
1979 WIN_OUTPUT_IDS = {
1980 1: -11,
1981 2: -12,
1982 }
1983
a383a98a
PH
1984 try:
1985 fileno = out.fileno()
1986 except AttributeError:
1987 # If the output stream doesn't have a fileno, it's virtual
1988 return False
aa42e873
PH
1989 except io.UnsupportedOperation:
1990 # Some strange Windows pseudo files?
1991 return False
b58ddb32
PH
1992 if fileno not in WIN_OUTPUT_IDS:
1993 return False
1994
d7cd9a9e 1995 GetStdHandle = compat_ctypes_WINFUNCTYPE(
b58ddb32 1996 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
d7cd9a9e 1997 ('GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1998 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1999
d7cd9a9e 2000 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
2001 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
2002 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
d7cd9a9e 2003 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
2004 written = ctypes.wintypes.DWORD(0)
2005
d7cd9a9e 2006 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
2007 FILE_TYPE_CHAR = 0x0002
2008 FILE_TYPE_REMOTE = 0x8000
d7cd9a9e 2009 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
2010 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
2011 ctypes.POINTER(ctypes.wintypes.DWORD))(
d7cd9a9e 2012 ('GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
2013 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
2014
2015 def not_a_console(handle):
2016 if handle == INVALID_HANDLE_VALUE or handle is None:
2017 return True
3089bc74
S
2018 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
2019 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
2020
2021 if not_a_console(h):
2022 return False
2023
d1b9c912
PH
2024 def next_nonbmp_pos(s):
2025 try:
2026 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
2027 except StopIteration:
2028 return len(s)
2029
2030 while s:
2031 count = min(next_nonbmp_pos(s), 1024)
2032
b58ddb32 2033 ret = WriteConsoleW(
d1b9c912 2034 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
2035 if ret == 0:
2036 raise OSError('Failed to write string')
d1b9c912
PH
2037 if not count: # We just wrote a non-BMP character
2038 assert written.value == 2
2039 s = s[1:]
2040 else:
2041 assert written.value > 0
2042 s = s[written.value:]
b58ddb32
PH
2043 return True
2044
2045
734f90bb 2046def write_string(s, out=None, encoding=None):
7459e3a2
PH
2047 if out is None:
2048 out = sys.stderr
8bf48f23 2049 assert type(s) == compat_str
7459e3a2 2050
b58ddb32
PH
2051 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
2052 if _windows_write_string(s, out):
2053 return
2054
3089bc74
S
2055 if ('b' in getattr(out, 'mode', '')
2056 or sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
2057 byt = s.encode(encoding or preferredencoding(), 'ignore')
2058 out.write(byt)
2059 elif hasattr(out, 'buffer'):
2060 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2061 byt = s.encode(enc, 'ignore')
2062 out.buffer.write(byt)
2063 else:
8bf48f23 2064 out.write(s)
7459e3a2
PH
2065 out.flush()
2066
2067
48ea9cea
PH
2068def bytes_to_intlist(bs):
2069 if not bs:
2070 return []
2071 if isinstance(bs[0], int): # Python 3
2072 return list(bs)
2073 else:
2074 return [ord(c) for c in bs]
2075
c257baff 2076
cba892fa 2077def intlist_to_bytes(xs):
2078 if not xs:
2079 return b''
edaa23f8 2080 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
2081
2082
c1c9a79c
PH
2083# Cross-platform file locking
2084if sys.platform == 'win32':
2085 import ctypes.wintypes
2086 import msvcrt
2087
2088 class OVERLAPPED(ctypes.Structure):
2089 _fields_ = [
2090 ('Internal', ctypes.wintypes.LPVOID),
2091 ('InternalHigh', ctypes.wintypes.LPVOID),
2092 ('Offset', ctypes.wintypes.DWORD),
2093 ('OffsetHigh', ctypes.wintypes.DWORD),
2094 ('hEvent', ctypes.wintypes.HANDLE),
2095 ]
2096
2097 kernel32 = ctypes.windll.kernel32
2098 LockFileEx = kernel32.LockFileEx
2099 LockFileEx.argtypes = [
2100 ctypes.wintypes.HANDLE, # hFile
2101 ctypes.wintypes.DWORD, # dwFlags
2102 ctypes.wintypes.DWORD, # dwReserved
2103 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2104 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2105 ctypes.POINTER(OVERLAPPED) # Overlapped
2106 ]
2107 LockFileEx.restype = ctypes.wintypes.BOOL
2108 UnlockFileEx = kernel32.UnlockFileEx
2109 UnlockFileEx.argtypes = [
2110 ctypes.wintypes.HANDLE, # hFile
2111 ctypes.wintypes.DWORD, # dwReserved
2112 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2113 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2114 ctypes.POINTER(OVERLAPPED) # Overlapped
2115 ]
2116 UnlockFileEx.restype = ctypes.wintypes.BOOL
2117 whole_low = 0xffffffff
2118 whole_high = 0x7fffffff
2119
2120 def _lock_file(f, exclusive):
2121 overlapped = OVERLAPPED()
2122 overlapped.Offset = 0
2123 overlapped.OffsetHigh = 0
2124 overlapped.hEvent = 0
2125 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2126 handle = msvcrt.get_osfhandle(f.fileno())
2127 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
2128 whole_low, whole_high, f._lock_file_overlapped_p):
2129 raise OSError('Locking file failed: %r' % ctypes.FormatError())
2130
2131 def _unlock_file(f):
2132 assert f._lock_file_overlapped_p
2133 handle = msvcrt.get_osfhandle(f.fileno())
2134 if not UnlockFileEx(handle, 0,
2135 whole_low, whole_high, f._lock_file_overlapped_p):
2136 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2137
2138else:
399a76e6
YCH
2139 # Some platforms, such as Jython, is missing fcntl
2140 try:
2141 import fcntl
c1c9a79c 2142
399a76e6
YCH
2143 def _lock_file(f, exclusive):
2144 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 2145
399a76e6
YCH
2146 def _unlock_file(f):
2147 fcntl.flock(f, fcntl.LOCK_UN)
2148 except ImportError:
2149 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2150
2151 def _lock_file(f, exclusive):
2152 raise IOError(UNSUPPORTED_MSG)
2153
2154 def _unlock_file(f):
2155 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
2156
2157
2158class locked_file(object):
2159 def __init__(self, filename, mode, encoding=None):
2160 assert mode in ['r', 'a', 'w']
2161 self.f = io.open(filename, mode, encoding=encoding)
2162 self.mode = mode
2163
2164 def __enter__(self):
2165 exclusive = self.mode != 'r'
2166 try:
2167 _lock_file(self.f, exclusive)
2168 except IOError:
2169 self.f.close()
2170 raise
2171 return self
2172
2173 def __exit__(self, etype, value, traceback):
2174 try:
2175 _unlock_file(self.f)
2176 finally:
2177 self.f.close()
2178
2179 def __iter__(self):
2180 return iter(self.f)
2181
2182 def write(self, *args):
2183 return self.f.write(*args)
2184
2185 def read(self, *args):
2186 return self.f.read(*args)
4eb7f1d1
JMF
2187
2188
4644ac55
S
2189def get_filesystem_encoding():
2190 encoding = sys.getfilesystemencoding()
2191 return encoding if encoding is not None else 'utf-8'
2192
2193
4eb7f1d1 2194def shell_quote(args):
a6a173c2 2195 quoted_args = []
4644ac55 2196 encoding = get_filesystem_encoding()
a6a173c2
JMF
2197 for a in args:
2198 if isinstance(a, bytes):
2199 # We may get a filename encoded with 'encodeFilename'
2200 a = a.decode(encoding)
aefce8e6 2201 quoted_args.append(compat_shlex_quote(a))
28e614de 2202 return ' '.join(quoted_args)
9d4660ca
PH
2203
2204
2205def smuggle_url(url, data):
2206 """ Pass additional data in a URL for internal use. """
2207
81953d1a
RA
2208 url, idata = unsmuggle_url(url, {})
2209 data.update(idata)
15707c7e 2210 sdata = compat_urllib_parse_urlencode(
28e614de
PH
2211 {'__youtubedl_smuggle': json.dumps(data)})
2212 return url + '#' + sdata
9d4660ca
PH
2213
2214
79f82953 2215def unsmuggle_url(smug_url, default=None):
83e865a3 2216 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2217 return smug_url, default
28e614de
PH
2218 url, _, sdata = smug_url.rpartition('#')
2219 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2220 data = json.loads(jsond)
2221 return url, data
02dbf93f
PH
2222
2223
e0fd9573 2224def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2225 """ Formats numbers with decimal sufixes like K, M, etc """
2226 num, factor = float_or_none(num), float(factor)
2227 if num is None:
2228 return None
2229 exponent = 0 if num == 0 else int(math.log(num, factor))
abbeeebc 2230 suffix = ['', *'kMGTPEZY'][exponent]
2231 if factor == 1024:
2232 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2233 converted = num / (factor ** exponent)
abbeeebc 2234 return fmt % (converted, suffix)
e0fd9573 2235
2236
02dbf93f 2237def format_bytes(bytes):
f02d24d8 2238 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2239
1c088fa8 2240
fb47597b
S
2241def lookup_unit_table(unit_table, s):
2242 units_re = '|'.join(re.escape(u) for u in unit_table)
2243 m = re.match(
782b1b5b 2244 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
2245 if not m:
2246 return None
2247 num_str = m.group('num').replace(',', '.')
2248 mult = unit_table[m.group('unit')]
2249 return int(float(num_str) * mult)
2250
2251
be64b5b0
PH
2252def parse_filesize(s):
2253 if s is None:
2254 return None
2255
dfb1b146 2256 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2257 # but we support those too
2258 _UNIT_TABLE = {
2259 'B': 1,
2260 'b': 1,
70852b47 2261 'bytes': 1,
be64b5b0
PH
2262 'KiB': 1024,
2263 'KB': 1000,
2264 'kB': 1024,
2265 'Kb': 1000,
13585d76 2266 'kb': 1000,
70852b47
YCH
2267 'kilobytes': 1000,
2268 'kibibytes': 1024,
be64b5b0
PH
2269 'MiB': 1024 ** 2,
2270 'MB': 1000 ** 2,
2271 'mB': 1024 ** 2,
2272 'Mb': 1000 ** 2,
13585d76 2273 'mb': 1000 ** 2,
70852b47
YCH
2274 'megabytes': 1000 ** 2,
2275 'mebibytes': 1024 ** 2,
be64b5b0
PH
2276 'GiB': 1024 ** 3,
2277 'GB': 1000 ** 3,
2278 'gB': 1024 ** 3,
2279 'Gb': 1000 ** 3,
13585d76 2280 'gb': 1000 ** 3,
70852b47
YCH
2281 'gigabytes': 1000 ** 3,
2282 'gibibytes': 1024 ** 3,
be64b5b0
PH
2283 'TiB': 1024 ** 4,
2284 'TB': 1000 ** 4,
2285 'tB': 1024 ** 4,
2286 'Tb': 1000 ** 4,
13585d76 2287 'tb': 1000 ** 4,
70852b47
YCH
2288 'terabytes': 1000 ** 4,
2289 'tebibytes': 1024 ** 4,
be64b5b0
PH
2290 'PiB': 1024 ** 5,
2291 'PB': 1000 ** 5,
2292 'pB': 1024 ** 5,
2293 'Pb': 1000 ** 5,
13585d76 2294 'pb': 1000 ** 5,
70852b47
YCH
2295 'petabytes': 1000 ** 5,
2296 'pebibytes': 1024 ** 5,
be64b5b0
PH
2297 'EiB': 1024 ** 6,
2298 'EB': 1000 ** 6,
2299 'eB': 1024 ** 6,
2300 'Eb': 1000 ** 6,
13585d76 2301 'eb': 1000 ** 6,
70852b47
YCH
2302 'exabytes': 1000 ** 6,
2303 'exbibytes': 1024 ** 6,
be64b5b0
PH
2304 'ZiB': 1024 ** 7,
2305 'ZB': 1000 ** 7,
2306 'zB': 1024 ** 7,
2307 'Zb': 1000 ** 7,
13585d76 2308 'zb': 1000 ** 7,
70852b47
YCH
2309 'zettabytes': 1000 ** 7,
2310 'zebibytes': 1024 ** 7,
be64b5b0
PH
2311 'YiB': 1024 ** 8,
2312 'YB': 1000 ** 8,
2313 'yB': 1024 ** 8,
2314 'Yb': 1000 ** 8,
13585d76 2315 'yb': 1000 ** 8,
70852b47
YCH
2316 'yottabytes': 1000 ** 8,
2317 'yobibytes': 1024 ** 8,
be64b5b0
PH
2318 }
2319
fb47597b
S
2320 return lookup_unit_table(_UNIT_TABLE, s)
2321
2322
2323def parse_count(s):
2324 if s is None:
be64b5b0
PH
2325 return None
2326
352d5da8 2327 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2328
2329 if re.match(r'^[\d,.]+$', s):
2330 return str_to_int(s)
2331
2332 _UNIT_TABLE = {
2333 'k': 1000,
2334 'K': 1000,
2335 'm': 1000 ** 2,
2336 'M': 1000 ** 2,
2337 'kk': 1000 ** 2,
2338 'KK': 1000 ** 2,
352d5da8 2339 'b': 1000 ** 3,
2340 'B': 1000 ** 3,
fb47597b 2341 }
be64b5b0 2342
352d5da8 2343 ret = lookup_unit_table(_UNIT_TABLE, s)
2344 if ret is not None:
2345 return ret
2346
2347 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2348 if mobj:
2349 return str_to_int(mobj.group(1))
be64b5b0 2350
2f7ae819 2351
b871d7e9
S
2352def parse_resolution(s):
2353 if s is None:
2354 return {}
2355
17ec8bcf 2356 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2357 if mobj:
2358 return {
2359 'width': int(mobj.group('w')),
2360 'height': int(mobj.group('h')),
2361 }
2362
17ec8bcf 2363 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2364 if mobj:
2365 return {'height': int(mobj.group(1))}
2366
2367 mobj = re.search(r'\b([48])[kK]\b', s)
2368 if mobj:
2369 return {'height': int(mobj.group(1)) * 540}
2370
2371 return {}
2372
2373
0dc41787
S
2374def parse_bitrate(s):
2375 if not isinstance(s, compat_str):
2376 return
2377 mobj = re.search(r'\b(\d+)\s*kbps', s)
2378 if mobj:
2379 return int(mobj.group(1))
2380
2381
a942d6cb 2382def month_by_name(name, lang='en'):
caefb1de
PH
2383 """ Return the number of a month by (locale-independently) English name """
2384
f6717dec 2385 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2386
caefb1de 2387 try:
f6717dec 2388 return month_names.index(name) + 1
7105440c
YCH
2389 except ValueError:
2390 return None
2391
2392
2393def month_by_abbreviation(abbrev):
2394 """ Return the number of a month by (locale-independently) English
2395 abbreviations """
2396
2397 try:
2398 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2399 except ValueError:
2400 return None
18258362
JMF
2401
2402
5aafe895 2403def fix_xml_ampersands(xml_str):
18258362 2404 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2405 return re.sub(
2406 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2407 '&amp;',
5aafe895 2408 xml_str)
e3946f98
PH
2409
2410
2411def setproctitle(title):
8bf48f23 2412 assert isinstance(title, compat_str)
c1c05c67
YCH
2413
2414 # ctypes in Jython is not complete
2415 # http://bugs.jython.org/issue2148
2416 if sys.platform.startswith('java'):
2417 return
2418
e3946f98 2419 try:
611c1dd9 2420 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2421 except OSError:
2422 return
2f49bcd6
RC
2423 except TypeError:
2424 # LoadLibrary in Windows Python 2.7.13 only expects
2425 # a bytestring, but since unicode_literals turns
2426 # every string into a unicode string, it fails.
2427 return
6eefe533
PH
2428 title_bytes = title.encode('utf-8')
2429 buf = ctypes.create_string_buffer(len(title_bytes))
2430 buf.value = title_bytes
e3946f98 2431 try:
6eefe533 2432 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2433 except AttributeError:
2434 return # Strange libc, just skip this
d7dda168
PH
2435
2436
2437def remove_start(s, start):
46bc9b7d 2438 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2439
2440
2b9faf55 2441def remove_end(s, end):
46bc9b7d 2442 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2443
2444
31b2051e
S
2445def remove_quotes(s):
2446 if s is None or len(s) < 2:
2447 return s
2448 for quote in ('"', "'", ):
2449 if s[0] == quote and s[-1] == quote:
2450 return s[1:-1]
2451 return s
2452
2453
b6e0c7d2
U
2454def get_domain(url):
2455 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2456 return domain.group('domain') if domain else None
2457
2458
29eb5174 2459def url_basename(url):
9b8aaeed 2460 path = compat_urlparse.urlparse(url).path
28e614de 2461 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2462
2463
02dc0a36
S
2464def base_url(url):
2465 return re.match(r'https?://[^?#&]+/', url).group()
2466
2467
e34c3361 2468def urljoin(base, path):
4b5de77b
S
2469 if isinstance(path, bytes):
2470 path = path.decode('utf-8')
e34c3361
S
2471 if not isinstance(path, compat_str) or not path:
2472 return None
fad4ceb5 2473 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2474 return path
4b5de77b
S
2475 if isinstance(base, bytes):
2476 base = base.decode('utf-8')
2477 if not isinstance(base, compat_str) or not re.match(
2478 r'^(?:https?:)?//', base):
e34c3361
S
2479 return None
2480 return compat_urlparse.urljoin(base, path)
2481
2482
aa94a6d3
PH
2483class HEADRequest(compat_urllib_request.Request):
2484 def get_method(self):
611c1dd9 2485 return 'HEAD'
7217e148
PH
2486
2487
95cf60e8
S
2488class PUTRequest(compat_urllib_request.Request):
2489 def get_method(self):
2490 return 'PUT'
2491
2492
9732d77e 2493def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2494 if get_attr and v is not None:
2495 v = getattr(v, get_attr, None)
1812afb7
S
2496 try:
2497 return int(v) * invscale // scale
31c49255 2498 except (ValueError, TypeError, OverflowError):
af98f8ff 2499 return default
9732d77e 2500
9572013d 2501
40a90862
JMF
2502def str_or_none(v, default=None):
2503 return default if v is None else compat_str(v)
2504
9732d77e
PH
2505
2506def str_to_int(int_str):
48d4681e 2507 """ A more relaxed version of int_or_none """
42db58ec 2508 if isinstance(int_str, compat_integer_types):
348c6bf1 2509 return int_str
42db58ec
S
2510 elif isinstance(int_str, compat_str):
2511 int_str = re.sub(r'[,\.\+]', '', int_str)
2512 return int_or_none(int_str)
608d11f5
PH
2513
2514
9732d77e 2515def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2516 if v is None:
2517 return default
2518 try:
2519 return float(v) * invscale / scale
5e1271c5 2520 except (ValueError, TypeError):
caf80631 2521 return default
43f775e4
PH
2522
2523
c7e327c4
S
2524def bool_or_none(v, default=None):
2525 return v if isinstance(v, bool) else default
2526
2527
53cd37ba
S
2528def strip_or_none(v, default=None):
2529 return v.strip() if isinstance(v, compat_str) else default
b72b4431
S
2530
2531
af03000a
S
2532def url_or_none(url):
2533 if not url or not isinstance(url, compat_str):
2534 return None
2535 url = url.strip()
29f7c58a 2536 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2537
2538
e29663c6 2539def strftime_or_none(timestamp, date_format, default=None):
2540 datetime_object = None
2541 try:
2542 if isinstance(timestamp, compat_numeric_types): # unix timestamp
2543 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2544 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2545 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2546 return datetime_object.strftime(date_format)
2547 except (ValueError, TypeError, AttributeError):
2548 return default
2549
2550
608d11f5 2551def parse_duration(s):
8f9312c3 2552 if not isinstance(s, compat_basestring):
608d11f5 2553 return None
ca7b3246 2554 s = s.strip()
38d79fd1 2555 if not s:
2556 return None
ca7b3246 2557
acaff495 2558 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2559 m = re.match(r'''(?x)
2560 (?P<before_secs>
2561 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2562 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2563 (?P<ms>[.:][0-9]+)?Z?$
2564 ''', s)
acaff495 2565 if m:
8bd1c00b 2566 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2567 else:
2568 m = re.match(
056653bb
S
2569 r'''(?ix)(?:P?
2570 (?:
2571 [0-9]+\s*y(?:ears?)?\s*
2572 )?
2573 (?:
2574 [0-9]+\s*m(?:onths?)?\s*
2575 )?
2576 (?:
2577 [0-9]+\s*w(?:eeks?)?\s*
2578 )?
8f4b58d7 2579 (?:
acaff495 2580 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 2581 )?
056653bb 2582 T)?
acaff495 2583 (?:
2584 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2585 )?
2586 (?:
2587 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2588 )?
2589 (?:
2590 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2591 )?Z?$''', s)
acaff495 2592 if m:
2593 days, hours, mins, secs, ms = m.groups()
2594 else:
15846398 2595 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2596 if m:
2597 hours, mins = m.groups()
2598 else:
2599 return None
2600
2601 duration = 0
2602 if secs:
2603 duration += float(secs)
2604 if mins:
2605 duration += float(mins) * 60
2606 if hours:
2607 duration += float(hours) * 60 * 60
2608 if days:
2609 duration += float(days) * 24 * 60 * 60
2610 if ms:
8bd1c00b 2611 duration += float(ms.replace(':', '.'))
acaff495 2612 return duration
91d7d0b3
JMF
2613
2614
e65e4c88 2615def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2616 name, real_ext = os.path.splitext(filename)
e65e4c88
S
2617 return (
2618 '{0}.{1}{2}'.format(name, ext, real_ext)
2619 if not expected_real_ext or real_ext[1:] == expected_real_ext
2620 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
2621
2622
b3ed15b7
S
2623def replace_extension(filename, ext, expected_real_ext=None):
2624 name, real_ext = os.path.splitext(filename)
2625 return '{0}.{1}'.format(
2626 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2627 ext)
2628
2629
d70ad093
PH
2630def check_executable(exe, args=[]):
2631 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2632 args can be a list of arguments for a short output (like -version) """
2633 try:
d3c93ec2 2634 Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
d70ad093
PH
2635 except OSError:
2636 return False
2637 return exe
b7ab0590
PH
2638
2639
9af98e17 2640def _get_exe_version_output(exe, args):
95807118 2641 try:
b64d04c1 2642 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2643 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2644 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
d3c93ec2 2645 out, _ = Popen(
2646 [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2647 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
95807118
PH
2648 except OSError:
2649 return False
cae97f65
PH
2650 if isinstance(out, bytes): # Python 2.x
2651 out = out.decode('ascii', 'ignore')
9af98e17 2652 return out
cae97f65
PH
2653
2654
2655def detect_exe_version(output, version_re=None, unrecognized='present'):
2656 assert isinstance(output, compat_str)
2657 if version_re is None:
2658 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2659 m = re.search(version_re, output)
95807118
PH
2660 if m:
2661 return m.group(1)
2662 else:
2663 return unrecognized
2664
2665
9af98e17 2666def get_exe_version(exe, args=['--version'],
2667 version_re=None, unrecognized='present'):
2668 """ Returns the version of the specified executable,
2669 or False if the executable is not present """
2670 out = _get_exe_version_output(exe, args)
2671 return detect_exe_version(out, version_re, unrecognized) if out else False
2672
2673
cb89cfc1 2674class LazyList(collections.abc.Sequence):
483336e7 2675 ''' Lazy immutable list from an iterable
2676 Note that slices of a LazyList are lists and not LazyList'''
2677
8e5fecc8 2678 class IndexError(IndexError):
2679 pass
2680
282f5709 2681 def __init__(self, iterable, *, reverse=False, _cache=None):
483336e7 2682 self.__iterable = iter(iterable)
282f5709 2683 self.__cache = [] if _cache is None else _cache
2684 self.__reversed = reverse
483336e7 2685
2686 def __iter__(self):
28419ca2 2687 if self.__reversed:
2688 # We need to consume the entire iterable to iterate in reverse
981052c9 2689 yield from self.exhaust()
28419ca2 2690 return
2691 yield from self.__cache
483336e7 2692 for item in self.__iterable:
2693 self.__cache.append(item)
2694 yield item
2695
981052c9 2696 def __exhaust(self):
483336e7 2697 self.__cache.extend(self.__iterable)
9f1a1c36 2698 # Discard the emptied iterable to make it pickle-able
2699 self.__iterable = []
28419ca2 2700 return self.__cache
2701
981052c9 2702 def exhaust(self):
2703 ''' Evaluate the entire iterable '''
2704 return self.__exhaust()[::-1 if self.__reversed else 1]
2705
28419ca2 2706 @staticmethod
981052c9 2707 def __reverse_index(x):
e0f2b4b4 2708 return None if x is None else -(x + 1)
483336e7 2709
2710 def __getitem__(self, idx):
2711 if isinstance(idx, slice):
28419ca2 2712 if self.__reversed:
e0f2b4b4 2713 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2714 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2715 elif isinstance(idx, int):
28419ca2 2716 if self.__reversed:
981052c9 2717 idx = self.__reverse_index(idx)
e0f2b4b4 2718 start, stop, step = idx, idx, 0
483336e7 2719 else:
2720 raise TypeError('indices must be integers or slices')
e0f2b4b4 2721 if ((start or 0) < 0 or (stop or 0) < 0
2722 or (start is None and step < 0)
2723 or (stop is None and step > 0)):
483336e7 2724 # We need to consume the entire iterable to be able to slice from the end
2725 # Obviously, never use this with infinite iterables
8e5fecc8 2726 self.__exhaust()
2727 try:
2728 return self.__cache[idx]
2729 except IndexError as e:
2730 raise self.IndexError(e) from e
e0f2b4b4 2731 n = max(start or 0, stop or 0) - len(self.__cache) + 1
28419ca2 2732 if n > 0:
2733 self.__cache.extend(itertools.islice(self.__iterable, n))
8e5fecc8 2734 try:
2735 return self.__cache[idx]
2736 except IndexError as e:
2737 raise self.IndexError(e) from e
483336e7 2738
2739 def __bool__(self):
2740 try:
28419ca2 2741 self[-1] if self.__reversed else self[0]
8e5fecc8 2742 except self.IndexError:
483336e7 2743 return False
2744 return True
2745
2746 def __len__(self):
8e5fecc8 2747 self.__exhaust()
483336e7 2748 return len(self.__cache)
2749
282f5709 2750 def __reversed__(self):
2751 return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2752
2753 def __copy__(self):
2754 return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2755
28419ca2 2756 def __repr__(self):
2757 # repr and str should mimic a list. So we exhaust the iterable
2758 return repr(self.exhaust())
2759
2760 def __str__(self):
2761 return repr(self.exhaust())
2762
483336e7 2763
7be9ccff 2764class PagedList:
c07a39ae 2765
2766 class IndexError(IndexError):
2767 pass
2768
dd26ced1
PH
2769 def __len__(self):
2770 # This is only useful for tests
2771 return len(self.getslice())
2772
7be9ccff 2773 def __init__(self, pagefunc, pagesize, use_cache=True):
2774 self._pagefunc = pagefunc
2775 self._pagesize = pagesize
2776 self._use_cache = use_cache
2777 self._cache = {}
2778
2779 def getpage(self, pagenum):
d8cf8d97 2780 page_results = self._cache.get(pagenum)
2781 if page_results is None:
2782 page_results = list(self._pagefunc(pagenum))
7be9ccff 2783 if self._use_cache:
2784 self._cache[pagenum] = page_results
2785 return page_results
2786
2787 def getslice(self, start=0, end=None):
2788 return list(self._getslice(start, end))
2789
2790 def _getslice(self, start, end):
55575225 2791 raise NotImplementedError('This method must be implemented by subclasses')
2792
2793 def __getitem__(self, idx):
7be9ccff 2794 # NOTE: cache must be enabled if this is used
55575225 2795 if not isinstance(idx, int) or idx < 0:
2796 raise TypeError('indices must be non-negative integers')
2797 entries = self.getslice(idx, idx + 1)
d8cf8d97 2798 if not entries:
c07a39ae 2799 raise self.IndexError()
d8cf8d97 2800 return entries[0]
55575225 2801
9c44d242
PH
2802
2803class OnDemandPagedList(PagedList):
7be9ccff 2804 def _getslice(self, start, end):
b7ab0590
PH
2805 for pagenum in itertools.count(start // self._pagesize):
2806 firstid = pagenum * self._pagesize
2807 nextfirstid = pagenum * self._pagesize + self._pagesize
2808 if start >= nextfirstid:
2809 continue
2810
b7ab0590
PH
2811 startv = (
2812 start % self._pagesize
2813 if firstid <= start < nextfirstid
2814 else 0)
b7ab0590
PH
2815 endv = (
2816 ((end - 1) % self._pagesize) + 1
2817 if (end is not None and firstid <= end <= nextfirstid)
2818 else None)
2819
7be9ccff 2820 page_results = self.getpage(pagenum)
b7ab0590
PH
2821 if startv != 0 or endv is not None:
2822 page_results = page_results[startv:endv]
7be9ccff 2823 yield from page_results
b7ab0590
PH
2824
2825 # A little optimization - if current page is not "full", ie. does
2826 # not contain page_size videos then we can assume that this page
2827 # is the last one - there are no more ids on further pages -
2828 # i.e. no need to query again.
2829 if len(page_results) + startv < self._pagesize:
2830 break
2831
2832 # If we got the whole page, but the next page is not interesting,
2833 # break out early as well
2834 if end == nextfirstid:
2835 break
81c2f20b
PH
2836
2837
9c44d242
PH
2838class InAdvancePagedList(PagedList):
2839 def __init__(self, pagefunc, pagecount, pagesize):
9c44d242 2840 self._pagecount = pagecount
7be9ccff 2841 PagedList.__init__(self, pagefunc, pagesize, True)
9c44d242 2842
7be9ccff 2843 def _getslice(self, start, end):
9c44d242
PH
2844 start_page = start // self._pagesize
2845 end_page = (
2846 self._pagecount if end is None else (end // self._pagesize + 1))
2847 skip_elems = start - start_page * self._pagesize
2848 only_more = None if end is None else end - start
2849 for pagenum in range(start_page, end_page):
7be9ccff 2850 page_results = self.getpage(pagenum)
9c44d242 2851 if skip_elems:
7be9ccff 2852 page_results = page_results[skip_elems:]
9c44d242
PH
2853 skip_elems = None
2854 if only_more is not None:
7be9ccff 2855 if len(page_results) < only_more:
2856 only_more -= len(page_results)
9c44d242 2857 else:
7be9ccff 2858 yield from page_results[:only_more]
9c44d242 2859 break
7be9ccff 2860 yield from page_results
9c44d242
PH
2861
2862
81c2f20b 2863def uppercase_escape(s):
676eb3f2 2864 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2865 return re.sub(
a612753d 2866 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2867 lambda m: unicode_escape(m.group(0))[0],
2868 s)
0fe2ff78
YCH
2869
2870
2871def lowercase_escape(s):
2872 unicode_escape = codecs.getdecoder('unicode_escape')
2873 return re.sub(
2874 r'\\u[0-9a-fA-F]{4}',
2875 lambda m: unicode_escape(m.group(0))[0],
2876 s)
b53466e1 2877
d05cfe06
S
2878
2879def escape_rfc3986(s):
2880 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 2881 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 2882 s = s.encode('utf-8')
ecc0c5ee 2883 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2884
2885
2886def escape_url(url):
2887 """Escape URL as suggested by RFC 3986"""
2888 url_parsed = compat_urllib_parse_urlparse(url)
2889 return url_parsed._replace(
efbed08d 2890 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2891 path=escape_rfc3986(url_parsed.path),
2892 params=escape_rfc3986(url_parsed.params),
2893 query=escape_rfc3986(url_parsed.query),
2894 fragment=escape_rfc3986(url_parsed.fragment)
2895 ).geturl()
2896
62e609ab 2897
4dfbf869 2898def parse_qs(url):
2899 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2900
2901
62e609ab
PH
2902def read_batch_urls(batch_fd):
2903 def fixup(url):
2904 if not isinstance(url, compat_str):
2905 url = url.decode('utf-8', 'replace')
8c04f0be 2906 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2907 for bom in BOM_UTF8:
2908 if url.startswith(bom):
2909 url = url[len(bom):]
2910 url = url.lstrip()
2911 if not url or url.startswith(('#', ';', ']')):
62e609ab 2912 return False
8c04f0be 2913 # "#" cannot be stripped out since it is part of the URI
2914 # However, it can be safely stipped out if follwing a whitespace
2915 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
2916
2917 with contextlib.closing(batch_fd) as fd:
2918 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2919
2920
2921def urlencode_postdata(*args, **kargs):
15707c7e 2922 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2923
2924
38f9ef31 2925def update_url_query(url, query):
cacd9966
YCH
2926 if not query:
2927 return url
38f9ef31 2928 parsed_url = compat_urlparse.urlparse(url)
2929 qs = compat_parse_qs(parsed_url.query)
2930 qs.update(query)
2931 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2932 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2933
8e60dc75 2934
ed0291d1
S
2935def update_Request(req, url=None, data=None, headers={}, query={}):
2936 req_headers = req.headers.copy()
2937 req_headers.update(headers)
2938 req_data = data or req.data
2939 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2940 req_get_method = req.get_method()
2941 if req_get_method == 'HEAD':
2942 req_type = HEADRequest
2943 elif req_get_method == 'PUT':
2944 req_type = PUTRequest
2945 else:
2946 req_type = compat_urllib_request.Request
ed0291d1
S
2947 new_req = req_type(
2948 req_url, data=req_data, headers=req_headers,
2949 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2950 if hasattr(req, 'timeout'):
2951 new_req.timeout = req.timeout
2952 return new_req
2953
2954
10c87c15 2955def _multipart_encode_impl(data, boundary):
0c265486
YCH
2956 content_type = 'multipart/form-data; boundary=%s' % boundary
2957
2958 out = b''
2959 for k, v in data.items():
2960 out += b'--' + boundary.encode('ascii') + b'\r\n'
2961 if isinstance(k, compat_str):
2962 k = k.encode('utf-8')
2963 if isinstance(v, compat_str):
2964 v = v.encode('utf-8')
2965 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2966 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2967 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2968 if boundary.encode('ascii') in content:
2969 raise ValueError('Boundary overlaps with data')
2970 out += content
2971
2972 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2973
2974 return out, content_type
2975
2976
2977def multipart_encode(data, boundary=None):
2978 '''
2979 Encode a dict to RFC 7578-compliant form-data
2980
2981 data:
2982 A dict where keys and values can be either Unicode or bytes-like
2983 objects.
2984 boundary:
2985 If specified a Unicode object, it's used as the boundary. Otherwise
2986 a random boundary is generated.
2987
2988 Reference: https://tools.ietf.org/html/rfc7578
2989 '''
2990 has_specified_boundary = boundary is not None
2991
2992 while True:
2993 if boundary is None:
2994 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2995
2996 try:
10c87c15 2997 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2998 break
2999 except ValueError:
3000 if has_specified_boundary:
3001 raise
3002 boundary = None
3003
3004 return out, content_type
3005
3006
86296ad2 3007def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
3008 if isinstance(key_or_keys, (list, tuple)):
3009 for key in key_or_keys:
86296ad2
S
3010 if key not in d or d[key] is None or skip_false_values and not d[key]:
3011 continue
3012 return d[key]
cbecc9b9
S
3013 return default
3014 return d.get(key_or_keys, default)
3015
3016
329ca3be 3017def try_get(src, getter, expected_type=None):
6606817a 3018 for get in variadic(getter):
a32a9a7e
S
3019 try:
3020 v = get(src)
3021 except (AttributeError, KeyError, TypeError, IndexError):
3022 pass
3023 else:
3024 if expected_type is None or isinstance(v, expected_type):
3025 return v
329ca3be
S
3026
3027
6cc62232
S
3028def merge_dicts(*dicts):
3029 merged = {}
3030 for a_dict in dicts:
3031 for k, v in a_dict.items():
3032 if v is None:
3033 continue
3089bc74
S
3034 if (k not in merged
3035 or (isinstance(v, compat_str) and v
3036 and isinstance(merged[k], compat_str)
3037 and not merged[k])):
6cc62232
S
3038 merged[k] = v
3039 return merged
3040
3041
8e60dc75
S
3042def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3043 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3044
16392824 3045
a1a530b0
PH
3046US_RATINGS = {
3047 'G': 0,
3048 'PG': 10,
3049 'PG-13': 13,
3050 'R': 16,
3051 'NC': 18,
3052}
fac55558
PH
3053
3054
a8795327 3055TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3056 'TV-Y': 0,
3057 'TV-Y7': 7,
3058 'TV-G': 0,
3059 'TV-PG': 0,
3060 'TV-14': 14,
3061 'TV-MA': 17,
a8795327
S
3062}
3063
3064
146c80e2 3065def parse_age_limit(s):
a8795327
S
3066 if type(s) == int:
3067 return s if 0 <= s <= 21 else None
3068 if not isinstance(s, compat_basestring):
d838b1bd 3069 return None
146c80e2 3070 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3071 if m:
3072 return int(m.group('age'))
5c5fae6d 3073 s = s.upper()
a8795327
S
3074 if s in US_RATINGS:
3075 return US_RATINGS[s]
5a16c9d9 3076 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3077 if m:
5a16c9d9 3078 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3079 return None
146c80e2
S
3080
3081
fac55558 3082def strip_jsonp(code):
609a61e3 3083 return re.sub(
5552c9eb 3084 r'''(?sx)^
e9c671d5 3085 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3086 (?:\s*&&\s*(?P=func_name))?
3087 \s*\(\s*(?P<callback_data>.*)\);?
3088 \s*?(?://[^\n]*)*$''',
3089 r'\g<callback_data>', code)
478c2c61
PH
3090
3091
5c610515 3092def js_to_json(code, vars={}):
3093 # vars is a dict of var, val pairs to substitute
c843e685 3094 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
4195096e
S
3095 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3096 INTEGER_TABLE = (
3097 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3098 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3099 )
3100
e05f6939 3101 def fix_kv(m):
e7b6d122
PH
3102 v = m.group(0)
3103 if v in ('true', 'false', 'null'):
3104 return v
421ddcb8
C
3105 elif v in ('undefined', 'void 0'):
3106 return 'null'
8bdd16b4 3107 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
bd1e4844 3108 return ""
3109
3110 if v[0] in ("'", '"'):
3111 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 3112 '"': '\\"',
bd1e4844 3113 "\\'": "'",
3114 '\\\n': '',
3115 '\\x': '\\u00',
3116 }.get(m.group(0), m.group(0)), v[1:-1])
8bdd16b4 3117 else:
3118 for regex, base in INTEGER_TABLE:
3119 im = re.match(regex, v)
3120 if im:
3121 i = int(im.group(1), base)
3122 return '"%d":' % i if v.endswith(':') else '%d' % i
89ac4a19 3123
5c610515 3124 if v in vars:
3125 return vars[v]
3126
e7b6d122 3127 return '"%s"' % v
e05f6939 3128
bd1e4844 3129 return re.sub(r'''(?sx)
3130 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3131 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 3132 {comment}|,(?={skip}[\]}}])|
421ddcb8 3133 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
4195096e 3134 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
8bdd16b4 3135 [0-9]+(?={skip}:)|
3136 !+
4195096e 3137 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
3138
3139
478c2c61
PH
3140def qualities(quality_ids):
3141 """ Get a numeric quality value out of a list of possible values """
3142 def q(qid):
3143 try:
3144 return quality_ids.index(qid)
3145 except ValueError:
3146 return -1
3147 return q
3148
acd69589 3149
ebed8b37 3150POSTPROCESS_WHEN = {'pre_process', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
1e43a6f7 3151
3152
de6000d9 3153DEFAULT_OUTTMPL = {
3154 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3155 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3156}
3157OUTTMPL_TYPES = {
72755351 3158 'chapter': None,
de6000d9 3159 'subtitle': None,
3160 'thumbnail': None,
3161 'description': 'description',
3162 'annotation': 'annotations.xml',
3163 'infojson': 'info.json',
08438d2c 3164 'link': None,
3b603dbd 3165 'pl_video': None,
5112f26a 3166 'pl_thumbnail': None,
de6000d9 3167 'pl_description': 'description',
3168 'pl_infojson': 'info.json',
3169}
0a871f68 3170
143db31d 3171# As of [1] format syntax is:
3172# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3173# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3174STR_FORMAT_RE_TMPL = r'''(?x)
3175 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3176 %
524e2e4f 3177 (?P<has_key>\((?P<key>{0})\))?
752cda38 3178 (?P<format>
524e2e4f 3179 (?P<conversion>[#0\-+ ]+)?
3180 (?P<min_width>\d+)?
3181 (?P<precision>\.\d+)?
3182 (?P<len_mod>[hlL])? # unused in python
901130bb 3183 {1} # conversion type
752cda38 3184 )
143db31d 3185'''
3186
7d1eb38a 3187
901130bb 3188STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3189
7d1eb38a 3190
a020a0dc
PH
3191def limit_length(s, length):
3192 """ Add ellipses to overly long strings """
3193 if s is None:
3194 return None
3195 ELLIPSES = '...'
3196 if len(s) > length:
3197 return s[:length - len(ELLIPSES)] + ELLIPSES
3198 return s
48844745
PH
3199
3200
3201def version_tuple(v):
5f9b8394 3202 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3203
3204
3205def is_outdated_version(version, limit, assume_new=True):
3206 if not version:
3207 return not assume_new
3208 try:
3209 return version_tuple(version) < version_tuple(limit)
3210 except ValueError:
3211 return not assume_new
732ea2f0
PH
3212
3213
3214def ytdl_is_updateable():
7a5c1cfe 3215 """ Returns if yt-dlp can be updated with -U """
735d865e 3216
5d535b4a 3217 from .update import is_non_updateable
732ea2f0 3218
5d535b4a 3219 return not is_non_updateable()
7d4111ed
PH
3220
3221
3222def args_to_str(args):
3223 # Get a short string representation for a subprocess command
702ccf2d 3224 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3225
3226
9b9c5355 3227def error_to_compat_str(err):
fdae2358
S
3228 err_str = str(err)
3229 # On python 2 error byte string must be decoded with proper
3230 # encoding rather than ascii
3231 if sys.version_info[0] < 3:
3232 err_str = err_str.decode(preferredencoding())
3233 return err_str
3234
3235
c460bdd5 3236def mimetype2ext(mt):
eb9ee194
S
3237 if mt is None:
3238 return None
3239
9359f3d4
F
3240 mt, _, params = mt.partition(';')
3241 mt = mt.strip()
3242
3243 FULL_MAP = {
765ac263 3244 'audio/mp4': 'm4a',
6c33d24b
YCH
3245 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3246 # it's the most popular one
3247 'audio/mpeg': 'mp3',
ba39289d 3248 'audio/x-wav': 'wav',
9359f3d4
F
3249 'audio/wav': 'wav',
3250 'audio/wave': 'wav',
3251 }
3252
3253 ext = FULL_MAP.get(mt)
765ac263
JMF
3254 if ext is not None:
3255 return ext
3256
9359f3d4 3257 SUBTYPE_MAP = {
f6861ec9 3258 '3gpp': '3gp',
cafcf657 3259 'smptett+xml': 'tt',
cafcf657 3260 'ttaf+xml': 'dfxp',
a0d8d704 3261 'ttml+xml': 'ttml',
f6861ec9 3262 'x-flv': 'flv',
a0d8d704 3263 'x-mp4-fragmented': 'mp4',
d4f05d47 3264 'x-ms-sami': 'sami',
a0d8d704 3265 'x-ms-wmv': 'wmv',
b4173f15
RA
3266 'mpegurl': 'm3u8',
3267 'x-mpegurl': 'm3u8',
3268 'vnd.apple.mpegurl': 'm3u8',
3269 'dash+xml': 'mpd',
b4173f15 3270 'f4m+xml': 'f4m',
f164b971 3271 'hds+xml': 'f4m',
e910fe2f 3272 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 3273 'quicktime': 'mov',
98ce1a3f 3274 'mp2t': 'ts',
39e7107d 3275 'x-wav': 'wav',
9359f3d4
F
3276 'filmstrip+json': 'fs',
3277 'svg+xml': 'svg',
3278 }
3279
3280 _, _, subtype = mt.rpartition('/')
3281 ext = SUBTYPE_MAP.get(subtype.lower())
3282 if ext is not None:
3283 return ext
3284
3285 SUFFIX_MAP = {
3286 'json': 'json',
3287 'xml': 'xml',
3288 'zip': 'zip',
3289 'gzip': 'gz',
3290 }
3291
3292 _, _, suffix = subtype.partition('+')
3293 ext = SUFFIX_MAP.get(suffix)
3294 if ext is not None:
3295 return ext
3296
3297 return subtype.replace('+', '.')
c460bdd5
PH
3298
3299
2814f12b
THD
3300def ext2mimetype(ext_or_url):
3301 if not ext_or_url:
3302 return None
3303 if '.' not in ext_or_url:
3304 ext_or_url = f'file.{ext_or_url}'
3305 return mimetypes.guess_type(ext_or_url)[0]
3306
3307
4f3c5e06 3308def parse_codecs(codecs_str):
3309 # http://tools.ietf.org/html/rfc6381
3310 if not codecs_str:
3311 return {}
a0566bbf 3312 split_codecs = list(filter(None, map(
dbf5416a 3313 str.strip, codecs_str.strip().strip(',').split(','))))
4afa3ec4 3314 vcodec, acodec, tcodec, hdr = None, None, None, None
a0566bbf 3315 for full_codec in split_codecs:
9bd979ca 3316 parts = full_codec.split('.')
3317 codec = parts[0].replace('0', '')
3318 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3319 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
4f3c5e06 3320 if not vcodec:
b69fd25c 3321 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
176f1866 3322 if codec in ('dvh1', 'dvhe'):
3323 hdr = 'DV'
9bd979ca 3324 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3325 hdr = 'HDR10'
3326 elif full_codec.replace('0', '').startswith('vp9.2'):
176f1866 3327 hdr = 'HDR10'
b69fd25c 3328 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 3329 if not acodec:
3330 acodec = full_codec
4afa3ec4
F
3331 elif codec in ('stpp', 'wvtt',):
3332 if not tcodec:
3333 tcodec = full_codec
4f3c5e06 3334 else:
60f5c9fb 3335 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
4afa3ec4 3336 if vcodec or acodec or tcodec:
4f3c5e06 3337 return {
3338 'vcodec': vcodec or 'none',
3339 'acodec': acodec or 'none',
176f1866 3340 'dynamic_range': hdr,
4afa3ec4 3341 **({'tcodec': tcodec} if tcodec is not None else {}),
4f3c5e06 3342 }
b69fd25c 3343 elif len(split_codecs) == 2:
3344 return {
3345 'vcodec': split_codecs[0],
3346 'acodec': split_codecs[1],
3347 }
4f3c5e06 3348 return {}
3349
3350
2ccd1b10 3351def urlhandle_detect_ext(url_handle):
79298173 3352 getheader = url_handle.headers.get
2ccd1b10 3353
b55ee18f
PH
3354 cd = getheader('Content-Disposition')
3355 if cd:
3356 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3357 if m:
3358 e = determine_ext(m.group('filename'), default_ext=None)
3359 if e:
3360 return e
3361
c460bdd5 3362 return mimetype2ext(getheader('Content-Type'))
05900629
PH
3363
3364
1e399778
YCH
3365def encode_data_uri(data, mime_type):
3366 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3367
3368
05900629 3369def age_restricted(content_limit, age_limit):
6ec6cb4e 3370 """ Returns True iff the content should be blocked """
05900629
PH
3371
3372 if age_limit is None: # No limit set
3373 return False
3374 if content_limit is None:
3375 return False # Content available for everyone
3376 return age_limit < content_limit
61ca9a80
PH
3377
3378
3379def is_html(first_bytes):
3380 """ Detect whether a file contains HTML by examining its first bytes. """
3381
3382 BOMS = [
3383 (b'\xef\xbb\xbf', 'utf-8'),
3384 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3385 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3386 (b'\xff\xfe', 'utf-16-le'),
3387 (b'\xfe\xff', 'utf-16-be'),
3388 ]
3389 for bom, enc in BOMS:
3390 if first_bytes.startswith(bom):
3391 s = first_bytes[len(bom):].decode(enc, 'replace')
3392 break
3393 else:
3394 s = first_bytes.decode('utf-8', 'replace')
3395
3396 return re.match(r'^\s*<', s)
a055469f
PH
3397
3398
3399def determine_protocol(info_dict):
3400 protocol = info_dict.get('protocol')
3401 if protocol is not None:
3402 return protocol
3403
7de837a5 3404 url = sanitize_url(info_dict['url'])
a055469f
PH
3405 if url.startswith('rtmp'):
3406 return 'rtmp'
3407 elif url.startswith('mms'):
3408 return 'mms'
3409 elif url.startswith('rtsp'):
3410 return 'rtsp'
3411
3412 ext = determine_ext(url)
3413 if ext == 'm3u8':
3414 return 'm3u8'
3415 elif ext == 'f4m':
3416 return 'f4m'
3417
3418 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
3419
3420
c5e3f849 3421def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3422 """ Render a list of rows, each as a list of values.
3423 Text after a \t will be right aligned """
ec11a9f4 3424 def width(string):
c5e3f849 3425 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3426
3427 def get_max_lens(table):
ec11a9f4 3428 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3429
3430 def filter_using_list(row, filterArray):
3431 return [col for (take, col) in zip(filterArray, row) if take]
3432
c5e3f849 3433 if hide_empty:
76d321f6 3434 max_lens = get_max_lens(data)
3435 header_row = filter_using_list(header_row, max_lens)
3436 data = [filter_using_list(row, max_lens) for row in data]
3437
cfb56d1a 3438 table = [header_row] + data
76d321f6 3439 max_lens = get_max_lens(table)
c5e3f849 3440 extra_gap += 1
76d321f6 3441 if delim:
c5e3f849 3442 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3443 table[1][-1] = table[1][-1][:-extra_gap] # Remove extra_gap from end of delimiter
ec11a9f4 3444 for row in table:
3445 for pos, text in enumerate(map(str, row)):
c5e3f849 3446 if '\t' in text:
3447 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3448 else:
3449 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3450 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3451 return ret
347de493
PH
3452
3453
8f18aca8 3454def _match_one(filter_part, dct, incomplete):
77b87f05 3455 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3456 STRING_OPERATORS = {
3457 '*=': operator.contains,
3458 '^=': lambda attr, value: attr.startswith(value),
3459 '$=': lambda attr, value: attr.endswith(value),
3460 '~=': lambda attr, value: re.search(value, attr),
3461 }
347de493 3462 COMPARISON_OPERATORS = {
a047eeb6 3463 **STRING_OPERATORS,
3464 '<=': operator.le, # "<=" must be defined above "<"
347de493 3465 '<': operator.lt,
347de493 3466 '>=': operator.ge,
a047eeb6 3467 '>': operator.gt,
347de493 3468 '=': operator.eq,
347de493 3469 }
a047eeb6 3470
347de493
PH
3471 operator_rex = re.compile(r'''(?x)\s*
3472 (?P<key>[a-z_]+)
77b87f05 3473 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3474 (?:
a047eeb6 3475 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3476 (?P<strval>.+?)
347de493
PH
3477 )
3478 \s*$
3479 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3480 m = operator_rex.search(filter_part)
3481 if m:
18f96d12 3482 m = m.groupdict()
3483 unnegated_op = COMPARISON_OPERATORS[m['op']]
3484 if m['negation']:
77b87f05
MT
3485 op = lambda attr, value: not unnegated_op(attr, value)
3486 else:
3487 op = unnegated_op
18f96d12 3488 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3489 if m['quote']:
3490 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3491 actual_value = dct.get(m['key'])
3492 numeric_comparison = None
3493 if isinstance(actual_value, compat_numeric_types):
e5a088dc
S
3494 # If the original field is a string and matching comparisonvalue is
3495 # a number we should respect the origin of the original field
3496 # and process comparison value as a string (see
18f96d12 3497 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3498 try:
18f96d12 3499 numeric_comparison = int(comparison_value)
347de493 3500 except ValueError:
18f96d12 3501 numeric_comparison = parse_filesize(comparison_value)
3502 if numeric_comparison is None:
3503 numeric_comparison = parse_filesize(f'{comparison_value}B')
3504 if numeric_comparison is None:
3505 numeric_comparison = parse_duration(comparison_value)
3506 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3507 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3508 if actual_value is None:
18f96d12 3509 return incomplete or m['none_inclusive']
3510 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3511
3512 UNARY_OPERATORS = {
1cc47c66
S
3513 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3514 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493
PH
3515 }
3516 operator_rex = re.compile(r'''(?x)\s*
3517 (?P<op>%s)\s*(?P<key>[a-z_]+)
3518 \s*$
3519 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3520 m = operator_rex.search(filter_part)
3521 if m:
3522 op = UNARY_OPERATORS[m.group('op')]
3523 actual_value = dct.get(m.group('key'))
8f18aca8 3524 if incomplete and actual_value is None:
3525 return True
347de493
PH
3526 return op(actual_value)
3527
3528 raise ValueError('Invalid filter part %r' % filter_part)
3529
3530
8f18aca8 3531def match_str(filter_str, dct, incomplete=False):
3532 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3533 When incomplete, all conditions passes on missing fields
3534 """
347de493 3535 return all(
8f18aca8 3536 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3537 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3538
3539
3540def match_filter_func(filter_str):
8f18aca8 3541 def _match_func(info_dict, *args, **kwargs):
3542 if match_str(filter_str, info_dict, *args, **kwargs):
347de493
PH
3543 return None
3544 else:
3545 video_title = info_dict.get('title', info_dict.get('id', 'video'))
3546 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3547 return _match_func
91410c9b
PH
3548
3549
bf6427d2
YCH
3550def parse_dfxp_time_expr(time_expr):
3551 if not time_expr:
d631d5f9 3552 return
bf6427d2
YCH
3553
3554 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3555 if mobj:
3556 return float(mobj.group('time_offset'))
3557
db2fe38b 3558 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3559 if mobj:
db2fe38b 3560 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3561
3562
c1c924ab 3563def srt_subtitles_timecode(seconds):
aa7785f8 3564 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3565
3566
3567def ass_subtitles_timecode(seconds):
3568 time = timetuple_from_msec(seconds * 1000)
3569 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3570
3571
3572def dfxp2srt(dfxp_data):
3869028f
YCH
3573 '''
3574 @param dfxp_data A bytes-like object containing DFXP data
3575 @returns A unicode object containing converted SRT data
3576 '''
5b995f71 3577 LEGACY_NAMESPACES = (
3869028f
YCH
3578 (b'http://www.w3.org/ns/ttml', [
3579 b'http://www.w3.org/2004/11/ttaf1',
3580 b'http://www.w3.org/2006/04/ttaf1',
3581 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3582 ]),
3869028f
YCH
3583 (b'http://www.w3.org/ns/ttml#styling', [
3584 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3585 ]),
3586 )
3587
3588 SUPPORTED_STYLING = [
3589 'color',
3590 'fontFamily',
3591 'fontSize',
3592 'fontStyle',
3593 'fontWeight',
3594 'textDecoration'
3595 ]
3596
4e335771 3597 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3598 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3599 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3600 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3601 })
bf6427d2 3602
5b995f71
RA
3603 styles = {}
3604 default_style = {}
3605
87de7069 3606 class TTMLPElementParser(object):
5b995f71
RA
3607 _out = ''
3608 _unclosed_elements = []
3609 _applied_styles = []
bf6427d2 3610
2b14cb56 3611 def start(self, tag, attrib):
5b995f71
RA
3612 if tag in (_x('ttml:br'), 'br'):
3613 self._out += '\n'
3614 else:
3615 unclosed_elements = []
3616 style = {}
3617 element_style_id = attrib.get('style')
3618 if default_style:
3619 style.update(default_style)
3620 if element_style_id:
3621 style.update(styles.get(element_style_id, {}))
3622 for prop in SUPPORTED_STYLING:
3623 prop_val = attrib.get(_x('tts:' + prop))
3624 if prop_val:
3625 style[prop] = prop_val
3626 if style:
3627 font = ''
3628 for k, v in sorted(style.items()):
3629 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3630 continue
3631 if k == 'color':
3632 font += ' color="%s"' % v
3633 elif k == 'fontSize':
3634 font += ' size="%s"' % v
3635 elif k == 'fontFamily':
3636 font += ' face="%s"' % v
3637 elif k == 'fontWeight' and v == 'bold':
3638 self._out += '<b>'
3639 unclosed_elements.append('b')
3640 elif k == 'fontStyle' and v == 'italic':
3641 self._out += '<i>'
3642 unclosed_elements.append('i')
3643 elif k == 'textDecoration' and v == 'underline':
3644 self._out += '<u>'
3645 unclosed_elements.append('u')
3646 if font:
3647 self._out += '<font' + font + '>'
3648 unclosed_elements.append('font')
3649 applied_style = {}
3650 if self._applied_styles:
3651 applied_style.update(self._applied_styles[-1])
3652 applied_style.update(style)
3653 self._applied_styles.append(applied_style)
3654 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3655
2b14cb56 3656 def end(self, tag):
5b995f71
RA
3657 if tag not in (_x('ttml:br'), 'br'):
3658 unclosed_elements = self._unclosed_elements.pop()
3659 for element in reversed(unclosed_elements):
3660 self._out += '</%s>' % element
3661 if unclosed_elements and self._applied_styles:
3662 self._applied_styles.pop()
bf6427d2 3663
2b14cb56 3664 def data(self, data):
5b995f71 3665 self._out += data
2b14cb56 3666
3667 def close(self):
5b995f71 3668 return self._out.strip()
2b14cb56 3669
3670 def parse_node(node):
3671 target = TTMLPElementParser()
3672 parser = xml.etree.ElementTree.XMLParser(target=target)
3673 parser.feed(xml.etree.ElementTree.tostring(node))
3674 return parser.close()
bf6427d2 3675
5b995f71
RA
3676 for k, v in LEGACY_NAMESPACES:
3677 for ns in v:
3678 dfxp_data = dfxp_data.replace(ns, k)
3679
3869028f 3680 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3681 out = []
5b995f71 3682 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3683
3684 if not paras:
3685 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3686
5b995f71
RA
3687 repeat = False
3688 while True:
3689 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3690 style_id = style.get('id') or style.get(_x('xml:id'))
3691 if not style_id:
3692 continue
5b995f71
RA
3693 parent_style_id = style.get('style')
3694 if parent_style_id:
3695 if parent_style_id not in styles:
3696 repeat = True
3697 continue
3698 styles[style_id] = styles[parent_style_id].copy()
3699 for prop in SUPPORTED_STYLING:
3700 prop_val = style.get(_x('tts:' + prop))
3701 if prop_val:
3702 styles.setdefault(style_id, {})[prop] = prop_val
3703 if repeat:
3704 repeat = False
3705 else:
3706 break
3707
3708 for p in ('body', 'div'):
3709 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3710 if ele is None:
3711 continue
3712 style = styles.get(ele.get('style'))
3713 if not style:
3714 continue
3715 default_style.update(style)
3716
bf6427d2 3717 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3718 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3719 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3720 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3721 if begin_time is None:
3722 continue
7dff0363 3723 if not end_time:
d631d5f9
YCH
3724 if not dur:
3725 continue
3726 end_time = begin_time + dur
bf6427d2
YCH
3727 out.append('%d\n%s --> %s\n%s\n\n' % (
3728 index,
c1c924ab
YCH
3729 srt_subtitles_timecode(begin_time),
3730 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3731 parse_node(para)))
3732
3733 return ''.join(out)
3734
3735
66e289ba
S
3736def cli_option(params, command_option, param):
3737 param = params.get(param)
98e698f1
RA
3738 if param:
3739 param = compat_str(param)
66e289ba
S
3740 return [command_option, param] if param is not None else []
3741
3742
3743def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3744 param = params.get(param)
5b232f46
S
3745 if param is None:
3746 return []
66e289ba
S
3747 assert isinstance(param, bool)
3748 if separator:
3749 return [command_option + separator + (true_value if param else false_value)]
3750 return [command_option, true_value if param else false_value]
3751
3752
3753def cli_valueless_option(params, command_option, param, expected_value=True):
3754 param = params.get(param)
3755 return [command_option] if param == expected_value else []
3756
3757
e92caff5 3758def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3759 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3760 if use_compat:
5b1ecbb3 3761 return argdict
3762 else:
3763 argdict = None
eab9b2bc 3764 if argdict is None:
5b1ecbb3 3765 return default
eab9b2bc 3766 assert isinstance(argdict, dict)
3767
e92caff5 3768 assert isinstance(keys, (list, tuple))
3769 for key_list in keys:
e92caff5 3770 arg_list = list(filter(
3771 lambda x: x is not None,
6606817a 3772 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3773 if arg_list:
3774 return [arg for args in arg_list for arg in args]
3775 return default
66e289ba 3776
6251555f 3777
330690a2 3778def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3779 main_key, exe = main_key.lower(), exe.lower()
3780 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3781 keys = [f'{root_key}{k}' for k in (keys or [''])]
3782 if root_key in keys:
3783 if main_key != exe:
3784 keys.append((main_key, exe))
3785 keys.append('default')
3786 else:
3787 use_compat = False
3788 return cli_configuration_args(argdict, keys, default, use_compat)
3789
66e289ba 3790
39672624
YCH
3791class ISO639Utils(object):
3792 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3793 _lang_map = {
3794 'aa': 'aar',
3795 'ab': 'abk',
3796 'ae': 'ave',
3797 'af': 'afr',
3798 'ak': 'aka',
3799 'am': 'amh',
3800 'an': 'arg',
3801 'ar': 'ara',
3802 'as': 'asm',
3803 'av': 'ava',
3804 'ay': 'aym',
3805 'az': 'aze',
3806 'ba': 'bak',
3807 'be': 'bel',
3808 'bg': 'bul',
3809 'bh': 'bih',
3810 'bi': 'bis',
3811 'bm': 'bam',
3812 'bn': 'ben',
3813 'bo': 'bod',
3814 'br': 'bre',
3815 'bs': 'bos',
3816 'ca': 'cat',
3817 'ce': 'che',
3818 'ch': 'cha',
3819 'co': 'cos',
3820 'cr': 'cre',
3821 'cs': 'ces',
3822 'cu': 'chu',
3823 'cv': 'chv',
3824 'cy': 'cym',
3825 'da': 'dan',
3826 'de': 'deu',
3827 'dv': 'div',
3828 'dz': 'dzo',
3829 'ee': 'ewe',
3830 'el': 'ell',
3831 'en': 'eng',
3832 'eo': 'epo',
3833 'es': 'spa',
3834 'et': 'est',
3835 'eu': 'eus',
3836 'fa': 'fas',
3837 'ff': 'ful',
3838 'fi': 'fin',
3839 'fj': 'fij',
3840 'fo': 'fao',
3841 'fr': 'fra',
3842 'fy': 'fry',
3843 'ga': 'gle',
3844 'gd': 'gla',
3845 'gl': 'glg',
3846 'gn': 'grn',
3847 'gu': 'guj',
3848 'gv': 'glv',
3849 'ha': 'hau',
3850 'he': 'heb',
b7acc835 3851 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
3852 'hi': 'hin',
3853 'ho': 'hmo',
3854 'hr': 'hrv',
3855 'ht': 'hat',
3856 'hu': 'hun',
3857 'hy': 'hye',
3858 'hz': 'her',
3859 'ia': 'ina',
3860 'id': 'ind',
b7acc835 3861 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
3862 'ie': 'ile',
3863 'ig': 'ibo',
3864 'ii': 'iii',
3865 'ik': 'ipk',
3866 'io': 'ido',
3867 'is': 'isl',
3868 'it': 'ita',
3869 'iu': 'iku',
3870 'ja': 'jpn',
3871 'jv': 'jav',
3872 'ka': 'kat',
3873 'kg': 'kon',
3874 'ki': 'kik',
3875 'kj': 'kua',
3876 'kk': 'kaz',
3877 'kl': 'kal',
3878 'km': 'khm',
3879 'kn': 'kan',
3880 'ko': 'kor',
3881 'kr': 'kau',
3882 'ks': 'kas',
3883 'ku': 'kur',
3884 'kv': 'kom',
3885 'kw': 'cor',
3886 'ky': 'kir',
3887 'la': 'lat',
3888 'lb': 'ltz',
3889 'lg': 'lug',
3890 'li': 'lim',
3891 'ln': 'lin',
3892 'lo': 'lao',
3893 'lt': 'lit',
3894 'lu': 'lub',
3895 'lv': 'lav',
3896 'mg': 'mlg',
3897 'mh': 'mah',
3898 'mi': 'mri',
3899 'mk': 'mkd',
3900 'ml': 'mal',
3901 'mn': 'mon',
3902 'mr': 'mar',
3903 'ms': 'msa',
3904 'mt': 'mlt',
3905 'my': 'mya',
3906 'na': 'nau',
3907 'nb': 'nob',
3908 'nd': 'nde',
3909 'ne': 'nep',
3910 'ng': 'ndo',
3911 'nl': 'nld',
3912 'nn': 'nno',
3913 'no': 'nor',
3914 'nr': 'nbl',
3915 'nv': 'nav',
3916 'ny': 'nya',
3917 'oc': 'oci',
3918 'oj': 'oji',
3919 'om': 'orm',
3920 'or': 'ori',
3921 'os': 'oss',
3922 'pa': 'pan',
3923 'pi': 'pli',
3924 'pl': 'pol',
3925 'ps': 'pus',
3926 'pt': 'por',
3927 'qu': 'que',
3928 'rm': 'roh',
3929 'rn': 'run',
3930 'ro': 'ron',
3931 'ru': 'rus',
3932 'rw': 'kin',
3933 'sa': 'san',
3934 'sc': 'srd',
3935 'sd': 'snd',
3936 'se': 'sme',
3937 'sg': 'sag',
3938 'si': 'sin',
3939 'sk': 'slk',
3940 'sl': 'slv',
3941 'sm': 'smo',
3942 'sn': 'sna',
3943 'so': 'som',
3944 'sq': 'sqi',
3945 'sr': 'srp',
3946 'ss': 'ssw',
3947 'st': 'sot',
3948 'su': 'sun',
3949 'sv': 'swe',
3950 'sw': 'swa',
3951 'ta': 'tam',
3952 'te': 'tel',
3953 'tg': 'tgk',
3954 'th': 'tha',
3955 'ti': 'tir',
3956 'tk': 'tuk',
3957 'tl': 'tgl',
3958 'tn': 'tsn',
3959 'to': 'ton',
3960 'tr': 'tur',
3961 'ts': 'tso',
3962 'tt': 'tat',
3963 'tw': 'twi',
3964 'ty': 'tah',
3965 'ug': 'uig',
3966 'uk': 'ukr',
3967 'ur': 'urd',
3968 'uz': 'uzb',
3969 've': 'ven',
3970 'vi': 'vie',
3971 'vo': 'vol',
3972 'wa': 'wln',
3973 'wo': 'wol',
3974 'xh': 'xho',
3975 'yi': 'yid',
e9a50fba 3976 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
3977 'yo': 'yor',
3978 'za': 'zha',
3979 'zh': 'zho',
3980 'zu': 'zul',
3981 }
3982
3983 @classmethod
3984 def short2long(cls, code):
3985 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3986 return cls._lang_map.get(code[:2])
3987
3988 @classmethod
3989 def long2short(cls, code):
3990 """Convert language code from ISO 639-2/T to ISO 639-1"""
3991 for short_name, long_name in cls._lang_map.items():
3992 if long_name == code:
3993 return short_name
3994
3995
4eb10f66
YCH
3996class ISO3166Utils(object):
3997 # From http://data.okfn.org/data/core/country-list
3998 _country_map = {
3999 'AF': 'Afghanistan',
4000 'AX': 'Åland Islands',
4001 'AL': 'Albania',
4002 'DZ': 'Algeria',
4003 'AS': 'American Samoa',
4004 'AD': 'Andorra',
4005 'AO': 'Angola',
4006 'AI': 'Anguilla',
4007 'AQ': 'Antarctica',
4008 'AG': 'Antigua and Barbuda',
4009 'AR': 'Argentina',
4010 'AM': 'Armenia',
4011 'AW': 'Aruba',
4012 'AU': 'Australia',
4013 'AT': 'Austria',
4014 'AZ': 'Azerbaijan',
4015 'BS': 'Bahamas',
4016 'BH': 'Bahrain',
4017 'BD': 'Bangladesh',
4018 'BB': 'Barbados',
4019 'BY': 'Belarus',
4020 'BE': 'Belgium',
4021 'BZ': 'Belize',
4022 'BJ': 'Benin',
4023 'BM': 'Bermuda',
4024 'BT': 'Bhutan',
4025 'BO': 'Bolivia, Plurinational State of',
4026 'BQ': 'Bonaire, Sint Eustatius and Saba',
4027 'BA': 'Bosnia and Herzegovina',
4028 'BW': 'Botswana',
4029 'BV': 'Bouvet Island',
4030 'BR': 'Brazil',
4031 'IO': 'British Indian Ocean Territory',
4032 'BN': 'Brunei Darussalam',
4033 'BG': 'Bulgaria',
4034 'BF': 'Burkina Faso',
4035 'BI': 'Burundi',
4036 'KH': 'Cambodia',
4037 'CM': 'Cameroon',
4038 'CA': 'Canada',
4039 'CV': 'Cape Verde',
4040 'KY': 'Cayman Islands',
4041 'CF': 'Central African Republic',
4042 'TD': 'Chad',
4043 'CL': 'Chile',
4044 'CN': 'China',
4045 'CX': 'Christmas Island',
4046 'CC': 'Cocos (Keeling) Islands',
4047 'CO': 'Colombia',
4048 'KM': 'Comoros',
4049 'CG': 'Congo',
4050 'CD': 'Congo, the Democratic Republic of the',
4051 'CK': 'Cook Islands',
4052 'CR': 'Costa Rica',
4053 'CI': 'Côte d\'Ivoire',
4054 'HR': 'Croatia',
4055 'CU': 'Cuba',
4056 'CW': 'Curaçao',
4057 'CY': 'Cyprus',
4058 'CZ': 'Czech Republic',
4059 'DK': 'Denmark',
4060 'DJ': 'Djibouti',
4061 'DM': 'Dominica',
4062 'DO': 'Dominican Republic',
4063 'EC': 'Ecuador',
4064 'EG': 'Egypt',
4065 'SV': 'El Salvador',
4066 'GQ': 'Equatorial Guinea',
4067 'ER': 'Eritrea',
4068 'EE': 'Estonia',
4069 'ET': 'Ethiopia',
4070 'FK': 'Falkland Islands (Malvinas)',
4071 'FO': 'Faroe Islands',
4072 'FJ': 'Fiji',
4073 'FI': 'Finland',
4074 'FR': 'France',
4075 'GF': 'French Guiana',
4076 'PF': 'French Polynesia',
4077 'TF': 'French Southern Territories',
4078 'GA': 'Gabon',
4079 'GM': 'Gambia',
4080 'GE': 'Georgia',
4081 'DE': 'Germany',
4082 'GH': 'Ghana',
4083 'GI': 'Gibraltar',
4084 'GR': 'Greece',
4085 'GL': 'Greenland',
4086 'GD': 'Grenada',
4087 'GP': 'Guadeloupe',
4088 'GU': 'Guam',
4089 'GT': 'Guatemala',
4090 'GG': 'Guernsey',
4091 'GN': 'Guinea',
4092 'GW': 'Guinea-Bissau',
4093 'GY': 'Guyana',
4094 'HT': 'Haiti',
4095 'HM': 'Heard Island and McDonald Islands',
4096 'VA': 'Holy See (Vatican City State)',
4097 'HN': 'Honduras',
4098 'HK': 'Hong Kong',
4099 'HU': 'Hungary',
4100 'IS': 'Iceland',
4101 'IN': 'India',
4102 'ID': 'Indonesia',
4103 'IR': 'Iran, Islamic Republic of',
4104 'IQ': 'Iraq',
4105 'IE': 'Ireland',
4106 'IM': 'Isle of Man',
4107 'IL': 'Israel',
4108 'IT': 'Italy',
4109 'JM': 'Jamaica',
4110 'JP': 'Japan',
4111 'JE': 'Jersey',
4112 'JO': 'Jordan',
4113 'KZ': 'Kazakhstan',
4114 'KE': 'Kenya',
4115 'KI': 'Kiribati',
4116 'KP': 'Korea, Democratic People\'s Republic of',
4117 'KR': 'Korea, Republic of',
4118 'KW': 'Kuwait',
4119 'KG': 'Kyrgyzstan',
4120 'LA': 'Lao People\'s Democratic Republic',
4121 'LV': 'Latvia',
4122 'LB': 'Lebanon',
4123 'LS': 'Lesotho',
4124 'LR': 'Liberia',
4125 'LY': 'Libya',
4126 'LI': 'Liechtenstein',
4127 'LT': 'Lithuania',
4128 'LU': 'Luxembourg',
4129 'MO': 'Macao',
4130 'MK': 'Macedonia, the Former Yugoslav Republic of',
4131 'MG': 'Madagascar',
4132 'MW': 'Malawi',
4133 'MY': 'Malaysia',
4134 'MV': 'Maldives',
4135 'ML': 'Mali',
4136 'MT': 'Malta',
4137 'MH': 'Marshall Islands',
4138 'MQ': 'Martinique',
4139 'MR': 'Mauritania',
4140 'MU': 'Mauritius',
4141 'YT': 'Mayotte',
4142 'MX': 'Mexico',
4143 'FM': 'Micronesia, Federated States of',
4144 'MD': 'Moldova, Republic of',
4145 'MC': 'Monaco',
4146 'MN': 'Mongolia',
4147 'ME': 'Montenegro',
4148 'MS': 'Montserrat',
4149 'MA': 'Morocco',
4150 'MZ': 'Mozambique',
4151 'MM': 'Myanmar',
4152 'NA': 'Namibia',
4153 'NR': 'Nauru',
4154 'NP': 'Nepal',
4155 'NL': 'Netherlands',
4156 'NC': 'New Caledonia',
4157 'NZ': 'New Zealand',
4158 'NI': 'Nicaragua',
4159 'NE': 'Niger',
4160 'NG': 'Nigeria',
4161 'NU': 'Niue',
4162 'NF': 'Norfolk Island',
4163 'MP': 'Northern Mariana Islands',
4164 'NO': 'Norway',
4165 'OM': 'Oman',
4166 'PK': 'Pakistan',
4167 'PW': 'Palau',
4168 'PS': 'Palestine, State of',
4169 'PA': 'Panama',
4170 'PG': 'Papua New Guinea',
4171 'PY': 'Paraguay',
4172 'PE': 'Peru',
4173 'PH': 'Philippines',
4174 'PN': 'Pitcairn',
4175 'PL': 'Poland',
4176 'PT': 'Portugal',
4177 'PR': 'Puerto Rico',
4178 'QA': 'Qatar',
4179 'RE': 'Réunion',
4180 'RO': 'Romania',
4181 'RU': 'Russian Federation',
4182 'RW': 'Rwanda',
4183 'BL': 'Saint Barthélemy',
4184 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4185 'KN': 'Saint Kitts and Nevis',
4186 'LC': 'Saint Lucia',
4187 'MF': 'Saint Martin (French part)',
4188 'PM': 'Saint Pierre and Miquelon',
4189 'VC': 'Saint Vincent and the Grenadines',
4190 'WS': 'Samoa',
4191 'SM': 'San Marino',
4192 'ST': 'Sao Tome and Principe',
4193 'SA': 'Saudi Arabia',
4194 'SN': 'Senegal',
4195 'RS': 'Serbia',
4196 'SC': 'Seychelles',
4197 'SL': 'Sierra Leone',
4198 'SG': 'Singapore',
4199 'SX': 'Sint Maarten (Dutch part)',
4200 'SK': 'Slovakia',
4201 'SI': 'Slovenia',
4202 'SB': 'Solomon Islands',
4203 'SO': 'Somalia',
4204 'ZA': 'South Africa',
4205 'GS': 'South Georgia and the South Sandwich Islands',
4206 'SS': 'South Sudan',
4207 'ES': 'Spain',
4208 'LK': 'Sri Lanka',
4209 'SD': 'Sudan',
4210 'SR': 'Suriname',
4211 'SJ': 'Svalbard and Jan Mayen',
4212 'SZ': 'Swaziland',
4213 'SE': 'Sweden',
4214 'CH': 'Switzerland',
4215 'SY': 'Syrian Arab Republic',
4216 'TW': 'Taiwan, Province of China',
4217 'TJ': 'Tajikistan',
4218 'TZ': 'Tanzania, United Republic of',
4219 'TH': 'Thailand',
4220 'TL': 'Timor-Leste',
4221 'TG': 'Togo',
4222 'TK': 'Tokelau',
4223 'TO': 'Tonga',
4224 'TT': 'Trinidad and Tobago',
4225 'TN': 'Tunisia',
4226 'TR': 'Turkey',
4227 'TM': 'Turkmenistan',
4228 'TC': 'Turks and Caicos Islands',
4229 'TV': 'Tuvalu',
4230 'UG': 'Uganda',
4231 'UA': 'Ukraine',
4232 'AE': 'United Arab Emirates',
4233 'GB': 'United Kingdom',
4234 'US': 'United States',
4235 'UM': 'United States Minor Outlying Islands',
4236 'UY': 'Uruguay',
4237 'UZ': 'Uzbekistan',
4238 'VU': 'Vanuatu',
4239 'VE': 'Venezuela, Bolivarian Republic of',
4240 'VN': 'Viet Nam',
4241 'VG': 'Virgin Islands, British',
4242 'VI': 'Virgin Islands, U.S.',
4243 'WF': 'Wallis and Futuna',
4244 'EH': 'Western Sahara',
4245 'YE': 'Yemen',
4246 'ZM': 'Zambia',
4247 'ZW': 'Zimbabwe',
4248 }
4249
4250 @classmethod
4251 def short2full(cls, code):
4252 """Convert an ISO 3166-2 country code to the corresponding full name"""
4253 return cls._country_map.get(code.upper())
4254
4255
773f291d
S
4256class GeoUtils(object):
4257 # Major IPv4 address blocks per country
4258 _country_ip_map = {
53896ca5 4259 'AD': '46.172.224.0/19',
773f291d
S
4260 'AE': '94.200.0.0/13',
4261 'AF': '149.54.0.0/17',
4262 'AG': '209.59.64.0/18',
4263 'AI': '204.14.248.0/21',
4264 'AL': '46.99.0.0/16',
4265 'AM': '46.70.0.0/15',
4266 'AO': '105.168.0.0/13',
53896ca5
S
4267 'AP': '182.50.184.0/21',
4268 'AQ': '23.154.160.0/24',
773f291d
S
4269 'AR': '181.0.0.0/12',
4270 'AS': '202.70.112.0/20',
53896ca5 4271 'AT': '77.116.0.0/14',
773f291d
S
4272 'AU': '1.128.0.0/11',
4273 'AW': '181.41.0.0/18',
53896ca5
S
4274 'AX': '185.217.4.0/22',
4275 'AZ': '5.197.0.0/16',
773f291d
S
4276 'BA': '31.176.128.0/17',
4277 'BB': '65.48.128.0/17',
4278 'BD': '114.130.0.0/16',
4279 'BE': '57.0.0.0/8',
53896ca5 4280 'BF': '102.178.0.0/15',
773f291d
S
4281 'BG': '95.42.0.0/15',
4282 'BH': '37.131.0.0/17',
4283 'BI': '154.117.192.0/18',
4284 'BJ': '137.255.0.0/16',
53896ca5 4285 'BL': '185.212.72.0/23',
773f291d
S
4286 'BM': '196.12.64.0/18',
4287 'BN': '156.31.0.0/16',
4288 'BO': '161.56.0.0/16',
4289 'BQ': '161.0.80.0/20',
53896ca5 4290 'BR': '191.128.0.0/12',
773f291d
S
4291 'BS': '24.51.64.0/18',
4292 'BT': '119.2.96.0/19',
4293 'BW': '168.167.0.0/16',
4294 'BY': '178.120.0.0/13',
4295 'BZ': '179.42.192.0/18',
4296 'CA': '99.224.0.0/11',
4297 'CD': '41.243.0.0/16',
53896ca5
S
4298 'CF': '197.242.176.0/21',
4299 'CG': '160.113.0.0/16',
773f291d 4300 'CH': '85.0.0.0/13',
53896ca5 4301 'CI': '102.136.0.0/14',
773f291d
S
4302 'CK': '202.65.32.0/19',
4303 'CL': '152.172.0.0/14',
53896ca5 4304 'CM': '102.244.0.0/14',
773f291d
S
4305 'CN': '36.128.0.0/10',
4306 'CO': '181.240.0.0/12',
4307 'CR': '201.192.0.0/12',
4308 'CU': '152.206.0.0/15',
4309 'CV': '165.90.96.0/19',
4310 'CW': '190.88.128.0/17',
53896ca5 4311 'CY': '31.153.0.0/16',
773f291d
S
4312 'CZ': '88.100.0.0/14',
4313 'DE': '53.0.0.0/8',
4314 'DJ': '197.241.0.0/17',
4315 'DK': '87.48.0.0/12',
4316 'DM': '192.243.48.0/20',
4317 'DO': '152.166.0.0/15',
4318 'DZ': '41.96.0.0/12',
4319 'EC': '186.68.0.0/15',
4320 'EE': '90.190.0.0/15',
4321 'EG': '156.160.0.0/11',
4322 'ER': '196.200.96.0/20',
4323 'ES': '88.0.0.0/11',
4324 'ET': '196.188.0.0/14',
4325 'EU': '2.16.0.0/13',
4326 'FI': '91.152.0.0/13',
4327 'FJ': '144.120.0.0/16',
53896ca5 4328 'FK': '80.73.208.0/21',
773f291d
S
4329 'FM': '119.252.112.0/20',
4330 'FO': '88.85.32.0/19',
4331 'FR': '90.0.0.0/9',
4332 'GA': '41.158.0.0/15',
4333 'GB': '25.0.0.0/8',
4334 'GD': '74.122.88.0/21',
4335 'GE': '31.146.0.0/16',
4336 'GF': '161.22.64.0/18',
4337 'GG': '62.68.160.0/19',
53896ca5
S
4338 'GH': '154.160.0.0/12',
4339 'GI': '95.164.0.0/16',
773f291d
S
4340 'GL': '88.83.0.0/19',
4341 'GM': '160.182.0.0/15',
4342 'GN': '197.149.192.0/18',
4343 'GP': '104.250.0.0/19',
4344 'GQ': '105.235.224.0/20',
4345 'GR': '94.64.0.0/13',
4346 'GT': '168.234.0.0/16',
4347 'GU': '168.123.0.0/16',
4348 'GW': '197.214.80.0/20',
4349 'GY': '181.41.64.0/18',
4350 'HK': '113.252.0.0/14',
4351 'HN': '181.210.0.0/16',
4352 'HR': '93.136.0.0/13',
4353 'HT': '148.102.128.0/17',
4354 'HU': '84.0.0.0/14',
4355 'ID': '39.192.0.0/10',
4356 'IE': '87.32.0.0/12',
4357 'IL': '79.176.0.0/13',
4358 'IM': '5.62.80.0/20',
4359 'IN': '117.192.0.0/10',
4360 'IO': '203.83.48.0/21',
4361 'IQ': '37.236.0.0/14',
4362 'IR': '2.176.0.0/12',
4363 'IS': '82.221.0.0/16',
4364 'IT': '79.0.0.0/10',
4365 'JE': '87.244.64.0/18',
4366 'JM': '72.27.0.0/17',
4367 'JO': '176.29.0.0/16',
53896ca5 4368 'JP': '133.0.0.0/8',
773f291d
S
4369 'KE': '105.48.0.0/12',
4370 'KG': '158.181.128.0/17',
4371 'KH': '36.37.128.0/17',
4372 'KI': '103.25.140.0/22',
4373 'KM': '197.255.224.0/20',
53896ca5 4374 'KN': '198.167.192.0/19',
773f291d
S
4375 'KP': '175.45.176.0/22',
4376 'KR': '175.192.0.0/10',
4377 'KW': '37.36.0.0/14',
4378 'KY': '64.96.0.0/15',
4379 'KZ': '2.72.0.0/13',
4380 'LA': '115.84.64.0/18',
4381 'LB': '178.135.0.0/16',
53896ca5 4382 'LC': '24.92.144.0/20',
773f291d
S
4383 'LI': '82.117.0.0/19',
4384 'LK': '112.134.0.0/15',
53896ca5 4385 'LR': '102.183.0.0/16',
773f291d
S
4386 'LS': '129.232.0.0/17',
4387 'LT': '78.56.0.0/13',
4388 'LU': '188.42.0.0/16',
4389 'LV': '46.109.0.0/16',
4390 'LY': '41.252.0.0/14',
4391 'MA': '105.128.0.0/11',
4392 'MC': '88.209.64.0/18',
4393 'MD': '37.246.0.0/16',
4394 'ME': '178.175.0.0/17',
4395 'MF': '74.112.232.0/21',
4396 'MG': '154.126.0.0/17',
4397 'MH': '117.103.88.0/21',
4398 'MK': '77.28.0.0/15',
4399 'ML': '154.118.128.0/18',
4400 'MM': '37.111.0.0/17',
4401 'MN': '49.0.128.0/17',
4402 'MO': '60.246.0.0/16',
4403 'MP': '202.88.64.0/20',
4404 'MQ': '109.203.224.0/19',
4405 'MR': '41.188.64.0/18',
4406 'MS': '208.90.112.0/22',
4407 'MT': '46.11.0.0/16',
4408 'MU': '105.16.0.0/12',
4409 'MV': '27.114.128.0/18',
53896ca5 4410 'MW': '102.70.0.0/15',
773f291d
S
4411 'MX': '187.192.0.0/11',
4412 'MY': '175.136.0.0/13',
4413 'MZ': '197.218.0.0/15',
4414 'NA': '41.182.0.0/16',
4415 'NC': '101.101.0.0/18',
4416 'NE': '197.214.0.0/18',
4417 'NF': '203.17.240.0/22',
4418 'NG': '105.112.0.0/12',
4419 'NI': '186.76.0.0/15',
4420 'NL': '145.96.0.0/11',
4421 'NO': '84.208.0.0/13',
4422 'NP': '36.252.0.0/15',
4423 'NR': '203.98.224.0/19',
4424 'NU': '49.156.48.0/22',
4425 'NZ': '49.224.0.0/14',
4426 'OM': '5.36.0.0/15',
4427 'PA': '186.72.0.0/15',
4428 'PE': '186.160.0.0/14',
4429 'PF': '123.50.64.0/18',
4430 'PG': '124.240.192.0/19',
4431 'PH': '49.144.0.0/13',
4432 'PK': '39.32.0.0/11',
4433 'PL': '83.0.0.0/11',
4434 'PM': '70.36.0.0/20',
4435 'PR': '66.50.0.0/16',
4436 'PS': '188.161.0.0/16',
4437 'PT': '85.240.0.0/13',
4438 'PW': '202.124.224.0/20',
4439 'PY': '181.120.0.0/14',
4440 'QA': '37.210.0.0/15',
53896ca5 4441 'RE': '102.35.0.0/16',
773f291d 4442 'RO': '79.112.0.0/13',
53896ca5 4443 'RS': '93.86.0.0/15',
773f291d 4444 'RU': '5.136.0.0/13',
53896ca5 4445 'RW': '41.186.0.0/16',
773f291d
S
4446 'SA': '188.48.0.0/13',
4447 'SB': '202.1.160.0/19',
4448 'SC': '154.192.0.0/11',
53896ca5 4449 'SD': '102.120.0.0/13',
773f291d 4450 'SE': '78.64.0.0/12',
53896ca5 4451 'SG': '8.128.0.0/10',
773f291d
S
4452 'SI': '188.196.0.0/14',
4453 'SK': '78.98.0.0/15',
53896ca5 4454 'SL': '102.143.0.0/17',
773f291d
S
4455 'SM': '89.186.32.0/19',
4456 'SN': '41.82.0.0/15',
53896ca5 4457 'SO': '154.115.192.0/18',
773f291d
S
4458 'SR': '186.179.128.0/17',
4459 'SS': '105.235.208.0/21',
4460 'ST': '197.159.160.0/19',
4461 'SV': '168.243.0.0/16',
4462 'SX': '190.102.0.0/20',
4463 'SY': '5.0.0.0/16',
4464 'SZ': '41.84.224.0/19',
4465 'TC': '65.255.48.0/20',
4466 'TD': '154.68.128.0/19',
4467 'TG': '196.168.0.0/14',
4468 'TH': '171.96.0.0/13',
4469 'TJ': '85.9.128.0/18',
4470 'TK': '27.96.24.0/21',
4471 'TL': '180.189.160.0/20',
4472 'TM': '95.85.96.0/19',
4473 'TN': '197.0.0.0/11',
4474 'TO': '175.176.144.0/21',
4475 'TR': '78.160.0.0/11',
4476 'TT': '186.44.0.0/15',
4477 'TV': '202.2.96.0/19',
4478 'TW': '120.96.0.0/11',
4479 'TZ': '156.156.0.0/14',
53896ca5
S
4480 'UA': '37.52.0.0/14',
4481 'UG': '102.80.0.0/13',
4482 'US': '6.0.0.0/8',
773f291d 4483 'UY': '167.56.0.0/13',
53896ca5 4484 'UZ': '84.54.64.0/18',
773f291d 4485 'VA': '212.77.0.0/19',
53896ca5 4486 'VC': '207.191.240.0/21',
773f291d 4487 'VE': '186.88.0.0/13',
53896ca5 4488 'VG': '66.81.192.0/20',
773f291d
S
4489 'VI': '146.226.0.0/16',
4490 'VN': '14.160.0.0/11',
4491 'VU': '202.80.32.0/20',
4492 'WF': '117.20.32.0/21',
4493 'WS': '202.4.32.0/19',
4494 'YE': '134.35.0.0/16',
4495 'YT': '41.242.116.0/22',
4496 'ZA': '41.0.0.0/11',
53896ca5
S
4497 'ZM': '102.144.0.0/13',
4498 'ZW': '102.177.192.0/18',
773f291d
S
4499 }
4500
4501 @classmethod
5f95927a
S
4502 def random_ipv4(cls, code_or_block):
4503 if len(code_or_block) == 2:
4504 block = cls._country_ip_map.get(code_or_block.upper())
4505 if not block:
4506 return None
4507 else:
4508 block = code_or_block
773f291d
S
4509 addr, preflen = block.split('/')
4510 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4511 addr_max = addr_min | (0xffffffff >> int(preflen))
18a0defa 4512 return compat_str(socket.inet_ntoa(
4248dad9 4513 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4514
4515
91410c9b 4516class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
4517 def __init__(self, proxies=None):
4518 # Set default handlers
4519 for type in ('http', 'https'):
4520 setattr(self, '%s_open' % type,
4521 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4522 meth(r, proxy, type))
38e87f6c 4523 compat_urllib_request.ProxyHandler.__init__(self, proxies)
2461f79d 4524
91410c9b 4525 def proxy_open(self, req, proxy, type):
2461f79d 4526 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4527 if req_proxy is not None:
4528 proxy = req_proxy
2461f79d
PH
4529 del req.headers['Ytdl-request-proxy']
4530
4531 if proxy == '__noproxy__':
4532 return None # No Proxy
51fb4995 4533 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4534 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4535 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4536 return None
91410c9b
PH
4537 return compat_urllib_request.ProxyHandler.proxy_open(
4538 self, req, proxy, type)
5bc880b9
YCH
4539
4540
0a5445dd
YCH
4541# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4542# released into Public Domain
4543# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4544
4545def long_to_bytes(n, blocksize=0):
4546 """long_to_bytes(n:long, blocksize:int) : string
4547 Convert a long integer to a byte string.
4548
4549 If optional blocksize is given and greater than zero, pad the front of the
4550 byte string with binary zeros so that the length is a multiple of
4551 blocksize.
4552 """
4553 # after much testing, this algorithm was deemed to be the fastest
4554 s = b''
4555 n = int(n)
4556 while n > 0:
4557 s = compat_struct_pack('>I', n & 0xffffffff) + s
4558 n = n >> 32
4559 # strip off leading zeros
4560 for i in range(len(s)):
4561 if s[i] != b'\000'[0]:
4562 break
4563 else:
4564 # only happens when n == 0
4565 s = b'\000'
4566 i = 0
4567 s = s[i:]
4568 # add back some pad bytes. this could be done more efficiently w.r.t. the
4569 # de-padding being done above, but sigh...
4570 if blocksize > 0 and len(s) % blocksize:
4571 s = (blocksize - len(s) % blocksize) * b'\000' + s
4572 return s
4573
4574
4575def bytes_to_long(s):
4576 """bytes_to_long(string) : long
4577 Convert a byte string to a long integer.
4578
4579 This is (essentially) the inverse of long_to_bytes().
4580 """
4581 acc = 0
4582 length = len(s)
4583 if length % 4:
4584 extra = (4 - length % 4)
4585 s = b'\000' * extra + s
4586 length = length + extra
4587 for i in range(0, length, 4):
4588 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4589 return acc
4590
4591
5bc880b9
YCH
4592def ohdave_rsa_encrypt(data, exponent, modulus):
4593 '''
4594 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4595
4596 Input:
4597 data: data to encrypt, bytes-like object
4598 exponent, modulus: parameter e and N of RSA algorithm, both integer
4599 Output: hex string of encrypted data
4600
4601 Limitation: supports one block encryption only
4602 '''
4603
4604 payload = int(binascii.hexlify(data[::-1]), 16)
4605 encrypted = pow(payload, exponent, modulus)
4606 return '%x' % encrypted
81bdc8fd
YCH
4607
4608
f48409c7
YCH
4609def pkcs1pad(data, length):
4610 """
4611 Padding input data with PKCS#1 scheme
4612
4613 @param {int[]} data input data
4614 @param {int} length target length
4615 @returns {int[]} padded data
4616 """
4617 if len(data) > length - 11:
4618 raise ValueError('Input data too long for PKCS#1 padding')
4619
4620 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4621 return [0, 2] + pseudo_random + [0] + data
4622
4623
5eb6bdce 4624def encode_base_n(num, n, table=None):
59f898b7 4625 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
4626 if not table:
4627 table = FULL_TABLE[:n]
4628
5eb6bdce
YCH
4629 if n > len(table):
4630 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4631
4632 if num == 0:
4633 return table[0]
4634
81bdc8fd
YCH
4635 ret = ''
4636 while num:
4637 ret = table[num % n] + ret
4638 num = num // n
4639 return ret
f52354a8
YCH
4640
4641
4642def decode_packed_codes(code):
06b3fe29 4643 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4644 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4645 base = int(base)
4646 count = int(count)
4647 symbols = symbols.split('|')
4648 symbol_table = {}
4649
4650 while count:
4651 count -= 1
5eb6bdce 4652 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4653 symbol_table[base_n_count] = symbols[count] or base_n_count
4654
4655 return re.sub(
4656 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4657 obfuscated_code)
e154c651 4658
4659
1ced2221
S
4660def caesar(s, alphabet, shift):
4661 if shift == 0:
4662 return s
4663 l = len(alphabet)
4664 return ''.join(
4665 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4666 for c in s)
4667
4668
4669def rot47(s):
4670 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4671
4672
e154c651 4673def parse_m3u8_attributes(attrib):
4674 info = {}
4675 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4676 if val.startswith('"'):
4677 val = val[1:-1]
4678 info[key] = val
4679 return info
1143535d
YCH
4680
4681
4682def urshift(val, n):
4683 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4684
4685
4686# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 4687# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
4688def decode_png(png_data):
4689 # Reference: https://www.w3.org/TR/PNG/
4690 header = png_data[8:]
4691
4692 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4693 raise IOError('Not a valid PNG file.')
4694
4695 int_map = {1: '>B', 2: '>H', 4: '>I'}
4696 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4697
4698 chunks = []
4699
4700 while header:
4701 length = unpack_integer(header[:4])
4702 header = header[4:]
4703
4704 chunk_type = header[:4]
4705 header = header[4:]
4706
4707 chunk_data = header[:length]
4708 header = header[length:]
4709
4710 header = header[4:] # Skip CRC
4711
4712 chunks.append({
4713 'type': chunk_type,
4714 'length': length,
4715 'data': chunk_data
4716 })
4717
4718 ihdr = chunks[0]['data']
4719
4720 width = unpack_integer(ihdr[:4])
4721 height = unpack_integer(ihdr[4:8])
4722
4723 idat = b''
4724
4725 for chunk in chunks:
4726 if chunk['type'] == b'IDAT':
4727 idat += chunk['data']
4728
4729 if not idat:
4730 raise IOError('Unable to read PNG data.')
4731
4732 decompressed_data = bytearray(zlib.decompress(idat))
4733
4734 stride = width * 3
4735 pixels = []
4736
4737 def _get_pixel(idx):
4738 x = idx % stride
4739 y = idx // stride
4740 return pixels[y][x]
4741
4742 for y in range(height):
4743 basePos = y * (1 + stride)
4744 filter_type = decompressed_data[basePos]
4745
4746 current_row = []
4747
4748 pixels.append(current_row)
4749
4750 for x in range(stride):
4751 color = decompressed_data[1 + basePos + x]
4752 basex = y * stride + x
4753 left = 0
4754 up = 0
4755
4756 if x > 2:
4757 left = _get_pixel(basex - 3)
4758 if y > 0:
4759 up = _get_pixel(basex - stride)
4760
4761 if filter_type == 1: # Sub
4762 color = (color + left) & 0xff
4763 elif filter_type == 2: # Up
4764 color = (color + up) & 0xff
4765 elif filter_type == 3: # Average
4766 color = (color + ((left + up) >> 1)) & 0xff
4767 elif filter_type == 4: # Paeth
4768 a = left
4769 b = up
4770 c = 0
4771
4772 if x > 2 and y > 0:
4773 c = _get_pixel(basex - stride - 3)
4774
4775 p = a + b - c
4776
4777 pa = abs(p - a)
4778 pb = abs(p - b)
4779 pc = abs(p - c)
4780
4781 if pa <= pb and pa <= pc:
4782 color = (color + a) & 0xff
4783 elif pb <= pc:
4784 color = (color + b) & 0xff
4785 else:
4786 color = (color + c) & 0xff
4787
4788 current_row.append(color)
4789
4790 return width, height, pixels
efa97bdc
YCH
4791
4792
4793def write_xattr(path, key, value):
4794 # This mess below finds the best xattr tool for the job
4795 try:
4796 # try the pyxattr module...
4797 import xattr
4798
53a7e3d2
YCH
4799 if hasattr(xattr, 'set'): # pyxattr
4800 # Unicode arguments are not supported in python-pyxattr until
4801 # version 0.5.0
067aa17e 4802 # See https://github.com/ytdl-org/youtube-dl/issues/5498
53a7e3d2
YCH
4803 pyxattr_required_version = '0.5.0'
4804 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4805 # TODO: fallback to CLI tools
4806 raise XAttrUnavailableError(
4807 'python-pyxattr is detected but is too old. '
7a5c1cfe 4808 'yt-dlp requires %s or above while your version is %s. '
53a7e3d2
YCH
4809 'Falling back to other xattr implementations' % (
4810 pyxattr_required_version, xattr.__version__))
4811
4812 setxattr = xattr.set
4813 else: # xattr
4814 setxattr = xattr.setxattr
efa97bdc
YCH
4815
4816 try:
53a7e3d2 4817 setxattr(path, key, value)
efa97bdc
YCH
4818 except EnvironmentError as e:
4819 raise XAttrMetadataError(e.errno, e.strerror)
4820
4821 except ImportError:
4822 if compat_os_name == 'nt':
4823 # Write xattrs to NTFS Alternate Data Streams:
4824 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4825 assert ':' not in key
4826 assert os.path.exists(path)
4827
4828 ads_fn = path + ':' + key
4829 try:
4830 with open(ads_fn, 'wb') as f:
4831 f.write(value)
4832 except EnvironmentError as e:
4833 raise XAttrMetadataError(e.errno, e.strerror)
4834 else:
4835 user_has_setfattr = check_executable('setfattr', ['--version'])
4836 user_has_xattr = check_executable('xattr', ['-h'])
4837
4838 if user_has_setfattr or user_has_xattr:
4839
4840 value = value.decode('utf-8')
4841 if user_has_setfattr:
4842 executable = 'setfattr'
4843 opts = ['-n', key, '-v', value]
4844 elif user_has_xattr:
4845 executable = 'xattr'
4846 opts = ['-w', key, value]
4847
3089bc74
S
4848 cmd = ([encodeFilename(executable, True)]
4849 + [encodeArgument(o) for o in opts]
4850 + [encodeFilename(path, True)])
efa97bdc
YCH
4851
4852 try:
d3c93ec2 4853 p = Popen(
efa97bdc
YCH
4854 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4855 except EnvironmentError as e:
4856 raise XAttrMetadataError(e.errno, e.strerror)
d3c93ec2 4857 stdout, stderr = p.communicate_or_kill()
efa97bdc
YCH
4858 stderr = stderr.decode('utf-8', 'replace')
4859 if p.returncode != 0:
4860 raise XAttrMetadataError(p.returncode, stderr)
4861
4862 else:
4863 # On Unix, and can't find pyxattr, setfattr, or xattr.
4864 if sys.platform.startswith('linux'):
4865 raise XAttrUnavailableError(
4866 "Couldn't find a tool to set the xattrs. "
4867 "Install either the python 'pyxattr' or 'xattr' "
4868 "modules, or the GNU 'attr' package "
4869 "(which contains the 'setfattr' tool).")
4870 else:
4871 raise XAttrUnavailableError(
4872 "Couldn't find a tool to set the xattrs. "
4873 "Install either the python 'xattr' module, "
4874 "or the 'xattr' binary.")
0c265486
YCH
4875
4876
4877def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
4878 start_date = datetime.date(1950, 1, 1)
4879 end_date = datetime.date(1995, 12, 31)
4880 offset = random.randint(0, (end_date - start_date).days)
4881 random_date = start_date + datetime.timedelta(offset)
0c265486 4882 return {
aa374bc7
AS
4883 year_field: str(random_date.year),
4884 month_field: str(random_date.month),
4885 day_field: str(random_date.day),
0c265486 4886 }
732044af 4887
c76eb41b 4888
732044af 4889# Templates for internet shortcut files, which are plain text files.
4890DOT_URL_LINK_TEMPLATE = '''
4891[InternetShortcut]
4892URL=%(url)s
4893'''.lstrip()
4894
4895DOT_WEBLOC_LINK_TEMPLATE = '''
4896<?xml version="1.0" encoding="UTF-8"?>
4897<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4898<plist version="1.0">
4899<dict>
4900\t<key>URL</key>
4901\t<string>%(url)s</string>
4902</dict>
4903</plist>
4904'''.lstrip()
4905
4906DOT_DESKTOP_LINK_TEMPLATE = '''
4907[Desktop Entry]
4908Encoding=UTF-8
4909Name=%(filename)s
4910Type=Link
4911URL=%(url)s
4912Icon=text-html
4913'''.lstrip()
4914
08438d2c 4915LINK_TEMPLATES = {
4916 'url': DOT_URL_LINK_TEMPLATE,
4917 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4918 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4919}
4920
732044af 4921
4922def iri_to_uri(iri):
4923 """
4924 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4925
4926 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4927 """
4928
4929 iri_parts = compat_urllib_parse_urlparse(iri)
4930
4931 if '[' in iri_parts.netloc:
4932 raise ValueError('IPv6 URIs are not, yet, supported.')
4933 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4934
4935 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4936
4937 net_location = ''
4938 if iri_parts.username:
4939 net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4940 if iri_parts.password is not None:
4941 net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4942 net_location += '@'
4943
4944 net_location += iri_parts.hostname.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
4945 # The 'idna' encoding produces ASCII text.
4946 if iri_parts.port is not None and iri_parts.port != 80:
4947 net_location += ':' + str(iri_parts.port)
4948
4949 return compat_urllib_parse_urlunparse(
4950 (iri_parts.scheme,
4951 net_location,
4952
4953 compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4954
4955 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4956 compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4957
4958 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4959 compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4960
4961 compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4962
4963 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4964
4965
4966def to_high_limit_path(path):
4967 if sys.platform in ['win32', 'cygwin']:
4968 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4969 return r'\\?\ '.rstrip() + os.path.abspath(path)
4970
4971 return path
76d321f6 4972
c76eb41b 4973
b868936c 4974def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4975 if field is None:
4976 val = obj if obj is not None else default
4977 else:
4978 val = obj.get(field, default)
76d321f6 4979 if func and val not in ignore:
4980 val = func(val)
4981 return template % val if val not in ignore else default
00dd0cd5 4982
4983
4984def clean_podcast_url(url):
4985 return re.sub(r'''(?x)
4986 (?:
4987 (?:
4988 chtbl\.com/track|
4989 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4990 play\.podtrac\.com
4991 )/[^/]+|
4992 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4993 flex\.acast\.com|
4994 pd(?:
4995 cn\.co| # https://podcorn.com/analytics-prefix/
4996 st\.fm # https://podsights.com/docs/
4997 )/e
4998 )/''', '', url)
ffcb8191
THD
4999
5000
5001_HEX_TABLE = '0123456789abcdef'
5002
5003
5004def random_uuidv4():
5005 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5006
5007
5008def make_dir(path, to_screen=None):
5009 try:
5010 dn = os.path.dirname(path)
5011 if dn and not os.path.exists(dn):
5012 os.makedirs(dn)
5013 return True
5014 except (OSError, IOError) as err:
5015 if callable(to_screen) is not None:
5016 to_screen('unable to create directory ' + error_to_compat_str(err))
5017 return False
f74980cb 5018
5019
5020def get_executable_path():
c552ae88 5021 from zipimport import zipimporter
5022 if hasattr(sys, 'frozen'): # Running from PyInstaller
5023 path = os.path.dirname(sys.executable)
5024 elif isinstance(globals().get('__loader__'), zipimporter): # Running from ZIP
5025 path = os.path.join(os.path.dirname(__file__), '../..')
5026 else:
5027 path = os.path.join(os.path.dirname(__file__), '..')
f74980cb 5028 return os.path.abspath(path)
5029
5030
2f567473 5031def load_plugins(name, suffix, namespace):
3ae5e797 5032 classes = {}
f74980cb 5033 try:
019a94f7
ÁS
5034 plugins_spec = importlib.util.spec_from_file_location(
5035 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5036 plugins = importlib.util.module_from_spec(plugins_spec)
5037 sys.modules[plugins_spec.name] = plugins
5038 plugins_spec.loader.exec_module(plugins)
f74980cb 5039 for name in dir(plugins):
2f567473 5040 if name in namespace:
5041 continue
5042 if not name.endswith(suffix):
f74980cb 5043 continue
5044 klass = getattr(plugins, name)
3ae5e797 5045 classes[name] = namespace[name] = klass
019a94f7 5046 except FileNotFoundError:
f74980cb 5047 pass
f74980cb 5048 return classes
06167fbb 5049
5050
325ebc17 5051def traverse_obj(
352d63fd 5052 obj, *path_list, default=None, expected_type=None, get_all=True,
325ebc17 5053 casesense=True, is_user_input=False, traverse_string=False):
324ad820 5054 ''' Traverse nested list/dict/tuple
8f334380 5055 @param path_list A list of paths which are checked one by one.
5056 Each path is a list of keys where each key is a string,
1797b073 5057 a function, a tuple of strings/None or "...".
2614f646 5058 When a fuction is given, it takes the key as argument and
5059 returns whether the key matches or not. When a tuple is given,
8f334380 5060 all the keys given in the tuple are traversed, and
5061 "..." traverses all the keys in the object
1797b073 5062 "None" returns the object without traversal
325ebc17 5063 @param default Default value to return
352d63fd 5064 @param expected_type Only accept final value of this type (Can also be any callable)
5065 @param get_all Return all the values obtained from a path or only the first one
324ad820 5066 @param casesense Whether to consider dictionary keys as case sensitive
5067 @param is_user_input Whether the keys are generated from user input. If True,
5068 strings are converted to int/slice if necessary
5069 @param traverse_string Whether to traverse inside strings. If True, any
5070 non-compatible object will also be converted into a string
8f334380 5071 # TODO: Write tests
324ad820 5072 '''
325ebc17 5073 if not casesense:
dbf5416a 5074 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
8f334380 5075 path_list = (map(_lower, variadic(path)) for path in path_list)
5076
5077 def _traverse_obj(obj, path, _current_depth=0):
5078 nonlocal depth
5079 path = tuple(variadic(path))
5080 for i, key in enumerate(path):
1797b073 5081 if None in (key, obj):
5082 return obj
8f334380 5083 if isinstance(key, (list, tuple)):
5084 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5085 key = ...
5086 if key is ...:
5087 obj = (obj.values() if isinstance(obj, dict)
5088 else obj if isinstance(obj, (list, tuple, LazyList))
5089 else str(obj) if traverse_string else [])
5090 _current_depth += 1
5091 depth = max(depth, _current_depth)
5092 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
2614f646 5093 elif callable(key):
5094 if isinstance(obj, (list, tuple, LazyList)):
5095 obj = enumerate(obj)
5096 elif isinstance(obj, dict):
5097 obj = obj.items()
5098 else:
5099 if not traverse_string:
5100 return None
5101 obj = str(obj)
5102 _current_depth += 1
5103 depth = max(depth, _current_depth)
5104 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
575e17a1 5105 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
325ebc17 5106 obj = (obj.get(key) if casesense or (key in obj)
5107 else next((v for k, v in obj.items() if _lower(k) == key), None))
5108 else:
5109 if is_user_input:
5110 key = (int_or_none(key) if ':' not in key
5111 else slice(*map(int_or_none, key.split(':'))))
8f334380 5112 if key == slice(None):
575e17a1 5113 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
325ebc17 5114 if not isinstance(key, (int, slice)):
9fea350f 5115 return None
8f334380 5116 if not isinstance(obj, (list, tuple, LazyList)):
325ebc17 5117 if not traverse_string:
5118 return None
5119 obj = str(obj)
5120 try:
5121 obj = obj[key]
5122 except IndexError:
324ad820 5123 return None
325ebc17 5124 return obj
5125
352d63fd 5126 if isinstance(expected_type, type):
5127 type_test = lambda val: val if isinstance(val, expected_type) else None
5128 elif expected_type is not None:
5129 type_test = expected_type
5130 else:
5131 type_test = lambda val: val
5132
8f334380 5133 for path in path_list:
5134 depth = 0
5135 val = _traverse_obj(obj, path)
325ebc17 5136 if val is not None:
8f334380 5137 if depth:
5138 for _ in range(depth - 1):
6586bca9 5139 val = itertools.chain.from_iterable(v for v in val if v is not None)
352d63fd 5140 val = [v for v in map(type_test, val) if v is not None]
8f334380 5141 if val:
352d63fd 5142 return val if get_all else val[0]
5143 else:
5144 val = type_test(val)
5145 if val is not None:
8f334380 5146 return val
325ebc17 5147 return default
324ad820 5148
5149
5150def traverse_dict(dictn, keys, casesense=True):
ee8dd27a 5151 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5152 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5153 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5154
5155
4b4b7f74 5156def variadic(x, allowed_types=(str, bytes, dict)):
cb89cfc1 5157 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
bd50a52b
THD
5158
5159
49fa4d9a
N
5160# create a JSON Web Signature (jws) with HS256 algorithm
5161# the resulting format is in JWS Compact Serialization
5162# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5163# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5164def jwt_encode_hs256(payload_data, key, headers={}):
5165 header_data = {
5166 'alg': 'HS256',
5167 'typ': 'JWT',
5168 }
5169 if headers:
5170 header_data.update(headers)
5171 header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5172 payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5173 h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5174 signature_b64 = base64.b64encode(h.digest())
5175 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5176 return token
819e0531 5177
5178
16b0d7e6 5179# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5180def jwt_decode_hs256(jwt):
5181 header_b64, payload_b64, signature_b64 = jwt.split('.')
5182 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5183 return payload_data
5184
5185
819e0531 5186def supports_terminal_sequences(stream):
5187 if compat_os_name == 'nt':
e3c7d495 5188 from .compat import WINDOWS_VT_MODE # Must be imported locally
5189 if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
819e0531 5190 return False
5191 elif not os.getenv('TERM'):
5192 return False
5193 try:
5194 return stream.isatty()
5195 except BaseException:
5196 return False
5197
5198
ec11a9f4 5199_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5200
5201
5202def remove_terminal_sequences(string):
5203 return _terminal_sequences_re.sub('', string)
5204
5205
5206def number_of_digits(number):
5207 return len('%d' % number)
34921b43 5208
5209
5210def join_nonempty(*values, delim='-', from_dict=None):
5211 if from_dict is not None:
c586f9e8 5212 values = map(from_dict.get, values)
34921b43 5213 return delim.join(map(str, filter(None, values)))
06e57990 5214
5215
5216class Config:
5217 own_args = None
5218 filename = None
5219 __initialized = False
5220
5221 def __init__(self, parser, label=None):
5222 self._parser, self.label = parser, label
5223 self._loaded_paths, self.configs = set(), []
5224
5225 def init(self, args=None, filename=None):
5226 assert not self.__initialized
5227 if filename:
5228 location = os.path.realpath(filename)
5229 if location in self._loaded_paths:
5230 return False
5231 self._loaded_paths.add(location)
5232
5233 self.__initialized = True
5234 self.own_args, self.filename = args, filename
5235 for location in self._parser.parse_args(args)[0].config_locations or []:
5236 location = compat_expanduser(location)
5237 if os.path.isdir(location):
5238 location = os.path.join(location, 'yt-dlp.conf')
5239 if not os.path.exists(location):
5240 self._parser.error(f'config location {location} does not exist')
5241 self.append_config(self.read_file(location), location)
5242 return True
5243
5244 def __str__(self):
5245 label = join_nonempty(
5246 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5247 delim=' ')
5248 return join_nonempty(
5249 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5250 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5251 delim='\n')
5252
5253 @staticmethod
5254 def read_file(filename, default=[]):
5255 try:
5256 optionf = open(filename)
5257 except IOError:
5258 return default # silently skip if file is not present
5259 try:
5260 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5261 contents = optionf.read()
5262 if sys.version_info < (3,):
5263 contents = contents.decode(preferredencoding())
5264 res = compat_shlex_split(contents, comments=True)
5265 finally:
5266 optionf.close()
5267 return res
5268
5269 @staticmethod
5270 def hide_login_info(opts):
5271 PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5272 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5273
5274 def _scrub_eq(o):
5275 m = eqre.match(o)
5276 if m:
5277 return m.group('key') + '=PRIVATE'
5278 else:
5279 return o
5280
5281 opts = list(map(_scrub_eq, opts))
5282 for idx, opt in enumerate(opts):
5283 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5284 opts[idx + 1] = 'PRIVATE'
5285 return opts
5286
5287 def append_config(self, *args, label=None):
5288 config = type(self)(self._parser, label)
5289 config._loaded_paths = self._loaded_paths
5290 if config.init(*args):
5291 self.configs.append(config)
5292
5293 @property
5294 def all_args(self):
5295 for config in reversed(self.configs):
5296 yield from config.all_args
5297 yield from self.own_args or []
5298
5299 def parse_args(self):
5300 return self._parser.parse_args(list(self.all_args))