]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
[youtube] Enforce UTC (#2402)
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
cc52de43 1#!/usr/bin/env python3
dcdb292f 2# coding: utf-8
d77c3dfd 3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
c380cc28 10import collections
62e609ab 11import contextlib
e3946f98 12import ctypes
c496ca96
PH
13import datetime
14import email.utils
0c265486 15import email.header
f45c185f 16import errno
be4a824d 17import functools
d77c3dfd 18import gzip
49fa4d9a
N
19import hashlib
20import hmac
019a94f7 21import importlib.util
03f9daab 22import io
79a2e94e 23import itertools
f4bfd65f 24import json
d77c3dfd 25import locale
02dbf93f 26import math
347de493 27import operator
d77c3dfd 28import os
c496ca96 29import platform
773f291d 30import random
d77c3dfd 31import re
c496ca96 32import socket
79a2e94e 33import ssl
1c088fa8 34import subprocess
d77c3dfd 35import sys
181c8655 36import tempfile
c380cc28 37import time
01951dda 38import traceback
bcf89ce6 39import xml.etree.ElementTree
d77c3dfd 40import zlib
2814f12b 41import mimetypes
d77c3dfd 42
8c25f81b 43from .compat import (
b4a3d461 44 compat_HTMLParseError,
8bb56eee 45 compat_HTMLParser,
201c1459 46 compat_HTTPError,
8f9312c3 47 compat_basestring,
8c25f81b 48 compat_chr,
1bab3437 49 compat_cookiejar,
d7cd9a9e 50 compat_ctypes_WINFUNCTYPE,
36e6f62c 51 compat_etree_fromstring,
51098426 52 compat_expanduser,
8c25f81b 53 compat_html_entities,
55b2f099 54 compat_html_entities_html5,
be4a824d 55 compat_http_client,
42db58ec 56 compat_integer_types,
e29663c6 57 compat_numeric_types,
c86b6142 58 compat_kwargs,
efa97bdc 59 compat_os_name,
8c25f81b 60 compat_parse_qs,
06e57990 61 compat_shlex_split,
702ccf2d 62 compat_shlex_quote,
8c25f81b 63 compat_str,
edaa23f8 64 compat_struct_pack,
d3f8e038 65 compat_struct_unpack,
8c25f81b
PH
66 compat_urllib_error,
67 compat_urllib_parse,
15707c7e 68 compat_urllib_parse_urlencode,
8c25f81b 69 compat_urllib_parse_urlparse,
732044af 70 compat_urllib_parse_urlunparse,
71 compat_urllib_parse_quote,
72 compat_urllib_parse_quote_plus,
7581bfc9 73 compat_urllib_parse_unquote_plus,
8c25f81b
PH
74 compat_urllib_request,
75 compat_urlparse,
810c10ba 76 compat_xpath,
8c25f81b 77)
4644ac55 78
71aff188
YCH
79from .socks import (
80 ProxyType,
81 sockssocket,
82)
83
4644ac55 84
51fb4995
YCH
85def register_socks_protocols():
86 # "Register" SOCKS protocols
d5ae6bb5
YCH
87 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
88 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
89 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
90 if scheme not in compat_urlparse.uses_netloc:
91 compat_urlparse.uses_netloc.append(scheme)
92
93
468e2e92
FV
94# This is not clearly defined otherwise
95compiled_regex_type = type(re.compile(''))
96
f7a147e3
S
97
98def random_user_agent():
99 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
100 _CHROME_VERSIONS = (
19b4c74d 101 '90.0.4430.212',
102 '90.0.4430.24',
103 '90.0.4430.70',
104 '90.0.4430.72',
105 '90.0.4430.85',
106 '90.0.4430.93',
107 '91.0.4472.101',
108 '91.0.4472.106',
109 '91.0.4472.114',
110 '91.0.4472.124',
111 '91.0.4472.164',
112 '91.0.4472.19',
113 '91.0.4472.77',
114 '92.0.4515.107',
115 '92.0.4515.115',
116 '92.0.4515.131',
117 '92.0.4515.159',
118 '92.0.4515.43',
119 '93.0.4556.0',
120 '93.0.4577.15',
121 '93.0.4577.63',
122 '93.0.4577.82',
123 '94.0.4606.41',
124 '94.0.4606.54',
125 '94.0.4606.61',
126 '94.0.4606.71',
127 '94.0.4606.81',
128 '94.0.4606.85',
129 '95.0.4638.17',
130 '95.0.4638.50',
131 '95.0.4638.54',
132 '95.0.4638.69',
133 '95.0.4638.74',
134 '96.0.4664.18',
135 '96.0.4664.45',
136 '96.0.4664.55',
137 '96.0.4664.93',
138 '97.0.4692.20',
f7a147e3
S
139 )
140 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
141
142
3e669f36 143std_headers = {
f7a147e3 144 'User-Agent': random_user_agent(),
59ae15a5
PH
145 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
146 'Accept-Encoding': 'gzip, deflate',
147 'Accept-Language': 'en-us,en;q=0.5',
d14cbdd9 148 'Sec-Fetch-Mode': 'same-origin',
3e669f36 149}
f427df17 150
5f6a1245 151
fb37eb25
S
152USER_AGENTS = {
153 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
154}
155
156
bf42a990
S
157NO_DEFAULT = object()
158
7105440c
YCH
159ENGLISH_MONTH_NAMES = [
160 'January', 'February', 'March', 'April', 'May', 'June',
161 'July', 'August', 'September', 'October', 'November', 'December']
162
f6717dec
S
163MONTH_NAMES = {
164 'en': ENGLISH_MONTH_NAMES,
165 'fr': [
3e4185c3
S
166 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
167 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 168}
a942d6cb 169
a7aaa398
S
170KNOWN_EXTENSIONS = (
171 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
172 'flv', 'f4v', 'f4a', 'f4b',
173 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
174 'mkv', 'mka', 'mk3d',
175 'avi', 'divx',
176 'mov',
177 'asf', 'wmv', 'wma',
178 '3gp', '3g2',
179 'mp3',
180 'flac',
181 'ape',
182 'wav',
183 'f4f', 'f4m', 'm3u8', 'smil')
184
c587cbb7 185# needed for sanitizing filenames in restricted mode
c8827027 186ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
187 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
188 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 189
46f59e89
S
190DATE_FORMATS = (
191 '%d %B %Y',
192 '%d %b %Y',
193 '%B %d %Y',
cb655f34
S
194 '%B %dst %Y',
195 '%B %dnd %Y',
9d30c213 196 '%B %drd %Y',
cb655f34 197 '%B %dth %Y',
46f59e89 198 '%b %d %Y',
cb655f34
S
199 '%b %dst %Y',
200 '%b %dnd %Y',
9d30c213 201 '%b %drd %Y',
cb655f34 202 '%b %dth %Y',
46f59e89
S
203 '%b %dst %Y %I:%M',
204 '%b %dnd %Y %I:%M',
9d30c213 205 '%b %drd %Y %I:%M',
46f59e89
S
206 '%b %dth %Y %I:%M',
207 '%Y %m %d',
208 '%Y-%m-%d',
bccdbd22 209 '%Y.%m.%d.',
46f59e89 210 '%Y/%m/%d',
81c13222 211 '%Y/%m/%d %H:%M',
46f59e89 212 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
213 '%Y%m%d%H%M',
214 '%Y%m%d%H%M%S',
4f3fa23e 215 '%Y%m%d',
0c1c6f4b 216 '%Y-%m-%d %H:%M',
46f59e89
S
217 '%Y-%m-%d %H:%M:%S',
218 '%Y-%m-%d %H:%M:%S.%f',
5014558a 219 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
220 '%d.%m.%Y %H:%M',
221 '%d.%m.%Y %H.%M',
222 '%Y-%m-%dT%H:%M:%SZ',
223 '%Y-%m-%dT%H:%M:%S.%fZ',
224 '%Y-%m-%dT%H:%M:%S.%f0Z',
225 '%Y-%m-%dT%H:%M:%S',
226 '%Y-%m-%dT%H:%M:%S.%f',
227 '%Y-%m-%dT%H:%M',
c6eed6b8
S
228 '%b %d %Y at %H:%M',
229 '%b %d %Y at %H:%M:%S',
b555ae9b
S
230 '%B %d %Y at %H:%M',
231 '%B %d %Y at %H:%M:%S',
a63d9bd0 232 '%H:%M %d-%b-%Y',
46f59e89
S
233)
234
235DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
236DATE_FORMATS_DAY_FIRST.extend([
237 '%d-%m-%Y',
238 '%d.%m.%Y',
239 '%d.%m.%y',
240 '%d/%m/%Y',
241 '%d/%m/%y',
242 '%d/%m/%Y %H:%M:%S',
243])
244
245DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
246DATE_FORMATS_MONTH_FIRST.extend([
247 '%m-%d-%Y',
248 '%m.%d.%Y',
249 '%m/%d/%Y',
250 '%m/%d/%y',
251 '%m/%d/%Y %H:%M:%S',
252])
253
06b3fe29 254PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
22f5f5c6 255JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
06b3fe29 256
7105440c 257
d77c3dfd 258def preferredencoding():
59ae15a5 259 """Get preferred encoding.
d77c3dfd 260
59ae15a5
PH
261 Returns the best encoding scheme for the system, based on
262 locale.getpreferredencoding() and some further tweaks.
263 """
264 try:
265 pref = locale.getpreferredencoding()
28e614de 266 'TEST'.encode(pref)
70a1165b 267 except Exception:
59ae15a5 268 pref = 'UTF-8'
bae611f2 269
59ae15a5 270 return pref
d77c3dfd 271
f4bfd65f 272
181c8655 273def write_json_file(obj, fn):
1394646a 274 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 275
92120217 276 fn = encodeFilename(fn)
61ee5aeb 277 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
278 encoding = get_filesystem_encoding()
279 # os.path.basename returns a bytes object, but NamedTemporaryFile
280 # will fail if the filename contains non ascii characters unless we
281 # use a unicode object
282 path_basename = lambda f: os.path.basename(fn).decode(encoding)
283 # the same for os.path.dirname
284 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
285 else:
286 path_basename = os.path.basename
287 path_dirname = os.path.dirname
288
73159f99
S
289 args = {
290 'suffix': '.tmp',
ec5f6016
JMF
291 'prefix': path_basename(fn) + '.',
292 'dir': path_dirname(fn),
73159f99
S
293 'delete': False,
294 }
295
181c8655
PH
296 # In Python 2.x, json.dump expects a bytestream.
297 # In Python 3.x, it writes to a character stream
298 if sys.version_info < (3, 0):
73159f99 299 args['mode'] = 'wb'
181c8655 300 else:
73159f99
S
301 args.update({
302 'mode': 'w',
303 'encoding': 'utf-8',
304 })
305
c86b6142 306 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
307
308 try:
309 with tf:
45d86abe 310 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
311 if sys.platform == 'win32':
312 # Need to remove existing file on Windows, else os.rename raises
313 # WindowsError or FileExistsError.
314 try:
315 os.unlink(fn)
316 except OSError:
317 pass
9cd5f54e
R
318 try:
319 mask = os.umask(0)
320 os.umask(mask)
321 os.chmod(tf.name, 0o666 & ~mask)
322 except OSError:
323 pass
181c8655 324 os.rename(tf.name, fn)
70a1165b 325 except Exception:
181c8655
PH
326 try:
327 os.remove(tf.name)
328 except OSError:
329 pass
330 raise
331
332
333if sys.version_info >= (2, 7):
ee114368 334 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 335 """ Find the xpath xpath[@key=val] """
5d2354f1 336 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 337 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
338 return node.find(expr)
339else:
ee114368 340 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 341 for f in node.findall(compat_xpath(xpath)):
ee114368
S
342 if key not in f.attrib:
343 continue
344 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
345 return f
346 return None
347
d7e66d39
JMF
348# On python2.6 the xml.etree.ElementTree.Element methods don't support
349# the namespace parameter
5f6a1245
JW
350
351
d7e66d39
JMF
352def xpath_with_ns(path, ns_map):
353 components = [c.split(':') for c in path.split('/')]
354 replaced = []
355 for c in components:
356 if len(c) == 1:
357 replaced.append(c[0])
358 else:
359 ns, tag = c
360 replaced.append('{%s}%s' % (ns_map[ns], tag))
361 return '/'.join(replaced)
362
d77c3dfd 363
a41fb80c 364def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 365 def _find_xpath(xpath):
810c10ba 366 return node.find(compat_xpath(xpath))
578c0745
S
367
368 if isinstance(xpath, (str, compat_str)):
369 n = _find_xpath(xpath)
370 else:
371 for xp in xpath:
372 n = _find_xpath(xp)
373 if n is not None:
374 break
d74bebd5 375
8e636da4 376 if n is None:
bf42a990
S
377 if default is not NO_DEFAULT:
378 return default
379 elif fatal:
bf0ff932
PH
380 name = xpath if name is None else name
381 raise ExtractorError('Could not find XML element %s' % name)
382 else:
383 return None
a41fb80c
S
384 return n
385
386
387def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
388 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
389 if n is None or n == default:
390 return n
391 if n.text is None:
392 if default is not NO_DEFAULT:
393 return default
394 elif fatal:
395 name = xpath if name is None else name
396 raise ExtractorError('Could not find XML element\'s text %s' % name)
397 else:
398 return None
399 return n.text
a41fb80c
S
400
401
402def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
403 n = find_xpath_attr(node, xpath, key)
404 if n is None:
405 if default is not NO_DEFAULT:
406 return default
407 elif fatal:
408 name = '%s[@%s]' % (xpath, key) if name is None else name
409 raise ExtractorError('Could not find XML attribute %s' % name)
410 else:
411 return None
412 return n.attrib[key]
bf0ff932
PH
413
414
9e6dd238 415def get_element_by_id(id, html):
43e8fafd 416 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 417 return get_element_by_attribute('id', id, html)
43e8fafd 418
12ea2f30 419
6f32a0b5
ZM
420def get_element_html_by_id(id, html):
421 """Return the html of the tag with the specified ID in the passed HTML document"""
422 return get_element_html_by_attribute('id', id, html)
423
424
84c237fb 425def get_element_by_class(class_name, html):
2af12ad9
TC
426 """Return the content of the first tag with the specified class in the passed HTML document"""
427 retval = get_elements_by_class(class_name, html)
428 return retval[0] if retval else None
429
430
6f32a0b5
ZM
431def get_element_html_by_class(class_name, html):
432 """Return the html of the first tag with the specified class in the passed HTML document"""
433 retval = get_elements_html_by_class(class_name, html)
434 return retval[0] if retval else None
435
436
2af12ad9
TC
437def get_element_by_attribute(attribute, value, html, escape_value=True):
438 retval = get_elements_by_attribute(attribute, value, html, escape_value)
439 return retval[0] if retval else None
440
441
6f32a0b5
ZM
442def get_element_html_by_attribute(attribute, value, html, escape_value=True):
443 retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
444 return retval[0] if retval else None
445
446
2af12ad9
TC
447def get_elements_by_class(class_name, html):
448 """Return the content of all tags with the specified class in the passed HTML document as a list"""
449 return get_elements_by_attribute(
84c237fb
YCH
450 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
451 html, escape_value=False)
452
453
6f32a0b5
ZM
454def get_elements_html_by_class(class_name, html):
455 """Return the html of all tags with the specified class in the passed HTML document as a list"""
456 return get_elements_html_by_attribute(
457 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
458 html, escape_value=False)
459
460
461def get_elements_by_attribute(*args, **kwargs):
43e8fafd 462 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
463 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
464
465
466def get_elements_html_by_attribute(*args, **kwargs):
467 """Return the html of the tag with the specified attribute in the passed HTML document"""
468 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
469
470
471def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
472 """
473 Return the text (content) and the html (whole) of the tag with the specified
474 attribute in the passed HTML document
475 """
9e6dd238 476
0254f162
ZM
477 value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
478
84c237fb
YCH
479 value = re.escape(value) if escape_value else value
480
0254f162 481 partial_element_re = r'''(?x)
6f32a0b5 482 <(?P<tag>[a-zA-Z0-9:._-]+)
0254f162
ZM
483 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
484 \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
485 ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
38285056 486
0254f162
ZM
487 for m in re.finditer(partial_element_re, html):
488 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 489
0254f162
ZM
490 yield (
491 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
492 whole
493 )
a921f407 494
c5229f39 495
6f32a0b5
ZM
496class HTMLBreakOnClosingTagParser(compat_HTMLParser):
497 """
498 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
499 closing tag for the first opening tag it has encountered, and can be used
500 as a context manager
501 """
502
503 class HTMLBreakOnClosingTagException(Exception):
504 pass
505
506 def __init__(self):
507 self.tagstack = collections.deque()
508 compat_HTMLParser.__init__(self)
509
510 def __enter__(self):
511 return self
512
513 def __exit__(self, *_):
514 self.close()
515
516 def close(self):
517 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
518 # so data remains buffered; we no longer have any interest in it, thus
519 # override this method to discard it
520 pass
521
522 def handle_starttag(self, tag, _):
523 self.tagstack.append(tag)
524
525 def handle_endtag(self, tag):
526 if not self.tagstack:
527 raise compat_HTMLParseError('no tags in the stack')
528 while self.tagstack:
529 inner_tag = self.tagstack.pop()
530 if inner_tag == tag:
531 break
532 else:
533 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
534 if not self.tagstack:
535 raise self.HTMLBreakOnClosingTagException()
536
537
538def get_element_text_and_html_by_tag(tag, html):
539 """
540 For the first element with the specified tag in the passed HTML document
541 return its' content (text) and the whole element (html)
542 """
543 def find_or_raise(haystack, needle, exc):
544 try:
545 return haystack.index(needle)
546 except ValueError:
547 raise exc
548 closing_tag = f'</{tag}>'
549 whole_start = find_or_raise(
550 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
551 content_start = find_or_raise(
552 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
553 content_start += whole_start + 1
554 with HTMLBreakOnClosingTagParser() as parser:
555 parser.feed(html[whole_start:content_start])
556 if not parser.tagstack or parser.tagstack[0] != tag:
557 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
558 offset = content_start
559 while offset < len(html):
560 next_closing_tag_start = find_or_raise(
561 html[offset:], closing_tag,
562 compat_HTMLParseError(f'closing {tag} tag not found'))
563 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
564 try:
565 parser.feed(html[offset:offset + next_closing_tag_end])
566 offset += next_closing_tag_end
567 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
568 return html[content_start:offset + next_closing_tag_start], \
569 html[whole_start:offset + next_closing_tag_end]
570 raise compat_HTMLParseError('unexpected end of html')
571
572
8bb56eee
BF
573class HTMLAttributeParser(compat_HTMLParser):
574 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 575
8bb56eee 576 def __init__(self):
c5229f39 577 self.attrs = {}
8bb56eee
BF
578 compat_HTMLParser.__init__(self)
579
580 def handle_starttag(self, tag, attrs):
581 self.attrs = dict(attrs)
582
c5229f39 583
73673ccf
FF
584class HTMLListAttrsParser(compat_HTMLParser):
585 """HTML parser to gather the attributes for the elements of a list"""
586
587 def __init__(self):
588 compat_HTMLParser.__init__(self)
589 self.items = []
590 self._level = 0
591
592 def handle_starttag(self, tag, attrs):
593 if tag == 'li' and self._level == 0:
594 self.items.append(dict(attrs))
595 self._level += 1
596
597 def handle_endtag(self, tag):
598 self._level -= 1
599
600
8bb56eee
BF
601def extract_attributes(html_element):
602 """Given a string for an HTML element such as
603 <el
604 a="foo" B="bar" c="&98;az" d=boz
605 empty= noval entity="&amp;"
606 sq='"' dq="'"
607 >
608 Decode and return a dictionary of attributes.
609 {
610 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
611 'empty': '', 'noval': None, 'entity': '&',
612 'sq': '"', 'dq': '\''
613 }.
614 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
615 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
616 """
617 parser = HTMLAttributeParser()
b4a3d461
S
618 try:
619 parser.feed(html_element)
620 parser.close()
621 # Older Python may throw HTMLParseError in case of malformed HTML
622 except compat_HTMLParseError:
623 pass
8bb56eee 624 return parser.attrs
9e6dd238 625
c5229f39 626
73673ccf
FF
627def parse_list(webpage):
628 """Given a string for an series of HTML <li> elements,
629 return a dictionary of their attributes"""
630 parser = HTMLListAttrsParser()
631 parser.feed(webpage)
632 parser.close()
633 return parser.items
634
635
9e6dd238 636def clean_html(html):
59ae15a5 637 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
638
639 if html is None: # Convenience for sanitizing descriptions etc.
640 return html
641
59ae15a5
PH
642 # Newline vs <br />
643 html = html.replace('\n', ' ')
edd9221c
TF
644 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
645 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
646 # Strip html tags
647 html = re.sub('<.*?>', '', html)
648 # Replace html entities
649 html = unescapeHTML(html)
7decf895 650 return html.strip()
9e6dd238
FV
651
652
d77c3dfd 653def sanitize_open(filename, open_mode):
59ae15a5
PH
654 """Try to open the given filename, and slightly tweak it if this fails.
655
656 Attempts to open the given filename. If this fails, it tries to change
657 the filename slightly, step by step, until it's either able to open it
658 or it fails and raises a final exception, like the standard open()
659 function.
660
661 It returns the tuple (stream, definitive_file_name).
662 """
663 try:
28e614de 664 if filename == '-':
59ae15a5
PH
665 if sys.platform == 'win32':
666 import msvcrt
667 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 668 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
669 stream = open(encodeFilename(filename), open_mode)
670 return (stream, filename)
671 except (IOError, OSError) as err:
f45c185f
PH
672 if err.errno in (errno.EACCES,):
673 raise
59ae15a5 674
f45c185f 675 # In case of error, try to remove win32 forbidden chars
d55de57b 676 alt_filename = sanitize_path(filename)
f45c185f
PH
677 if alt_filename == filename:
678 raise
679 else:
680 # An exception here should be caught in the caller
d55de57b 681 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 682 return (stream, alt_filename)
d77c3dfd
FV
683
684
685def timeconvert(timestr):
59ae15a5
PH
686 """Convert RFC 2822 defined time string into system timestamp"""
687 timestamp = None
688 timetuple = email.utils.parsedate_tz(timestr)
689 if timetuple is not None:
690 timestamp = email.utils.mktime_tz(timetuple)
691 return timestamp
1c469a94 692
5f6a1245 693
796173d0 694def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
695 """Sanitizes a string so it could be used as part of a filename.
696 If restricted is set, use a stricter subset of allowed characters.
158af524
S
697 Set is_id if this is not an arbitrary string, but an ID that should be kept
698 if possible.
59ae15a5
PH
699 """
700 def replace_insane(char):
c587cbb7
AT
701 if restricted and char in ACCENT_CHARS:
702 return ACCENT_CHARS[char]
91dd88b9 703 elif not restricted and char == '\n':
704 return ' '
705 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
706 return ''
707 elif char == '"':
708 return '' if restricted else '\''
709 elif char == ':':
710 return '_-' if restricted else ' -'
711 elif char in '\\/|*<>':
712 return '_'
627dcfff 713 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
714 return '_'
715 if restricted and ord(char) > 127:
716 return '_'
717 return char
718
639f1cea 719 if s == '':
720 return ''
2aeb06d6
PH
721 # Handle timestamps
722 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 723 result = ''.join(map(replace_insane, s))
796173d0
PH
724 if not is_id:
725 while '__' in result:
726 result = result.replace('__', '_')
727 result = result.strip('_')
728 # Common case of "Foreign band name - English song title"
729 if restricted and result.startswith('-_'):
730 result = result[2:]
5a42414b
PH
731 if result.startswith('-'):
732 result = '_' + result[len('-'):]
a7440261 733 result = result.lstrip('.')
796173d0
PH
734 if not result:
735 result = '_'
59ae15a5 736 return result
d77c3dfd 737
5f6a1245 738
c2934512 739def sanitize_path(s, force=False):
a2aaf4db 740 """Sanitizes and normalizes path on Windows"""
c2934512 741 if sys.platform == 'win32':
c4218ac3 742 force = False
c2934512 743 drive_or_unc, _ = os.path.splitdrive(s)
744 if sys.version_info < (2, 7) and not drive_or_unc:
745 drive_or_unc, _ = os.path.splitunc(s)
746 elif force:
747 drive_or_unc = ''
748 else:
a2aaf4db 749 return s
c2934512 750
be531ef1
S
751 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
752 if drive_or_unc:
a2aaf4db
S
753 norm_path.pop(0)
754 sanitized_path = [
ec85ded8 755 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 756 for path_part in norm_path]
be531ef1
S
757 if drive_or_unc:
758 sanitized_path.insert(0, drive_or_unc + os.path.sep)
c4218ac3 759 elif force and s[0] == os.path.sep:
760 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
761 return os.path.join(*sanitized_path)
762
763
17bcc626 764def sanitize_url(url):
befa4708
S
765 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
766 # the number of unwanted failures due to missing protocol
767 if url.startswith('//'):
768 return 'http:%s' % url
769 # Fix some common typos seen so far
770 COMMON_TYPOS = (
067aa17e 771 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
772 (r'^httpss://', r'https://'),
773 # https://bx1.be/lives/direct-tv/
774 (r'^rmtp([es]?)://', r'rtmp\1://'),
775 )
776 for mistake, fixup in COMMON_TYPOS:
777 if re.match(mistake, url):
778 return re.sub(mistake, fixup, url)
bc6b9bcd 779 return url
17bcc626
S
780
781
5435dcf9
HH
782def extract_basic_auth(url):
783 parts = compat_urlparse.urlsplit(url)
784 if parts.username is None:
785 return url, None
786 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
787 parts.hostname if parts.port is None
788 else '%s:%d' % (parts.hostname, parts.port))))
789 auth_payload = base64.b64encode(
790 ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
791 return url, 'Basic ' + auth_payload.decode('utf-8')
792
793
67dda517 794def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 795 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
796 if auth_header is not None:
797 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
798 headers['Authorization'] = auth_header
799 return compat_urllib_request.Request(url, *args, **kwargs)
67dda517
S
800
801
51098426
S
802def expand_path(s):
803 """Expand shell variables and ~"""
804 return os.path.expandvars(compat_expanduser(s))
805
806
d77c3dfd 807def orderedSet(iterable):
59ae15a5
PH
808 """ Remove all duplicates from the input iterable """
809 res = []
810 for el in iterable:
811 if el not in res:
812 res.append(el)
813 return res
d77c3dfd 814
912b38b4 815
55b2f099 816def _htmlentity_transform(entity_with_semicolon):
4e408e47 817 """Transforms an HTML entity to a character."""
55b2f099
YCH
818 entity = entity_with_semicolon[:-1]
819
4e408e47
PH
820 # Known non-numeric HTML entity
821 if entity in compat_html_entities.name2codepoint:
822 return compat_chr(compat_html_entities.name2codepoint[entity])
823
55b2f099
YCH
824 # TODO: HTML5 allows entities without a semicolon. For example,
825 # '&Eacuteric' should be decoded as 'Éric'.
826 if entity_with_semicolon in compat_html_entities_html5:
827 return compat_html_entities_html5[entity_with_semicolon]
828
91757b0f 829 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
830 if mobj is not None:
831 numstr = mobj.group(1)
28e614de 832 if numstr.startswith('x'):
4e408e47 833 base = 16
28e614de 834 numstr = '0%s' % numstr
4e408e47
PH
835 else:
836 base = 10
067aa17e 837 # See https://github.com/ytdl-org/youtube-dl/issues/7518
7aefc49c
S
838 try:
839 return compat_chr(int(numstr, base))
840 except ValueError:
841 pass
4e408e47
PH
842
843 # Unknown entity in name, return its literal representation
7a3f0c00 844 return '&%s;' % entity
4e408e47
PH
845
846
d77c3dfd 847def unescapeHTML(s):
912b38b4
PH
848 if s is None:
849 return None
850 assert type(s) == compat_str
d77c3dfd 851
4e408e47 852 return re.sub(
95f3f7c2 853 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 854
8bf48f23 855
cdb19aa4 856def escapeHTML(text):
857 return (
858 text
859 .replace('&', '&amp;')
860 .replace('<', '&lt;')
861 .replace('>', '&gt;')
862 .replace('"', '&quot;')
863 .replace("'", '&#39;')
864 )
865
866
f5b1bca9 867def process_communicate_or_kill(p, *args, **kwargs):
868 try:
869 return p.communicate(*args, **kwargs)
870 except BaseException: # Including KeyboardInterrupt
871 p.kill()
872 p.wait()
873 raise
874
875
d3c93ec2 876class Popen(subprocess.Popen):
877 if sys.platform == 'win32':
878 _startupinfo = subprocess.STARTUPINFO()
879 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
880 else:
881 _startupinfo = None
882
883 def __init__(self, *args, **kwargs):
884 super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
885
886 def communicate_or_kill(self, *args, **kwargs):
887 return process_communicate_or_kill(self, *args, **kwargs)
888
889
aa49acd1
S
890def get_subprocess_encoding():
891 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
892 # For subprocess calls, encode with locale encoding
893 # Refer to http://stackoverflow.com/a/9951851/35070
894 encoding = preferredencoding()
895 else:
896 encoding = sys.getfilesystemencoding()
897 if encoding is None:
898 encoding = 'utf-8'
899 return encoding
900
901
8bf48f23 902def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
903 """
904 @param s The name of the file
905 """
d77c3dfd 906
8bf48f23 907 assert type(s) == compat_str
d77c3dfd 908
59ae15a5
PH
909 # Python 3 has a Unicode API
910 if sys.version_info >= (3, 0):
911 return s
0f00efed 912
aa49acd1
S
913 # Pass '' directly to use Unicode APIs on Windows 2000 and up
914 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
915 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
916 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
917 return s
918
8ee239e9
YCH
919 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
920 if sys.platform.startswith('java'):
921 return s
922
aa49acd1
S
923 return s.encode(get_subprocess_encoding(), 'ignore')
924
925
926def decodeFilename(b, for_subprocess=False):
927
928 if sys.version_info >= (3, 0):
929 return b
930
931 if not isinstance(b, bytes):
932 return b
933
934 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 935
f07b74fc
PH
936
937def encodeArgument(s):
938 if not isinstance(s, compat_str):
939 # Legacy code that uses byte strings
940 # Uncomment the following line after fixing all post processors
7af808a5 941 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
942 s = s.decode('ascii')
943 return encodeFilename(s, True)
944
945
aa49acd1
S
946def decodeArgument(b):
947 return decodeFilename(b, True)
948
949
8271226a
PH
950def decodeOption(optval):
951 if optval is None:
952 return optval
953 if isinstance(optval, bytes):
954 optval = optval.decode(preferredencoding())
955
956 assert isinstance(optval, compat_str)
957 return optval
1c256f70 958
5f6a1245 959
aa7785f8 960_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
961
962
963def timetuple_from_msec(msec):
964 secs, msec = divmod(msec, 1000)
965 mins, secs = divmod(secs, 60)
966 hrs, mins = divmod(mins, 60)
967 return _timetuple(hrs, mins, secs, msec)
968
969
cdb19aa4 970def formatSeconds(secs, delim=':', msec=False):
aa7785f8 971 time = timetuple_from_msec(secs * 1000)
972 if time.hours:
973 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
974 elif time.minutes:
975 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 976 else:
aa7785f8 977 ret = '%d' % time.seconds
978 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 979
a0ddb8a2 980
77562778 981def _ssl_load_windows_store_certs(ssl_context, storename):
982 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
983 try:
984 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
985 if encoding == 'x509_asn' and (
986 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
987 except PermissionError:
988 return
989 for cert in certs:
a2366922 990 try:
77562778 991 ssl_context.load_verify_locations(cadata=cert)
992 except ssl.SSLError:
a2366922
PH
993 pass
994
77562778 995
996def make_HTTPS_handler(params, **kwargs):
997 opts_check_certificate = not params.get('nocheckcertificate')
998 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
999 context.check_hostname = opts_check_certificate
1000 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1001 if opts_check_certificate:
4e3d1898 1002 try:
1003 context.load_default_certs()
1004 # Work around the issue in load_default_certs when there are bad certificates. See:
1005 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1006 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1007 except ssl.SSLError:
1008 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1009 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1010 # Create a new context to discard any certificates that were already loaded
1011 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1012 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
1013 for storename in ('CA', 'ROOT'):
1014 _ssl_load_windows_store_certs(context, storename)
1015 context.set_default_verify_paths()
77562778 1016 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 1017
732ea2f0 1018
5873d4cc 1019def bug_reports_message(before=';'):
08f2a92c 1020 if ytdl_is_updateable():
7a5c1cfe 1021 update_cmd = 'type yt-dlp -U to update'
08f2a92c 1022 else:
7a5c1cfe 1023 update_cmd = 'see https://github.com/yt-dlp/yt-dlp on how to update'
5873d4cc 1024 msg = 'please report this issue on https://github.com/yt-dlp/yt-dlp .'
08f2a92c 1025 msg += ' Make sure you are using the latest version; %s.' % update_cmd
7a5c1cfe 1026 msg += ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
5873d4cc
F
1027
1028 before = before.rstrip()
1029 if not before or before.endswith(('.', '!', '?')):
1030 msg = msg[0].title() + msg[1:]
1031
1032 return (before + ' ' if before else '') + msg
08f2a92c
JMF
1033
1034
bf5b9d85
PM
1035class YoutubeDLError(Exception):
1036 """Base exception for YoutubeDL errors."""
aa9369a2 1037 msg = None
1038
1039 def __init__(self, msg=None):
1040 if msg is not None:
1041 self.msg = msg
1042 elif self.msg is None:
1043 self.msg = type(self).__name__
1044 super().__init__(self.msg)
bf5b9d85
PM
1045
1046
3158150c 1047network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1048if hasattr(ssl, 'CertificateError'):
1049 network_exceptions.append(ssl.CertificateError)
1050network_exceptions = tuple(network_exceptions)
1051
1052
bf5b9d85 1053class ExtractorError(YoutubeDLError):
1c256f70 1054 """Error during info extraction."""
5f6a1245 1055
1151c407 1056 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1057 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1058 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1059 """
3158150c 1060 if sys.exc_info()[0] in network_exceptions:
9a82b238 1061 expected = True
d5979c5d 1062
526d74ec 1063 self.msg = str(msg)
1c256f70 1064 self.traceback = tb
1151c407 1065 self.expected = expected
2eabb802 1066 self.cause = cause
d11271dd 1067 self.video_id = video_id
1151c407 1068 self.ie = ie
1069 self.exc_info = sys.exc_info() # preserve original exception
1070
1071 super(ExtractorError, self).__init__(''.join((
1072 format_field(ie, template='[%s] '),
1073 format_field(video_id, template='%s: '),
526d74ec 1074 self.msg,
1151c407 1075 format_field(cause, template=' (caused by %r)'),
1076 '' if expected else bug_reports_message())))
1c256f70 1077
01951dda
PH
1078 def format_traceback(self):
1079 if self.traceback is None:
1080 return None
28e614de 1081 return ''.join(traceback.format_tb(self.traceback))
01951dda 1082
1c256f70 1083
416c7fcb
PH
1084class UnsupportedError(ExtractorError):
1085 def __init__(self, url):
1086 super(UnsupportedError, self).__init__(
1087 'Unsupported URL: %s' % url, expected=True)
1088 self.url = url
1089
1090
55b3e45b
JMF
1091class RegexNotFoundError(ExtractorError):
1092 """Error when a regex didn't match"""
1093 pass
1094
1095
773f291d
S
1096class GeoRestrictedError(ExtractorError):
1097 """Geographic restriction Error exception.
1098
1099 This exception may be thrown when a video is not available from your
1100 geographic location due to geographic restrictions imposed by a website.
1101 """
b6e0c7d2 1102
0db3bae8 1103 def __init__(self, msg, countries=None, **kwargs):
1104 kwargs['expected'] = True
1105 super(GeoRestrictedError, self).__init__(msg, **kwargs)
773f291d
S
1106 self.countries = countries
1107
1108
bf5b9d85 1109class DownloadError(YoutubeDLError):
59ae15a5 1110 """Download Error exception.
d77c3dfd 1111
59ae15a5
PH
1112 This exception may be thrown by FileDownloader objects if they are not
1113 configured to continue on errors. They will contain the appropriate
1114 error message.
1115 """
5f6a1245 1116
8cc83b8d
FV
1117 def __init__(self, msg, exc_info=None):
1118 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1119 super(DownloadError, self).__init__(msg)
1120 self.exc_info = exc_info
d77c3dfd
FV
1121
1122
498f5606 1123class EntryNotInPlaylist(YoutubeDLError):
1124 """Entry not in playlist exception.
1125
1126 This exception will be thrown by YoutubeDL when a requested entry
1127 is not found in the playlist info_dict
1128 """
aa9369a2 1129 msg = 'Entry not found in info'
498f5606 1130
1131
bf5b9d85 1132class SameFileError(YoutubeDLError):
59ae15a5 1133 """Same File exception.
d77c3dfd 1134
59ae15a5
PH
1135 This exception will be thrown by FileDownloader objects if they detect
1136 multiple files would have to be downloaded to the same file on disk.
1137 """
aa9369a2 1138 msg = 'Fixed output name but more than one file to download'
1139
1140 def __init__(self, filename=None):
1141 if filename is not None:
1142 self.msg += f': {filename}'
1143 super().__init__(self.msg)
d77c3dfd
FV
1144
1145
bf5b9d85 1146class PostProcessingError(YoutubeDLError):
59ae15a5 1147 """Post Processing exception.
d77c3dfd 1148
59ae15a5
PH
1149 This exception may be raised by PostProcessor's .run() method to
1150 indicate an error in the postprocessing task.
1151 """
5f6a1245 1152
5f6a1245 1153
48f79687 1154class DownloadCancelled(YoutubeDLError):
1155 """ Exception raised when the download queue should be interrupted """
1156 msg = 'The download was cancelled'
8b0d7497 1157
8b0d7497 1158
48f79687 1159class ExistingVideoReached(DownloadCancelled):
1160 """ --break-on-existing triggered """
1161 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1162
48f79687 1163
1164class RejectedVideoReached(DownloadCancelled):
1165 """ --break-on-reject triggered """
1166 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
51d9739f 1167
1168
48f79687 1169class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1170 """ --max-downloads limit has been reached. """
48f79687 1171 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1172
1173
f2ebc5c7 1174class ReExtractInfo(YoutubeDLError):
1175 """ Video info needs to be re-extracted. """
1176
1177 def __init__(self, msg, expected=False):
1178 super().__init__(msg)
1179 self.expected = expected
1180
1181
1182class ThrottledDownload(ReExtractInfo):
48f79687 1183 """ Download speed below --throttled-rate. """
aa9369a2 1184 msg = 'The download speed is below throttle limit'
d77c3dfd 1185
43b22906 1186 def __init__(self):
1187 super().__init__(self.msg, expected=False)
f2ebc5c7 1188
d77c3dfd 1189
bf5b9d85 1190class UnavailableVideoError(YoutubeDLError):
59ae15a5 1191 """Unavailable Format exception.
d77c3dfd 1192
59ae15a5
PH
1193 This exception will be thrown when a video is requested
1194 in a format that is not available for that video.
1195 """
aa9369a2 1196 msg = 'Unable to download video'
1197
1198 def __init__(self, err=None):
1199 if err is not None:
1200 self.msg += f': {err}'
1201 super().__init__(self.msg)
d77c3dfd
FV
1202
1203
bf5b9d85 1204class ContentTooShortError(YoutubeDLError):
59ae15a5 1205 """Content Too Short exception.
d77c3dfd 1206
59ae15a5
PH
1207 This exception may be raised by FileDownloader objects when a file they
1208 download is too small for what the server announced first, indicating
1209 the connection was probably interrupted.
1210 """
d77c3dfd 1211
59ae15a5 1212 def __init__(self, downloaded, expected):
bf5b9d85
PM
1213 super(ContentTooShortError, self).__init__(
1214 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1215 )
2c7ed247 1216 # Both in bytes
59ae15a5
PH
1217 self.downloaded = downloaded
1218 self.expected = expected
d77c3dfd 1219
5f6a1245 1220
bf5b9d85 1221class XAttrMetadataError(YoutubeDLError):
efa97bdc
YCH
1222 def __init__(self, code=None, msg='Unknown error'):
1223 super(XAttrMetadataError, self).__init__(msg)
1224 self.code = code
bd264412 1225 self.msg = msg
efa97bdc
YCH
1226
1227 # Parsing code and msg
3089bc74 1228 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1229 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1230 self.reason = 'NO_SPACE'
1231 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1232 self.reason = 'VALUE_TOO_LONG'
1233 else:
1234 self.reason = 'NOT_SUPPORTED'
1235
1236
bf5b9d85 1237class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1238 pass
1239
1240
c5a59d93 1241def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
1242 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1243 # expected HTTP responses to meet HTTP/1.0 or later (see also
067aa17e 1244 # https://github.com/ytdl-org/youtube-dl/issues/6727)
e5e78797 1245 if sys.version_info < (3, 0):
65220c3b
S
1246 kwargs['strict'] = True
1247 hc = http_class(*args, **compat_kwargs(kwargs))
be4a824d 1248 source_address = ydl_handler._params.get('source_address')
8959018a 1249
be4a824d 1250 if source_address is not None:
8959018a
AU
1251 # This is to workaround _create_connection() from socket where it will try all
1252 # address data from getaddrinfo() including IPv6. This filters the result from
1253 # getaddrinfo() based on the source_address value.
1254 # This is based on the cpython socket.create_connection() function.
1255 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1256 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1257 host, port = address
1258 err = None
1259 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1260 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1261 ip_addrs = [addr for addr in addrs if addr[0] == af]
1262 if addrs and not ip_addrs:
1263 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1264 raise socket.error(
1265 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1266 % (ip_version, source_address[0]))
8959018a
AU
1267 for res in ip_addrs:
1268 af, socktype, proto, canonname, sa = res
1269 sock = None
1270 try:
1271 sock = socket.socket(af, socktype, proto)
1272 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1273 sock.settimeout(timeout)
1274 sock.bind(source_address)
1275 sock.connect(sa)
1276 err = None # Explicitly break reference cycle
1277 return sock
1278 except socket.error as _:
1279 err = _
1280 if sock is not None:
1281 sock.close()
1282 if err is not None:
1283 raise err
1284 else:
9e21e6d9
S
1285 raise socket.error('getaddrinfo returns an empty list')
1286 if hasattr(hc, '_create_connection'):
1287 hc._create_connection = _create_connection
be4a824d
PH
1288 sa = (source_address, 0)
1289 if hasattr(hc, 'source_address'): # Python 2.7+
1290 hc.source_address = sa
1291 else: # Python 2.6
1292 def _hc_connect(self, *args, **kwargs):
9e21e6d9 1293 sock = _create_connection(
be4a824d
PH
1294 (self.host, self.port), self.timeout, sa)
1295 if is_https:
d7932313
PH
1296 self.sock = ssl.wrap_socket(
1297 sock, self.key_file, self.cert_file,
1298 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
1299 else:
1300 self.sock = sock
1301 hc.connect = functools.partial(_hc_connect, hc)
1302
1303 return hc
1304
1305
87f0e62d 1306def handle_youtubedl_headers(headers):
992fc9d6
YCH
1307 filtered_headers = headers
1308
1309 if 'Youtubedl-no-compression' in filtered_headers:
1310 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 1311 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1312
992fc9d6 1313 return filtered_headers
87f0e62d
YCH
1314
1315
acebc9cd 1316class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
1317 """Handler for HTTP requests and responses.
1318
1319 This class, when installed with an OpenerDirector, automatically adds
1320 the standard headers to every HTTP request and handles gzipped and
1321 deflated responses from web servers. If compression is to be avoided in
1322 a particular request, the original request in the program code only has
0424ec30 1323 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1324 removed before making the real request.
1325
1326 Part of this code was copied from:
1327
1328 http://techknack.net/python-urllib2-handlers/
1329
1330 Andrew Rowls, the author of that code, agreed to release it to the
1331 public domain.
1332 """
1333
be4a824d
PH
1334 def __init__(self, params, *args, **kwargs):
1335 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1336 self._params = params
1337
1338 def http_open(self, req):
71aff188
YCH
1339 conn_class = compat_http_client.HTTPConnection
1340
1341 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1342 if socks_proxy:
1343 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1344 del req.headers['Ytdl-socks-proxy']
1345
be4a824d 1346 return self.do_open(functools.partial(
71aff188 1347 _create_http_connection, self, conn_class, False),
be4a824d
PH
1348 req)
1349
59ae15a5
PH
1350 @staticmethod
1351 def deflate(data):
fc2119f2 1352 if not data:
1353 return data
59ae15a5
PH
1354 try:
1355 return zlib.decompress(data, -zlib.MAX_WBITS)
1356 except zlib.error:
1357 return zlib.decompress(data)
1358
acebc9cd 1359 def http_request(self, req):
51f267d9
S
1360 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1361 # always respected by websites, some tend to give out URLs with non percent-encoded
1362 # non-ASCII characters (see telemb.py, ard.py [#3412])
1363 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1364 # To work around aforementioned issue we will replace request's original URL with
1365 # percent-encoded one
1366 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1367 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1368 url = req.get_full_url()
1369 url_escaped = escape_url(url)
1370
1371 # Substitute URL if any change after escaping
1372 if url != url_escaped:
15d260eb 1373 req = update_Request(req, url=url_escaped)
51f267d9 1374
33ac271b 1375 for h, v in std_headers.items():
3d5f7a39
JK
1376 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1377 # The dict keys are capitalized because of this bug by urllib
1378 if h.capitalize() not in req.headers:
33ac271b 1379 req.add_header(h, v)
87f0e62d
YCH
1380
1381 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
1382
1383 if sys.version_info < (2, 7) and '#' in req.get_full_url():
1384 # Python 2.6 is brain-dead when it comes to fragments
1385 req._Request__original = req._Request__original.partition('#')[0]
1386 req._Request__r_type = req._Request__r_type.partition('#')[0]
1387
59ae15a5
PH
1388 return req
1389
acebc9cd 1390 def http_response(self, req, resp):
59ae15a5
PH
1391 old_resp = resp
1392 # gzip
1393 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1394 content = resp.read()
1395 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1396 try:
1397 uncompressed = io.BytesIO(gz.read())
1398 except IOError as original_ioerror:
1399 # There may be junk add the end of the file
1400 # See http://stackoverflow.com/q/4928560/35070 for details
1401 for i in range(1, 1024):
1402 try:
1403 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1404 uncompressed = io.BytesIO(gz.read())
1405 except IOError:
1406 continue
1407 break
1408 else:
1409 raise original_ioerror
b407d853 1410 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1411 resp.msg = old_resp.msg
c047270c 1412 del resp.headers['Content-encoding']
59ae15a5
PH
1413 # deflate
1414 if resp.headers.get('Content-encoding', '') == 'deflate':
1415 gz = io.BytesIO(self.deflate(resp.read()))
b407d853 1416 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1417 resp.msg = old_resp.msg
c047270c 1418 del resp.headers['Content-encoding']
ad729172 1419 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1420 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1421 if 300 <= resp.code < 400:
1422 location = resp.headers.get('Location')
1423 if location:
1424 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1425 if sys.version_info >= (3, 0):
1426 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
1427 else:
1428 location = location.decode('utf-8')
5a4d9ddb
S
1429 location_escaped = escape_url(location)
1430 if location != location_escaped:
1431 del resp.headers['Location']
9a4aec8b
YCH
1432 if sys.version_info < (3, 0):
1433 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 1434 resp.headers['Location'] = location_escaped
59ae15a5 1435 return resp
0f8d03f8 1436
acebc9cd
PH
1437 https_request = http_request
1438 https_response = http_response
bf50b038 1439
5de90176 1440
71aff188
YCH
1441def make_socks_conn_class(base_class, socks_proxy):
1442 assert issubclass(base_class, (
1443 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1444
1445 url_components = compat_urlparse.urlparse(socks_proxy)
1446 if url_components.scheme.lower() == 'socks5':
1447 socks_type = ProxyType.SOCKS5
1448 elif url_components.scheme.lower() in ('socks', 'socks4'):
1449 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1450 elif url_components.scheme.lower() == 'socks4a':
1451 socks_type = ProxyType.SOCKS4A
71aff188 1452
cdd94c2e
YCH
1453 def unquote_if_non_empty(s):
1454 if not s:
1455 return s
1456 return compat_urllib_parse_unquote_plus(s)
1457
71aff188
YCH
1458 proxy_args = (
1459 socks_type,
1460 url_components.hostname, url_components.port or 1080,
1461 True, # Remote DNS
cdd94c2e
YCH
1462 unquote_if_non_empty(url_components.username),
1463 unquote_if_non_empty(url_components.password),
71aff188
YCH
1464 )
1465
1466 class SocksConnection(base_class):
1467 def connect(self):
1468 self.sock = sockssocket()
1469 self.sock.setproxy(*proxy_args)
1470 if type(self.timeout) in (int, float):
1471 self.sock.settimeout(self.timeout)
1472 self.sock.connect((self.host, self.port))
1473
1474 if isinstance(self, compat_http_client.HTTPSConnection):
1475 if hasattr(self, '_context'): # Python > 2.6
1476 self.sock = self._context.wrap_socket(
1477 self.sock, server_hostname=self.host)
1478 else:
1479 self.sock = ssl.wrap_socket(self.sock)
1480
1481 return SocksConnection
1482
1483
be4a824d
PH
1484class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1485 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1486 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1487 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1488 self._params = params
1489
1490 def https_open(self, req):
4f264c02 1491 kwargs = {}
71aff188
YCH
1492 conn_class = self._https_conn_class
1493
4f264c02
JMF
1494 if hasattr(self, '_context'): # python > 2.6
1495 kwargs['context'] = self._context
1496 if hasattr(self, '_check_hostname'): # python 3.x
1497 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1498
1499 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1500 if socks_proxy:
1501 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1502 del req.headers['Ytdl-socks-proxy']
1503
be4a824d 1504 return self.do_open(functools.partial(
71aff188 1505 _create_http_connection, self, conn_class, True),
4f264c02 1506 req, **kwargs)
be4a824d
PH
1507
1508
1bab3437 1509class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
f1a8511f
S
1510 """
1511 See [1] for cookie file format.
1512
1513 1. https://curl.haxx.se/docs/http-cookies.html
1514 """
e7e62441 1515 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1516 _ENTRY_LEN = 7
1517 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1518# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1519
1520'''
1521 _CookieFileEntry = collections.namedtuple(
1522 'CookieFileEntry',
1523 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1524
1bab3437 1525 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
c380cc28
S
1526 """
1527 Save cookies to a file.
1528
1529 Most of the code is taken from CPython 3.8 and slightly adapted
1530 to support cookie files with UTF-8 in both python 2 and 3.
1531 """
1532 if filename is None:
1533 if self.filename is not None:
1534 filename = self.filename
1535 else:
1536 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1537
1bab3437
S
1538 # Store session cookies with `expires` set to 0 instead of an empty
1539 # string
1540 for cookie in self:
1541 if cookie.expires is None:
1542 cookie.expires = 0
c380cc28
S
1543
1544 with io.open(filename, 'w', encoding='utf-8') as f:
1545 f.write(self._HEADER)
1546 now = time.time()
1547 for cookie in self:
1548 if not ignore_discard and cookie.discard:
1549 continue
1550 if not ignore_expires and cookie.is_expired(now):
1551 continue
1552 if cookie.secure:
1553 secure = 'TRUE'
1554 else:
1555 secure = 'FALSE'
1556 if cookie.domain.startswith('.'):
1557 initial_dot = 'TRUE'
1558 else:
1559 initial_dot = 'FALSE'
1560 if cookie.expires is not None:
1561 expires = compat_str(cookie.expires)
1562 else:
1563 expires = ''
1564 if cookie.value is None:
1565 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1566 # with no name, whereas http.cookiejar regards it as a
1567 # cookie with no value.
1568 name = ''
1569 value = cookie.name
1570 else:
1571 name = cookie.name
1572 value = cookie.value
1573 f.write(
1574 '\t'.join([cookie.domain, initial_dot, cookie.path,
1575 secure, expires, name, value]) + '\n')
1bab3437
S
1576
1577 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1578 """Load cookies from a file."""
1579 if filename is None:
1580 if self.filename is not None:
1581 filename = self.filename
1582 else:
1583 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1584
c380cc28
S
1585 def prepare_line(line):
1586 if line.startswith(self._HTTPONLY_PREFIX):
1587 line = line[len(self._HTTPONLY_PREFIX):]
1588 # comments and empty lines are fine
1589 if line.startswith('#') or not line.strip():
1590 return line
1591 cookie_list = line.split('\t')
1592 if len(cookie_list) != self._ENTRY_LEN:
1593 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1594 cookie = self._CookieFileEntry(*cookie_list)
1595 if cookie.expires_at and not cookie.expires_at.isdigit():
1596 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1597 return line
1598
e7e62441 1599 cf = io.StringIO()
c380cc28 1600 with io.open(filename, encoding='utf-8') as f:
e7e62441 1601 for line in f:
c380cc28
S
1602 try:
1603 cf.write(prepare_line(line))
1604 except compat_cookiejar.LoadError as e:
1605 write_string(
1606 'WARNING: skipping cookie file entry due to %s: %r\n'
1607 % (e, line), sys.stderr)
1608 continue
e7e62441 1609 cf.seek(0)
1610 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1611 # Session cookies are denoted by either `expires` field set to
1612 # an empty string or 0. MozillaCookieJar only recognizes the former
1613 # (see [1]). So we need force the latter to be recognized as session
1614 # cookies on our own.
1615 # Session cookies may be important for cookies-based authentication,
1616 # e.g. usually, when user does not check 'Remember me' check box while
1617 # logging in on a site, some important cookies are stored as session
1618 # cookies so that not recognizing them will result in failed login.
1619 # 1. https://bugs.python.org/issue17164
1620 for cookie in self:
1621 # Treat `expires=0` cookies as session cookies
1622 if cookie.expires == 0:
1623 cookie.expires = None
1624 cookie.discard = True
1625
1626
a6420bf5
S
1627class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1628 def __init__(self, cookiejar=None):
1629 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1630
1631 def http_response(self, request, response):
1632 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1633 # characters in Set-Cookie HTTP header of last response (see
067aa17e 1634 # https://github.com/ytdl-org/youtube-dl/issues/6769).
a6420bf5
S
1635 # In order to at least prevent crashing we will percent encode Set-Cookie
1636 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1637 # if sys.version_info < (3, 0) and response.headers:
1638 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1639 # set_cookie = response.headers.get(set_cookie_header)
1640 # if set_cookie:
1641 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1642 # if set_cookie != set_cookie_escaped:
1643 # del response.headers[set_cookie_header]
1644 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1645 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1646
f5fa042c 1647 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
a6420bf5
S
1648 https_response = http_response
1649
1650
fca6dba8 1651class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
201c1459 1652 """YoutubeDL redirect handler
1653
1654 The code is based on HTTPRedirectHandler implementation from CPython [1].
1655
1656 This redirect handler solves two issues:
1657 - ensures redirect URL is always unicode under python 2
1658 - introduces support for experimental HTTP response status code
1659 308 Permanent Redirect [2] used by some sites [3]
1660
1661 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1662 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1663 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1664 """
1665
1666 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1667
1668 def redirect_request(self, req, fp, code, msg, headers, newurl):
1669 """Return a Request or None in response to a redirect.
1670
1671 This is called by the http_error_30x methods when a
1672 redirection response is received. If a redirection should
1673 take place, return a new Request to allow http_error_30x to
1674 perform the redirect. Otherwise, raise HTTPError if no-one
1675 else should try to handle this url. Return None if you can't
1676 but another Handler might.
1677 """
1678 m = req.get_method()
1679 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1680 or code in (301, 302, 303) and m == "POST")):
1681 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1682 # Strictly (according to RFC 2616), 301 or 302 in response to
1683 # a POST MUST NOT cause a redirection without confirmation
1684 # from the user (of urllib.request, in this case). In practice,
1685 # essentially all clients do redirect in this case, so we do
1686 # the same.
1687
1688 # On python 2 urlh.geturl() may sometimes return redirect URL
1689 # as byte string instead of unicode. This workaround allows
1690 # to force it always return unicode.
1691 if sys.version_info[0] < 3:
1692 newurl = compat_str(newurl)
1693
1694 # Be conciliant with URIs containing a space. This is mainly
1695 # redundant with the more complete encoding done in http_error_302(),
1696 # but it is kept for compatibility with other callers.
1697 newurl = newurl.replace(' ', '%20')
1698
1699 CONTENT_HEADERS = ("content-length", "content-type")
1700 # NB: don't use dict comprehension for python 2.6 compatibility
1701 newheaders = dict((k, v) for k, v in req.headers.items()
1702 if k.lower() not in CONTENT_HEADERS)
1703 return compat_urllib_request.Request(
1704 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1705 unverifiable=True)
fca6dba8
S
1706
1707
46f59e89
S
1708def extract_timezone(date_str):
1709 m = re.search(
f137e4c2 1710 r'''(?x)
1711 ^.{8,}? # >=8 char non-TZ prefix, if present
1712 (?P<tz>Z| # just the UTC Z, or
1713 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1714 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1715 [ ]? # optional space
1716 (?P<sign>\+|-) # +/-
1717 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1718 $)
1719 ''', date_str)
46f59e89
S
1720 if not m:
1721 timezone = datetime.timedelta()
1722 else:
1723 date_str = date_str[:-len(m.group('tz'))]
1724 if not m.group('sign'):
1725 timezone = datetime.timedelta()
1726 else:
1727 sign = 1 if m.group('sign') == '+' else -1
1728 timezone = datetime.timedelta(
1729 hours=sign * int(m.group('hours')),
1730 minutes=sign * int(m.group('minutes')))
1731 return timezone, date_str
1732
1733
08b38d54 1734def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1735 """ Return a UNIX timestamp from the given date """
1736
1737 if date_str is None:
1738 return None
1739
52c3a6e4
S
1740 date_str = re.sub(r'\.[0-9]+', '', date_str)
1741
08b38d54 1742 if timezone is None:
46f59e89
S
1743 timezone, date_str = extract_timezone(date_str)
1744
52c3a6e4
S
1745 try:
1746 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1747 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1748 return calendar.timegm(dt.timetuple())
1749 except ValueError:
1750 pass
912b38b4
PH
1751
1752
46f59e89
S
1753def date_formats(day_first=True):
1754 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1755
1756
42bdd9d0 1757def unified_strdate(date_str, day_first=True):
bf50b038 1758 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1759
1760 if date_str is None:
1761 return None
bf50b038 1762 upload_date = None
5f6a1245 1763 # Replace commas
026fcc04 1764 date_str = date_str.replace(',', ' ')
42bdd9d0 1765 # Remove AM/PM + timezone
9bb8e0a3 1766 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1767 _, date_str = extract_timezone(date_str)
42bdd9d0 1768
46f59e89 1769 for expression in date_formats(day_first):
bf50b038
JMF
1770 try:
1771 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1772 except ValueError:
bf50b038 1773 pass
42393ce2
PH
1774 if upload_date is None:
1775 timetuple = email.utils.parsedate_tz(date_str)
1776 if timetuple:
c6b9cf05
S
1777 try:
1778 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1779 except ValueError:
1780 pass
6a750402
JMF
1781 if upload_date is not None:
1782 return compat_str(upload_date)
bf50b038 1783
5f6a1245 1784
46f59e89
S
1785def unified_timestamp(date_str, day_first=True):
1786 if date_str is None:
1787 return None
1788
2ae2ffda 1789 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1790
7dc2a74e 1791 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1792 timezone, date_str = extract_timezone(date_str)
1793
1794 # Remove AM/PM + timezone
1795 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1796
deef3195
S
1797 # Remove unrecognized timezones from ISO 8601 alike timestamps
1798 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1799 if m:
1800 date_str = date_str[:-len(m.group('tz'))]
1801
f226880c
PH
1802 # Python only supports microseconds, so remove nanoseconds
1803 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1804 if m:
1805 date_str = m.group(1)
1806
46f59e89
S
1807 for expression in date_formats(day_first):
1808 try:
7dc2a74e 1809 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1810 return calendar.timegm(dt.timetuple())
1811 except ValueError:
1812 pass
1813 timetuple = email.utils.parsedate_tz(date_str)
1814 if timetuple:
7dc2a74e 1815 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1816
1817
28e614de 1818def determine_ext(url, default_ext='unknown_video'):
85750f89 1819 if url is None or '.' not in url:
f4776371 1820 return default_ext
9cb9a5df 1821 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1822 if re.match(r'^[A-Za-z0-9]+$', guess):
1823 return guess
a7aaa398
S
1824 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1825 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1826 return guess.rstrip('/')
73e79f2a 1827 else:
cbdbb766 1828 return default_ext
73e79f2a 1829
5f6a1245 1830
824fa511
S
1831def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1832 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1833
5f6a1245 1834
9e62f283 1835def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
37254abc
JMF
1836 """
1837 Return a datetime object from a string in the format YYYYMMDD or
9e62f283 1838 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1839
1840 format: string date format used to return datetime object from
1841 precision: round the time portion of a datetime object.
1842 auto|microsecond|second|minute|hour|day.
1843 auto: round to the unit provided in date_str (if applicable).
1844 """
1845 auto_precision = False
1846 if precision == 'auto':
1847 auto_precision = True
1848 precision = 'microsecond'
396a76f7 1849 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1850 if date_str in ('now', 'today'):
37254abc 1851 return today
f8795e10
PH
1852 if date_str == 'yesterday':
1853 return today - datetime.timedelta(days=1)
9e62f283 1854 match = re.match(
1855 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1856 date_str)
37254abc 1857 if match is not None:
9e62f283 1858 start_time = datetime_from_str(match.group('start'), precision, format)
1859 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1860 unit = match.group('unit')
9e62f283 1861 if unit == 'month' or unit == 'year':
1862 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1863 unit = 'day'
9e62f283 1864 else:
1865 if unit == 'week':
1866 unit = 'day'
1867 time *= 7
1868 delta = datetime.timedelta(**{unit + 's': time})
1869 new_date = start_time + delta
1870 if auto_precision:
1871 return datetime_round(new_date, unit)
1872 return new_date
1873
1874 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1875
1876
1877def date_from_str(date_str, format='%Y%m%d'):
1878 """
1879 Return a datetime object from a string in the format YYYYMMDD or
1880 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1881
1882 format: string date format used to return datetime object from
1883 """
1884 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1885
1886
1887def datetime_add_months(dt, months):
1888 """Increment/Decrement a datetime object by months."""
1889 month = dt.month + months - 1
1890 year = dt.year + month // 12
1891 month = month % 12 + 1
1892 day = min(dt.day, calendar.monthrange(year, month)[1])
1893 return dt.replace(year, month, day)
1894
1895
1896def datetime_round(dt, precision='day'):
1897 """
1898 Round a datetime object's time to a specific precision
1899 """
1900 if precision == 'microsecond':
1901 return dt
1902
1903 unit_seconds = {
1904 'day': 86400,
1905 'hour': 3600,
1906 'minute': 60,
1907 'second': 1,
1908 }
1909 roundto = lambda x, n: ((x + n / 2) // n) * n
1910 timestamp = calendar.timegm(dt.timetuple())
1911 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1912
1913
e63fc1be 1914def hyphenate_date(date_str):
1915 """
1916 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1917 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1918 if match is not None:
1919 return '-'.join(match.groups())
1920 else:
1921 return date_str
1922
5f6a1245 1923
bd558525
JMF
1924class DateRange(object):
1925 """Represents a time interval between two dates"""
5f6a1245 1926
bd558525
JMF
1927 def __init__(self, start=None, end=None):
1928 """start and end must be strings in the format accepted by date"""
1929 if start is not None:
1930 self.start = date_from_str(start)
1931 else:
1932 self.start = datetime.datetime.min.date()
1933 if end is not None:
1934 self.end = date_from_str(end)
1935 else:
1936 self.end = datetime.datetime.max.date()
37254abc 1937 if self.start > self.end:
bd558525 1938 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1939
bd558525
JMF
1940 @classmethod
1941 def day(cls, day):
1942 """Returns a range that only contains the given day"""
5f6a1245
JW
1943 return cls(day, day)
1944
bd558525
JMF
1945 def __contains__(self, date):
1946 """Check if the date is in the range"""
37254abc
JMF
1947 if not isinstance(date, datetime.date):
1948 date = date_from_str(date)
1949 return self.start <= date <= self.end
5f6a1245 1950
bd558525 1951 def __str__(self):
5f6a1245 1952 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1953
1954
1955def platform_name():
1956 """ Returns the platform name as a compat_str """
1957 res = platform.platform()
1958 if isinstance(res, bytes):
1959 res = res.decode(preferredencoding())
1960
1961 assert isinstance(res, compat_str)
1962 return res
c257baff
PH
1963
1964
49fa4d9a
N
1965def get_windows_version():
1966 ''' Get Windows version. None if it's not running on Windows '''
1967 if compat_os_name == 'nt':
1968 return version_tuple(platform.win32_ver()[1])
1969 else:
1970 return None
1971
1972
b58ddb32
PH
1973def _windows_write_string(s, out):
1974 """ Returns True if the string was written using special methods,
1975 False if it has yet to be written out."""
1976 # Adapted from http://stackoverflow.com/a/3259271/35070
1977
b58ddb32
PH
1978 import ctypes.wintypes
1979
1980 WIN_OUTPUT_IDS = {
1981 1: -11,
1982 2: -12,
1983 }
1984
a383a98a
PH
1985 try:
1986 fileno = out.fileno()
1987 except AttributeError:
1988 # If the output stream doesn't have a fileno, it's virtual
1989 return False
aa42e873
PH
1990 except io.UnsupportedOperation:
1991 # Some strange Windows pseudo files?
1992 return False
b58ddb32
PH
1993 if fileno not in WIN_OUTPUT_IDS:
1994 return False
1995
d7cd9a9e 1996 GetStdHandle = compat_ctypes_WINFUNCTYPE(
b58ddb32 1997 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
d7cd9a9e 1998 ('GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1999 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
2000
d7cd9a9e 2001 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
2002 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
2003 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
d7cd9a9e 2004 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
2005 written = ctypes.wintypes.DWORD(0)
2006
d7cd9a9e 2007 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
2008 FILE_TYPE_CHAR = 0x0002
2009 FILE_TYPE_REMOTE = 0x8000
d7cd9a9e 2010 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
2011 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
2012 ctypes.POINTER(ctypes.wintypes.DWORD))(
d7cd9a9e 2013 ('GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
2014 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
2015
2016 def not_a_console(handle):
2017 if handle == INVALID_HANDLE_VALUE or handle is None:
2018 return True
3089bc74
S
2019 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
2020 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
2021
2022 if not_a_console(h):
2023 return False
2024
d1b9c912
PH
2025 def next_nonbmp_pos(s):
2026 try:
2027 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
2028 except StopIteration:
2029 return len(s)
2030
2031 while s:
2032 count = min(next_nonbmp_pos(s), 1024)
2033
b58ddb32 2034 ret = WriteConsoleW(
d1b9c912 2035 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
2036 if ret == 0:
2037 raise OSError('Failed to write string')
d1b9c912
PH
2038 if not count: # We just wrote a non-BMP character
2039 assert written.value == 2
2040 s = s[1:]
2041 else:
2042 assert written.value > 0
2043 s = s[written.value:]
b58ddb32
PH
2044 return True
2045
2046
734f90bb 2047def write_string(s, out=None, encoding=None):
7459e3a2
PH
2048 if out is None:
2049 out = sys.stderr
8bf48f23 2050 assert type(s) == compat_str
7459e3a2 2051
b58ddb32
PH
2052 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
2053 if _windows_write_string(s, out):
2054 return
2055
3089bc74
S
2056 if ('b' in getattr(out, 'mode', '')
2057 or sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
2058 byt = s.encode(encoding or preferredencoding(), 'ignore')
2059 out.write(byt)
2060 elif hasattr(out, 'buffer'):
2061 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2062 byt = s.encode(enc, 'ignore')
2063 out.buffer.write(byt)
2064 else:
8bf48f23 2065 out.write(s)
7459e3a2
PH
2066 out.flush()
2067
2068
48ea9cea
PH
2069def bytes_to_intlist(bs):
2070 if not bs:
2071 return []
2072 if isinstance(bs[0], int): # Python 3
2073 return list(bs)
2074 else:
2075 return [ord(c) for c in bs]
2076
c257baff 2077
cba892fa 2078def intlist_to_bytes(xs):
2079 if not xs:
2080 return b''
edaa23f8 2081 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
2082
2083
c1c9a79c
PH
2084# Cross-platform file locking
2085if sys.platform == 'win32':
2086 import ctypes.wintypes
2087 import msvcrt
2088
2089 class OVERLAPPED(ctypes.Structure):
2090 _fields_ = [
2091 ('Internal', ctypes.wintypes.LPVOID),
2092 ('InternalHigh', ctypes.wintypes.LPVOID),
2093 ('Offset', ctypes.wintypes.DWORD),
2094 ('OffsetHigh', ctypes.wintypes.DWORD),
2095 ('hEvent', ctypes.wintypes.HANDLE),
2096 ]
2097
2098 kernel32 = ctypes.windll.kernel32
2099 LockFileEx = kernel32.LockFileEx
2100 LockFileEx.argtypes = [
2101 ctypes.wintypes.HANDLE, # hFile
2102 ctypes.wintypes.DWORD, # dwFlags
2103 ctypes.wintypes.DWORD, # dwReserved
2104 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2105 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2106 ctypes.POINTER(OVERLAPPED) # Overlapped
2107 ]
2108 LockFileEx.restype = ctypes.wintypes.BOOL
2109 UnlockFileEx = kernel32.UnlockFileEx
2110 UnlockFileEx.argtypes = [
2111 ctypes.wintypes.HANDLE, # hFile
2112 ctypes.wintypes.DWORD, # dwReserved
2113 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2114 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2115 ctypes.POINTER(OVERLAPPED) # Overlapped
2116 ]
2117 UnlockFileEx.restype = ctypes.wintypes.BOOL
2118 whole_low = 0xffffffff
2119 whole_high = 0x7fffffff
2120
2121 def _lock_file(f, exclusive):
2122 overlapped = OVERLAPPED()
2123 overlapped.Offset = 0
2124 overlapped.OffsetHigh = 0
2125 overlapped.hEvent = 0
2126 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2127 handle = msvcrt.get_osfhandle(f.fileno())
2128 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
2129 whole_low, whole_high, f._lock_file_overlapped_p):
2130 raise OSError('Locking file failed: %r' % ctypes.FormatError())
2131
2132 def _unlock_file(f):
2133 assert f._lock_file_overlapped_p
2134 handle = msvcrt.get_osfhandle(f.fileno())
2135 if not UnlockFileEx(handle, 0,
2136 whole_low, whole_high, f._lock_file_overlapped_p):
2137 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2138
2139else:
399a76e6
YCH
2140 # Some platforms, such as Jython, is missing fcntl
2141 try:
2142 import fcntl
c1c9a79c 2143
399a76e6
YCH
2144 def _lock_file(f, exclusive):
2145 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 2146
399a76e6
YCH
2147 def _unlock_file(f):
2148 fcntl.flock(f, fcntl.LOCK_UN)
2149 except ImportError:
2150 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2151
2152 def _lock_file(f, exclusive):
2153 raise IOError(UNSUPPORTED_MSG)
2154
2155 def _unlock_file(f):
2156 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
2157
2158
2159class locked_file(object):
2160 def __init__(self, filename, mode, encoding=None):
2161 assert mode in ['r', 'a', 'w']
2162 self.f = io.open(filename, mode, encoding=encoding)
2163 self.mode = mode
2164
2165 def __enter__(self):
2166 exclusive = self.mode != 'r'
2167 try:
2168 _lock_file(self.f, exclusive)
2169 except IOError:
2170 self.f.close()
2171 raise
2172 return self
2173
2174 def __exit__(self, etype, value, traceback):
2175 try:
2176 _unlock_file(self.f)
2177 finally:
2178 self.f.close()
2179
2180 def __iter__(self):
2181 return iter(self.f)
2182
2183 def write(self, *args):
2184 return self.f.write(*args)
2185
2186 def read(self, *args):
2187 return self.f.read(*args)
4eb7f1d1
JMF
2188
2189
4644ac55
S
2190def get_filesystem_encoding():
2191 encoding = sys.getfilesystemencoding()
2192 return encoding if encoding is not None else 'utf-8'
2193
2194
4eb7f1d1 2195def shell_quote(args):
a6a173c2 2196 quoted_args = []
4644ac55 2197 encoding = get_filesystem_encoding()
a6a173c2
JMF
2198 for a in args:
2199 if isinstance(a, bytes):
2200 # We may get a filename encoded with 'encodeFilename'
2201 a = a.decode(encoding)
aefce8e6 2202 quoted_args.append(compat_shlex_quote(a))
28e614de 2203 return ' '.join(quoted_args)
9d4660ca
PH
2204
2205
2206def smuggle_url(url, data):
2207 """ Pass additional data in a URL for internal use. """
2208
81953d1a
RA
2209 url, idata = unsmuggle_url(url, {})
2210 data.update(idata)
15707c7e 2211 sdata = compat_urllib_parse_urlencode(
28e614de
PH
2212 {'__youtubedl_smuggle': json.dumps(data)})
2213 return url + '#' + sdata
9d4660ca
PH
2214
2215
79f82953 2216def unsmuggle_url(smug_url, default=None):
83e865a3 2217 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2218 return smug_url, default
28e614de
PH
2219 url, _, sdata = smug_url.rpartition('#')
2220 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2221 data = json.loads(jsond)
2222 return url, data
02dbf93f
PH
2223
2224
e0fd9573 2225def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2226 """ Formats numbers with decimal sufixes like K, M, etc """
2227 num, factor = float_or_none(num), float(factor)
2228 if num is None:
2229 return None
2230 exponent = 0 if num == 0 else int(math.log(num, factor))
abbeeebc 2231 suffix = ['', *'kMGTPEZY'][exponent]
2232 if factor == 1024:
2233 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2234 converted = num / (factor ** exponent)
abbeeebc 2235 return fmt % (converted, suffix)
e0fd9573 2236
2237
02dbf93f 2238def format_bytes(bytes):
f02d24d8 2239 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2240
1c088fa8 2241
fb47597b
S
2242def lookup_unit_table(unit_table, s):
2243 units_re = '|'.join(re.escape(u) for u in unit_table)
2244 m = re.match(
782b1b5b 2245 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
2246 if not m:
2247 return None
2248 num_str = m.group('num').replace(',', '.')
2249 mult = unit_table[m.group('unit')]
2250 return int(float(num_str) * mult)
2251
2252
be64b5b0
PH
2253def parse_filesize(s):
2254 if s is None:
2255 return None
2256
dfb1b146 2257 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2258 # but we support those too
2259 _UNIT_TABLE = {
2260 'B': 1,
2261 'b': 1,
70852b47 2262 'bytes': 1,
be64b5b0
PH
2263 'KiB': 1024,
2264 'KB': 1000,
2265 'kB': 1024,
2266 'Kb': 1000,
13585d76 2267 'kb': 1000,
70852b47
YCH
2268 'kilobytes': 1000,
2269 'kibibytes': 1024,
be64b5b0
PH
2270 'MiB': 1024 ** 2,
2271 'MB': 1000 ** 2,
2272 'mB': 1024 ** 2,
2273 'Mb': 1000 ** 2,
13585d76 2274 'mb': 1000 ** 2,
70852b47
YCH
2275 'megabytes': 1000 ** 2,
2276 'mebibytes': 1024 ** 2,
be64b5b0
PH
2277 'GiB': 1024 ** 3,
2278 'GB': 1000 ** 3,
2279 'gB': 1024 ** 3,
2280 'Gb': 1000 ** 3,
13585d76 2281 'gb': 1000 ** 3,
70852b47
YCH
2282 'gigabytes': 1000 ** 3,
2283 'gibibytes': 1024 ** 3,
be64b5b0
PH
2284 'TiB': 1024 ** 4,
2285 'TB': 1000 ** 4,
2286 'tB': 1024 ** 4,
2287 'Tb': 1000 ** 4,
13585d76 2288 'tb': 1000 ** 4,
70852b47
YCH
2289 'terabytes': 1000 ** 4,
2290 'tebibytes': 1024 ** 4,
be64b5b0
PH
2291 'PiB': 1024 ** 5,
2292 'PB': 1000 ** 5,
2293 'pB': 1024 ** 5,
2294 'Pb': 1000 ** 5,
13585d76 2295 'pb': 1000 ** 5,
70852b47
YCH
2296 'petabytes': 1000 ** 5,
2297 'pebibytes': 1024 ** 5,
be64b5b0
PH
2298 'EiB': 1024 ** 6,
2299 'EB': 1000 ** 6,
2300 'eB': 1024 ** 6,
2301 'Eb': 1000 ** 6,
13585d76 2302 'eb': 1000 ** 6,
70852b47
YCH
2303 'exabytes': 1000 ** 6,
2304 'exbibytes': 1024 ** 6,
be64b5b0
PH
2305 'ZiB': 1024 ** 7,
2306 'ZB': 1000 ** 7,
2307 'zB': 1024 ** 7,
2308 'Zb': 1000 ** 7,
13585d76 2309 'zb': 1000 ** 7,
70852b47
YCH
2310 'zettabytes': 1000 ** 7,
2311 'zebibytes': 1024 ** 7,
be64b5b0
PH
2312 'YiB': 1024 ** 8,
2313 'YB': 1000 ** 8,
2314 'yB': 1024 ** 8,
2315 'Yb': 1000 ** 8,
13585d76 2316 'yb': 1000 ** 8,
70852b47
YCH
2317 'yottabytes': 1000 ** 8,
2318 'yobibytes': 1024 ** 8,
be64b5b0
PH
2319 }
2320
fb47597b
S
2321 return lookup_unit_table(_UNIT_TABLE, s)
2322
2323
2324def parse_count(s):
2325 if s is None:
be64b5b0
PH
2326 return None
2327
352d5da8 2328 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2329
2330 if re.match(r'^[\d,.]+$', s):
2331 return str_to_int(s)
2332
2333 _UNIT_TABLE = {
2334 'k': 1000,
2335 'K': 1000,
2336 'm': 1000 ** 2,
2337 'M': 1000 ** 2,
2338 'kk': 1000 ** 2,
2339 'KK': 1000 ** 2,
352d5da8 2340 'b': 1000 ** 3,
2341 'B': 1000 ** 3,
fb47597b 2342 }
be64b5b0 2343
352d5da8 2344 ret = lookup_unit_table(_UNIT_TABLE, s)
2345 if ret is not None:
2346 return ret
2347
2348 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2349 if mobj:
2350 return str_to_int(mobj.group(1))
be64b5b0 2351
2f7ae819 2352
b871d7e9
S
2353def parse_resolution(s):
2354 if s is None:
2355 return {}
2356
17ec8bcf 2357 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2358 if mobj:
2359 return {
2360 'width': int(mobj.group('w')),
2361 'height': int(mobj.group('h')),
2362 }
2363
17ec8bcf 2364 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2365 if mobj:
2366 return {'height': int(mobj.group(1))}
2367
2368 mobj = re.search(r'\b([48])[kK]\b', s)
2369 if mobj:
2370 return {'height': int(mobj.group(1)) * 540}
2371
2372 return {}
2373
2374
0dc41787
S
2375def parse_bitrate(s):
2376 if not isinstance(s, compat_str):
2377 return
2378 mobj = re.search(r'\b(\d+)\s*kbps', s)
2379 if mobj:
2380 return int(mobj.group(1))
2381
2382
a942d6cb 2383def month_by_name(name, lang='en'):
caefb1de
PH
2384 """ Return the number of a month by (locale-independently) English name """
2385
f6717dec 2386 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2387
caefb1de 2388 try:
f6717dec 2389 return month_names.index(name) + 1
7105440c
YCH
2390 except ValueError:
2391 return None
2392
2393
2394def month_by_abbreviation(abbrev):
2395 """ Return the number of a month by (locale-independently) English
2396 abbreviations """
2397
2398 try:
2399 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2400 except ValueError:
2401 return None
18258362
JMF
2402
2403
5aafe895 2404def fix_xml_ampersands(xml_str):
18258362 2405 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2406 return re.sub(
2407 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2408 '&amp;',
5aafe895 2409 xml_str)
e3946f98
PH
2410
2411
2412def setproctitle(title):
8bf48f23 2413 assert isinstance(title, compat_str)
c1c05c67
YCH
2414
2415 # ctypes in Jython is not complete
2416 # http://bugs.jython.org/issue2148
2417 if sys.platform.startswith('java'):
2418 return
2419
e3946f98 2420 try:
611c1dd9 2421 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2422 except OSError:
2423 return
2f49bcd6
RC
2424 except TypeError:
2425 # LoadLibrary in Windows Python 2.7.13 only expects
2426 # a bytestring, but since unicode_literals turns
2427 # every string into a unicode string, it fails.
2428 return
6eefe533
PH
2429 title_bytes = title.encode('utf-8')
2430 buf = ctypes.create_string_buffer(len(title_bytes))
2431 buf.value = title_bytes
e3946f98 2432 try:
6eefe533 2433 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2434 except AttributeError:
2435 return # Strange libc, just skip this
d7dda168
PH
2436
2437
2438def remove_start(s, start):
46bc9b7d 2439 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2440
2441
2b9faf55 2442def remove_end(s, end):
46bc9b7d 2443 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2444
2445
31b2051e
S
2446def remove_quotes(s):
2447 if s is None or len(s) < 2:
2448 return s
2449 for quote in ('"', "'", ):
2450 if s[0] == quote and s[-1] == quote:
2451 return s[1:-1]
2452 return s
2453
2454
b6e0c7d2
U
2455def get_domain(url):
2456 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2457 return domain.group('domain') if domain else None
2458
2459
29eb5174 2460def url_basename(url):
9b8aaeed 2461 path = compat_urlparse.urlparse(url).path
28e614de 2462 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2463
2464
02dc0a36
S
2465def base_url(url):
2466 return re.match(r'https?://[^?#&]+/', url).group()
2467
2468
e34c3361 2469def urljoin(base, path):
4b5de77b
S
2470 if isinstance(path, bytes):
2471 path = path.decode('utf-8')
e34c3361
S
2472 if not isinstance(path, compat_str) or not path:
2473 return None
fad4ceb5 2474 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2475 return path
4b5de77b
S
2476 if isinstance(base, bytes):
2477 base = base.decode('utf-8')
2478 if not isinstance(base, compat_str) or not re.match(
2479 r'^(?:https?:)?//', base):
e34c3361
S
2480 return None
2481 return compat_urlparse.urljoin(base, path)
2482
2483
aa94a6d3
PH
2484class HEADRequest(compat_urllib_request.Request):
2485 def get_method(self):
611c1dd9 2486 return 'HEAD'
7217e148
PH
2487
2488
95cf60e8
S
2489class PUTRequest(compat_urllib_request.Request):
2490 def get_method(self):
2491 return 'PUT'
2492
2493
9732d77e 2494def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2495 if get_attr and v is not None:
2496 v = getattr(v, get_attr, None)
1812afb7
S
2497 try:
2498 return int(v) * invscale // scale
31c49255 2499 except (ValueError, TypeError, OverflowError):
af98f8ff 2500 return default
9732d77e 2501
9572013d 2502
40a90862
JMF
2503def str_or_none(v, default=None):
2504 return default if v is None else compat_str(v)
2505
9732d77e
PH
2506
2507def str_to_int(int_str):
48d4681e 2508 """ A more relaxed version of int_or_none """
42db58ec 2509 if isinstance(int_str, compat_integer_types):
348c6bf1 2510 return int_str
42db58ec
S
2511 elif isinstance(int_str, compat_str):
2512 int_str = re.sub(r'[,\.\+]', '', int_str)
2513 return int_or_none(int_str)
608d11f5
PH
2514
2515
9732d77e 2516def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2517 if v is None:
2518 return default
2519 try:
2520 return float(v) * invscale / scale
5e1271c5 2521 except (ValueError, TypeError):
caf80631 2522 return default
43f775e4
PH
2523
2524
c7e327c4
S
2525def bool_or_none(v, default=None):
2526 return v if isinstance(v, bool) else default
2527
2528
53cd37ba
S
2529def strip_or_none(v, default=None):
2530 return v.strip() if isinstance(v, compat_str) else default
b72b4431
S
2531
2532
af03000a
S
2533def url_or_none(url):
2534 if not url or not isinstance(url, compat_str):
2535 return None
2536 url = url.strip()
29f7c58a 2537 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2538
2539
e29663c6 2540def strftime_or_none(timestamp, date_format, default=None):
2541 datetime_object = None
2542 try:
2543 if isinstance(timestamp, compat_numeric_types): # unix timestamp
2544 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2545 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2546 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2547 return datetime_object.strftime(date_format)
2548 except (ValueError, TypeError, AttributeError):
2549 return default
2550
2551
608d11f5 2552def parse_duration(s):
8f9312c3 2553 if not isinstance(s, compat_basestring):
608d11f5 2554 return None
ca7b3246 2555 s = s.strip()
38d79fd1 2556 if not s:
2557 return None
ca7b3246 2558
acaff495 2559 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2560 m = re.match(r'''(?x)
2561 (?P<before_secs>
2562 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2563 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2564 (?P<ms>[.:][0-9]+)?Z?$
2565 ''', s)
acaff495 2566 if m:
8bd1c00b 2567 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2568 else:
2569 m = re.match(
056653bb
S
2570 r'''(?ix)(?:P?
2571 (?:
2572 [0-9]+\s*y(?:ears?)?\s*
2573 )?
2574 (?:
2575 [0-9]+\s*m(?:onths?)?\s*
2576 )?
2577 (?:
2578 [0-9]+\s*w(?:eeks?)?\s*
2579 )?
8f4b58d7 2580 (?:
acaff495 2581 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 2582 )?
056653bb 2583 T)?
acaff495 2584 (?:
2585 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2586 )?
2587 (?:
2588 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2589 )?
2590 (?:
2591 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2592 )?Z?$''', s)
acaff495 2593 if m:
2594 days, hours, mins, secs, ms = m.groups()
2595 else:
15846398 2596 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2597 if m:
2598 hours, mins = m.groups()
2599 else:
2600 return None
2601
2602 duration = 0
2603 if secs:
2604 duration += float(secs)
2605 if mins:
2606 duration += float(mins) * 60
2607 if hours:
2608 duration += float(hours) * 60 * 60
2609 if days:
2610 duration += float(days) * 24 * 60 * 60
2611 if ms:
8bd1c00b 2612 duration += float(ms.replace(':', '.'))
acaff495 2613 return duration
91d7d0b3
JMF
2614
2615
e65e4c88 2616def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2617 name, real_ext = os.path.splitext(filename)
e65e4c88
S
2618 return (
2619 '{0}.{1}{2}'.format(name, ext, real_ext)
2620 if not expected_real_ext or real_ext[1:] == expected_real_ext
2621 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
2622
2623
b3ed15b7
S
2624def replace_extension(filename, ext, expected_real_ext=None):
2625 name, real_ext = os.path.splitext(filename)
2626 return '{0}.{1}'.format(
2627 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2628 ext)
2629
2630
d70ad093
PH
2631def check_executable(exe, args=[]):
2632 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2633 args can be a list of arguments for a short output (like -version) """
2634 try:
d3c93ec2 2635 Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
d70ad093
PH
2636 except OSError:
2637 return False
2638 return exe
b7ab0590
PH
2639
2640
9af98e17 2641def _get_exe_version_output(exe, args):
95807118 2642 try:
b64d04c1 2643 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2644 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2645 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
d3c93ec2 2646 out, _ = Popen(
2647 [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2648 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
95807118
PH
2649 except OSError:
2650 return False
cae97f65
PH
2651 if isinstance(out, bytes): # Python 2.x
2652 out = out.decode('ascii', 'ignore')
9af98e17 2653 return out
cae97f65
PH
2654
2655
2656def detect_exe_version(output, version_re=None, unrecognized='present'):
2657 assert isinstance(output, compat_str)
2658 if version_re is None:
2659 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2660 m = re.search(version_re, output)
95807118
PH
2661 if m:
2662 return m.group(1)
2663 else:
2664 return unrecognized
2665
2666
9af98e17 2667def get_exe_version(exe, args=['--version'],
2668 version_re=None, unrecognized='present'):
2669 """ Returns the version of the specified executable,
2670 or False if the executable is not present """
2671 out = _get_exe_version_output(exe, args)
2672 return detect_exe_version(out, version_re, unrecognized) if out else False
2673
2674
cb89cfc1 2675class LazyList(collections.abc.Sequence):
483336e7 2676 ''' Lazy immutable list from an iterable
2677 Note that slices of a LazyList are lists and not LazyList'''
2678
8e5fecc8 2679 class IndexError(IndexError):
2680 pass
2681
282f5709 2682 def __init__(self, iterable, *, reverse=False, _cache=None):
483336e7 2683 self.__iterable = iter(iterable)
282f5709 2684 self.__cache = [] if _cache is None else _cache
2685 self.__reversed = reverse
483336e7 2686
2687 def __iter__(self):
28419ca2 2688 if self.__reversed:
2689 # We need to consume the entire iterable to iterate in reverse
981052c9 2690 yield from self.exhaust()
28419ca2 2691 return
2692 yield from self.__cache
483336e7 2693 for item in self.__iterable:
2694 self.__cache.append(item)
2695 yield item
2696
981052c9 2697 def __exhaust(self):
483336e7 2698 self.__cache.extend(self.__iterable)
9f1a1c36 2699 # Discard the emptied iterable to make it pickle-able
2700 self.__iterable = []
28419ca2 2701 return self.__cache
2702
981052c9 2703 def exhaust(self):
2704 ''' Evaluate the entire iterable '''
2705 return self.__exhaust()[::-1 if self.__reversed else 1]
2706
28419ca2 2707 @staticmethod
981052c9 2708 def __reverse_index(x):
e0f2b4b4 2709 return None if x is None else -(x + 1)
483336e7 2710
2711 def __getitem__(self, idx):
2712 if isinstance(idx, slice):
28419ca2 2713 if self.__reversed:
e0f2b4b4 2714 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2715 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2716 elif isinstance(idx, int):
28419ca2 2717 if self.__reversed:
981052c9 2718 idx = self.__reverse_index(idx)
e0f2b4b4 2719 start, stop, step = idx, idx, 0
483336e7 2720 else:
2721 raise TypeError('indices must be integers or slices')
e0f2b4b4 2722 if ((start or 0) < 0 or (stop or 0) < 0
2723 or (start is None and step < 0)
2724 or (stop is None and step > 0)):
483336e7 2725 # We need to consume the entire iterable to be able to slice from the end
2726 # Obviously, never use this with infinite iterables
8e5fecc8 2727 self.__exhaust()
2728 try:
2729 return self.__cache[idx]
2730 except IndexError as e:
2731 raise self.IndexError(e) from e
e0f2b4b4 2732 n = max(start or 0, stop or 0) - len(self.__cache) + 1
28419ca2 2733 if n > 0:
2734 self.__cache.extend(itertools.islice(self.__iterable, n))
8e5fecc8 2735 try:
2736 return self.__cache[idx]
2737 except IndexError as e:
2738 raise self.IndexError(e) from e
483336e7 2739
2740 def __bool__(self):
2741 try:
28419ca2 2742 self[-1] if self.__reversed else self[0]
8e5fecc8 2743 except self.IndexError:
483336e7 2744 return False
2745 return True
2746
2747 def __len__(self):
8e5fecc8 2748 self.__exhaust()
483336e7 2749 return len(self.__cache)
2750
282f5709 2751 def __reversed__(self):
2752 return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2753
2754 def __copy__(self):
2755 return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2756
28419ca2 2757 def __repr__(self):
2758 # repr and str should mimic a list. So we exhaust the iterable
2759 return repr(self.exhaust())
2760
2761 def __str__(self):
2762 return repr(self.exhaust())
2763
483336e7 2764
7be9ccff 2765class PagedList:
c07a39ae 2766
2767 class IndexError(IndexError):
2768 pass
2769
dd26ced1
PH
2770 def __len__(self):
2771 # This is only useful for tests
2772 return len(self.getslice())
2773
7be9ccff 2774 def __init__(self, pagefunc, pagesize, use_cache=True):
2775 self._pagefunc = pagefunc
2776 self._pagesize = pagesize
2777 self._use_cache = use_cache
2778 self._cache = {}
2779
2780 def getpage(self, pagenum):
d8cf8d97 2781 page_results = self._cache.get(pagenum)
2782 if page_results is None:
2783 page_results = list(self._pagefunc(pagenum))
7be9ccff 2784 if self._use_cache:
2785 self._cache[pagenum] = page_results
2786 return page_results
2787
2788 def getslice(self, start=0, end=None):
2789 return list(self._getslice(start, end))
2790
2791 def _getslice(self, start, end):
55575225 2792 raise NotImplementedError('This method must be implemented by subclasses')
2793
2794 def __getitem__(self, idx):
7be9ccff 2795 # NOTE: cache must be enabled if this is used
55575225 2796 if not isinstance(idx, int) or idx < 0:
2797 raise TypeError('indices must be non-negative integers')
2798 entries = self.getslice(idx, idx + 1)
d8cf8d97 2799 if not entries:
c07a39ae 2800 raise self.IndexError()
d8cf8d97 2801 return entries[0]
55575225 2802
9c44d242
PH
2803
2804class OnDemandPagedList(PagedList):
7be9ccff 2805 def _getslice(self, start, end):
b7ab0590
PH
2806 for pagenum in itertools.count(start // self._pagesize):
2807 firstid = pagenum * self._pagesize
2808 nextfirstid = pagenum * self._pagesize + self._pagesize
2809 if start >= nextfirstid:
2810 continue
2811
b7ab0590
PH
2812 startv = (
2813 start % self._pagesize
2814 if firstid <= start < nextfirstid
2815 else 0)
b7ab0590
PH
2816 endv = (
2817 ((end - 1) % self._pagesize) + 1
2818 if (end is not None and firstid <= end <= nextfirstid)
2819 else None)
2820
7be9ccff 2821 page_results = self.getpage(pagenum)
b7ab0590
PH
2822 if startv != 0 or endv is not None:
2823 page_results = page_results[startv:endv]
7be9ccff 2824 yield from page_results
b7ab0590
PH
2825
2826 # A little optimization - if current page is not "full", ie. does
2827 # not contain page_size videos then we can assume that this page
2828 # is the last one - there are no more ids on further pages -
2829 # i.e. no need to query again.
2830 if len(page_results) + startv < self._pagesize:
2831 break
2832
2833 # If we got the whole page, but the next page is not interesting,
2834 # break out early as well
2835 if end == nextfirstid:
2836 break
81c2f20b
PH
2837
2838
9c44d242
PH
2839class InAdvancePagedList(PagedList):
2840 def __init__(self, pagefunc, pagecount, pagesize):
9c44d242 2841 self._pagecount = pagecount
7be9ccff 2842 PagedList.__init__(self, pagefunc, pagesize, True)
9c44d242 2843
7be9ccff 2844 def _getslice(self, start, end):
9c44d242
PH
2845 start_page = start // self._pagesize
2846 end_page = (
2847 self._pagecount if end is None else (end // self._pagesize + 1))
2848 skip_elems = start - start_page * self._pagesize
2849 only_more = None if end is None else end - start
2850 for pagenum in range(start_page, end_page):
7be9ccff 2851 page_results = self.getpage(pagenum)
9c44d242 2852 if skip_elems:
7be9ccff 2853 page_results = page_results[skip_elems:]
9c44d242
PH
2854 skip_elems = None
2855 if only_more is not None:
7be9ccff 2856 if len(page_results) < only_more:
2857 only_more -= len(page_results)
9c44d242 2858 else:
7be9ccff 2859 yield from page_results[:only_more]
9c44d242 2860 break
7be9ccff 2861 yield from page_results
9c44d242
PH
2862
2863
81c2f20b 2864def uppercase_escape(s):
676eb3f2 2865 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2866 return re.sub(
a612753d 2867 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2868 lambda m: unicode_escape(m.group(0))[0],
2869 s)
0fe2ff78
YCH
2870
2871
2872def lowercase_escape(s):
2873 unicode_escape = codecs.getdecoder('unicode_escape')
2874 return re.sub(
2875 r'\\u[0-9a-fA-F]{4}',
2876 lambda m: unicode_escape(m.group(0))[0],
2877 s)
b53466e1 2878
d05cfe06
S
2879
2880def escape_rfc3986(s):
2881 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 2882 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 2883 s = s.encode('utf-8')
ecc0c5ee 2884 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2885
2886
2887def escape_url(url):
2888 """Escape URL as suggested by RFC 3986"""
2889 url_parsed = compat_urllib_parse_urlparse(url)
2890 return url_parsed._replace(
efbed08d 2891 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2892 path=escape_rfc3986(url_parsed.path),
2893 params=escape_rfc3986(url_parsed.params),
2894 query=escape_rfc3986(url_parsed.query),
2895 fragment=escape_rfc3986(url_parsed.fragment)
2896 ).geturl()
2897
62e609ab 2898
4dfbf869 2899def parse_qs(url):
2900 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2901
2902
62e609ab
PH
2903def read_batch_urls(batch_fd):
2904 def fixup(url):
2905 if not isinstance(url, compat_str):
2906 url = url.decode('utf-8', 'replace')
8c04f0be 2907 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2908 for bom in BOM_UTF8:
2909 if url.startswith(bom):
2910 url = url[len(bom):]
2911 url = url.lstrip()
2912 if not url or url.startswith(('#', ';', ']')):
62e609ab 2913 return False
8c04f0be 2914 # "#" cannot be stripped out since it is part of the URI
2915 # However, it can be safely stipped out if follwing a whitespace
2916 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
2917
2918 with contextlib.closing(batch_fd) as fd:
2919 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2920
2921
2922def urlencode_postdata(*args, **kargs):
15707c7e 2923 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2924
2925
38f9ef31 2926def update_url_query(url, query):
cacd9966
YCH
2927 if not query:
2928 return url
38f9ef31 2929 parsed_url = compat_urlparse.urlparse(url)
2930 qs = compat_parse_qs(parsed_url.query)
2931 qs.update(query)
2932 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2933 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2934
8e60dc75 2935
ed0291d1
S
2936def update_Request(req, url=None, data=None, headers={}, query={}):
2937 req_headers = req.headers.copy()
2938 req_headers.update(headers)
2939 req_data = data or req.data
2940 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2941 req_get_method = req.get_method()
2942 if req_get_method == 'HEAD':
2943 req_type = HEADRequest
2944 elif req_get_method == 'PUT':
2945 req_type = PUTRequest
2946 else:
2947 req_type = compat_urllib_request.Request
ed0291d1
S
2948 new_req = req_type(
2949 req_url, data=req_data, headers=req_headers,
2950 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2951 if hasattr(req, 'timeout'):
2952 new_req.timeout = req.timeout
2953 return new_req
2954
2955
10c87c15 2956def _multipart_encode_impl(data, boundary):
0c265486
YCH
2957 content_type = 'multipart/form-data; boundary=%s' % boundary
2958
2959 out = b''
2960 for k, v in data.items():
2961 out += b'--' + boundary.encode('ascii') + b'\r\n'
2962 if isinstance(k, compat_str):
2963 k = k.encode('utf-8')
2964 if isinstance(v, compat_str):
2965 v = v.encode('utf-8')
2966 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2967 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2968 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2969 if boundary.encode('ascii') in content:
2970 raise ValueError('Boundary overlaps with data')
2971 out += content
2972
2973 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2974
2975 return out, content_type
2976
2977
2978def multipart_encode(data, boundary=None):
2979 '''
2980 Encode a dict to RFC 7578-compliant form-data
2981
2982 data:
2983 A dict where keys and values can be either Unicode or bytes-like
2984 objects.
2985 boundary:
2986 If specified a Unicode object, it's used as the boundary. Otherwise
2987 a random boundary is generated.
2988
2989 Reference: https://tools.ietf.org/html/rfc7578
2990 '''
2991 has_specified_boundary = boundary is not None
2992
2993 while True:
2994 if boundary is None:
2995 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2996
2997 try:
10c87c15 2998 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2999 break
3000 except ValueError:
3001 if has_specified_boundary:
3002 raise
3003 boundary = None
3004
3005 return out, content_type
3006
3007
86296ad2 3008def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
3009 if isinstance(key_or_keys, (list, tuple)):
3010 for key in key_or_keys:
86296ad2
S
3011 if key not in d or d[key] is None or skip_false_values and not d[key]:
3012 continue
3013 return d[key]
cbecc9b9
S
3014 return default
3015 return d.get(key_or_keys, default)
3016
3017
329ca3be 3018def try_get(src, getter, expected_type=None):
6606817a 3019 for get in variadic(getter):
a32a9a7e
S
3020 try:
3021 v = get(src)
3022 except (AttributeError, KeyError, TypeError, IndexError):
3023 pass
3024 else:
3025 if expected_type is None or isinstance(v, expected_type):
3026 return v
329ca3be
S
3027
3028
6cc62232
S
3029def merge_dicts(*dicts):
3030 merged = {}
3031 for a_dict in dicts:
3032 for k, v in a_dict.items():
3033 if v is None:
3034 continue
3089bc74
S
3035 if (k not in merged
3036 or (isinstance(v, compat_str) and v
3037 and isinstance(merged[k], compat_str)
3038 and not merged[k])):
6cc62232
S
3039 merged[k] = v
3040 return merged
3041
3042
8e60dc75
S
3043def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3044 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3045
16392824 3046
a1a530b0
PH
3047US_RATINGS = {
3048 'G': 0,
3049 'PG': 10,
3050 'PG-13': 13,
3051 'R': 16,
3052 'NC': 18,
3053}
fac55558
PH
3054
3055
a8795327 3056TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3057 'TV-Y': 0,
3058 'TV-Y7': 7,
3059 'TV-G': 0,
3060 'TV-PG': 0,
3061 'TV-14': 14,
3062 'TV-MA': 17,
a8795327
S
3063}
3064
3065
146c80e2 3066def parse_age_limit(s):
a8795327
S
3067 if type(s) == int:
3068 return s if 0 <= s <= 21 else None
3069 if not isinstance(s, compat_basestring):
d838b1bd 3070 return None
146c80e2 3071 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3072 if m:
3073 return int(m.group('age'))
5c5fae6d 3074 s = s.upper()
a8795327
S
3075 if s in US_RATINGS:
3076 return US_RATINGS[s]
5a16c9d9 3077 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3078 if m:
5a16c9d9 3079 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3080 return None
146c80e2
S
3081
3082
fac55558 3083def strip_jsonp(code):
609a61e3 3084 return re.sub(
5552c9eb 3085 r'''(?sx)^
e9c671d5 3086 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3087 (?:\s*&&\s*(?P=func_name))?
3088 \s*\(\s*(?P<callback_data>.*)\);?
3089 \s*?(?://[^\n]*)*$''',
3090 r'\g<callback_data>', code)
478c2c61
PH
3091
3092
5c610515 3093def js_to_json(code, vars={}):
3094 # vars is a dict of var, val pairs to substitute
c843e685 3095 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
4195096e
S
3096 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3097 INTEGER_TABLE = (
3098 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3099 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3100 )
3101
e05f6939 3102 def fix_kv(m):
e7b6d122
PH
3103 v = m.group(0)
3104 if v in ('true', 'false', 'null'):
3105 return v
421ddcb8
C
3106 elif v in ('undefined', 'void 0'):
3107 return 'null'
8bdd16b4 3108 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
bd1e4844 3109 return ""
3110
3111 if v[0] in ("'", '"'):
3112 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 3113 '"': '\\"',
bd1e4844 3114 "\\'": "'",
3115 '\\\n': '',
3116 '\\x': '\\u00',
3117 }.get(m.group(0), m.group(0)), v[1:-1])
8bdd16b4 3118 else:
3119 for regex, base in INTEGER_TABLE:
3120 im = re.match(regex, v)
3121 if im:
3122 i = int(im.group(1), base)
3123 return '"%d":' % i if v.endswith(':') else '%d' % i
89ac4a19 3124
5c610515 3125 if v in vars:
3126 return vars[v]
3127
e7b6d122 3128 return '"%s"' % v
e05f6939 3129
bd1e4844 3130 return re.sub(r'''(?sx)
3131 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3132 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 3133 {comment}|,(?={skip}[\]}}])|
421ddcb8 3134 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
4195096e 3135 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
8bdd16b4 3136 [0-9]+(?={skip}:)|
3137 !+
4195096e 3138 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
3139
3140
478c2c61
PH
3141def qualities(quality_ids):
3142 """ Get a numeric quality value out of a list of possible values """
3143 def q(qid):
3144 try:
3145 return quality_ids.index(qid)
3146 except ValueError:
3147 return -1
3148 return q
3149
acd69589 3150
ebed8b37 3151POSTPROCESS_WHEN = {'pre_process', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
1e43a6f7 3152
3153
de6000d9 3154DEFAULT_OUTTMPL = {
3155 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3156 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3157}
3158OUTTMPL_TYPES = {
72755351 3159 'chapter': None,
de6000d9 3160 'subtitle': None,
3161 'thumbnail': None,
3162 'description': 'description',
3163 'annotation': 'annotations.xml',
3164 'infojson': 'info.json',
08438d2c 3165 'link': None,
3b603dbd 3166 'pl_video': None,
5112f26a 3167 'pl_thumbnail': None,
de6000d9 3168 'pl_description': 'description',
3169 'pl_infojson': 'info.json',
3170}
0a871f68 3171
143db31d 3172# As of [1] format syntax is:
3173# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3174# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3175STR_FORMAT_RE_TMPL = r'''(?x)
3176 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3177 %
524e2e4f 3178 (?P<has_key>\((?P<key>{0})\))?
752cda38 3179 (?P<format>
524e2e4f 3180 (?P<conversion>[#0\-+ ]+)?
3181 (?P<min_width>\d+)?
3182 (?P<precision>\.\d+)?
3183 (?P<len_mod>[hlL])? # unused in python
901130bb 3184 {1} # conversion type
752cda38 3185 )
143db31d 3186'''
3187
7d1eb38a 3188
901130bb 3189STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3190
7d1eb38a 3191
a020a0dc
PH
3192def limit_length(s, length):
3193 """ Add ellipses to overly long strings """
3194 if s is None:
3195 return None
3196 ELLIPSES = '...'
3197 if len(s) > length:
3198 return s[:length - len(ELLIPSES)] + ELLIPSES
3199 return s
48844745
PH
3200
3201
3202def version_tuple(v):
5f9b8394 3203 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3204
3205
3206def is_outdated_version(version, limit, assume_new=True):
3207 if not version:
3208 return not assume_new
3209 try:
3210 return version_tuple(version) < version_tuple(limit)
3211 except ValueError:
3212 return not assume_new
732ea2f0
PH
3213
3214
3215def ytdl_is_updateable():
7a5c1cfe 3216 """ Returns if yt-dlp can be updated with -U """
735d865e 3217
5d535b4a 3218 from .update import is_non_updateable
732ea2f0 3219
5d535b4a 3220 return not is_non_updateable()
7d4111ed
PH
3221
3222
3223def args_to_str(args):
3224 # Get a short string representation for a subprocess command
702ccf2d 3225 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3226
3227
9b9c5355 3228def error_to_compat_str(err):
fdae2358
S
3229 err_str = str(err)
3230 # On python 2 error byte string must be decoded with proper
3231 # encoding rather than ascii
3232 if sys.version_info[0] < 3:
3233 err_str = err_str.decode(preferredencoding())
3234 return err_str
3235
3236
c460bdd5 3237def mimetype2ext(mt):
eb9ee194
S
3238 if mt is None:
3239 return None
3240
9359f3d4
F
3241 mt, _, params = mt.partition(';')
3242 mt = mt.strip()
3243
3244 FULL_MAP = {
765ac263 3245 'audio/mp4': 'm4a',
6c33d24b
YCH
3246 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3247 # it's the most popular one
3248 'audio/mpeg': 'mp3',
ba39289d 3249 'audio/x-wav': 'wav',
9359f3d4
F
3250 'audio/wav': 'wav',
3251 'audio/wave': 'wav',
3252 }
3253
3254 ext = FULL_MAP.get(mt)
765ac263
JMF
3255 if ext is not None:
3256 return ext
3257
9359f3d4 3258 SUBTYPE_MAP = {
f6861ec9 3259 '3gpp': '3gp',
cafcf657 3260 'smptett+xml': 'tt',
cafcf657 3261 'ttaf+xml': 'dfxp',
a0d8d704 3262 'ttml+xml': 'ttml',
f6861ec9 3263 'x-flv': 'flv',
a0d8d704 3264 'x-mp4-fragmented': 'mp4',
d4f05d47 3265 'x-ms-sami': 'sami',
a0d8d704 3266 'x-ms-wmv': 'wmv',
b4173f15
RA
3267 'mpegurl': 'm3u8',
3268 'x-mpegurl': 'm3u8',
3269 'vnd.apple.mpegurl': 'm3u8',
3270 'dash+xml': 'mpd',
b4173f15 3271 'f4m+xml': 'f4m',
f164b971 3272 'hds+xml': 'f4m',
e910fe2f 3273 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 3274 'quicktime': 'mov',
98ce1a3f 3275 'mp2t': 'ts',
39e7107d 3276 'x-wav': 'wav',
9359f3d4
F
3277 'filmstrip+json': 'fs',
3278 'svg+xml': 'svg',
3279 }
3280
3281 _, _, subtype = mt.rpartition('/')
3282 ext = SUBTYPE_MAP.get(subtype.lower())
3283 if ext is not None:
3284 return ext
3285
3286 SUFFIX_MAP = {
3287 'json': 'json',
3288 'xml': 'xml',
3289 'zip': 'zip',
3290 'gzip': 'gz',
3291 }
3292
3293 _, _, suffix = subtype.partition('+')
3294 ext = SUFFIX_MAP.get(suffix)
3295 if ext is not None:
3296 return ext
3297
3298 return subtype.replace('+', '.')
c460bdd5
PH
3299
3300
2814f12b
THD
3301def ext2mimetype(ext_or_url):
3302 if not ext_or_url:
3303 return None
3304 if '.' not in ext_or_url:
3305 ext_or_url = f'file.{ext_or_url}'
3306 return mimetypes.guess_type(ext_or_url)[0]
3307
3308
4f3c5e06 3309def parse_codecs(codecs_str):
3310 # http://tools.ietf.org/html/rfc6381
3311 if not codecs_str:
3312 return {}
a0566bbf 3313 split_codecs = list(filter(None, map(
dbf5416a 3314 str.strip, codecs_str.strip().strip(',').split(','))))
4afa3ec4 3315 vcodec, acodec, tcodec, hdr = None, None, None, None
a0566bbf 3316 for full_codec in split_codecs:
9bd979ca 3317 parts = full_codec.split('.')
3318 codec = parts[0].replace('0', '')
3319 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3320 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
4f3c5e06 3321 if not vcodec:
b69fd25c 3322 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
176f1866 3323 if codec in ('dvh1', 'dvhe'):
3324 hdr = 'DV'
9bd979ca 3325 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3326 hdr = 'HDR10'
3327 elif full_codec.replace('0', '').startswith('vp9.2'):
176f1866 3328 hdr = 'HDR10'
b69fd25c 3329 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 3330 if not acodec:
3331 acodec = full_codec
4afa3ec4
F
3332 elif codec in ('stpp', 'wvtt',):
3333 if not tcodec:
3334 tcodec = full_codec
4f3c5e06 3335 else:
60f5c9fb 3336 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
4afa3ec4 3337 if vcodec or acodec or tcodec:
4f3c5e06 3338 return {
3339 'vcodec': vcodec or 'none',
3340 'acodec': acodec or 'none',
176f1866 3341 'dynamic_range': hdr,
4afa3ec4 3342 **({'tcodec': tcodec} if tcodec is not None else {}),
4f3c5e06 3343 }
b69fd25c 3344 elif len(split_codecs) == 2:
3345 return {
3346 'vcodec': split_codecs[0],
3347 'acodec': split_codecs[1],
3348 }
4f3c5e06 3349 return {}
3350
3351
2ccd1b10 3352def urlhandle_detect_ext(url_handle):
79298173 3353 getheader = url_handle.headers.get
2ccd1b10 3354
b55ee18f
PH
3355 cd = getheader('Content-Disposition')
3356 if cd:
3357 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3358 if m:
3359 e = determine_ext(m.group('filename'), default_ext=None)
3360 if e:
3361 return e
3362
c460bdd5 3363 return mimetype2ext(getheader('Content-Type'))
05900629
PH
3364
3365
1e399778
YCH
3366def encode_data_uri(data, mime_type):
3367 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3368
3369
05900629 3370def age_restricted(content_limit, age_limit):
6ec6cb4e 3371 """ Returns True iff the content should be blocked """
05900629
PH
3372
3373 if age_limit is None: # No limit set
3374 return False
3375 if content_limit is None:
3376 return False # Content available for everyone
3377 return age_limit < content_limit
61ca9a80
PH
3378
3379
3380def is_html(first_bytes):
3381 """ Detect whether a file contains HTML by examining its first bytes. """
3382
3383 BOMS = [
3384 (b'\xef\xbb\xbf', 'utf-8'),
3385 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3386 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3387 (b'\xff\xfe', 'utf-16-le'),
3388 (b'\xfe\xff', 'utf-16-be'),
3389 ]
3390 for bom, enc in BOMS:
3391 if first_bytes.startswith(bom):
3392 s = first_bytes[len(bom):].decode(enc, 'replace')
3393 break
3394 else:
3395 s = first_bytes.decode('utf-8', 'replace')
3396
3397 return re.match(r'^\s*<', s)
a055469f
PH
3398
3399
3400def determine_protocol(info_dict):
3401 protocol = info_dict.get('protocol')
3402 if protocol is not None:
3403 return protocol
3404
7de837a5 3405 url = sanitize_url(info_dict['url'])
a055469f
PH
3406 if url.startswith('rtmp'):
3407 return 'rtmp'
3408 elif url.startswith('mms'):
3409 return 'mms'
3410 elif url.startswith('rtsp'):
3411 return 'rtsp'
3412
3413 ext = determine_ext(url)
3414 if ext == 'm3u8':
3415 return 'm3u8'
3416 elif ext == 'f4m':
3417 return 'f4m'
3418
3419 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
3420
3421
c5e3f849 3422def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3423 """ Render a list of rows, each as a list of values.
3424 Text after a \t will be right aligned """
ec11a9f4 3425 def width(string):
c5e3f849 3426 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3427
3428 def get_max_lens(table):
ec11a9f4 3429 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3430
3431 def filter_using_list(row, filterArray):
3432 return [col for (take, col) in zip(filterArray, row) if take]
3433
c5e3f849 3434 if hide_empty:
76d321f6 3435 max_lens = get_max_lens(data)
3436 header_row = filter_using_list(header_row, max_lens)
3437 data = [filter_using_list(row, max_lens) for row in data]
3438
cfb56d1a 3439 table = [header_row] + data
76d321f6 3440 max_lens = get_max_lens(table)
c5e3f849 3441 extra_gap += 1
76d321f6 3442 if delim:
c5e3f849 3443 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3444 table[1][-1] = table[1][-1][:-extra_gap] # Remove extra_gap from end of delimiter
ec11a9f4 3445 for row in table:
3446 for pos, text in enumerate(map(str, row)):
c5e3f849 3447 if '\t' in text:
3448 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3449 else:
3450 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3451 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3452 return ret
347de493
PH
3453
3454
8f18aca8 3455def _match_one(filter_part, dct, incomplete):
77b87f05 3456 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3457 STRING_OPERATORS = {
3458 '*=': operator.contains,
3459 '^=': lambda attr, value: attr.startswith(value),
3460 '$=': lambda attr, value: attr.endswith(value),
3461 '~=': lambda attr, value: re.search(value, attr),
3462 }
347de493 3463 COMPARISON_OPERATORS = {
a047eeb6 3464 **STRING_OPERATORS,
3465 '<=': operator.le, # "<=" must be defined above "<"
347de493 3466 '<': operator.lt,
347de493 3467 '>=': operator.ge,
a047eeb6 3468 '>': operator.gt,
347de493 3469 '=': operator.eq,
347de493 3470 }
a047eeb6 3471
347de493
PH
3472 operator_rex = re.compile(r'''(?x)\s*
3473 (?P<key>[a-z_]+)
77b87f05 3474 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3475 (?:
a047eeb6 3476 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3477 (?P<strval>.+?)
347de493
PH
3478 )
3479 \s*$
3480 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3481 m = operator_rex.search(filter_part)
3482 if m:
18f96d12 3483 m = m.groupdict()
3484 unnegated_op = COMPARISON_OPERATORS[m['op']]
3485 if m['negation']:
77b87f05
MT
3486 op = lambda attr, value: not unnegated_op(attr, value)
3487 else:
3488 op = unnegated_op
18f96d12 3489 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3490 if m['quote']:
3491 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3492 actual_value = dct.get(m['key'])
3493 numeric_comparison = None
3494 if isinstance(actual_value, compat_numeric_types):
e5a088dc
S
3495 # If the original field is a string and matching comparisonvalue is
3496 # a number we should respect the origin of the original field
3497 # and process comparison value as a string (see
18f96d12 3498 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3499 try:
18f96d12 3500 numeric_comparison = int(comparison_value)
347de493 3501 except ValueError:
18f96d12 3502 numeric_comparison = parse_filesize(comparison_value)
3503 if numeric_comparison is None:
3504 numeric_comparison = parse_filesize(f'{comparison_value}B')
3505 if numeric_comparison is None:
3506 numeric_comparison = parse_duration(comparison_value)
3507 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3508 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3509 if actual_value is None:
18f96d12 3510 return incomplete or m['none_inclusive']
3511 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3512
3513 UNARY_OPERATORS = {
1cc47c66
S
3514 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3515 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493
PH
3516 }
3517 operator_rex = re.compile(r'''(?x)\s*
3518 (?P<op>%s)\s*(?P<key>[a-z_]+)
3519 \s*$
3520 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3521 m = operator_rex.search(filter_part)
3522 if m:
3523 op = UNARY_OPERATORS[m.group('op')]
3524 actual_value = dct.get(m.group('key'))
8f18aca8 3525 if incomplete and actual_value is None:
3526 return True
347de493
PH
3527 return op(actual_value)
3528
3529 raise ValueError('Invalid filter part %r' % filter_part)
3530
3531
8f18aca8 3532def match_str(filter_str, dct, incomplete=False):
3533 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3534 When incomplete, all conditions passes on missing fields
3535 """
347de493 3536 return all(
8f18aca8 3537 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3538 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3539
3540
3541def match_filter_func(filter_str):
8f18aca8 3542 def _match_func(info_dict, *args, **kwargs):
3543 if match_str(filter_str, info_dict, *args, **kwargs):
347de493
PH
3544 return None
3545 else:
3546 video_title = info_dict.get('title', info_dict.get('id', 'video'))
3547 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3548 return _match_func
91410c9b
PH
3549
3550
bf6427d2
YCH
3551def parse_dfxp_time_expr(time_expr):
3552 if not time_expr:
d631d5f9 3553 return
bf6427d2
YCH
3554
3555 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3556 if mobj:
3557 return float(mobj.group('time_offset'))
3558
db2fe38b 3559 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3560 if mobj:
db2fe38b 3561 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3562
3563
c1c924ab 3564def srt_subtitles_timecode(seconds):
aa7785f8 3565 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3566
3567
3568def ass_subtitles_timecode(seconds):
3569 time = timetuple_from_msec(seconds * 1000)
3570 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3571
3572
3573def dfxp2srt(dfxp_data):
3869028f
YCH
3574 '''
3575 @param dfxp_data A bytes-like object containing DFXP data
3576 @returns A unicode object containing converted SRT data
3577 '''
5b995f71 3578 LEGACY_NAMESPACES = (
3869028f
YCH
3579 (b'http://www.w3.org/ns/ttml', [
3580 b'http://www.w3.org/2004/11/ttaf1',
3581 b'http://www.w3.org/2006/04/ttaf1',
3582 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3583 ]),
3869028f
YCH
3584 (b'http://www.w3.org/ns/ttml#styling', [
3585 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3586 ]),
3587 )
3588
3589 SUPPORTED_STYLING = [
3590 'color',
3591 'fontFamily',
3592 'fontSize',
3593 'fontStyle',
3594 'fontWeight',
3595 'textDecoration'
3596 ]
3597
4e335771 3598 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3599 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3600 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3601 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3602 })
bf6427d2 3603
5b995f71
RA
3604 styles = {}
3605 default_style = {}
3606
87de7069 3607 class TTMLPElementParser(object):
5b995f71
RA
3608 _out = ''
3609 _unclosed_elements = []
3610 _applied_styles = []
bf6427d2 3611
2b14cb56 3612 def start(self, tag, attrib):
5b995f71
RA
3613 if tag in (_x('ttml:br'), 'br'):
3614 self._out += '\n'
3615 else:
3616 unclosed_elements = []
3617 style = {}
3618 element_style_id = attrib.get('style')
3619 if default_style:
3620 style.update(default_style)
3621 if element_style_id:
3622 style.update(styles.get(element_style_id, {}))
3623 for prop in SUPPORTED_STYLING:
3624 prop_val = attrib.get(_x('tts:' + prop))
3625 if prop_val:
3626 style[prop] = prop_val
3627 if style:
3628 font = ''
3629 for k, v in sorted(style.items()):
3630 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3631 continue
3632 if k == 'color':
3633 font += ' color="%s"' % v
3634 elif k == 'fontSize':
3635 font += ' size="%s"' % v
3636 elif k == 'fontFamily':
3637 font += ' face="%s"' % v
3638 elif k == 'fontWeight' and v == 'bold':
3639 self._out += '<b>'
3640 unclosed_elements.append('b')
3641 elif k == 'fontStyle' and v == 'italic':
3642 self._out += '<i>'
3643 unclosed_elements.append('i')
3644 elif k == 'textDecoration' and v == 'underline':
3645 self._out += '<u>'
3646 unclosed_elements.append('u')
3647 if font:
3648 self._out += '<font' + font + '>'
3649 unclosed_elements.append('font')
3650 applied_style = {}
3651 if self._applied_styles:
3652 applied_style.update(self._applied_styles[-1])
3653 applied_style.update(style)
3654 self._applied_styles.append(applied_style)
3655 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3656
2b14cb56 3657 def end(self, tag):
5b995f71
RA
3658 if tag not in (_x('ttml:br'), 'br'):
3659 unclosed_elements = self._unclosed_elements.pop()
3660 for element in reversed(unclosed_elements):
3661 self._out += '</%s>' % element
3662 if unclosed_elements and self._applied_styles:
3663 self._applied_styles.pop()
bf6427d2 3664
2b14cb56 3665 def data(self, data):
5b995f71 3666 self._out += data
2b14cb56 3667
3668 def close(self):
5b995f71 3669 return self._out.strip()
2b14cb56 3670
3671 def parse_node(node):
3672 target = TTMLPElementParser()
3673 parser = xml.etree.ElementTree.XMLParser(target=target)
3674 parser.feed(xml.etree.ElementTree.tostring(node))
3675 return parser.close()
bf6427d2 3676
5b995f71
RA
3677 for k, v in LEGACY_NAMESPACES:
3678 for ns in v:
3679 dfxp_data = dfxp_data.replace(ns, k)
3680
3869028f 3681 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3682 out = []
5b995f71 3683 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3684
3685 if not paras:
3686 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3687
5b995f71
RA
3688 repeat = False
3689 while True:
3690 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3691 style_id = style.get('id') or style.get(_x('xml:id'))
3692 if not style_id:
3693 continue
5b995f71
RA
3694 parent_style_id = style.get('style')
3695 if parent_style_id:
3696 if parent_style_id not in styles:
3697 repeat = True
3698 continue
3699 styles[style_id] = styles[parent_style_id].copy()
3700 for prop in SUPPORTED_STYLING:
3701 prop_val = style.get(_x('tts:' + prop))
3702 if prop_val:
3703 styles.setdefault(style_id, {})[prop] = prop_val
3704 if repeat:
3705 repeat = False
3706 else:
3707 break
3708
3709 for p in ('body', 'div'):
3710 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3711 if ele is None:
3712 continue
3713 style = styles.get(ele.get('style'))
3714 if not style:
3715 continue
3716 default_style.update(style)
3717
bf6427d2 3718 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3719 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3720 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3721 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3722 if begin_time is None:
3723 continue
7dff0363 3724 if not end_time:
d631d5f9
YCH
3725 if not dur:
3726 continue
3727 end_time = begin_time + dur
bf6427d2
YCH
3728 out.append('%d\n%s --> %s\n%s\n\n' % (
3729 index,
c1c924ab
YCH
3730 srt_subtitles_timecode(begin_time),
3731 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3732 parse_node(para)))
3733
3734 return ''.join(out)
3735
3736
66e289ba
S
3737def cli_option(params, command_option, param):
3738 param = params.get(param)
98e698f1
RA
3739 if param:
3740 param = compat_str(param)
66e289ba
S
3741 return [command_option, param] if param is not None else []
3742
3743
3744def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3745 param = params.get(param)
5b232f46
S
3746 if param is None:
3747 return []
66e289ba
S
3748 assert isinstance(param, bool)
3749 if separator:
3750 return [command_option + separator + (true_value if param else false_value)]
3751 return [command_option, true_value if param else false_value]
3752
3753
3754def cli_valueless_option(params, command_option, param, expected_value=True):
3755 param = params.get(param)
3756 return [command_option] if param == expected_value else []
3757
3758
e92caff5 3759def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3760 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3761 if use_compat:
5b1ecbb3 3762 return argdict
3763 else:
3764 argdict = None
eab9b2bc 3765 if argdict is None:
5b1ecbb3 3766 return default
eab9b2bc 3767 assert isinstance(argdict, dict)
3768
e92caff5 3769 assert isinstance(keys, (list, tuple))
3770 for key_list in keys:
e92caff5 3771 arg_list = list(filter(
3772 lambda x: x is not None,
6606817a 3773 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3774 if arg_list:
3775 return [arg for args in arg_list for arg in args]
3776 return default
66e289ba 3777
6251555f 3778
330690a2 3779def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3780 main_key, exe = main_key.lower(), exe.lower()
3781 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3782 keys = [f'{root_key}{k}' for k in (keys or [''])]
3783 if root_key in keys:
3784 if main_key != exe:
3785 keys.append((main_key, exe))
3786 keys.append('default')
3787 else:
3788 use_compat = False
3789 return cli_configuration_args(argdict, keys, default, use_compat)
3790
66e289ba 3791
39672624
YCH
3792class ISO639Utils(object):
3793 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3794 _lang_map = {
3795 'aa': 'aar',
3796 'ab': 'abk',
3797 'ae': 'ave',
3798 'af': 'afr',
3799 'ak': 'aka',
3800 'am': 'amh',
3801 'an': 'arg',
3802 'ar': 'ara',
3803 'as': 'asm',
3804 'av': 'ava',
3805 'ay': 'aym',
3806 'az': 'aze',
3807 'ba': 'bak',
3808 'be': 'bel',
3809 'bg': 'bul',
3810 'bh': 'bih',
3811 'bi': 'bis',
3812 'bm': 'bam',
3813 'bn': 'ben',
3814 'bo': 'bod',
3815 'br': 'bre',
3816 'bs': 'bos',
3817 'ca': 'cat',
3818 'ce': 'che',
3819 'ch': 'cha',
3820 'co': 'cos',
3821 'cr': 'cre',
3822 'cs': 'ces',
3823 'cu': 'chu',
3824 'cv': 'chv',
3825 'cy': 'cym',
3826 'da': 'dan',
3827 'de': 'deu',
3828 'dv': 'div',
3829 'dz': 'dzo',
3830 'ee': 'ewe',
3831 'el': 'ell',
3832 'en': 'eng',
3833 'eo': 'epo',
3834 'es': 'spa',
3835 'et': 'est',
3836 'eu': 'eus',
3837 'fa': 'fas',
3838 'ff': 'ful',
3839 'fi': 'fin',
3840 'fj': 'fij',
3841 'fo': 'fao',
3842 'fr': 'fra',
3843 'fy': 'fry',
3844 'ga': 'gle',
3845 'gd': 'gla',
3846 'gl': 'glg',
3847 'gn': 'grn',
3848 'gu': 'guj',
3849 'gv': 'glv',
3850 'ha': 'hau',
3851 'he': 'heb',
b7acc835 3852 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
3853 'hi': 'hin',
3854 'ho': 'hmo',
3855 'hr': 'hrv',
3856 'ht': 'hat',
3857 'hu': 'hun',
3858 'hy': 'hye',
3859 'hz': 'her',
3860 'ia': 'ina',
3861 'id': 'ind',
b7acc835 3862 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
3863 'ie': 'ile',
3864 'ig': 'ibo',
3865 'ii': 'iii',
3866 'ik': 'ipk',
3867 'io': 'ido',
3868 'is': 'isl',
3869 'it': 'ita',
3870 'iu': 'iku',
3871 'ja': 'jpn',
3872 'jv': 'jav',
3873 'ka': 'kat',
3874 'kg': 'kon',
3875 'ki': 'kik',
3876 'kj': 'kua',
3877 'kk': 'kaz',
3878 'kl': 'kal',
3879 'km': 'khm',
3880 'kn': 'kan',
3881 'ko': 'kor',
3882 'kr': 'kau',
3883 'ks': 'kas',
3884 'ku': 'kur',
3885 'kv': 'kom',
3886 'kw': 'cor',
3887 'ky': 'kir',
3888 'la': 'lat',
3889 'lb': 'ltz',
3890 'lg': 'lug',
3891 'li': 'lim',
3892 'ln': 'lin',
3893 'lo': 'lao',
3894 'lt': 'lit',
3895 'lu': 'lub',
3896 'lv': 'lav',
3897 'mg': 'mlg',
3898 'mh': 'mah',
3899 'mi': 'mri',
3900 'mk': 'mkd',
3901 'ml': 'mal',
3902 'mn': 'mon',
3903 'mr': 'mar',
3904 'ms': 'msa',
3905 'mt': 'mlt',
3906 'my': 'mya',
3907 'na': 'nau',
3908 'nb': 'nob',
3909 'nd': 'nde',
3910 'ne': 'nep',
3911 'ng': 'ndo',
3912 'nl': 'nld',
3913 'nn': 'nno',
3914 'no': 'nor',
3915 'nr': 'nbl',
3916 'nv': 'nav',
3917 'ny': 'nya',
3918 'oc': 'oci',
3919 'oj': 'oji',
3920 'om': 'orm',
3921 'or': 'ori',
3922 'os': 'oss',
3923 'pa': 'pan',
3924 'pi': 'pli',
3925 'pl': 'pol',
3926 'ps': 'pus',
3927 'pt': 'por',
3928 'qu': 'que',
3929 'rm': 'roh',
3930 'rn': 'run',
3931 'ro': 'ron',
3932 'ru': 'rus',
3933 'rw': 'kin',
3934 'sa': 'san',
3935 'sc': 'srd',
3936 'sd': 'snd',
3937 'se': 'sme',
3938 'sg': 'sag',
3939 'si': 'sin',
3940 'sk': 'slk',
3941 'sl': 'slv',
3942 'sm': 'smo',
3943 'sn': 'sna',
3944 'so': 'som',
3945 'sq': 'sqi',
3946 'sr': 'srp',
3947 'ss': 'ssw',
3948 'st': 'sot',
3949 'su': 'sun',
3950 'sv': 'swe',
3951 'sw': 'swa',
3952 'ta': 'tam',
3953 'te': 'tel',
3954 'tg': 'tgk',
3955 'th': 'tha',
3956 'ti': 'tir',
3957 'tk': 'tuk',
3958 'tl': 'tgl',
3959 'tn': 'tsn',
3960 'to': 'ton',
3961 'tr': 'tur',
3962 'ts': 'tso',
3963 'tt': 'tat',
3964 'tw': 'twi',
3965 'ty': 'tah',
3966 'ug': 'uig',
3967 'uk': 'ukr',
3968 'ur': 'urd',
3969 'uz': 'uzb',
3970 've': 'ven',
3971 'vi': 'vie',
3972 'vo': 'vol',
3973 'wa': 'wln',
3974 'wo': 'wol',
3975 'xh': 'xho',
3976 'yi': 'yid',
e9a50fba 3977 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
3978 'yo': 'yor',
3979 'za': 'zha',
3980 'zh': 'zho',
3981 'zu': 'zul',
3982 }
3983
3984 @classmethod
3985 def short2long(cls, code):
3986 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3987 return cls._lang_map.get(code[:2])
3988
3989 @classmethod
3990 def long2short(cls, code):
3991 """Convert language code from ISO 639-2/T to ISO 639-1"""
3992 for short_name, long_name in cls._lang_map.items():
3993 if long_name == code:
3994 return short_name
3995
3996
4eb10f66
YCH
3997class ISO3166Utils(object):
3998 # From http://data.okfn.org/data/core/country-list
3999 _country_map = {
4000 'AF': 'Afghanistan',
4001 'AX': 'Åland Islands',
4002 'AL': 'Albania',
4003 'DZ': 'Algeria',
4004 'AS': 'American Samoa',
4005 'AD': 'Andorra',
4006 'AO': 'Angola',
4007 'AI': 'Anguilla',
4008 'AQ': 'Antarctica',
4009 'AG': 'Antigua and Barbuda',
4010 'AR': 'Argentina',
4011 'AM': 'Armenia',
4012 'AW': 'Aruba',
4013 'AU': 'Australia',
4014 'AT': 'Austria',
4015 'AZ': 'Azerbaijan',
4016 'BS': 'Bahamas',
4017 'BH': 'Bahrain',
4018 'BD': 'Bangladesh',
4019 'BB': 'Barbados',
4020 'BY': 'Belarus',
4021 'BE': 'Belgium',
4022 'BZ': 'Belize',
4023 'BJ': 'Benin',
4024 'BM': 'Bermuda',
4025 'BT': 'Bhutan',
4026 'BO': 'Bolivia, Plurinational State of',
4027 'BQ': 'Bonaire, Sint Eustatius and Saba',
4028 'BA': 'Bosnia and Herzegovina',
4029 'BW': 'Botswana',
4030 'BV': 'Bouvet Island',
4031 'BR': 'Brazil',
4032 'IO': 'British Indian Ocean Territory',
4033 'BN': 'Brunei Darussalam',
4034 'BG': 'Bulgaria',
4035 'BF': 'Burkina Faso',
4036 'BI': 'Burundi',
4037 'KH': 'Cambodia',
4038 'CM': 'Cameroon',
4039 'CA': 'Canada',
4040 'CV': 'Cape Verde',
4041 'KY': 'Cayman Islands',
4042 'CF': 'Central African Republic',
4043 'TD': 'Chad',
4044 'CL': 'Chile',
4045 'CN': 'China',
4046 'CX': 'Christmas Island',
4047 'CC': 'Cocos (Keeling) Islands',
4048 'CO': 'Colombia',
4049 'KM': 'Comoros',
4050 'CG': 'Congo',
4051 'CD': 'Congo, the Democratic Republic of the',
4052 'CK': 'Cook Islands',
4053 'CR': 'Costa Rica',
4054 'CI': 'Côte d\'Ivoire',
4055 'HR': 'Croatia',
4056 'CU': 'Cuba',
4057 'CW': 'Curaçao',
4058 'CY': 'Cyprus',
4059 'CZ': 'Czech Republic',
4060 'DK': 'Denmark',
4061 'DJ': 'Djibouti',
4062 'DM': 'Dominica',
4063 'DO': 'Dominican Republic',
4064 'EC': 'Ecuador',
4065 'EG': 'Egypt',
4066 'SV': 'El Salvador',
4067 'GQ': 'Equatorial Guinea',
4068 'ER': 'Eritrea',
4069 'EE': 'Estonia',
4070 'ET': 'Ethiopia',
4071 'FK': 'Falkland Islands (Malvinas)',
4072 'FO': 'Faroe Islands',
4073 'FJ': 'Fiji',
4074 'FI': 'Finland',
4075 'FR': 'France',
4076 'GF': 'French Guiana',
4077 'PF': 'French Polynesia',
4078 'TF': 'French Southern Territories',
4079 'GA': 'Gabon',
4080 'GM': 'Gambia',
4081 'GE': 'Georgia',
4082 'DE': 'Germany',
4083 'GH': 'Ghana',
4084 'GI': 'Gibraltar',
4085 'GR': 'Greece',
4086 'GL': 'Greenland',
4087 'GD': 'Grenada',
4088 'GP': 'Guadeloupe',
4089 'GU': 'Guam',
4090 'GT': 'Guatemala',
4091 'GG': 'Guernsey',
4092 'GN': 'Guinea',
4093 'GW': 'Guinea-Bissau',
4094 'GY': 'Guyana',
4095 'HT': 'Haiti',
4096 'HM': 'Heard Island and McDonald Islands',
4097 'VA': 'Holy See (Vatican City State)',
4098 'HN': 'Honduras',
4099 'HK': 'Hong Kong',
4100 'HU': 'Hungary',
4101 'IS': 'Iceland',
4102 'IN': 'India',
4103 'ID': 'Indonesia',
4104 'IR': 'Iran, Islamic Republic of',
4105 'IQ': 'Iraq',
4106 'IE': 'Ireland',
4107 'IM': 'Isle of Man',
4108 'IL': 'Israel',
4109 'IT': 'Italy',
4110 'JM': 'Jamaica',
4111 'JP': 'Japan',
4112 'JE': 'Jersey',
4113 'JO': 'Jordan',
4114 'KZ': 'Kazakhstan',
4115 'KE': 'Kenya',
4116 'KI': 'Kiribati',
4117 'KP': 'Korea, Democratic People\'s Republic of',
4118 'KR': 'Korea, Republic of',
4119 'KW': 'Kuwait',
4120 'KG': 'Kyrgyzstan',
4121 'LA': 'Lao People\'s Democratic Republic',
4122 'LV': 'Latvia',
4123 'LB': 'Lebanon',
4124 'LS': 'Lesotho',
4125 'LR': 'Liberia',
4126 'LY': 'Libya',
4127 'LI': 'Liechtenstein',
4128 'LT': 'Lithuania',
4129 'LU': 'Luxembourg',
4130 'MO': 'Macao',
4131 'MK': 'Macedonia, the Former Yugoslav Republic of',
4132 'MG': 'Madagascar',
4133 'MW': 'Malawi',
4134 'MY': 'Malaysia',
4135 'MV': 'Maldives',
4136 'ML': 'Mali',
4137 'MT': 'Malta',
4138 'MH': 'Marshall Islands',
4139 'MQ': 'Martinique',
4140 'MR': 'Mauritania',
4141 'MU': 'Mauritius',
4142 'YT': 'Mayotte',
4143 'MX': 'Mexico',
4144 'FM': 'Micronesia, Federated States of',
4145 'MD': 'Moldova, Republic of',
4146 'MC': 'Monaco',
4147 'MN': 'Mongolia',
4148 'ME': 'Montenegro',
4149 'MS': 'Montserrat',
4150 'MA': 'Morocco',
4151 'MZ': 'Mozambique',
4152 'MM': 'Myanmar',
4153 'NA': 'Namibia',
4154 'NR': 'Nauru',
4155 'NP': 'Nepal',
4156 'NL': 'Netherlands',
4157 'NC': 'New Caledonia',
4158 'NZ': 'New Zealand',
4159 'NI': 'Nicaragua',
4160 'NE': 'Niger',
4161 'NG': 'Nigeria',
4162 'NU': 'Niue',
4163 'NF': 'Norfolk Island',
4164 'MP': 'Northern Mariana Islands',
4165 'NO': 'Norway',
4166 'OM': 'Oman',
4167 'PK': 'Pakistan',
4168 'PW': 'Palau',
4169 'PS': 'Palestine, State of',
4170 'PA': 'Panama',
4171 'PG': 'Papua New Guinea',
4172 'PY': 'Paraguay',
4173 'PE': 'Peru',
4174 'PH': 'Philippines',
4175 'PN': 'Pitcairn',
4176 'PL': 'Poland',
4177 'PT': 'Portugal',
4178 'PR': 'Puerto Rico',
4179 'QA': 'Qatar',
4180 'RE': 'Réunion',
4181 'RO': 'Romania',
4182 'RU': 'Russian Federation',
4183 'RW': 'Rwanda',
4184 'BL': 'Saint Barthélemy',
4185 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4186 'KN': 'Saint Kitts and Nevis',
4187 'LC': 'Saint Lucia',
4188 'MF': 'Saint Martin (French part)',
4189 'PM': 'Saint Pierre and Miquelon',
4190 'VC': 'Saint Vincent and the Grenadines',
4191 'WS': 'Samoa',
4192 'SM': 'San Marino',
4193 'ST': 'Sao Tome and Principe',
4194 'SA': 'Saudi Arabia',
4195 'SN': 'Senegal',
4196 'RS': 'Serbia',
4197 'SC': 'Seychelles',
4198 'SL': 'Sierra Leone',
4199 'SG': 'Singapore',
4200 'SX': 'Sint Maarten (Dutch part)',
4201 'SK': 'Slovakia',
4202 'SI': 'Slovenia',
4203 'SB': 'Solomon Islands',
4204 'SO': 'Somalia',
4205 'ZA': 'South Africa',
4206 'GS': 'South Georgia and the South Sandwich Islands',
4207 'SS': 'South Sudan',
4208 'ES': 'Spain',
4209 'LK': 'Sri Lanka',
4210 'SD': 'Sudan',
4211 'SR': 'Suriname',
4212 'SJ': 'Svalbard and Jan Mayen',
4213 'SZ': 'Swaziland',
4214 'SE': 'Sweden',
4215 'CH': 'Switzerland',
4216 'SY': 'Syrian Arab Republic',
4217 'TW': 'Taiwan, Province of China',
4218 'TJ': 'Tajikistan',
4219 'TZ': 'Tanzania, United Republic of',
4220 'TH': 'Thailand',
4221 'TL': 'Timor-Leste',
4222 'TG': 'Togo',
4223 'TK': 'Tokelau',
4224 'TO': 'Tonga',
4225 'TT': 'Trinidad and Tobago',
4226 'TN': 'Tunisia',
4227 'TR': 'Turkey',
4228 'TM': 'Turkmenistan',
4229 'TC': 'Turks and Caicos Islands',
4230 'TV': 'Tuvalu',
4231 'UG': 'Uganda',
4232 'UA': 'Ukraine',
4233 'AE': 'United Arab Emirates',
4234 'GB': 'United Kingdom',
4235 'US': 'United States',
4236 'UM': 'United States Minor Outlying Islands',
4237 'UY': 'Uruguay',
4238 'UZ': 'Uzbekistan',
4239 'VU': 'Vanuatu',
4240 'VE': 'Venezuela, Bolivarian Republic of',
4241 'VN': 'Viet Nam',
4242 'VG': 'Virgin Islands, British',
4243 'VI': 'Virgin Islands, U.S.',
4244 'WF': 'Wallis and Futuna',
4245 'EH': 'Western Sahara',
4246 'YE': 'Yemen',
4247 'ZM': 'Zambia',
4248 'ZW': 'Zimbabwe',
4249 }
4250
4251 @classmethod
4252 def short2full(cls, code):
4253 """Convert an ISO 3166-2 country code to the corresponding full name"""
4254 return cls._country_map.get(code.upper())
4255
4256
773f291d
S
4257class GeoUtils(object):
4258 # Major IPv4 address blocks per country
4259 _country_ip_map = {
53896ca5 4260 'AD': '46.172.224.0/19',
773f291d
S
4261 'AE': '94.200.0.0/13',
4262 'AF': '149.54.0.0/17',
4263 'AG': '209.59.64.0/18',
4264 'AI': '204.14.248.0/21',
4265 'AL': '46.99.0.0/16',
4266 'AM': '46.70.0.0/15',
4267 'AO': '105.168.0.0/13',
53896ca5
S
4268 'AP': '182.50.184.0/21',
4269 'AQ': '23.154.160.0/24',
773f291d
S
4270 'AR': '181.0.0.0/12',
4271 'AS': '202.70.112.0/20',
53896ca5 4272 'AT': '77.116.0.0/14',
773f291d
S
4273 'AU': '1.128.0.0/11',
4274 'AW': '181.41.0.0/18',
53896ca5
S
4275 'AX': '185.217.4.0/22',
4276 'AZ': '5.197.0.0/16',
773f291d
S
4277 'BA': '31.176.128.0/17',
4278 'BB': '65.48.128.0/17',
4279 'BD': '114.130.0.0/16',
4280 'BE': '57.0.0.0/8',
53896ca5 4281 'BF': '102.178.0.0/15',
773f291d
S
4282 'BG': '95.42.0.0/15',
4283 'BH': '37.131.0.0/17',
4284 'BI': '154.117.192.0/18',
4285 'BJ': '137.255.0.0/16',
53896ca5 4286 'BL': '185.212.72.0/23',
773f291d
S
4287 'BM': '196.12.64.0/18',
4288 'BN': '156.31.0.0/16',
4289 'BO': '161.56.0.0/16',
4290 'BQ': '161.0.80.0/20',
53896ca5 4291 'BR': '191.128.0.0/12',
773f291d
S
4292 'BS': '24.51.64.0/18',
4293 'BT': '119.2.96.0/19',
4294 'BW': '168.167.0.0/16',
4295 'BY': '178.120.0.0/13',
4296 'BZ': '179.42.192.0/18',
4297 'CA': '99.224.0.0/11',
4298 'CD': '41.243.0.0/16',
53896ca5
S
4299 'CF': '197.242.176.0/21',
4300 'CG': '160.113.0.0/16',
773f291d 4301 'CH': '85.0.0.0/13',
53896ca5 4302 'CI': '102.136.0.0/14',
773f291d
S
4303 'CK': '202.65.32.0/19',
4304 'CL': '152.172.0.0/14',
53896ca5 4305 'CM': '102.244.0.0/14',
773f291d
S
4306 'CN': '36.128.0.0/10',
4307 'CO': '181.240.0.0/12',
4308 'CR': '201.192.0.0/12',
4309 'CU': '152.206.0.0/15',
4310 'CV': '165.90.96.0/19',
4311 'CW': '190.88.128.0/17',
53896ca5 4312 'CY': '31.153.0.0/16',
773f291d
S
4313 'CZ': '88.100.0.0/14',
4314 'DE': '53.0.0.0/8',
4315 'DJ': '197.241.0.0/17',
4316 'DK': '87.48.0.0/12',
4317 'DM': '192.243.48.0/20',
4318 'DO': '152.166.0.0/15',
4319 'DZ': '41.96.0.0/12',
4320 'EC': '186.68.0.0/15',
4321 'EE': '90.190.0.0/15',
4322 'EG': '156.160.0.0/11',
4323 'ER': '196.200.96.0/20',
4324 'ES': '88.0.0.0/11',
4325 'ET': '196.188.0.0/14',
4326 'EU': '2.16.0.0/13',
4327 'FI': '91.152.0.0/13',
4328 'FJ': '144.120.0.0/16',
53896ca5 4329 'FK': '80.73.208.0/21',
773f291d
S
4330 'FM': '119.252.112.0/20',
4331 'FO': '88.85.32.0/19',
4332 'FR': '90.0.0.0/9',
4333 'GA': '41.158.0.0/15',
4334 'GB': '25.0.0.0/8',
4335 'GD': '74.122.88.0/21',
4336 'GE': '31.146.0.0/16',
4337 'GF': '161.22.64.0/18',
4338 'GG': '62.68.160.0/19',
53896ca5
S
4339 'GH': '154.160.0.0/12',
4340 'GI': '95.164.0.0/16',
773f291d
S
4341 'GL': '88.83.0.0/19',
4342 'GM': '160.182.0.0/15',
4343 'GN': '197.149.192.0/18',
4344 'GP': '104.250.0.0/19',
4345 'GQ': '105.235.224.0/20',
4346 'GR': '94.64.0.0/13',
4347 'GT': '168.234.0.0/16',
4348 'GU': '168.123.0.0/16',
4349 'GW': '197.214.80.0/20',
4350 'GY': '181.41.64.0/18',
4351 'HK': '113.252.0.0/14',
4352 'HN': '181.210.0.0/16',
4353 'HR': '93.136.0.0/13',
4354 'HT': '148.102.128.0/17',
4355 'HU': '84.0.0.0/14',
4356 'ID': '39.192.0.0/10',
4357 'IE': '87.32.0.0/12',
4358 'IL': '79.176.0.0/13',
4359 'IM': '5.62.80.0/20',
4360 'IN': '117.192.0.0/10',
4361 'IO': '203.83.48.0/21',
4362 'IQ': '37.236.0.0/14',
4363 'IR': '2.176.0.0/12',
4364 'IS': '82.221.0.0/16',
4365 'IT': '79.0.0.0/10',
4366 'JE': '87.244.64.0/18',
4367 'JM': '72.27.0.0/17',
4368 'JO': '176.29.0.0/16',
53896ca5 4369 'JP': '133.0.0.0/8',
773f291d
S
4370 'KE': '105.48.0.0/12',
4371 'KG': '158.181.128.0/17',
4372 'KH': '36.37.128.0/17',
4373 'KI': '103.25.140.0/22',
4374 'KM': '197.255.224.0/20',
53896ca5 4375 'KN': '198.167.192.0/19',
773f291d
S
4376 'KP': '175.45.176.0/22',
4377 'KR': '175.192.0.0/10',
4378 'KW': '37.36.0.0/14',
4379 'KY': '64.96.0.0/15',
4380 'KZ': '2.72.0.0/13',
4381 'LA': '115.84.64.0/18',
4382 'LB': '178.135.0.0/16',
53896ca5 4383 'LC': '24.92.144.0/20',
773f291d
S
4384 'LI': '82.117.0.0/19',
4385 'LK': '112.134.0.0/15',
53896ca5 4386 'LR': '102.183.0.0/16',
773f291d
S
4387 'LS': '129.232.0.0/17',
4388 'LT': '78.56.0.0/13',
4389 'LU': '188.42.0.0/16',
4390 'LV': '46.109.0.0/16',
4391 'LY': '41.252.0.0/14',
4392 'MA': '105.128.0.0/11',
4393 'MC': '88.209.64.0/18',
4394 'MD': '37.246.0.0/16',
4395 'ME': '178.175.0.0/17',
4396 'MF': '74.112.232.0/21',
4397 'MG': '154.126.0.0/17',
4398 'MH': '117.103.88.0/21',
4399 'MK': '77.28.0.0/15',
4400 'ML': '154.118.128.0/18',
4401 'MM': '37.111.0.0/17',
4402 'MN': '49.0.128.0/17',
4403 'MO': '60.246.0.0/16',
4404 'MP': '202.88.64.0/20',
4405 'MQ': '109.203.224.0/19',
4406 'MR': '41.188.64.0/18',
4407 'MS': '208.90.112.0/22',
4408 'MT': '46.11.0.0/16',
4409 'MU': '105.16.0.0/12',
4410 'MV': '27.114.128.0/18',
53896ca5 4411 'MW': '102.70.0.0/15',
773f291d
S
4412 'MX': '187.192.0.0/11',
4413 'MY': '175.136.0.0/13',
4414 'MZ': '197.218.0.0/15',
4415 'NA': '41.182.0.0/16',
4416 'NC': '101.101.0.0/18',
4417 'NE': '197.214.0.0/18',
4418 'NF': '203.17.240.0/22',
4419 'NG': '105.112.0.0/12',
4420 'NI': '186.76.0.0/15',
4421 'NL': '145.96.0.0/11',
4422 'NO': '84.208.0.0/13',
4423 'NP': '36.252.0.0/15',
4424 'NR': '203.98.224.0/19',
4425 'NU': '49.156.48.0/22',
4426 'NZ': '49.224.0.0/14',
4427 'OM': '5.36.0.0/15',
4428 'PA': '186.72.0.0/15',
4429 'PE': '186.160.0.0/14',
4430 'PF': '123.50.64.0/18',
4431 'PG': '124.240.192.0/19',
4432 'PH': '49.144.0.0/13',
4433 'PK': '39.32.0.0/11',
4434 'PL': '83.0.0.0/11',
4435 'PM': '70.36.0.0/20',
4436 'PR': '66.50.0.0/16',
4437 'PS': '188.161.0.0/16',
4438 'PT': '85.240.0.0/13',
4439 'PW': '202.124.224.0/20',
4440 'PY': '181.120.0.0/14',
4441 'QA': '37.210.0.0/15',
53896ca5 4442 'RE': '102.35.0.0/16',
773f291d 4443 'RO': '79.112.0.0/13',
53896ca5 4444 'RS': '93.86.0.0/15',
773f291d 4445 'RU': '5.136.0.0/13',
53896ca5 4446 'RW': '41.186.0.0/16',
773f291d
S
4447 'SA': '188.48.0.0/13',
4448 'SB': '202.1.160.0/19',
4449 'SC': '154.192.0.0/11',
53896ca5 4450 'SD': '102.120.0.0/13',
773f291d 4451 'SE': '78.64.0.0/12',
53896ca5 4452 'SG': '8.128.0.0/10',
773f291d
S
4453 'SI': '188.196.0.0/14',
4454 'SK': '78.98.0.0/15',
53896ca5 4455 'SL': '102.143.0.0/17',
773f291d
S
4456 'SM': '89.186.32.0/19',
4457 'SN': '41.82.0.0/15',
53896ca5 4458 'SO': '154.115.192.0/18',
773f291d
S
4459 'SR': '186.179.128.0/17',
4460 'SS': '105.235.208.0/21',
4461 'ST': '197.159.160.0/19',
4462 'SV': '168.243.0.0/16',
4463 'SX': '190.102.0.0/20',
4464 'SY': '5.0.0.0/16',
4465 'SZ': '41.84.224.0/19',
4466 'TC': '65.255.48.0/20',
4467 'TD': '154.68.128.0/19',
4468 'TG': '196.168.0.0/14',
4469 'TH': '171.96.0.0/13',
4470 'TJ': '85.9.128.0/18',
4471 'TK': '27.96.24.0/21',
4472 'TL': '180.189.160.0/20',
4473 'TM': '95.85.96.0/19',
4474 'TN': '197.0.0.0/11',
4475 'TO': '175.176.144.0/21',
4476 'TR': '78.160.0.0/11',
4477 'TT': '186.44.0.0/15',
4478 'TV': '202.2.96.0/19',
4479 'TW': '120.96.0.0/11',
4480 'TZ': '156.156.0.0/14',
53896ca5
S
4481 'UA': '37.52.0.0/14',
4482 'UG': '102.80.0.0/13',
4483 'US': '6.0.0.0/8',
773f291d 4484 'UY': '167.56.0.0/13',
53896ca5 4485 'UZ': '84.54.64.0/18',
773f291d 4486 'VA': '212.77.0.0/19',
53896ca5 4487 'VC': '207.191.240.0/21',
773f291d 4488 'VE': '186.88.0.0/13',
53896ca5 4489 'VG': '66.81.192.0/20',
773f291d
S
4490 'VI': '146.226.0.0/16',
4491 'VN': '14.160.0.0/11',
4492 'VU': '202.80.32.0/20',
4493 'WF': '117.20.32.0/21',
4494 'WS': '202.4.32.0/19',
4495 'YE': '134.35.0.0/16',
4496 'YT': '41.242.116.0/22',
4497 'ZA': '41.0.0.0/11',
53896ca5
S
4498 'ZM': '102.144.0.0/13',
4499 'ZW': '102.177.192.0/18',
773f291d
S
4500 }
4501
4502 @classmethod
5f95927a
S
4503 def random_ipv4(cls, code_or_block):
4504 if len(code_or_block) == 2:
4505 block = cls._country_ip_map.get(code_or_block.upper())
4506 if not block:
4507 return None
4508 else:
4509 block = code_or_block
773f291d
S
4510 addr, preflen = block.split('/')
4511 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4512 addr_max = addr_min | (0xffffffff >> int(preflen))
18a0defa 4513 return compat_str(socket.inet_ntoa(
4248dad9 4514 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4515
4516
91410c9b 4517class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
4518 def __init__(self, proxies=None):
4519 # Set default handlers
4520 for type in ('http', 'https'):
4521 setattr(self, '%s_open' % type,
4522 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4523 meth(r, proxy, type))
38e87f6c 4524 compat_urllib_request.ProxyHandler.__init__(self, proxies)
2461f79d 4525
91410c9b 4526 def proxy_open(self, req, proxy, type):
2461f79d 4527 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4528 if req_proxy is not None:
4529 proxy = req_proxy
2461f79d
PH
4530 del req.headers['Ytdl-request-proxy']
4531
4532 if proxy == '__noproxy__':
4533 return None # No Proxy
51fb4995 4534 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4535 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4536 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4537 return None
91410c9b
PH
4538 return compat_urllib_request.ProxyHandler.proxy_open(
4539 self, req, proxy, type)
5bc880b9
YCH
4540
4541
0a5445dd
YCH
4542# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4543# released into Public Domain
4544# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4545
4546def long_to_bytes(n, blocksize=0):
4547 """long_to_bytes(n:long, blocksize:int) : string
4548 Convert a long integer to a byte string.
4549
4550 If optional blocksize is given and greater than zero, pad the front of the
4551 byte string with binary zeros so that the length is a multiple of
4552 blocksize.
4553 """
4554 # after much testing, this algorithm was deemed to be the fastest
4555 s = b''
4556 n = int(n)
4557 while n > 0:
4558 s = compat_struct_pack('>I', n & 0xffffffff) + s
4559 n = n >> 32
4560 # strip off leading zeros
4561 for i in range(len(s)):
4562 if s[i] != b'\000'[0]:
4563 break
4564 else:
4565 # only happens when n == 0
4566 s = b'\000'
4567 i = 0
4568 s = s[i:]
4569 # add back some pad bytes. this could be done more efficiently w.r.t. the
4570 # de-padding being done above, but sigh...
4571 if blocksize > 0 and len(s) % blocksize:
4572 s = (blocksize - len(s) % blocksize) * b'\000' + s
4573 return s
4574
4575
4576def bytes_to_long(s):
4577 """bytes_to_long(string) : long
4578 Convert a byte string to a long integer.
4579
4580 This is (essentially) the inverse of long_to_bytes().
4581 """
4582 acc = 0
4583 length = len(s)
4584 if length % 4:
4585 extra = (4 - length % 4)
4586 s = b'\000' * extra + s
4587 length = length + extra
4588 for i in range(0, length, 4):
4589 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4590 return acc
4591
4592
5bc880b9
YCH
4593def ohdave_rsa_encrypt(data, exponent, modulus):
4594 '''
4595 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4596
4597 Input:
4598 data: data to encrypt, bytes-like object
4599 exponent, modulus: parameter e and N of RSA algorithm, both integer
4600 Output: hex string of encrypted data
4601
4602 Limitation: supports one block encryption only
4603 '''
4604
4605 payload = int(binascii.hexlify(data[::-1]), 16)
4606 encrypted = pow(payload, exponent, modulus)
4607 return '%x' % encrypted
81bdc8fd
YCH
4608
4609
f48409c7
YCH
4610def pkcs1pad(data, length):
4611 """
4612 Padding input data with PKCS#1 scheme
4613
4614 @param {int[]} data input data
4615 @param {int} length target length
4616 @returns {int[]} padded data
4617 """
4618 if len(data) > length - 11:
4619 raise ValueError('Input data too long for PKCS#1 padding')
4620
4621 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4622 return [0, 2] + pseudo_random + [0] + data
4623
4624
5eb6bdce 4625def encode_base_n(num, n, table=None):
59f898b7 4626 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
4627 if not table:
4628 table = FULL_TABLE[:n]
4629
5eb6bdce
YCH
4630 if n > len(table):
4631 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4632
4633 if num == 0:
4634 return table[0]
4635
81bdc8fd
YCH
4636 ret = ''
4637 while num:
4638 ret = table[num % n] + ret
4639 num = num // n
4640 return ret
f52354a8
YCH
4641
4642
4643def decode_packed_codes(code):
06b3fe29 4644 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4645 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4646 base = int(base)
4647 count = int(count)
4648 symbols = symbols.split('|')
4649 symbol_table = {}
4650
4651 while count:
4652 count -= 1
5eb6bdce 4653 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4654 symbol_table[base_n_count] = symbols[count] or base_n_count
4655
4656 return re.sub(
4657 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4658 obfuscated_code)
e154c651 4659
4660
1ced2221
S
4661def caesar(s, alphabet, shift):
4662 if shift == 0:
4663 return s
4664 l = len(alphabet)
4665 return ''.join(
4666 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4667 for c in s)
4668
4669
4670def rot47(s):
4671 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4672
4673
e154c651 4674def parse_m3u8_attributes(attrib):
4675 info = {}
4676 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4677 if val.startswith('"'):
4678 val = val[1:-1]
4679 info[key] = val
4680 return info
1143535d
YCH
4681
4682
4683def urshift(val, n):
4684 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4685
4686
4687# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 4688# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
4689def decode_png(png_data):
4690 # Reference: https://www.w3.org/TR/PNG/
4691 header = png_data[8:]
4692
4693 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4694 raise IOError('Not a valid PNG file.')
4695
4696 int_map = {1: '>B', 2: '>H', 4: '>I'}
4697 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4698
4699 chunks = []
4700
4701 while header:
4702 length = unpack_integer(header[:4])
4703 header = header[4:]
4704
4705 chunk_type = header[:4]
4706 header = header[4:]
4707
4708 chunk_data = header[:length]
4709 header = header[length:]
4710
4711 header = header[4:] # Skip CRC
4712
4713 chunks.append({
4714 'type': chunk_type,
4715 'length': length,
4716 'data': chunk_data
4717 })
4718
4719 ihdr = chunks[0]['data']
4720
4721 width = unpack_integer(ihdr[:4])
4722 height = unpack_integer(ihdr[4:8])
4723
4724 idat = b''
4725
4726 for chunk in chunks:
4727 if chunk['type'] == b'IDAT':
4728 idat += chunk['data']
4729
4730 if not idat:
4731 raise IOError('Unable to read PNG data.')
4732
4733 decompressed_data = bytearray(zlib.decompress(idat))
4734
4735 stride = width * 3
4736 pixels = []
4737
4738 def _get_pixel(idx):
4739 x = idx % stride
4740 y = idx // stride
4741 return pixels[y][x]
4742
4743 for y in range(height):
4744 basePos = y * (1 + stride)
4745 filter_type = decompressed_data[basePos]
4746
4747 current_row = []
4748
4749 pixels.append(current_row)
4750
4751 for x in range(stride):
4752 color = decompressed_data[1 + basePos + x]
4753 basex = y * stride + x
4754 left = 0
4755 up = 0
4756
4757 if x > 2:
4758 left = _get_pixel(basex - 3)
4759 if y > 0:
4760 up = _get_pixel(basex - stride)
4761
4762 if filter_type == 1: # Sub
4763 color = (color + left) & 0xff
4764 elif filter_type == 2: # Up
4765 color = (color + up) & 0xff
4766 elif filter_type == 3: # Average
4767 color = (color + ((left + up) >> 1)) & 0xff
4768 elif filter_type == 4: # Paeth
4769 a = left
4770 b = up
4771 c = 0
4772
4773 if x > 2 and y > 0:
4774 c = _get_pixel(basex - stride - 3)
4775
4776 p = a + b - c
4777
4778 pa = abs(p - a)
4779 pb = abs(p - b)
4780 pc = abs(p - c)
4781
4782 if pa <= pb and pa <= pc:
4783 color = (color + a) & 0xff
4784 elif pb <= pc:
4785 color = (color + b) & 0xff
4786 else:
4787 color = (color + c) & 0xff
4788
4789 current_row.append(color)
4790
4791 return width, height, pixels
efa97bdc
YCH
4792
4793
4794def write_xattr(path, key, value):
4795 # This mess below finds the best xattr tool for the job
4796 try:
4797 # try the pyxattr module...
4798 import xattr
4799
53a7e3d2
YCH
4800 if hasattr(xattr, 'set'): # pyxattr
4801 # Unicode arguments are not supported in python-pyxattr until
4802 # version 0.5.0
067aa17e 4803 # See https://github.com/ytdl-org/youtube-dl/issues/5498
53a7e3d2
YCH
4804 pyxattr_required_version = '0.5.0'
4805 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4806 # TODO: fallback to CLI tools
4807 raise XAttrUnavailableError(
4808 'python-pyxattr is detected but is too old. '
7a5c1cfe 4809 'yt-dlp requires %s or above while your version is %s. '
53a7e3d2
YCH
4810 'Falling back to other xattr implementations' % (
4811 pyxattr_required_version, xattr.__version__))
4812
4813 setxattr = xattr.set
4814 else: # xattr
4815 setxattr = xattr.setxattr
efa97bdc
YCH
4816
4817 try:
53a7e3d2 4818 setxattr(path, key, value)
efa97bdc
YCH
4819 except EnvironmentError as e:
4820 raise XAttrMetadataError(e.errno, e.strerror)
4821
4822 except ImportError:
4823 if compat_os_name == 'nt':
4824 # Write xattrs to NTFS Alternate Data Streams:
4825 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4826 assert ':' not in key
4827 assert os.path.exists(path)
4828
4829 ads_fn = path + ':' + key
4830 try:
4831 with open(ads_fn, 'wb') as f:
4832 f.write(value)
4833 except EnvironmentError as e:
4834 raise XAttrMetadataError(e.errno, e.strerror)
4835 else:
4836 user_has_setfattr = check_executable('setfattr', ['--version'])
4837 user_has_xattr = check_executable('xattr', ['-h'])
4838
4839 if user_has_setfattr or user_has_xattr:
4840
4841 value = value.decode('utf-8')
4842 if user_has_setfattr:
4843 executable = 'setfattr'
4844 opts = ['-n', key, '-v', value]
4845 elif user_has_xattr:
4846 executable = 'xattr'
4847 opts = ['-w', key, value]
4848
3089bc74
S
4849 cmd = ([encodeFilename(executable, True)]
4850 + [encodeArgument(o) for o in opts]
4851 + [encodeFilename(path, True)])
efa97bdc
YCH
4852
4853 try:
d3c93ec2 4854 p = Popen(
efa97bdc
YCH
4855 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4856 except EnvironmentError as e:
4857 raise XAttrMetadataError(e.errno, e.strerror)
d3c93ec2 4858 stdout, stderr = p.communicate_or_kill()
efa97bdc
YCH
4859 stderr = stderr.decode('utf-8', 'replace')
4860 if p.returncode != 0:
4861 raise XAttrMetadataError(p.returncode, stderr)
4862
4863 else:
4864 # On Unix, and can't find pyxattr, setfattr, or xattr.
4865 if sys.platform.startswith('linux'):
4866 raise XAttrUnavailableError(
4867 "Couldn't find a tool to set the xattrs. "
4868 "Install either the python 'pyxattr' or 'xattr' "
4869 "modules, or the GNU 'attr' package "
4870 "(which contains the 'setfattr' tool).")
4871 else:
4872 raise XAttrUnavailableError(
4873 "Couldn't find a tool to set the xattrs. "
4874 "Install either the python 'xattr' module, "
4875 "or the 'xattr' binary.")
0c265486
YCH
4876
4877
4878def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
4879 start_date = datetime.date(1950, 1, 1)
4880 end_date = datetime.date(1995, 12, 31)
4881 offset = random.randint(0, (end_date - start_date).days)
4882 random_date = start_date + datetime.timedelta(offset)
0c265486 4883 return {
aa374bc7
AS
4884 year_field: str(random_date.year),
4885 month_field: str(random_date.month),
4886 day_field: str(random_date.day),
0c265486 4887 }
732044af 4888
c76eb41b 4889
732044af 4890# Templates for internet shortcut files, which are plain text files.
4891DOT_URL_LINK_TEMPLATE = '''
4892[InternetShortcut]
4893URL=%(url)s
4894'''.lstrip()
4895
4896DOT_WEBLOC_LINK_TEMPLATE = '''
4897<?xml version="1.0" encoding="UTF-8"?>
4898<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4899<plist version="1.0">
4900<dict>
4901\t<key>URL</key>
4902\t<string>%(url)s</string>
4903</dict>
4904</plist>
4905'''.lstrip()
4906
4907DOT_DESKTOP_LINK_TEMPLATE = '''
4908[Desktop Entry]
4909Encoding=UTF-8
4910Name=%(filename)s
4911Type=Link
4912URL=%(url)s
4913Icon=text-html
4914'''.lstrip()
4915
08438d2c 4916LINK_TEMPLATES = {
4917 'url': DOT_URL_LINK_TEMPLATE,
4918 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4919 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4920}
4921
732044af 4922
4923def iri_to_uri(iri):
4924 """
4925 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4926
4927 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4928 """
4929
4930 iri_parts = compat_urllib_parse_urlparse(iri)
4931
4932 if '[' in iri_parts.netloc:
4933 raise ValueError('IPv6 URIs are not, yet, supported.')
4934 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4935
4936 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4937
4938 net_location = ''
4939 if iri_parts.username:
4940 net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4941 if iri_parts.password is not None:
4942 net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4943 net_location += '@'
4944
4945 net_location += iri_parts.hostname.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
4946 # The 'idna' encoding produces ASCII text.
4947 if iri_parts.port is not None and iri_parts.port != 80:
4948 net_location += ':' + str(iri_parts.port)
4949
4950 return compat_urllib_parse_urlunparse(
4951 (iri_parts.scheme,
4952 net_location,
4953
4954 compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4955
4956 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4957 compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4958
4959 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4960 compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4961
4962 compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4963
4964 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4965
4966
4967def to_high_limit_path(path):
4968 if sys.platform in ['win32', 'cygwin']:
4969 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4970 return r'\\?\ '.rstrip() + os.path.abspath(path)
4971
4972 return path
76d321f6 4973
c76eb41b 4974
b868936c 4975def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4976 if field is None:
4977 val = obj if obj is not None else default
4978 else:
4979 val = obj.get(field, default)
76d321f6 4980 if func and val not in ignore:
4981 val = func(val)
4982 return template % val if val not in ignore else default
00dd0cd5 4983
4984
4985def clean_podcast_url(url):
4986 return re.sub(r'''(?x)
4987 (?:
4988 (?:
4989 chtbl\.com/track|
4990 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4991 play\.podtrac\.com
4992 )/[^/]+|
4993 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4994 flex\.acast\.com|
4995 pd(?:
4996 cn\.co| # https://podcorn.com/analytics-prefix/
4997 st\.fm # https://podsights.com/docs/
4998 )/e
4999 )/''', '', url)
ffcb8191
THD
5000
5001
5002_HEX_TABLE = '0123456789abcdef'
5003
5004
5005def random_uuidv4():
5006 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5007
5008
5009def make_dir(path, to_screen=None):
5010 try:
5011 dn = os.path.dirname(path)
5012 if dn and not os.path.exists(dn):
5013 os.makedirs(dn)
5014 return True
5015 except (OSError, IOError) as err:
5016 if callable(to_screen) is not None:
5017 to_screen('unable to create directory ' + error_to_compat_str(err))
5018 return False
f74980cb 5019
5020
5021def get_executable_path():
c552ae88 5022 from zipimport import zipimporter
5023 if hasattr(sys, 'frozen'): # Running from PyInstaller
5024 path = os.path.dirname(sys.executable)
5025 elif isinstance(globals().get('__loader__'), zipimporter): # Running from ZIP
5026 path = os.path.join(os.path.dirname(__file__), '../..')
5027 else:
5028 path = os.path.join(os.path.dirname(__file__), '..')
f74980cb 5029 return os.path.abspath(path)
5030
5031
2f567473 5032def load_plugins(name, suffix, namespace):
3ae5e797 5033 classes = {}
f74980cb 5034 try:
019a94f7
ÁS
5035 plugins_spec = importlib.util.spec_from_file_location(
5036 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5037 plugins = importlib.util.module_from_spec(plugins_spec)
5038 sys.modules[plugins_spec.name] = plugins
5039 plugins_spec.loader.exec_module(plugins)
f74980cb 5040 for name in dir(plugins):
2f567473 5041 if name in namespace:
5042 continue
5043 if not name.endswith(suffix):
f74980cb 5044 continue
5045 klass = getattr(plugins, name)
3ae5e797 5046 classes[name] = namespace[name] = klass
019a94f7 5047 except FileNotFoundError:
f74980cb 5048 pass
f74980cb 5049 return classes
06167fbb 5050
5051
325ebc17 5052def traverse_obj(
352d63fd 5053 obj, *path_list, default=None, expected_type=None, get_all=True,
325ebc17 5054 casesense=True, is_user_input=False, traverse_string=False):
324ad820 5055 ''' Traverse nested list/dict/tuple
8f334380 5056 @param path_list A list of paths which are checked one by one.
5057 Each path is a list of keys where each key is a string,
1797b073 5058 a function, a tuple of strings/None or "...".
2614f646 5059 When a fuction is given, it takes the key as argument and
5060 returns whether the key matches or not. When a tuple is given,
8f334380 5061 all the keys given in the tuple are traversed, and
5062 "..." traverses all the keys in the object
1797b073 5063 "None" returns the object without traversal
325ebc17 5064 @param default Default value to return
352d63fd 5065 @param expected_type Only accept final value of this type (Can also be any callable)
5066 @param get_all Return all the values obtained from a path or only the first one
324ad820 5067 @param casesense Whether to consider dictionary keys as case sensitive
5068 @param is_user_input Whether the keys are generated from user input. If True,
5069 strings are converted to int/slice if necessary
5070 @param traverse_string Whether to traverse inside strings. If True, any
5071 non-compatible object will also be converted into a string
8f334380 5072 # TODO: Write tests
324ad820 5073 '''
325ebc17 5074 if not casesense:
dbf5416a 5075 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
8f334380 5076 path_list = (map(_lower, variadic(path)) for path in path_list)
5077
5078 def _traverse_obj(obj, path, _current_depth=0):
5079 nonlocal depth
5080 path = tuple(variadic(path))
5081 for i, key in enumerate(path):
1797b073 5082 if None in (key, obj):
5083 return obj
8f334380 5084 if isinstance(key, (list, tuple)):
5085 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5086 key = ...
5087 if key is ...:
5088 obj = (obj.values() if isinstance(obj, dict)
5089 else obj if isinstance(obj, (list, tuple, LazyList))
5090 else str(obj) if traverse_string else [])
5091 _current_depth += 1
5092 depth = max(depth, _current_depth)
5093 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
2614f646 5094 elif callable(key):
5095 if isinstance(obj, (list, tuple, LazyList)):
5096 obj = enumerate(obj)
5097 elif isinstance(obj, dict):
5098 obj = obj.items()
5099 else:
5100 if not traverse_string:
5101 return None
5102 obj = str(obj)
5103 _current_depth += 1
5104 depth = max(depth, _current_depth)
5105 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
575e17a1 5106 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
325ebc17 5107 obj = (obj.get(key) if casesense or (key in obj)
5108 else next((v for k, v in obj.items() if _lower(k) == key), None))
5109 else:
5110 if is_user_input:
5111 key = (int_or_none(key) if ':' not in key
5112 else slice(*map(int_or_none, key.split(':'))))
8f334380 5113 if key == slice(None):
575e17a1 5114 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
325ebc17 5115 if not isinstance(key, (int, slice)):
9fea350f 5116 return None
8f334380 5117 if not isinstance(obj, (list, tuple, LazyList)):
325ebc17 5118 if not traverse_string:
5119 return None
5120 obj = str(obj)
5121 try:
5122 obj = obj[key]
5123 except IndexError:
324ad820 5124 return None
325ebc17 5125 return obj
5126
352d63fd 5127 if isinstance(expected_type, type):
5128 type_test = lambda val: val if isinstance(val, expected_type) else None
5129 elif expected_type is not None:
5130 type_test = expected_type
5131 else:
5132 type_test = lambda val: val
5133
8f334380 5134 for path in path_list:
5135 depth = 0
5136 val = _traverse_obj(obj, path)
325ebc17 5137 if val is not None:
8f334380 5138 if depth:
5139 for _ in range(depth - 1):
6586bca9 5140 val = itertools.chain.from_iterable(v for v in val if v is not None)
352d63fd 5141 val = [v for v in map(type_test, val) if v is not None]
8f334380 5142 if val:
352d63fd 5143 return val if get_all else val[0]
5144 else:
5145 val = type_test(val)
5146 if val is not None:
8f334380 5147 return val
325ebc17 5148 return default
324ad820 5149
5150
5151def traverse_dict(dictn, keys, casesense=True):
ee8dd27a 5152 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5153 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5154 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5155
5156
4b4b7f74 5157def variadic(x, allowed_types=(str, bytes, dict)):
cb89cfc1 5158 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
bd50a52b
THD
5159
5160
49fa4d9a
N
5161# create a JSON Web Signature (jws) with HS256 algorithm
5162# the resulting format is in JWS Compact Serialization
5163# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5164# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5165def jwt_encode_hs256(payload_data, key, headers={}):
5166 header_data = {
5167 'alg': 'HS256',
5168 'typ': 'JWT',
5169 }
5170 if headers:
5171 header_data.update(headers)
5172 header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5173 payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5174 h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5175 signature_b64 = base64.b64encode(h.digest())
5176 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5177 return token
819e0531 5178
5179
16b0d7e6 5180# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5181def jwt_decode_hs256(jwt):
5182 header_b64, payload_b64, signature_b64 = jwt.split('.')
5183 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5184 return payload_data
5185
5186
819e0531 5187def supports_terminal_sequences(stream):
5188 if compat_os_name == 'nt':
e3c7d495 5189 from .compat import WINDOWS_VT_MODE # Must be imported locally
5190 if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
819e0531 5191 return False
5192 elif not os.getenv('TERM'):
5193 return False
5194 try:
5195 return stream.isatty()
5196 except BaseException:
5197 return False
5198
5199
ec11a9f4 5200_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5201
5202
5203def remove_terminal_sequences(string):
5204 return _terminal_sequences_re.sub('', string)
5205
5206
5207def number_of_digits(number):
5208 return len('%d' % number)
34921b43 5209
5210
5211def join_nonempty(*values, delim='-', from_dict=None):
5212 if from_dict is not None:
c586f9e8 5213 values = map(from_dict.get, values)
34921b43 5214 return delim.join(map(str, filter(None, values)))
06e57990 5215
5216
5217class Config:
5218 own_args = None
5219 filename = None
5220 __initialized = False
5221
5222 def __init__(self, parser, label=None):
5223 self._parser, self.label = parser, label
5224 self._loaded_paths, self.configs = set(), []
5225
5226 def init(self, args=None, filename=None):
5227 assert not self.__initialized
5228 if filename:
5229 location = os.path.realpath(filename)
5230 if location in self._loaded_paths:
5231 return False
5232 self._loaded_paths.add(location)
5233
5234 self.__initialized = True
5235 self.own_args, self.filename = args, filename
5236 for location in self._parser.parse_args(args)[0].config_locations or []:
5237 location = compat_expanduser(location)
5238 if os.path.isdir(location):
5239 location = os.path.join(location, 'yt-dlp.conf')
5240 if not os.path.exists(location):
5241 self._parser.error(f'config location {location} does not exist')
5242 self.append_config(self.read_file(location), location)
5243 return True
5244
5245 def __str__(self):
5246 label = join_nonempty(
5247 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5248 delim=' ')
5249 return join_nonempty(
5250 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5251 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5252 delim='\n')
5253
5254 @staticmethod
5255 def read_file(filename, default=[]):
5256 try:
5257 optionf = open(filename)
5258 except IOError:
5259 return default # silently skip if file is not present
5260 try:
5261 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5262 contents = optionf.read()
5263 if sys.version_info < (3,):
5264 contents = contents.decode(preferredencoding())
5265 res = compat_shlex_split(contents, comments=True)
5266 finally:
5267 optionf.close()
5268 return res
5269
5270 @staticmethod
5271 def hide_login_info(opts):
5272 PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5273 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5274
5275 def _scrub_eq(o):
5276 m = eqre.match(o)
5277 if m:
5278 return m.group('key') + '=PRIVATE'
5279 else:
5280 return o
5281
5282 opts = list(map(_scrub_eq, opts))
5283 for idx, opt in enumerate(opts):
5284 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5285 opts[idx + 1] = 'PRIVATE'
5286 return opts
5287
5288 def append_config(self, *args, label=None):
5289 config = type(self)(self._parser, label)
5290 config._loaded_paths = self._loaded_paths
5291 if config.init(*args):
5292 self.configs.append(config)
5293
5294 @property
5295 def all_args(self):
5296 for config in reversed(self.configs):
5297 yield from config.all_args
5298 yield from self.own_args or []
5299
5300 def parse_args(self):
5301 return self._parser.parse_args(list(self.all_args))