]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
Allow multiple and nested configuration files
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
cc52de43 1#!/usr/bin/env python3
dcdb292f 2# coding: utf-8
d77c3dfd 3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
c380cc28 10import collections
62e609ab 11import contextlib
e3946f98 12import ctypes
c496ca96
PH
13import datetime
14import email.utils
0c265486 15import email.header
f45c185f 16import errno
be4a824d 17import functools
d77c3dfd 18import gzip
49fa4d9a
N
19import hashlib
20import hmac
019a94f7 21import importlib.util
03f9daab 22import io
79a2e94e 23import itertools
f4bfd65f 24import json
d77c3dfd 25import locale
02dbf93f 26import math
347de493 27import operator
d77c3dfd 28import os
c496ca96 29import platform
773f291d 30import random
d77c3dfd 31import re
c496ca96 32import socket
79a2e94e 33import ssl
1c088fa8 34import subprocess
d77c3dfd 35import sys
181c8655 36import tempfile
c380cc28 37import time
01951dda 38import traceback
bcf89ce6 39import xml.etree.ElementTree
d77c3dfd 40import zlib
2814f12b 41import mimetypes
d77c3dfd 42
8c25f81b 43from .compat import (
b4a3d461 44 compat_HTMLParseError,
8bb56eee 45 compat_HTMLParser,
201c1459 46 compat_HTTPError,
8f9312c3 47 compat_basestring,
8c25f81b 48 compat_chr,
1bab3437 49 compat_cookiejar,
d7cd9a9e 50 compat_ctypes_WINFUNCTYPE,
36e6f62c 51 compat_etree_fromstring,
51098426 52 compat_expanduser,
8c25f81b 53 compat_html_entities,
55b2f099 54 compat_html_entities_html5,
be4a824d 55 compat_http_client,
42db58ec 56 compat_integer_types,
e29663c6 57 compat_numeric_types,
c86b6142 58 compat_kwargs,
efa97bdc 59 compat_os_name,
8c25f81b 60 compat_parse_qs,
06e57990 61 compat_shlex_split,
702ccf2d 62 compat_shlex_quote,
8c25f81b 63 compat_str,
edaa23f8 64 compat_struct_pack,
d3f8e038 65 compat_struct_unpack,
8c25f81b
PH
66 compat_urllib_error,
67 compat_urllib_parse,
15707c7e 68 compat_urllib_parse_urlencode,
8c25f81b 69 compat_urllib_parse_urlparse,
732044af 70 compat_urllib_parse_urlunparse,
71 compat_urllib_parse_quote,
72 compat_urllib_parse_quote_plus,
7581bfc9 73 compat_urllib_parse_unquote_plus,
8c25f81b
PH
74 compat_urllib_request,
75 compat_urlparse,
810c10ba 76 compat_xpath,
8c25f81b 77)
4644ac55 78
71aff188
YCH
79from .socks import (
80 ProxyType,
81 sockssocket,
82)
83
4644ac55 84
51fb4995
YCH
85def register_socks_protocols():
86 # "Register" SOCKS protocols
d5ae6bb5
YCH
87 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
88 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
89 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
90 if scheme not in compat_urlparse.uses_netloc:
91 compat_urlparse.uses_netloc.append(scheme)
92
93
468e2e92
FV
94# This is not clearly defined otherwise
95compiled_regex_type = type(re.compile(''))
96
f7a147e3
S
97
98def random_user_agent():
99 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
100 _CHROME_VERSIONS = (
d76d15a6
F
101 '90.0.4430.212',
102 '90.0.4430.24',
103 '90.0.4430.70',
104 '90.0.4430.72',
105 '90.0.4430.85',
106 '90.0.4430.93',
107 '91.0.4472.101',
108 '91.0.4472.106',
109 '91.0.4472.114',
110 '91.0.4472.124',
111 '91.0.4472.164',
112 '91.0.4472.19',
113 '91.0.4472.77',
114 '92.0.4515.107',
115 '92.0.4515.115',
116 '92.0.4515.131',
117 '92.0.4515.159',
118 '92.0.4515.43',
119 '93.0.4556.0',
120 '93.0.4577.15',
121 '93.0.4577.63',
122 '93.0.4577.82',
123 '94.0.4606.41',
124 '94.0.4606.54',
125 '94.0.4606.61',
126 '94.0.4606.71',
127 '94.0.4606.81',
128 '94.0.4606.85',
129 '95.0.4638.17',
130 '95.0.4638.50',
131 '95.0.4638.54',
132 '95.0.4638.69',
133 '95.0.4638.74',
134 '96.0.4664.18',
135 '96.0.4664.45',
136 '96.0.4664.55',
137 '96.0.4664.93',
138 '97.0.4692.20',
f7a147e3
S
139 )
140 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
141
142
3e669f36 143std_headers = {
f7a147e3 144 'User-Agent': random_user_agent(),
59ae15a5
PH
145 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
146 'Accept-Encoding': 'gzip, deflate',
147 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 148}
f427df17 149
5f6a1245 150
fb37eb25
S
151USER_AGENTS = {
152 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
153}
154
155
bf42a990
S
156NO_DEFAULT = object()
157
7105440c
YCH
158ENGLISH_MONTH_NAMES = [
159 'January', 'February', 'March', 'April', 'May', 'June',
160 'July', 'August', 'September', 'October', 'November', 'December']
161
f6717dec
S
162MONTH_NAMES = {
163 'en': ENGLISH_MONTH_NAMES,
164 'fr': [
3e4185c3
S
165 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
166 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 167}
a942d6cb 168
a7aaa398
S
169KNOWN_EXTENSIONS = (
170 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
171 'flv', 'f4v', 'f4a', 'f4b',
172 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
173 'mkv', 'mka', 'mk3d',
174 'avi', 'divx',
175 'mov',
176 'asf', 'wmv', 'wma',
177 '3gp', '3g2',
178 'mp3',
179 'flac',
180 'ape',
181 'wav',
182 'f4f', 'f4m', 'm3u8', 'smil')
183
c587cbb7 184# needed for sanitizing filenames in restricted mode
c8827027 185ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
186 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
187 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 188
46f59e89
S
189DATE_FORMATS = (
190 '%d %B %Y',
191 '%d %b %Y',
192 '%B %d %Y',
cb655f34
S
193 '%B %dst %Y',
194 '%B %dnd %Y',
9d30c213 195 '%B %drd %Y',
cb655f34 196 '%B %dth %Y',
46f59e89 197 '%b %d %Y',
cb655f34
S
198 '%b %dst %Y',
199 '%b %dnd %Y',
9d30c213 200 '%b %drd %Y',
cb655f34 201 '%b %dth %Y',
46f59e89
S
202 '%b %dst %Y %I:%M',
203 '%b %dnd %Y %I:%M',
9d30c213 204 '%b %drd %Y %I:%M',
46f59e89
S
205 '%b %dth %Y %I:%M',
206 '%Y %m %d',
207 '%Y-%m-%d',
bccdbd22 208 '%Y.%m.%d.',
46f59e89 209 '%Y/%m/%d',
81c13222 210 '%Y/%m/%d %H:%M',
46f59e89 211 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
212 '%Y%m%d%H%M',
213 '%Y%m%d%H%M%S',
4f3fa23e 214 '%Y%m%d',
0c1c6f4b 215 '%Y-%m-%d %H:%M',
46f59e89
S
216 '%Y-%m-%d %H:%M:%S',
217 '%Y-%m-%d %H:%M:%S.%f',
5014558a 218 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
219 '%d.%m.%Y %H:%M',
220 '%d.%m.%Y %H.%M',
221 '%Y-%m-%dT%H:%M:%SZ',
222 '%Y-%m-%dT%H:%M:%S.%fZ',
223 '%Y-%m-%dT%H:%M:%S.%f0Z',
224 '%Y-%m-%dT%H:%M:%S',
225 '%Y-%m-%dT%H:%M:%S.%f',
226 '%Y-%m-%dT%H:%M',
c6eed6b8
S
227 '%b %d %Y at %H:%M',
228 '%b %d %Y at %H:%M:%S',
b555ae9b
S
229 '%B %d %Y at %H:%M',
230 '%B %d %Y at %H:%M:%S',
a63d9bd0 231 '%H:%M %d-%b-%Y',
46f59e89
S
232)
233
234DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
235DATE_FORMATS_DAY_FIRST.extend([
236 '%d-%m-%Y',
237 '%d.%m.%Y',
238 '%d.%m.%y',
239 '%d/%m/%Y',
240 '%d/%m/%y',
241 '%d/%m/%Y %H:%M:%S',
242])
243
244DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
245DATE_FORMATS_MONTH_FIRST.extend([
246 '%m-%d-%Y',
247 '%m.%d.%Y',
248 '%m/%d/%Y',
249 '%m/%d/%y',
250 '%m/%d/%Y %H:%M:%S',
251])
252
06b3fe29 253PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
22f5f5c6 254JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
06b3fe29 255
7105440c 256
d77c3dfd 257def preferredencoding():
59ae15a5 258 """Get preferred encoding.
d77c3dfd 259
59ae15a5
PH
260 Returns the best encoding scheme for the system, based on
261 locale.getpreferredencoding() and some further tweaks.
262 """
263 try:
264 pref = locale.getpreferredencoding()
28e614de 265 'TEST'.encode(pref)
70a1165b 266 except Exception:
59ae15a5 267 pref = 'UTF-8'
bae611f2 268
59ae15a5 269 return pref
d77c3dfd 270
f4bfd65f 271
181c8655 272def write_json_file(obj, fn):
1394646a 273 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 274
92120217 275 fn = encodeFilename(fn)
61ee5aeb 276 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
277 encoding = get_filesystem_encoding()
278 # os.path.basename returns a bytes object, but NamedTemporaryFile
279 # will fail if the filename contains non ascii characters unless we
280 # use a unicode object
281 path_basename = lambda f: os.path.basename(fn).decode(encoding)
282 # the same for os.path.dirname
283 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
284 else:
285 path_basename = os.path.basename
286 path_dirname = os.path.dirname
287
73159f99
S
288 args = {
289 'suffix': '.tmp',
ec5f6016
JMF
290 'prefix': path_basename(fn) + '.',
291 'dir': path_dirname(fn),
73159f99
S
292 'delete': False,
293 }
294
181c8655
PH
295 # In Python 2.x, json.dump expects a bytestream.
296 # In Python 3.x, it writes to a character stream
297 if sys.version_info < (3, 0):
73159f99 298 args['mode'] = 'wb'
181c8655 299 else:
73159f99
S
300 args.update({
301 'mode': 'w',
302 'encoding': 'utf-8',
303 })
304
c86b6142 305 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
306
307 try:
308 with tf:
45d86abe 309 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
310 if sys.platform == 'win32':
311 # Need to remove existing file on Windows, else os.rename raises
312 # WindowsError or FileExistsError.
313 try:
314 os.unlink(fn)
315 except OSError:
316 pass
9cd5f54e
R
317 try:
318 mask = os.umask(0)
319 os.umask(mask)
320 os.chmod(tf.name, 0o666 & ~mask)
321 except OSError:
322 pass
181c8655 323 os.rename(tf.name, fn)
70a1165b 324 except Exception:
181c8655
PH
325 try:
326 os.remove(tf.name)
327 except OSError:
328 pass
329 raise
330
331
332if sys.version_info >= (2, 7):
ee114368 333 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 334 """ Find the xpath xpath[@key=val] """
5d2354f1 335 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 336 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
337 return node.find(expr)
338else:
ee114368 339 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 340 for f in node.findall(compat_xpath(xpath)):
ee114368
S
341 if key not in f.attrib:
342 continue
343 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
344 return f
345 return None
346
d7e66d39
JMF
347# On python2.6 the xml.etree.ElementTree.Element methods don't support
348# the namespace parameter
5f6a1245
JW
349
350
d7e66d39
JMF
351def xpath_with_ns(path, ns_map):
352 components = [c.split(':') for c in path.split('/')]
353 replaced = []
354 for c in components:
355 if len(c) == 1:
356 replaced.append(c[0])
357 else:
358 ns, tag = c
359 replaced.append('{%s}%s' % (ns_map[ns], tag))
360 return '/'.join(replaced)
361
d77c3dfd 362
a41fb80c 363def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 364 def _find_xpath(xpath):
810c10ba 365 return node.find(compat_xpath(xpath))
578c0745
S
366
367 if isinstance(xpath, (str, compat_str)):
368 n = _find_xpath(xpath)
369 else:
370 for xp in xpath:
371 n = _find_xpath(xp)
372 if n is not None:
373 break
d74bebd5 374
8e636da4 375 if n is None:
bf42a990
S
376 if default is not NO_DEFAULT:
377 return default
378 elif fatal:
bf0ff932
PH
379 name = xpath if name is None else name
380 raise ExtractorError('Could not find XML element %s' % name)
381 else:
382 return None
a41fb80c
S
383 return n
384
385
386def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
387 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
388 if n is None or n == default:
389 return n
390 if n.text is None:
391 if default is not NO_DEFAULT:
392 return default
393 elif fatal:
394 name = xpath if name is None else name
395 raise ExtractorError('Could not find XML element\'s text %s' % name)
396 else:
397 return None
398 return n.text
a41fb80c
S
399
400
401def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
402 n = find_xpath_attr(node, xpath, key)
403 if n is None:
404 if default is not NO_DEFAULT:
405 return default
406 elif fatal:
407 name = '%s[@%s]' % (xpath, key) if name is None else name
408 raise ExtractorError('Could not find XML attribute %s' % name)
409 else:
410 return None
411 return n.attrib[key]
bf0ff932
PH
412
413
9e6dd238 414def get_element_by_id(id, html):
43e8fafd 415 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 416 return get_element_by_attribute('id', id, html)
43e8fafd 417
12ea2f30 418
84c237fb 419def get_element_by_class(class_name, html):
2af12ad9
TC
420 """Return the content of the first tag with the specified class in the passed HTML document"""
421 retval = get_elements_by_class(class_name, html)
422 return retval[0] if retval else None
423
424
425def get_element_by_attribute(attribute, value, html, escape_value=True):
426 retval = get_elements_by_attribute(attribute, value, html, escape_value)
427 return retval[0] if retval else None
428
429
430def get_elements_by_class(class_name, html):
431 """Return the content of all tags with the specified class in the passed HTML document as a list"""
432 return get_elements_by_attribute(
84c237fb
YCH
433 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
434 html, escape_value=False)
435
436
2af12ad9 437def get_elements_by_attribute(attribute, value, html, escape_value=True):
43e8fafd 438 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 439
84c237fb
YCH
440 value = re.escape(value) if escape_value else value
441
2af12ad9
TC
442 retlist = []
443 for m in re.finditer(r'''(?xs)
38285056 444 <([a-zA-Z0-9:._-]+)
609ff8ca 445 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38285056 446 \s+%s=['"]?%s['"]?
609ff8ca 447 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38285056
PH
448 \s*>
449 (?P<content>.*?)
450 </\1>
2af12ad9
TC
451 ''' % (re.escape(attribute), value), html):
452 res = m.group('content')
38285056 453
2af12ad9
TC
454 if res.startswith('"') or res.startswith("'"):
455 res = res[1:-1]
38285056 456
2af12ad9 457 retlist.append(unescapeHTML(res))
a921f407 458
2af12ad9 459 return retlist
a921f407 460
c5229f39 461
8bb56eee
BF
462class HTMLAttributeParser(compat_HTMLParser):
463 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 464
8bb56eee 465 def __init__(self):
c5229f39 466 self.attrs = {}
8bb56eee
BF
467 compat_HTMLParser.__init__(self)
468
469 def handle_starttag(self, tag, attrs):
470 self.attrs = dict(attrs)
471
c5229f39 472
73673ccf
FF
473class HTMLListAttrsParser(compat_HTMLParser):
474 """HTML parser to gather the attributes for the elements of a list"""
475
476 def __init__(self):
477 compat_HTMLParser.__init__(self)
478 self.items = []
479 self._level = 0
480
481 def handle_starttag(self, tag, attrs):
482 if tag == 'li' and self._level == 0:
483 self.items.append(dict(attrs))
484 self._level += 1
485
486 def handle_endtag(self, tag):
487 self._level -= 1
488
489
8bb56eee
BF
490def extract_attributes(html_element):
491 """Given a string for an HTML element such as
492 <el
493 a="foo" B="bar" c="&98;az" d=boz
494 empty= noval entity="&amp;"
495 sq='"' dq="'"
496 >
497 Decode and return a dictionary of attributes.
498 {
499 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
500 'empty': '', 'noval': None, 'entity': '&',
501 'sq': '"', 'dq': '\''
502 }.
503 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
504 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
505 """
506 parser = HTMLAttributeParser()
b4a3d461
S
507 try:
508 parser.feed(html_element)
509 parser.close()
510 # Older Python may throw HTMLParseError in case of malformed HTML
511 except compat_HTMLParseError:
512 pass
8bb56eee 513 return parser.attrs
9e6dd238 514
c5229f39 515
73673ccf
FF
516def parse_list(webpage):
517 """Given a string for an series of HTML <li> elements,
518 return a dictionary of their attributes"""
519 parser = HTMLListAttrsParser()
520 parser.feed(webpage)
521 parser.close()
522 return parser.items
523
524
9e6dd238 525def clean_html(html):
59ae15a5 526 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
527
528 if html is None: # Convenience for sanitizing descriptions etc.
529 return html
530
59ae15a5
PH
531 # Newline vs <br />
532 html = html.replace('\n', ' ')
edd9221c
TF
533 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
534 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
535 # Strip html tags
536 html = re.sub('<.*?>', '', html)
537 # Replace html entities
538 html = unescapeHTML(html)
7decf895 539 return html.strip()
9e6dd238
FV
540
541
d77c3dfd 542def sanitize_open(filename, open_mode):
59ae15a5
PH
543 """Try to open the given filename, and slightly tweak it if this fails.
544
545 Attempts to open the given filename. If this fails, it tries to change
546 the filename slightly, step by step, until it's either able to open it
547 or it fails and raises a final exception, like the standard open()
548 function.
549
550 It returns the tuple (stream, definitive_file_name).
551 """
552 try:
28e614de 553 if filename == '-':
59ae15a5
PH
554 if sys.platform == 'win32':
555 import msvcrt
556 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 557 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
558 stream = open(encodeFilename(filename), open_mode)
559 return (stream, filename)
560 except (IOError, OSError) as err:
f45c185f
PH
561 if err.errno in (errno.EACCES,):
562 raise
59ae15a5 563
f45c185f 564 # In case of error, try to remove win32 forbidden chars
d55de57b 565 alt_filename = sanitize_path(filename)
f45c185f
PH
566 if alt_filename == filename:
567 raise
568 else:
569 # An exception here should be caught in the caller
d55de57b 570 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 571 return (stream, alt_filename)
d77c3dfd
FV
572
573
574def timeconvert(timestr):
59ae15a5
PH
575 """Convert RFC 2822 defined time string into system timestamp"""
576 timestamp = None
577 timetuple = email.utils.parsedate_tz(timestr)
578 if timetuple is not None:
579 timestamp = email.utils.mktime_tz(timetuple)
580 return timestamp
1c469a94 581
5f6a1245 582
796173d0 583def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
584 """Sanitizes a string so it could be used as part of a filename.
585 If restricted is set, use a stricter subset of allowed characters.
158af524
S
586 Set is_id if this is not an arbitrary string, but an ID that should be kept
587 if possible.
59ae15a5
PH
588 """
589 def replace_insane(char):
c587cbb7
AT
590 if restricted and char in ACCENT_CHARS:
591 return ACCENT_CHARS[char]
91dd88b9 592 elif not restricted and char == '\n':
593 return ' '
594 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
595 return ''
596 elif char == '"':
597 return '' if restricted else '\''
598 elif char == ':':
599 return '_-' if restricted else ' -'
600 elif char in '\\/|*<>':
601 return '_'
627dcfff 602 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
603 return '_'
604 if restricted and ord(char) > 127:
605 return '_'
606 return char
607
639f1cea 608 if s == '':
609 return ''
2aeb06d6
PH
610 # Handle timestamps
611 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 612 result = ''.join(map(replace_insane, s))
796173d0
PH
613 if not is_id:
614 while '__' in result:
615 result = result.replace('__', '_')
616 result = result.strip('_')
617 # Common case of "Foreign band name - English song title"
618 if restricted and result.startswith('-_'):
619 result = result[2:]
5a42414b
PH
620 if result.startswith('-'):
621 result = '_' + result[len('-'):]
a7440261 622 result = result.lstrip('.')
796173d0
PH
623 if not result:
624 result = '_'
59ae15a5 625 return result
d77c3dfd 626
5f6a1245 627
c2934512 628def sanitize_path(s, force=False):
a2aaf4db 629 """Sanitizes and normalizes path on Windows"""
c2934512 630 if sys.platform == 'win32':
c4218ac3 631 force = False
c2934512 632 drive_or_unc, _ = os.path.splitdrive(s)
633 if sys.version_info < (2, 7) and not drive_or_unc:
634 drive_or_unc, _ = os.path.splitunc(s)
635 elif force:
636 drive_or_unc = ''
637 else:
a2aaf4db 638 return s
c2934512 639
be531ef1
S
640 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
641 if drive_or_unc:
a2aaf4db
S
642 norm_path.pop(0)
643 sanitized_path = [
ec85ded8 644 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 645 for path_part in norm_path]
be531ef1
S
646 if drive_or_unc:
647 sanitized_path.insert(0, drive_or_unc + os.path.sep)
c4218ac3 648 elif force and s[0] == os.path.sep:
649 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
650 return os.path.join(*sanitized_path)
651
652
17bcc626 653def sanitize_url(url):
befa4708
S
654 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
655 # the number of unwanted failures due to missing protocol
656 if url.startswith('//'):
657 return 'http:%s' % url
658 # Fix some common typos seen so far
659 COMMON_TYPOS = (
067aa17e 660 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
661 (r'^httpss://', r'https://'),
662 # https://bx1.be/lives/direct-tv/
663 (r'^rmtp([es]?)://', r'rtmp\1://'),
664 )
665 for mistake, fixup in COMMON_TYPOS:
666 if re.match(mistake, url):
667 return re.sub(mistake, fixup, url)
bc6b9bcd 668 return url
17bcc626
S
669
670
5435dcf9
HH
671def extract_basic_auth(url):
672 parts = compat_urlparse.urlsplit(url)
673 if parts.username is None:
674 return url, None
675 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
676 parts.hostname if parts.port is None
677 else '%s:%d' % (parts.hostname, parts.port))))
678 auth_payload = base64.b64encode(
679 ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
680 return url, 'Basic ' + auth_payload.decode('utf-8')
681
682
67dda517 683def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 684 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
685 if auth_header is not None:
686 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
687 headers['Authorization'] = auth_header
688 return compat_urllib_request.Request(url, *args, **kwargs)
67dda517
S
689
690
51098426
S
691def expand_path(s):
692 """Expand shell variables and ~"""
693 return os.path.expandvars(compat_expanduser(s))
694
695
d77c3dfd 696def orderedSet(iterable):
59ae15a5
PH
697 """ Remove all duplicates from the input iterable """
698 res = []
699 for el in iterable:
700 if el not in res:
701 res.append(el)
702 return res
d77c3dfd 703
912b38b4 704
55b2f099 705def _htmlentity_transform(entity_with_semicolon):
4e408e47 706 """Transforms an HTML entity to a character."""
55b2f099
YCH
707 entity = entity_with_semicolon[:-1]
708
4e408e47
PH
709 # Known non-numeric HTML entity
710 if entity in compat_html_entities.name2codepoint:
711 return compat_chr(compat_html_entities.name2codepoint[entity])
712
55b2f099
YCH
713 # TODO: HTML5 allows entities without a semicolon. For example,
714 # '&Eacuteric' should be decoded as 'Éric'.
715 if entity_with_semicolon in compat_html_entities_html5:
716 return compat_html_entities_html5[entity_with_semicolon]
717
91757b0f 718 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
719 if mobj is not None:
720 numstr = mobj.group(1)
28e614de 721 if numstr.startswith('x'):
4e408e47 722 base = 16
28e614de 723 numstr = '0%s' % numstr
4e408e47
PH
724 else:
725 base = 10
067aa17e 726 # See https://github.com/ytdl-org/youtube-dl/issues/7518
7aefc49c
S
727 try:
728 return compat_chr(int(numstr, base))
729 except ValueError:
730 pass
4e408e47
PH
731
732 # Unknown entity in name, return its literal representation
7a3f0c00 733 return '&%s;' % entity
4e408e47
PH
734
735
d77c3dfd 736def unescapeHTML(s):
912b38b4
PH
737 if s is None:
738 return None
739 assert type(s) == compat_str
d77c3dfd 740
4e408e47 741 return re.sub(
95f3f7c2 742 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 743
8bf48f23 744
cdb19aa4 745def escapeHTML(text):
746 return (
747 text
748 .replace('&', '&amp;')
749 .replace('<', '&lt;')
750 .replace('>', '&gt;')
751 .replace('"', '&quot;')
752 .replace("'", '&#39;')
753 )
754
755
f5b1bca9 756def process_communicate_or_kill(p, *args, **kwargs):
757 try:
758 return p.communicate(*args, **kwargs)
759 except BaseException: # Including KeyboardInterrupt
760 p.kill()
761 p.wait()
762 raise
763
764
d3c93ec2 765class Popen(subprocess.Popen):
766 if sys.platform == 'win32':
767 _startupinfo = subprocess.STARTUPINFO()
768 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
769 else:
770 _startupinfo = None
771
772 def __init__(self, *args, **kwargs):
773 super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
774
775 def communicate_or_kill(self, *args, **kwargs):
776 return process_communicate_or_kill(self, *args, **kwargs)
777
778
aa49acd1
S
779def get_subprocess_encoding():
780 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
781 # For subprocess calls, encode with locale encoding
782 # Refer to http://stackoverflow.com/a/9951851/35070
783 encoding = preferredencoding()
784 else:
785 encoding = sys.getfilesystemencoding()
786 if encoding is None:
787 encoding = 'utf-8'
788 return encoding
789
790
8bf48f23 791def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
792 """
793 @param s The name of the file
794 """
d77c3dfd 795
8bf48f23 796 assert type(s) == compat_str
d77c3dfd 797
59ae15a5
PH
798 # Python 3 has a Unicode API
799 if sys.version_info >= (3, 0):
800 return s
0f00efed 801
aa49acd1
S
802 # Pass '' directly to use Unicode APIs on Windows 2000 and up
803 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
804 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
805 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
806 return s
807
8ee239e9
YCH
808 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
809 if sys.platform.startswith('java'):
810 return s
811
aa49acd1
S
812 return s.encode(get_subprocess_encoding(), 'ignore')
813
814
815def decodeFilename(b, for_subprocess=False):
816
817 if sys.version_info >= (3, 0):
818 return b
819
820 if not isinstance(b, bytes):
821 return b
822
823 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 824
f07b74fc
PH
825
826def encodeArgument(s):
827 if not isinstance(s, compat_str):
828 # Legacy code that uses byte strings
829 # Uncomment the following line after fixing all post processors
7af808a5 830 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
831 s = s.decode('ascii')
832 return encodeFilename(s, True)
833
834
aa49acd1
S
835def decodeArgument(b):
836 return decodeFilename(b, True)
837
838
8271226a
PH
839def decodeOption(optval):
840 if optval is None:
841 return optval
842 if isinstance(optval, bytes):
843 optval = optval.decode(preferredencoding())
844
845 assert isinstance(optval, compat_str)
846 return optval
1c256f70 847
5f6a1245 848
aa7785f8 849_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
850
851
852def timetuple_from_msec(msec):
853 secs, msec = divmod(msec, 1000)
854 mins, secs = divmod(secs, 60)
855 hrs, mins = divmod(mins, 60)
856 return _timetuple(hrs, mins, secs, msec)
857
858
cdb19aa4 859def formatSeconds(secs, delim=':', msec=False):
aa7785f8 860 time = timetuple_from_msec(secs * 1000)
861 if time.hours:
862 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
863 elif time.minutes:
864 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 865 else:
aa7785f8 866 ret = '%d' % time.seconds
867 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 868
a0ddb8a2 869
77562778 870def _ssl_load_windows_store_certs(ssl_context, storename):
871 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
872 try:
873 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
874 if encoding == 'x509_asn' and (
875 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
876 except PermissionError:
877 return
878 for cert in certs:
a2366922 879 try:
77562778 880 ssl_context.load_verify_locations(cadata=cert)
881 except ssl.SSLError:
a2366922
PH
882 pass
883
77562778 884
885def make_HTTPS_handler(params, **kwargs):
886 opts_check_certificate = not params.get('nocheckcertificate')
887 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
888 context.check_hostname = opts_check_certificate
889 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
890 if opts_check_certificate:
4e3d1898 891 try:
892 context.load_default_certs()
893 # Work around the issue in load_default_certs when there are bad certificates. See:
894 # https://github.com/yt-dlp/yt-dlp/issues/1060,
895 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
896 except ssl.SSLError:
897 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
898 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
899 # Create a new context to discard any certificates that were already loaded
900 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
901 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
902 for storename in ('CA', 'ROOT'):
903 _ssl_load_windows_store_certs(context, storename)
904 context.set_default_verify_paths()
77562778 905 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 906
732ea2f0 907
5873d4cc 908def bug_reports_message(before=';'):
08f2a92c 909 if ytdl_is_updateable():
7a5c1cfe 910 update_cmd = 'type yt-dlp -U to update'
08f2a92c 911 else:
7a5c1cfe 912 update_cmd = 'see https://github.com/yt-dlp/yt-dlp on how to update'
5873d4cc 913 msg = 'please report this issue on https://github.com/yt-dlp/yt-dlp .'
08f2a92c 914 msg += ' Make sure you are using the latest version; %s.' % update_cmd
7a5c1cfe 915 msg += ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
5873d4cc
F
916
917 before = before.rstrip()
918 if not before or before.endswith(('.', '!', '?')):
919 msg = msg[0].title() + msg[1:]
920
921 return (before + ' ' if before else '') + msg
08f2a92c
JMF
922
923
bf5b9d85
PM
924class YoutubeDLError(Exception):
925 """Base exception for YoutubeDL errors."""
aa9369a2 926 msg = None
927
928 def __init__(self, msg=None):
929 if msg is not None:
930 self.msg = msg
931 elif self.msg is None:
932 self.msg = type(self).__name__
933 super().__init__(self.msg)
bf5b9d85
PM
934
935
3158150c 936network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
937if hasattr(ssl, 'CertificateError'):
938 network_exceptions.append(ssl.CertificateError)
939network_exceptions = tuple(network_exceptions)
940
941
bf5b9d85 942class ExtractorError(YoutubeDLError):
1c256f70 943 """Error during info extraction."""
5f6a1245 944
1151c407 945 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 946 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 947 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 948 """
3158150c 949 if sys.exc_info()[0] in network_exceptions:
9a82b238 950 expected = True
d5979c5d 951
526d74ec 952 self.msg = str(msg)
1c256f70 953 self.traceback = tb
1151c407 954 self.expected = expected
2eabb802 955 self.cause = cause
d11271dd 956 self.video_id = video_id
1151c407 957 self.ie = ie
958 self.exc_info = sys.exc_info() # preserve original exception
959
960 super(ExtractorError, self).__init__(''.join((
961 format_field(ie, template='[%s] '),
962 format_field(video_id, template='%s: '),
526d74ec 963 self.msg,
1151c407 964 format_field(cause, template=' (caused by %r)'),
965 '' if expected else bug_reports_message())))
1c256f70 966
01951dda
PH
967 def format_traceback(self):
968 if self.traceback is None:
969 return None
28e614de 970 return ''.join(traceback.format_tb(self.traceback))
01951dda 971
1c256f70 972
416c7fcb
PH
973class UnsupportedError(ExtractorError):
974 def __init__(self, url):
975 super(UnsupportedError, self).__init__(
976 'Unsupported URL: %s' % url, expected=True)
977 self.url = url
978
979
55b3e45b
JMF
980class RegexNotFoundError(ExtractorError):
981 """Error when a regex didn't match"""
982 pass
983
984
773f291d
S
985class GeoRestrictedError(ExtractorError):
986 """Geographic restriction Error exception.
987
988 This exception may be thrown when a video is not available from your
989 geographic location due to geographic restrictions imposed by a website.
990 """
b6e0c7d2 991
0db3bae8 992 def __init__(self, msg, countries=None, **kwargs):
993 kwargs['expected'] = True
994 super(GeoRestrictedError, self).__init__(msg, **kwargs)
773f291d
S
995 self.countries = countries
996
997
bf5b9d85 998class DownloadError(YoutubeDLError):
59ae15a5 999 """Download Error exception.
d77c3dfd 1000
59ae15a5
PH
1001 This exception may be thrown by FileDownloader objects if they are not
1002 configured to continue on errors. They will contain the appropriate
1003 error message.
1004 """
5f6a1245 1005
8cc83b8d
FV
1006 def __init__(self, msg, exc_info=None):
1007 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1008 super(DownloadError, self).__init__(msg)
1009 self.exc_info = exc_info
d77c3dfd
FV
1010
1011
498f5606 1012class EntryNotInPlaylist(YoutubeDLError):
1013 """Entry not in playlist exception.
1014
1015 This exception will be thrown by YoutubeDL when a requested entry
1016 is not found in the playlist info_dict
1017 """
aa9369a2 1018 msg = 'Entry not found in info'
498f5606 1019
1020
bf5b9d85 1021class SameFileError(YoutubeDLError):
59ae15a5 1022 """Same File exception.
d77c3dfd 1023
59ae15a5
PH
1024 This exception will be thrown by FileDownloader objects if they detect
1025 multiple files would have to be downloaded to the same file on disk.
1026 """
aa9369a2 1027 msg = 'Fixed output name but more than one file to download'
1028
1029 def __init__(self, filename=None):
1030 if filename is not None:
1031 self.msg += f': {filename}'
1032 super().__init__(self.msg)
d77c3dfd
FV
1033
1034
bf5b9d85 1035class PostProcessingError(YoutubeDLError):
59ae15a5 1036 """Post Processing exception.
d77c3dfd 1037
59ae15a5
PH
1038 This exception may be raised by PostProcessor's .run() method to
1039 indicate an error in the postprocessing task.
1040 """
5f6a1245 1041
5f6a1245 1042
48f79687 1043class DownloadCancelled(YoutubeDLError):
1044 """ Exception raised when the download queue should be interrupted """
1045 msg = 'The download was cancelled'
8b0d7497 1046
8b0d7497 1047
48f79687 1048class ExistingVideoReached(DownloadCancelled):
1049 """ --break-on-existing triggered """
1050 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1051
48f79687 1052
1053class RejectedVideoReached(DownloadCancelled):
1054 """ --break-on-reject triggered """
1055 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
51d9739f 1056
1057
48f79687 1058class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1059 """ --max-downloads limit has been reached. """
48f79687 1060 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1061
1062
f2ebc5c7 1063class ReExtractInfo(YoutubeDLError):
1064 """ Video info needs to be re-extracted. """
1065
1066 def __init__(self, msg, expected=False):
1067 super().__init__(msg)
1068 self.expected = expected
1069
1070
1071class ThrottledDownload(ReExtractInfo):
48f79687 1072 """ Download speed below --throttled-rate. """
aa9369a2 1073 msg = 'The download speed is below throttle limit'
d77c3dfd 1074
43b22906 1075 def __init__(self):
1076 super().__init__(self.msg, expected=False)
f2ebc5c7 1077
d77c3dfd 1078
bf5b9d85 1079class UnavailableVideoError(YoutubeDLError):
59ae15a5 1080 """Unavailable Format exception.
d77c3dfd 1081
59ae15a5
PH
1082 This exception will be thrown when a video is requested
1083 in a format that is not available for that video.
1084 """
aa9369a2 1085 msg = 'Unable to download video'
1086
1087 def __init__(self, err=None):
1088 if err is not None:
1089 self.msg += f': {err}'
1090 super().__init__(self.msg)
d77c3dfd
FV
1091
1092
bf5b9d85 1093class ContentTooShortError(YoutubeDLError):
59ae15a5 1094 """Content Too Short exception.
d77c3dfd 1095
59ae15a5
PH
1096 This exception may be raised by FileDownloader objects when a file they
1097 download is too small for what the server announced first, indicating
1098 the connection was probably interrupted.
1099 """
d77c3dfd 1100
59ae15a5 1101 def __init__(self, downloaded, expected):
bf5b9d85
PM
1102 super(ContentTooShortError, self).__init__(
1103 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1104 )
2c7ed247 1105 # Both in bytes
59ae15a5
PH
1106 self.downloaded = downloaded
1107 self.expected = expected
d77c3dfd 1108
5f6a1245 1109
bf5b9d85 1110class XAttrMetadataError(YoutubeDLError):
efa97bdc
YCH
1111 def __init__(self, code=None, msg='Unknown error'):
1112 super(XAttrMetadataError, self).__init__(msg)
1113 self.code = code
bd264412 1114 self.msg = msg
efa97bdc
YCH
1115
1116 # Parsing code and msg
3089bc74 1117 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1118 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1119 self.reason = 'NO_SPACE'
1120 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1121 self.reason = 'VALUE_TOO_LONG'
1122 else:
1123 self.reason = 'NOT_SUPPORTED'
1124
1125
bf5b9d85 1126class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1127 pass
1128
1129
c5a59d93 1130def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
1131 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1132 # expected HTTP responses to meet HTTP/1.0 or later (see also
067aa17e 1133 # https://github.com/ytdl-org/youtube-dl/issues/6727)
e5e78797 1134 if sys.version_info < (3, 0):
65220c3b
S
1135 kwargs['strict'] = True
1136 hc = http_class(*args, **compat_kwargs(kwargs))
be4a824d 1137 source_address = ydl_handler._params.get('source_address')
8959018a 1138
be4a824d 1139 if source_address is not None:
8959018a
AU
1140 # This is to workaround _create_connection() from socket where it will try all
1141 # address data from getaddrinfo() including IPv6. This filters the result from
1142 # getaddrinfo() based on the source_address value.
1143 # This is based on the cpython socket.create_connection() function.
1144 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1145 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1146 host, port = address
1147 err = None
1148 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1149 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1150 ip_addrs = [addr for addr in addrs if addr[0] == af]
1151 if addrs and not ip_addrs:
1152 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1153 raise socket.error(
1154 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1155 % (ip_version, source_address[0]))
8959018a
AU
1156 for res in ip_addrs:
1157 af, socktype, proto, canonname, sa = res
1158 sock = None
1159 try:
1160 sock = socket.socket(af, socktype, proto)
1161 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1162 sock.settimeout(timeout)
1163 sock.bind(source_address)
1164 sock.connect(sa)
1165 err = None # Explicitly break reference cycle
1166 return sock
1167 except socket.error as _:
1168 err = _
1169 if sock is not None:
1170 sock.close()
1171 if err is not None:
1172 raise err
1173 else:
9e21e6d9
S
1174 raise socket.error('getaddrinfo returns an empty list')
1175 if hasattr(hc, '_create_connection'):
1176 hc._create_connection = _create_connection
be4a824d
PH
1177 sa = (source_address, 0)
1178 if hasattr(hc, 'source_address'): # Python 2.7+
1179 hc.source_address = sa
1180 else: # Python 2.6
1181 def _hc_connect(self, *args, **kwargs):
9e21e6d9 1182 sock = _create_connection(
be4a824d
PH
1183 (self.host, self.port), self.timeout, sa)
1184 if is_https:
d7932313
PH
1185 self.sock = ssl.wrap_socket(
1186 sock, self.key_file, self.cert_file,
1187 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
1188 else:
1189 self.sock = sock
1190 hc.connect = functools.partial(_hc_connect, hc)
1191
1192 return hc
1193
1194
87f0e62d 1195def handle_youtubedl_headers(headers):
992fc9d6
YCH
1196 filtered_headers = headers
1197
1198 if 'Youtubedl-no-compression' in filtered_headers:
1199 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 1200 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1201
992fc9d6 1202 return filtered_headers
87f0e62d
YCH
1203
1204
acebc9cd 1205class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
1206 """Handler for HTTP requests and responses.
1207
1208 This class, when installed with an OpenerDirector, automatically adds
1209 the standard headers to every HTTP request and handles gzipped and
1210 deflated responses from web servers. If compression is to be avoided in
1211 a particular request, the original request in the program code only has
0424ec30 1212 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1213 removed before making the real request.
1214
1215 Part of this code was copied from:
1216
1217 http://techknack.net/python-urllib2-handlers/
1218
1219 Andrew Rowls, the author of that code, agreed to release it to the
1220 public domain.
1221 """
1222
be4a824d
PH
1223 def __init__(self, params, *args, **kwargs):
1224 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1225 self._params = params
1226
1227 def http_open(self, req):
71aff188
YCH
1228 conn_class = compat_http_client.HTTPConnection
1229
1230 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1231 if socks_proxy:
1232 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1233 del req.headers['Ytdl-socks-proxy']
1234
be4a824d 1235 return self.do_open(functools.partial(
71aff188 1236 _create_http_connection, self, conn_class, False),
be4a824d
PH
1237 req)
1238
59ae15a5
PH
1239 @staticmethod
1240 def deflate(data):
fc2119f2 1241 if not data:
1242 return data
59ae15a5
PH
1243 try:
1244 return zlib.decompress(data, -zlib.MAX_WBITS)
1245 except zlib.error:
1246 return zlib.decompress(data)
1247
acebc9cd 1248 def http_request(self, req):
51f267d9
S
1249 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1250 # always respected by websites, some tend to give out URLs with non percent-encoded
1251 # non-ASCII characters (see telemb.py, ard.py [#3412])
1252 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1253 # To work around aforementioned issue we will replace request's original URL with
1254 # percent-encoded one
1255 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1256 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1257 url = req.get_full_url()
1258 url_escaped = escape_url(url)
1259
1260 # Substitute URL if any change after escaping
1261 if url != url_escaped:
15d260eb 1262 req = update_Request(req, url=url_escaped)
51f267d9 1263
33ac271b 1264 for h, v in std_headers.items():
3d5f7a39
JK
1265 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1266 # The dict keys are capitalized because of this bug by urllib
1267 if h.capitalize() not in req.headers:
33ac271b 1268 req.add_header(h, v)
87f0e62d
YCH
1269
1270 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
1271
1272 if sys.version_info < (2, 7) and '#' in req.get_full_url():
1273 # Python 2.6 is brain-dead when it comes to fragments
1274 req._Request__original = req._Request__original.partition('#')[0]
1275 req._Request__r_type = req._Request__r_type.partition('#')[0]
1276
59ae15a5
PH
1277 return req
1278
acebc9cd 1279 def http_response(self, req, resp):
59ae15a5
PH
1280 old_resp = resp
1281 # gzip
1282 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1283 content = resp.read()
1284 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1285 try:
1286 uncompressed = io.BytesIO(gz.read())
1287 except IOError as original_ioerror:
1288 # There may be junk add the end of the file
1289 # See http://stackoverflow.com/q/4928560/35070 for details
1290 for i in range(1, 1024):
1291 try:
1292 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1293 uncompressed = io.BytesIO(gz.read())
1294 except IOError:
1295 continue
1296 break
1297 else:
1298 raise original_ioerror
b407d853 1299 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1300 resp.msg = old_resp.msg
c047270c 1301 del resp.headers['Content-encoding']
59ae15a5
PH
1302 # deflate
1303 if resp.headers.get('Content-encoding', '') == 'deflate':
1304 gz = io.BytesIO(self.deflate(resp.read()))
b407d853 1305 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1306 resp.msg = old_resp.msg
c047270c 1307 del resp.headers['Content-encoding']
ad729172 1308 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1309 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1310 if 300 <= resp.code < 400:
1311 location = resp.headers.get('Location')
1312 if location:
1313 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1314 if sys.version_info >= (3, 0):
1315 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
1316 else:
1317 location = location.decode('utf-8')
5a4d9ddb
S
1318 location_escaped = escape_url(location)
1319 if location != location_escaped:
1320 del resp.headers['Location']
9a4aec8b
YCH
1321 if sys.version_info < (3, 0):
1322 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 1323 resp.headers['Location'] = location_escaped
59ae15a5 1324 return resp
0f8d03f8 1325
acebc9cd
PH
1326 https_request = http_request
1327 https_response = http_response
bf50b038 1328
5de90176 1329
71aff188
YCH
1330def make_socks_conn_class(base_class, socks_proxy):
1331 assert issubclass(base_class, (
1332 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1333
1334 url_components = compat_urlparse.urlparse(socks_proxy)
1335 if url_components.scheme.lower() == 'socks5':
1336 socks_type = ProxyType.SOCKS5
1337 elif url_components.scheme.lower() in ('socks', 'socks4'):
1338 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1339 elif url_components.scheme.lower() == 'socks4a':
1340 socks_type = ProxyType.SOCKS4A
71aff188 1341
cdd94c2e
YCH
1342 def unquote_if_non_empty(s):
1343 if not s:
1344 return s
1345 return compat_urllib_parse_unquote_plus(s)
1346
71aff188
YCH
1347 proxy_args = (
1348 socks_type,
1349 url_components.hostname, url_components.port or 1080,
1350 True, # Remote DNS
cdd94c2e
YCH
1351 unquote_if_non_empty(url_components.username),
1352 unquote_if_non_empty(url_components.password),
71aff188
YCH
1353 )
1354
1355 class SocksConnection(base_class):
1356 def connect(self):
1357 self.sock = sockssocket()
1358 self.sock.setproxy(*proxy_args)
1359 if type(self.timeout) in (int, float):
1360 self.sock.settimeout(self.timeout)
1361 self.sock.connect((self.host, self.port))
1362
1363 if isinstance(self, compat_http_client.HTTPSConnection):
1364 if hasattr(self, '_context'): # Python > 2.6
1365 self.sock = self._context.wrap_socket(
1366 self.sock, server_hostname=self.host)
1367 else:
1368 self.sock = ssl.wrap_socket(self.sock)
1369
1370 return SocksConnection
1371
1372
be4a824d
PH
1373class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1374 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1375 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1376 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1377 self._params = params
1378
1379 def https_open(self, req):
4f264c02 1380 kwargs = {}
71aff188
YCH
1381 conn_class = self._https_conn_class
1382
4f264c02
JMF
1383 if hasattr(self, '_context'): # python > 2.6
1384 kwargs['context'] = self._context
1385 if hasattr(self, '_check_hostname'): # python 3.x
1386 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1387
1388 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1389 if socks_proxy:
1390 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1391 del req.headers['Ytdl-socks-proxy']
1392
be4a824d 1393 return self.do_open(functools.partial(
71aff188 1394 _create_http_connection, self, conn_class, True),
4f264c02 1395 req, **kwargs)
be4a824d
PH
1396
1397
1bab3437 1398class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
f1a8511f
S
1399 """
1400 See [1] for cookie file format.
1401
1402 1. https://curl.haxx.se/docs/http-cookies.html
1403 """
e7e62441 1404 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1405 _ENTRY_LEN = 7
1406 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1407# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1408
1409'''
1410 _CookieFileEntry = collections.namedtuple(
1411 'CookieFileEntry',
1412 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1413
1bab3437 1414 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
c380cc28
S
1415 """
1416 Save cookies to a file.
1417
1418 Most of the code is taken from CPython 3.8 and slightly adapted
1419 to support cookie files with UTF-8 in both python 2 and 3.
1420 """
1421 if filename is None:
1422 if self.filename is not None:
1423 filename = self.filename
1424 else:
1425 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1426
1bab3437
S
1427 # Store session cookies with `expires` set to 0 instead of an empty
1428 # string
1429 for cookie in self:
1430 if cookie.expires is None:
1431 cookie.expires = 0
c380cc28
S
1432
1433 with io.open(filename, 'w', encoding='utf-8') as f:
1434 f.write(self._HEADER)
1435 now = time.time()
1436 for cookie in self:
1437 if not ignore_discard and cookie.discard:
1438 continue
1439 if not ignore_expires and cookie.is_expired(now):
1440 continue
1441 if cookie.secure:
1442 secure = 'TRUE'
1443 else:
1444 secure = 'FALSE'
1445 if cookie.domain.startswith('.'):
1446 initial_dot = 'TRUE'
1447 else:
1448 initial_dot = 'FALSE'
1449 if cookie.expires is not None:
1450 expires = compat_str(cookie.expires)
1451 else:
1452 expires = ''
1453 if cookie.value is None:
1454 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1455 # with no name, whereas http.cookiejar regards it as a
1456 # cookie with no value.
1457 name = ''
1458 value = cookie.name
1459 else:
1460 name = cookie.name
1461 value = cookie.value
1462 f.write(
1463 '\t'.join([cookie.domain, initial_dot, cookie.path,
1464 secure, expires, name, value]) + '\n')
1bab3437
S
1465
1466 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1467 """Load cookies from a file."""
1468 if filename is None:
1469 if self.filename is not None:
1470 filename = self.filename
1471 else:
1472 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1473
c380cc28
S
1474 def prepare_line(line):
1475 if line.startswith(self._HTTPONLY_PREFIX):
1476 line = line[len(self._HTTPONLY_PREFIX):]
1477 # comments and empty lines are fine
1478 if line.startswith('#') or not line.strip():
1479 return line
1480 cookie_list = line.split('\t')
1481 if len(cookie_list) != self._ENTRY_LEN:
1482 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1483 cookie = self._CookieFileEntry(*cookie_list)
1484 if cookie.expires_at and not cookie.expires_at.isdigit():
1485 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1486 return line
1487
e7e62441 1488 cf = io.StringIO()
c380cc28 1489 with io.open(filename, encoding='utf-8') as f:
e7e62441 1490 for line in f:
c380cc28
S
1491 try:
1492 cf.write(prepare_line(line))
1493 except compat_cookiejar.LoadError as e:
1494 write_string(
1495 'WARNING: skipping cookie file entry due to %s: %r\n'
1496 % (e, line), sys.stderr)
1497 continue
e7e62441 1498 cf.seek(0)
1499 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1500 # Session cookies are denoted by either `expires` field set to
1501 # an empty string or 0. MozillaCookieJar only recognizes the former
1502 # (see [1]). So we need force the latter to be recognized as session
1503 # cookies on our own.
1504 # Session cookies may be important for cookies-based authentication,
1505 # e.g. usually, when user does not check 'Remember me' check box while
1506 # logging in on a site, some important cookies are stored as session
1507 # cookies so that not recognizing them will result in failed login.
1508 # 1. https://bugs.python.org/issue17164
1509 for cookie in self:
1510 # Treat `expires=0` cookies as session cookies
1511 if cookie.expires == 0:
1512 cookie.expires = None
1513 cookie.discard = True
1514
1515
a6420bf5
S
1516class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1517 def __init__(self, cookiejar=None):
1518 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1519
1520 def http_response(self, request, response):
1521 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1522 # characters in Set-Cookie HTTP header of last response (see
067aa17e 1523 # https://github.com/ytdl-org/youtube-dl/issues/6769).
a6420bf5
S
1524 # In order to at least prevent crashing we will percent encode Set-Cookie
1525 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1526 # if sys.version_info < (3, 0) and response.headers:
1527 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1528 # set_cookie = response.headers.get(set_cookie_header)
1529 # if set_cookie:
1530 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1531 # if set_cookie != set_cookie_escaped:
1532 # del response.headers[set_cookie_header]
1533 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1534 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1535
f5fa042c 1536 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
a6420bf5
S
1537 https_response = http_response
1538
1539
fca6dba8 1540class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
201c1459 1541 """YoutubeDL redirect handler
1542
1543 The code is based on HTTPRedirectHandler implementation from CPython [1].
1544
1545 This redirect handler solves two issues:
1546 - ensures redirect URL is always unicode under python 2
1547 - introduces support for experimental HTTP response status code
1548 308 Permanent Redirect [2] used by some sites [3]
1549
1550 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1551 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1552 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1553 """
1554
1555 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1556
1557 def redirect_request(self, req, fp, code, msg, headers, newurl):
1558 """Return a Request or None in response to a redirect.
1559
1560 This is called by the http_error_30x methods when a
1561 redirection response is received. If a redirection should
1562 take place, return a new Request to allow http_error_30x to
1563 perform the redirect. Otherwise, raise HTTPError if no-one
1564 else should try to handle this url. Return None if you can't
1565 but another Handler might.
1566 """
1567 m = req.get_method()
1568 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1569 or code in (301, 302, 303) and m == "POST")):
1570 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1571 # Strictly (according to RFC 2616), 301 or 302 in response to
1572 # a POST MUST NOT cause a redirection without confirmation
1573 # from the user (of urllib.request, in this case). In practice,
1574 # essentially all clients do redirect in this case, so we do
1575 # the same.
1576
1577 # On python 2 urlh.geturl() may sometimes return redirect URL
1578 # as byte string instead of unicode. This workaround allows
1579 # to force it always return unicode.
1580 if sys.version_info[0] < 3:
1581 newurl = compat_str(newurl)
1582
1583 # Be conciliant with URIs containing a space. This is mainly
1584 # redundant with the more complete encoding done in http_error_302(),
1585 # but it is kept for compatibility with other callers.
1586 newurl = newurl.replace(' ', '%20')
1587
1588 CONTENT_HEADERS = ("content-length", "content-type")
1589 # NB: don't use dict comprehension for python 2.6 compatibility
1590 newheaders = dict((k, v) for k, v in req.headers.items()
1591 if k.lower() not in CONTENT_HEADERS)
1592 return compat_urllib_request.Request(
1593 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1594 unverifiable=True)
fca6dba8
S
1595
1596
46f59e89
S
1597def extract_timezone(date_str):
1598 m = re.search(
f137e4c2 1599 r'''(?x)
1600 ^.{8,}? # >=8 char non-TZ prefix, if present
1601 (?P<tz>Z| # just the UTC Z, or
1602 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1603 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1604 [ ]? # optional space
1605 (?P<sign>\+|-) # +/-
1606 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1607 $)
1608 ''', date_str)
46f59e89
S
1609 if not m:
1610 timezone = datetime.timedelta()
1611 else:
1612 date_str = date_str[:-len(m.group('tz'))]
1613 if not m.group('sign'):
1614 timezone = datetime.timedelta()
1615 else:
1616 sign = 1 if m.group('sign') == '+' else -1
1617 timezone = datetime.timedelta(
1618 hours=sign * int(m.group('hours')),
1619 minutes=sign * int(m.group('minutes')))
1620 return timezone, date_str
1621
1622
08b38d54 1623def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1624 """ Return a UNIX timestamp from the given date """
1625
1626 if date_str is None:
1627 return None
1628
52c3a6e4
S
1629 date_str = re.sub(r'\.[0-9]+', '', date_str)
1630
08b38d54 1631 if timezone is None:
46f59e89
S
1632 timezone, date_str = extract_timezone(date_str)
1633
52c3a6e4
S
1634 try:
1635 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1636 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1637 return calendar.timegm(dt.timetuple())
1638 except ValueError:
1639 pass
912b38b4
PH
1640
1641
46f59e89
S
1642def date_formats(day_first=True):
1643 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1644
1645
42bdd9d0 1646def unified_strdate(date_str, day_first=True):
bf50b038 1647 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1648
1649 if date_str is None:
1650 return None
bf50b038 1651 upload_date = None
5f6a1245 1652 # Replace commas
026fcc04 1653 date_str = date_str.replace(',', ' ')
42bdd9d0 1654 # Remove AM/PM + timezone
9bb8e0a3 1655 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1656 _, date_str = extract_timezone(date_str)
42bdd9d0 1657
46f59e89 1658 for expression in date_formats(day_first):
bf50b038
JMF
1659 try:
1660 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1661 except ValueError:
bf50b038 1662 pass
42393ce2
PH
1663 if upload_date is None:
1664 timetuple = email.utils.parsedate_tz(date_str)
1665 if timetuple:
c6b9cf05
S
1666 try:
1667 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1668 except ValueError:
1669 pass
6a750402
JMF
1670 if upload_date is not None:
1671 return compat_str(upload_date)
bf50b038 1672
5f6a1245 1673
46f59e89
S
1674def unified_timestamp(date_str, day_first=True):
1675 if date_str is None:
1676 return None
1677
2ae2ffda 1678 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1679
7dc2a74e 1680 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1681 timezone, date_str = extract_timezone(date_str)
1682
1683 # Remove AM/PM + timezone
1684 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1685
deef3195
S
1686 # Remove unrecognized timezones from ISO 8601 alike timestamps
1687 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1688 if m:
1689 date_str = date_str[:-len(m.group('tz'))]
1690
f226880c
PH
1691 # Python only supports microseconds, so remove nanoseconds
1692 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1693 if m:
1694 date_str = m.group(1)
1695
46f59e89
S
1696 for expression in date_formats(day_first):
1697 try:
7dc2a74e 1698 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1699 return calendar.timegm(dt.timetuple())
1700 except ValueError:
1701 pass
1702 timetuple = email.utils.parsedate_tz(date_str)
1703 if timetuple:
7dc2a74e 1704 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1705
1706
28e614de 1707def determine_ext(url, default_ext='unknown_video'):
85750f89 1708 if url is None or '.' not in url:
f4776371 1709 return default_ext
9cb9a5df 1710 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1711 if re.match(r'^[A-Za-z0-9]+$', guess):
1712 return guess
a7aaa398
S
1713 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1714 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1715 return guess.rstrip('/')
73e79f2a 1716 else:
cbdbb766 1717 return default_ext
73e79f2a 1718
5f6a1245 1719
824fa511
S
1720def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1721 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1722
5f6a1245 1723
9e62f283 1724def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
37254abc
JMF
1725 """
1726 Return a datetime object from a string in the format YYYYMMDD or
9e62f283 1727 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1728
1729 format: string date format used to return datetime object from
1730 precision: round the time portion of a datetime object.
1731 auto|microsecond|second|minute|hour|day.
1732 auto: round to the unit provided in date_str (if applicable).
1733 """
1734 auto_precision = False
1735 if precision == 'auto':
1736 auto_precision = True
1737 precision = 'microsecond'
1738 today = datetime_round(datetime.datetime.now(), precision)
f8795e10 1739 if date_str in ('now', 'today'):
37254abc 1740 return today
f8795e10
PH
1741 if date_str == 'yesterday':
1742 return today - datetime.timedelta(days=1)
9e62f283 1743 match = re.match(
1744 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1745 date_str)
37254abc 1746 if match is not None:
9e62f283 1747 start_time = datetime_from_str(match.group('start'), precision, format)
1748 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1749 unit = match.group('unit')
9e62f283 1750 if unit == 'month' or unit == 'year':
1751 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1752 unit = 'day'
9e62f283 1753 else:
1754 if unit == 'week':
1755 unit = 'day'
1756 time *= 7
1757 delta = datetime.timedelta(**{unit + 's': time})
1758 new_date = start_time + delta
1759 if auto_precision:
1760 return datetime_round(new_date, unit)
1761 return new_date
1762
1763 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1764
1765
1766def date_from_str(date_str, format='%Y%m%d'):
1767 """
1768 Return a datetime object from a string in the format YYYYMMDD or
1769 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1770
1771 format: string date format used to return datetime object from
1772 """
1773 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1774
1775
1776def datetime_add_months(dt, months):
1777 """Increment/Decrement a datetime object by months."""
1778 month = dt.month + months - 1
1779 year = dt.year + month // 12
1780 month = month % 12 + 1
1781 day = min(dt.day, calendar.monthrange(year, month)[1])
1782 return dt.replace(year, month, day)
1783
1784
1785def datetime_round(dt, precision='day'):
1786 """
1787 Round a datetime object's time to a specific precision
1788 """
1789 if precision == 'microsecond':
1790 return dt
1791
1792 unit_seconds = {
1793 'day': 86400,
1794 'hour': 3600,
1795 'minute': 60,
1796 'second': 1,
1797 }
1798 roundto = lambda x, n: ((x + n / 2) // n) * n
1799 timestamp = calendar.timegm(dt.timetuple())
1800 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1801
1802
e63fc1be 1803def hyphenate_date(date_str):
1804 """
1805 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1806 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1807 if match is not None:
1808 return '-'.join(match.groups())
1809 else:
1810 return date_str
1811
5f6a1245 1812
bd558525
JMF
1813class DateRange(object):
1814 """Represents a time interval between two dates"""
5f6a1245 1815
bd558525
JMF
1816 def __init__(self, start=None, end=None):
1817 """start and end must be strings in the format accepted by date"""
1818 if start is not None:
1819 self.start = date_from_str(start)
1820 else:
1821 self.start = datetime.datetime.min.date()
1822 if end is not None:
1823 self.end = date_from_str(end)
1824 else:
1825 self.end = datetime.datetime.max.date()
37254abc 1826 if self.start > self.end:
bd558525 1827 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1828
bd558525
JMF
1829 @classmethod
1830 def day(cls, day):
1831 """Returns a range that only contains the given day"""
5f6a1245
JW
1832 return cls(day, day)
1833
bd558525
JMF
1834 def __contains__(self, date):
1835 """Check if the date is in the range"""
37254abc
JMF
1836 if not isinstance(date, datetime.date):
1837 date = date_from_str(date)
1838 return self.start <= date <= self.end
5f6a1245 1839
bd558525 1840 def __str__(self):
5f6a1245 1841 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1842
1843
1844def platform_name():
1845 """ Returns the platform name as a compat_str """
1846 res = platform.platform()
1847 if isinstance(res, bytes):
1848 res = res.decode(preferredencoding())
1849
1850 assert isinstance(res, compat_str)
1851 return res
c257baff
PH
1852
1853
49fa4d9a
N
1854def get_windows_version():
1855 ''' Get Windows version. None if it's not running on Windows '''
1856 if compat_os_name == 'nt':
1857 return version_tuple(platform.win32_ver()[1])
1858 else:
1859 return None
1860
1861
b58ddb32
PH
1862def _windows_write_string(s, out):
1863 """ Returns True if the string was written using special methods,
1864 False if it has yet to be written out."""
1865 # Adapted from http://stackoverflow.com/a/3259271/35070
1866
b58ddb32
PH
1867 import ctypes.wintypes
1868
1869 WIN_OUTPUT_IDS = {
1870 1: -11,
1871 2: -12,
1872 }
1873
a383a98a
PH
1874 try:
1875 fileno = out.fileno()
1876 except AttributeError:
1877 # If the output stream doesn't have a fileno, it's virtual
1878 return False
aa42e873
PH
1879 except io.UnsupportedOperation:
1880 # Some strange Windows pseudo files?
1881 return False
b58ddb32
PH
1882 if fileno not in WIN_OUTPUT_IDS:
1883 return False
1884
d7cd9a9e 1885 GetStdHandle = compat_ctypes_WINFUNCTYPE(
b58ddb32 1886 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
d7cd9a9e 1887 ('GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1888 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1889
d7cd9a9e 1890 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
1891 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1892 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
d7cd9a9e 1893 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1894 written = ctypes.wintypes.DWORD(0)
1895
d7cd9a9e 1896 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1897 FILE_TYPE_CHAR = 0x0002
1898 FILE_TYPE_REMOTE = 0x8000
d7cd9a9e 1899 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
1900 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1901 ctypes.POINTER(ctypes.wintypes.DWORD))(
d7cd9a9e 1902 ('GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1903 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1904
1905 def not_a_console(handle):
1906 if handle == INVALID_HANDLE_VALUE or handle is None:
1907 return True
3089bc74
S
1908 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1909 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1910
1911 if not_a_console(h):
1912 return False
1913
d1b9c912
PH
1914 def next_nonbmp_pos(s):
1915 try:
1916 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1917 except StopIteration:
1918 return len(s)
1919
1920 while s:
1921 count = min(next_nonbmp_pos(s), 1024)
1922
b58ddb32 1923 ret = WriteConsoleW(
d1b9c912 1924 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1925 if ret == 0:
1926 raise OSError('Failed to write string')
d1b9c912
PH
1927 if not count: # We just wrote a non-BMP character
1928 assert written.value == 2
1929 s = s[1:]
1930 else:
1931 assert written.value > 0
1932 s = s[written.value:]
b58ddb32
PH
1933 return True
1934
1935
734f90bb 1936def write_string(s, out=None, encoding=None):
7459e3a2
PH
1937 if out is None:
1938 out = sys.stderr
8bf48f23 1939 assert type(s) == compat_str
7459e3a2 1940
b58ddb32
PH
1941 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1942 if _windows_write_string(s, out):
1943 return
1944
3089bc74
S
1945 if ('b' in getattr(out, 'mode', '')
1946 or sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1947 byt = s.encode(encoding or preferredencoding(), 'ignore')
1948 out.write(byt)
1949 elif hasattr(out, 'buffer'):
1950 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1951 byt = s.encode(enc, 'ignore')
1952 out.buffer.write(byt)
1953 else:
8bf48f23 1954 out.write(s)
7459e3a2
PH
1955 out.flush()
1956
1957
48ea9cea
PH
1958def bytes_to_intlist(bs):
1959 if not bs:
1960 return []
1961 if isinstance(bs[0], int): # Python 3
1962 return list(bs)
1963 else:
1964 return [ord(c) for c in bs]
1965
c257baff 1966
cba892fa 1967def intlist_to_bytes(xs):
1968 if not xs:
1969 return b''
edaa23f8 1970 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1971
1972
c1c9a79c
PH
1973# Cross-platform file locking
1974if sys.platform == 'win32':
1975 import ctypes.wintypes
1976 import msvcrt
1977
1978 class OVERLAPPED(ctypes.Structure):
1979 _fields_ = [
1980 ('Internal', ctypes.wintypes.LPVOID),
1981 ('InternalHigh', ctypes.wintypes.LPVOID),
1982 ('Offset', ctypes.wintypes.DWORD),
1983 ('OffsetHigh', ctypes.wintypes.DWORD),
1984 ('hEvent', ctypes.wintypes.HANDLE),
1985 ]
1986
1987 kernel32 = ctypes.windll.kernel32
1988 LockFileEx = kernel32.LockFileEx
1989 LockFileEx.argtypes = [
1990 ctypes.wintypes.HANDLE, # hFile
1991 ctypes.wintypes.DWORD, # dwFlags
1992 ctypes.wintypes.DWORD, # dwReserved
1993 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1994 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1995 ctypes.POINTER(OVERLAPPED) # Overlapped
1996 ]
1997 LockFileEx.restype = ctypes.wintypes.BOOL
1998 UnlockFileEx = kernel32.UnlockFileEx
1999 UnlockFileEx.argtypes = [
2000 ctypes.wintypes.HANDLE, # hFile
2001 ctypes.wintypes.DWORD, # dwReserved
2002 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2003 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2004 ctypes.POINTER(OVERLAPPED) # Overlapped
2005 ]
2006 UnlockFileEx.restype = ctypes.wintypes.BOOL
2007 whole_low = 0xffffffff
2008 whole_high = 0x7fffffff
2009
2010 def _lock_file(f, exclusive):
2011 overlapped = OVERLAPPED()
2012 overlapped.Offset = 0
2013 overlapped.OffsetHigh = 0
2014 overlapped.hEvent = 0
2015 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2016 handle = msvcrt.get_osfhandle(f.fileno())
2017 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
2018 whole_low, whole_high, f._lock_file_overlapped_p):
2019 raise OSError('Locking file failed: %r' % ctypes.FormatError())
2020
2021 def _unlock_file(f):
2022 assert f._lock_file_overlapped_p
2023 handle = msvcrt.get_osfhandle(f.fileno())
2024 if not UnlockFileEx(handle, 0,
2025 whole_low, whole_high, f._lock_file_overlapped_p):
2026 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2027
2028else:
399a76e6
YCH
2029 # Some platforms, such as Jython, is missing fcntl
2030 try:
2031 import fcntl
c1c9a79c 2032
399a76e6
YCH
2033 def _lock_file(f, exclusive):
2034 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 2035
399a76e6
YCH
2036 def _unlock_file(f):
2037 fcntl.flock(f, fcntl.LOCK_UN)
2038 except ImportError:
2039 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2040
2041 def _lock_file(f, exclusive):
2042 raise IOError(UNSUPPORTED_MSG)
2043
2044 def _unlock_file(f):
2045 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
2046
2047
2048class locked_file(object):
2049 def __init__(self, filename, mode, encoding=None):
2050 assert mode in ['r', 'a', 'w']
2051 self.f = io.open(filename, mode, encoding=encoding)
2052 self.mode = mode
2053
2054 def __enter__(self):
2055 exclusive = self.mode != 'r'
2056 try:
2057 _lock_file(self.f, exclusive)
2058 except IOError:
2059 self.f.close()
2060 raise
2061 return self
2062
2063 def __exit__(self, etype, value, traceback):
2064 try:
2065 _unlock_file(self.f)
2066 finally:
2067 self.f.close()
2068
2069 def __iter__(self):
2070 return iter(self.f)
2071
2072 def write(self, *args):
2073 return self.f.write(*args)
2074
2075 def read(self, *args):
2076 return self.f.read(*args)
4eb7f1d1
JMF
2077
2078
4644ac55
S
2079def get_filesystem_encoding():
2080 encoding = sys.getfilesystemencoding()
2081 return encoding if encoding is not None else 'utf-8'
2082
2083
4eb7f1d1 2084def shell_quote(args):
a6a173c2 2085 quoted_args = []
4644ac55 2086 encoding = get_filesystem_encoding()
a6a173c2
JMF
2087 for a in args:
2088 if isinstance(a, bytes):
2089 # We may get a filename encoded with 'encodeFilename'
2090 a = a.decode(encoding)
aefce8e6 2091 quoted_args.append(compat_shlex_quote(a))
28e614de 2092 return ' '.join(quoted_args)
9d4660ca
PH
2093
2094
2095def smuggle_url(url, data):
2096 """ Pass additional data in a URL for internal use. """
2097
81953d1a
RA
2098 url, idata = unsmuggle_url(url, {})
2099 data.update(idata)
15707c7e 2100 sdata = compat_urllib_parse_urlencode(
28e614de
PH
2101 {'__youtubedl_smuggle': json.dumps(data)})
2102 return url + '#' + sdata
9d4660ca
PH
2103
2104
79f82953 2105def unsmuggle_url(smug_url, default=None):
83e865a3 2106 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2107 return smug_url, default
28e614de
PH
2108 url, _, sdata = smug_url.rpartition('#')
2109 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2110 data = json.loads(jsond)
2111 return url, data
02dbf93f
PH
2112
2113
e0fd9573 2114def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2115 """ Formats numbers with decimal sufixes like K, M, etc """
2116 num, factor = float_or_none(num), float(factor)
2117 if num is None:
2118 return None
2119 exponent = 0 if num == 0 else int(math.log(num, factor))
abbeeebc 2120 suffix = ['', *'kMGTPEZY'][exponent]
2121 if factor == 1024:
2122 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2123 converted = num / (factor ** exponent)
abbeeebc 2124 return fmt % (converted, suffix)
e0fd9573 2125
2126
02dbf93f 2127def format_bytes(bytes):
f02d24d8 2128 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2129
1c088fa8 2130
fb47597b
S
2131def lookup_unit_table(unit_table, s):
2132 units_re = '|'.join(re.escape(u) for u in unit_table)
2133 m = re.match(
782b1b5b 2134 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
2135 if not m:
2136 return None
2137 num_str = m.group('num').replace(',', '.')
2138 mult = unit_table[m.group('unit')]
2139 return int(float(num_str) * mult)
2140
2141
be64b5b0
PH
2142def parse_filesize(s):
2143 if s is None:
2144 return None
2145
dfb1b146 2146 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2147 # but we support those too
2148 _UNIT_TABLE = {
2149 'B': 1,
2150 'b': 1,
70852b47 2151 'bytes': 1,
be64b5b0
PH
2152 'KiB': 1024,
2153 'KB': 1000,
2154 'kB': 1024,
2155 'Kb': 1000,
13585d76 2156 'kb': 1000,
70852b47
YCH
2157 'kilobytes': 1000,
2158 'kibibytes': 1024,
be64b5b0
PH
2159 'MiB': 1024 ** 2,
2160 'MB': 1000 ** 2,
2161 'mB': 1024 ** 2,
2162 'Mb': 1000 ** 2,
13585d76 2163 'mb': 1000 ** 2,
70852b47
YCH
2164 'megabytes': 1000 ** 2,
2165 'mebibytes': 1024 ** 2,
be64b5b0
PH
2166 'GiB': 1024 ** 3,
2167 'GB': 1000 ** 3,
2168 'gB': 1024 ** 3,
2169 'Gb': 1000 ** 3,
13585d76 2170 'gb': 1000 ** 3,
70852b47
YCH
2171 'gigabytes': 1000 ** 3,
2172 'gibibytes': 1024 ** 3,
be64b5b0
PH
2173 'TiB': 1024 ** 4,
2174 'TB': 1000 ** 4,
2175 'tB': 1024 ** 4,
2176 'Tb': 1000 ** 4,
13585d76 2177 'tb': 1000 ** 4,
70852b47
YCH
2178 'terabytes': 1000 ** 4,
2179 'tebibytes': 1024 ** 4,
be64b5b0
PH
2180 'PiB': 1024 ** 5,
2181 'PB': 1000 ** 5,
2182 'pB': 1024 ** 5,
2183 'Pb': 1000 ** 5,
13585d76 2184 'pb': 1000 ** 5,
70852b47
YCH
2185 'petabytes': 1000 ** 5,
2186 'pebibytes': 1024 ** 5,
be64b5b0
PH
2187 'EiB': 1024 ** 6,
2188 'EB': 1000 ** 6,
2189 'eB': 1024 ** 6,
2190 'Eb': 1000 ** 6,
13585d76 2191 'eb': 1000 ** 6,
70852b47
YCH
2192 'exabytes': 1000 ** 6,
2193 'exbibytes': 1024 ** 6,
be64b5b0
PH
2194 'ZiB': 1024 ** 7,
2195 'ZB': 1000 ** 7,
2196 'zB': 1024 ** 7,
2197 'Zb': 1000 ** 7,
13585d76 2198 'zb': 1000 ** 7,
70852b47
YCH
2199 'zettabytes': 1000 ** 7,
2200 'zebibytes': 1024 ** 7,
be64b5b0
PH
2201 'YiB': 1024 ** 8,
2202 'YB': 1000 ** 8,
2203 'yB': 1024 ** 8,
2204 'Yb': 1000 ** 8,
13585d76 2205 'yb': 1000 ** 8,
70852b47
YCH
2206 'yottabytes': 1000 ** 8,
2207 'yobibytes': 1024 ** 8,
be64b5b0
PH
2208 }
2209
fb47597b
S
2210 return lookup_unit_table(_UNIT_TABLE, s)
2211
2212
2213def parse_count(s):
2214 if s is None:
be64b5b0
PH
2215 return None
2216
352d5da8 2217 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2218
2219 if re.match(r'^[\d,.]+$', s):
2220 return str_to_int(s)
2221
2222 _UNIT_TABLE = {
2223 'k': 1000,
2224 'K': 1000,
2225 'm': 1000 ** 2,
2226 'M': 1000 ** 2,
2227 'kk': 1000 ** 2,
2228 'KK': 1000 ** 2,
352d5da8 2229 'b': 1000 ** 3,
2230 'B': 1000 ** 3,
fb47597b 2231 }
be64b5b0 2232
352d5da8 2233 ret = lookup_unit_table(_UNIT_TABLE, s)
2234 if ret is not None:
2235 return ret
2236
2237 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2238 if mobj:
2239 return str_to_int(mobj.group(1))
be64b5b0 2240
2f7ae819 2241
b871d7e9
S
2242def parse_resolution(s):
2243 if s is None:
2244 return {}
2245
17ec8bcf 2246 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2247 if mobj:
2248 return {
2249 'width': int(mobj.group('w')),
2250 'height': int(mobj.group('h')),
2251 }
2252
17ec8bcf 2253 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2254 if mobj:
2255 return {'height': int(mobj.group(1))}
2256
2257 mobj = re.search(r'\b([48])[kK]\b', s)
2258 if mobj:
2259 return {'height': int(mobj.group(1)) * 540}
2260
2261 return {}
2262
2263
0dc41787
S
2264def parse_bitrate(s):
2265 if not isinstance(s, compat_str):
2266 return
2267 mobj = re.search(r'\b(\d+)\s*kbps', s)
2268 if mobj:
2269 return int(mobj.group(1))
2270
2271
a942d6cb 2272def month_by_name(name, lang='en'):
caefb1de
PH
2273 """ Return the number of a month by (locale-independently) English name """
2274
f6717dec 2275 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2276
caefb1de 2277 try:
f6717dec 2278 return month_names.index(name) + 1
7105440c
YCH
2279 except ValueError:
2280 return None
2281
2282
2283def month_by_abbreviation(abbrev):
2284 """ Return the number of a month by (locale-independently) English
2285 abbreviations """
2286
2287 try:
2288 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2289 except ValueError:
2290 return None
18258362
JMF
2291
2292
5aafe895 2293def fix_xml_ampersands(xml_str):
18258362 2294 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2295 return re.sub(
2296 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2297 '&amp;',
5aafe895 2298 xml_str)
e3946f98
PH
2299
2300
2301def setproctitle(title):
8bf48f23 2302 assert isinstance(title, compat_str)
c1c05c67
YCH
2303
2304 # ctypes in Jython is not complete
2305 # http://bugs.jython.org/issue2148
2306 if sys.platform.startswith('java'):
2307 return
2308
e3946f98 2309 try:
611c1dd9 2310 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2311 except OSError:
2312 return
2f49bcd6
RC
2313 except TypeError:
2314 # LoadLibrary in Windows Python 2.7.13 only expects
2315 # a bytestring, but since unicode_literals turns
2316 # every string into a unicode string, it fails.
2317 return
6eefe533
PH
2318 title_bytes = title.encode('utf-8')
2319 buf = ctypes.create_string_buffer(len(title_bytes))
2320 buf.value = title_bytes
e3946f98 2321 try:
6eefe533 2322 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2323 except AttributeError:
2324 return # Strange libc, just skip this
d7dda168
PH
2325
2326
2327def remove_start(s, start):
46bc9b7d 2328 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2329
2330
2b9faf55 2331def remove_end(s, end):
46bc9b7d 2332 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2333
2334
31b2051e
S
2335def remove_quotes(s):
2336 if s is None or len(s) < 2:
2337 return s
2338 for quote in ('"', "'", ):
2339 if s[0] == quote and s[-1] == quote:
2340 return s[1:-1]
2341 return s
2342
2343
b6e0c7d2
U
2344def get_domain(url):
2345 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2346 return domain.group('domain') if domain else None
2347
2348
29eb5174 2349def url_basename(url):
9b8aaeed 2350 path = compat_urlparse.urlparse(url).path
28e614de 2351 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2352
2353
02dc0a36
S
2354def base_url(url):
2355 return re.match(r'https?://[^?#&]+/', url).group()
2356
2357
e34c3361 2358def urljoin(base, path):
4b5de77b
S
2359 if isinstance(path, bytes):
2360 path = path.decode('utf-8')
e34c3361
S
2361 if not isinstance(path, compat_str) or not path:
2362 return None
fad4ceb5 2363 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2364 return path
4b5de77b
S
2365 if isinstance(base, bytes):
2366 base = base.decode('utf-8')
2367 if not isinstance(base, compat_str) or not re.match(
2368 r'^(?:https?:)?//', base):
e34c3361
S
2369 return None
2370 return compat_urlparse.urljoin(base, path)
2371
2372
aa94a6d3
PH
2373class HEADRequest(compat_urllib_request.Request):
2374 def get_method(self):
611c1dd9 2375 return 'HEAD'
7217e148
PH
2376
2377
95cf60e8
S
2378class PUTRequest(compat_urllib_request.Request):
2379 def get_method(self):
2380 return 'PUT'
2381
2382
9732d77e 2383def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
2384 if get_attr:
2385 if v is not None:
2386 v = getattr(v, get_attr, None)
9572013d
PH
2387 if v == '':
2388 v = None
1812afb7
S
2389 if v is None:
2390 return default
2391 try:
2392 return int(v) * invscale // scale
31c49255 2393 except (ValueError, TypeError, OverflowError):
af98f8ff 2394 return default
9732d77e 2395
9572013d 2396
40a90862
JMF
2397def str_or_none(v, default=None):
2398 return default if v is None else compat_str(v)
2399
9732d77e
PH
2400
2401def str_to_int(int_str):
48d4681e 2402 """ A more relaxed version of int_or_none """
42db58ec 2403 if isinstance(int_str, compat_integer_types):
348c6bf1 2404 return int_str
42db58ec
S
2405 elif isinstance(int_str, compat_str):
2406 int_str = re.sub(r'[,\.\+]', '', int_str)
2407 return int_or_none(int_str)
608d11f5
PH
2408
2409
9732d77e 2410def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2411 if v is None:
2412 return default
2413 try:
2414 return float(v) * invscale / scale
5e1271c5 2415 except (ValueError, TypeError):
caf80631 2416 return default
43f775e4
PH
2417
2418
c7e327c4
S
2419def bool_or_none(v, default=None):
2420 return v if isinstance(v, bool) else default
2421
2422
53cd37ba
S
2423def strip_or_none(v, default=None):
2424 return v.strip() if isinstance(v, compat_str) else default
b72b4431
S
2425
2426
af03000a
S
2427def url_or_none(url):
2428 if not url or not isinstance(url, compat_str):
2429 return None
2430 url = url.strip()
29f7c58a 2431 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2432
2433
e29663c6 2434def strftime_or_none(timestamp, date_format, default=None):
2435 datetime_object = None
2436 try:
2437 if isinstance(timestamp, compat_numeric_types): # unix timestamp
2438 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2439 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2440 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2441 return datetime_object.strftime(date_format)
2442 except (ValueError, TypeError, AttributeError):
2443 return default
2444
2445
608d11f5 2446def parse_duration(s):
8f9312c3 2447 if not isinstance(s, compat_basestring):
608d11f5 2448 return None
ca7b3246 2449 s = s.strip()
38d79fd1 2450 if not s:
2451 return None
ca7b3246 2452
acaff495 2453 days, hours, mins, secs, ms = [None] * 5
15846398 2454 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
acaff495 2455 if m:
2456 days, hours, mins, secs, ms = m.groups()
2457 else:
2458 m = re.match(
056653bb
S
2459 r'''(?ix)(?:P?
2460 (?:
2461 [0-9]+\s*y(?:ears?)?\s*
2462 )?
2463 (?:
2464 [0-9]+\s*m(?:onths?)?\s*
2465 )?
2466 (?:
2467 [0-9]+\s*w(?:eeks?)?\s*
2468 )?
8f4b58d7 2469 (?:
acaff495 2470 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 2471 )?
056653bb 2472 T)?
acaff495 2473 (?:
2474 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2475 )?
2476 (?:
2477 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2478 )?
2479 (?:
2480 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2481 )?Z?$''', s)
acaff495 2482 if m:
2483 days, hours, mins, secs, ms = m.groups()
2484 else:
15846398 2485 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2486 if m:
2487 hours, mins = m.groups()
2488 else:
2489 return None
2490
2491 duration = 0
2492 if secs:
2493 duration += float(secs)
2494 if mins:
2495 duration += float(mins) * 60
2496 if hours:
2497 duration += float(hours) * 60 * 60
2498 if days:
2499 duration += float(days) * 24 * 60 * 60
2500 if ms:
2501 duration += float(ms)
2502 return duration
91d7d0b3
JMF
2503
2504
e65e4c88 2505def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2506 name, real_ext = os.path.splitext(filename)
e65e4c88
S
2507 return (
2508 '{0}.{1}{2}'.format(name, ext, real_ext)
2509 if not expected_real_ext or real_ext[1:] == expected_real_ext
2510 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
2511
2512
b3ed15b7
S
2513def replace_extension(filename, ext, expected_real_ext=None):
2514 name, real_ext = os.path.splitext(filename)
2515 return '{0}.{1}'.format(
2516 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2517 ext)
2518
2519
d70ad093
PH
2520def check_executable(exe, args=[]):
2521 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2522 args can be a list of arguments for a short output (like -version) """
2523 try:
d3c93ec2 2524 Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
d70ad093
PH
2525 except OSError:
2526 return False
2527 return exe
b7ab0590
PH
2528
2529
9af98e17 2530def _get_exe_version_output(exe, args):
95807118 2531 try:
b64d04c1 2532 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2533 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2534 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
d3c93ec2 2535 out, _ = Popen(
2536 [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2537 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
95807118
PH
2538 except OSError:
2539 return False
cae97f65
PH
2540 if isinstance(out, bytes): # Python 2.x
2541 out = out.decode('ascii', 'ignore')
9af98e17 2542 return out
cae97f65
PH
2543
2544
2545def detect_exe_version(output, version_re=None, unrecognized='present'):
2546 assert isinstance(output, compat_str)
2547 if version_re is None:
2548 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2549 m = re.search(version_re, output)
95807118
PH
2550 if m:
2551 return m.group(1)
2552 else:
2553 return unrecognized
2554
2555
9af98e17 2556def get_exe_version(exe, args=['--version'],
2557 version_re=None, unrecognized='present'):
2558 """ Returns the version of the specified executable,
2559 or False if the executable is not present """
2560 out = _get_exe_version_output(exe, args)
2561 return detect_exe_version(out, version_re, unrecognized) if out else False
2562
2563
cb89cfc1 2564class LazyList(collections.abc.Sequence):
483336e7 2565 ''' Lazy immutable list from an iterable
2566 Note that slices of a LazyList are lists and not LazyList'''
2567
8e5fecc8 2568 class IndexError(IndexError):
2569 pass
2570
282f5709 2571 def __init__(self, iterable, *, reverse=False, _cache=None):
483336e7 2572 self.__iterable = iter(iterable)
282f5709 2573 self.__cache = [] if _cache is None else _cache
2574 self.__reversed = reverse
483336e7 2575
2576 def __iter__(self):
28419ca2 2577 if self.__reversed:
2578 # We need to consume the entire iterable to iterate in reverse
981052c9 2579 yield from self.exhaust()
28419ca2 2580 return
2581 yield from self.__cache
483336e7 2582 for item in self.__iterable:
2583 self.__cache.append(item)
2584 yield item
2585
981052c9 2586 def __exhaust(self):
483336e7 2587 self.__cache.extend(self.__iterable)
9f1a1c36 2588 # Discard the emptied iterable to make it pickle-able
2589 self.__iterable = []
28419ca2 2590 return self.__cache
2591
981052c9 2592 def exhaust(self):
2593 ''' Evaluate the entire iterable '''
2594 return self.__exhaust()[::-1 if self.__reversed else 1]
2595
28419ca2 2596 @staticmethod
981052c9 2597 def __reverse_index(x):
e0f2b4b4 2598 return None if x is None else -(x + 1)
483336e7 2599
2600 def __getitem__(self, idx):
2601 if isinstance(idx, slice):
28419ca2 2602 if self.__reversed:
e0f2b4b4 2603 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2604 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2605 elif isinstance(idx, int):
28419ca2 2606 if self.__reversed:
981052c9 2607 idx = self.__reverse_index(idx)
e0f2b4b4 2608 start, stop, step = idx, idx, 0
483336e7 2609 else:
2610 raise TypeError('indices must be integers or slices')
e0f2b4b4 2611 if ((start or 0) < 0 or (stop or 0) < 0
2612 or (start is None and step < 0)
2613 or (stop is None and step > 0)):
483336e7 2614 # We need to consume the entire iterable to be able to slice from the end
2615 # Obviously, never use this with infinite iterables
8e5fecc8 2616 self.__exhaust()
2617 try:
2618 return self.__cache[idx]
2619 except IndexError as e:
2620 raise self.IndexError(e) from e
e0f2b4b4 2621 n = max(start or 0, stop or 0) - len(self.__cache) + 1
28419ca2 2622 if n > 0:
2623 self.__cache.extend(itertools.islice(self.__iterable, n))
8e5fecc8 2624 try:
2625 return self.__cache[idx]
2626 except IndexError as e:
2627 raise self.IndexError(e) from e
483336e7 2628
2629 def __bool__(self):
2630 try:
28419ca2 2631 self[-1] if self.__reversed else self[0]
8e5fecc8 2632 except self.IndexError:
483336e7 2633 return False
2634 return True
2635
2636 def __len__(self):
8e5fecc8 2637 self.__exhaust()
483336e7 2638 return len(self.__cache)
2639
282f5709 2640 def __reversed__(self):
2641 return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2642
2643 def __copy__(self):
2644 return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2645
28419ca2 2646 def __repr__(self):
2647 # repr and str should mimic a list. So we exhaust the iterable
2648 return repr(self.exhaust())
2649
2650 def __str__(self):
2651 return repr(self.exhaust())
2652
483336e7 2653
7be9ccff 2654class PagedList:
c07a39ae 2655
2656 class IndexError(IndexError):
2657 pass
2658
dd26ced1
PH
2659 def __len__(self):
2660 # This is only useful for tests
2661 return len(self.getslice())
2662
7be9ccff 2663 def __init__(self, pagefunc, pagesize, use_cache=True):
2664 self._pagefunc = pagefunc
2665 self._pagesize = pagesize
2666 self._use_cache = use_cache
2667 self._cache = {}
2668
2669 def getpage(self, pagenum):
d8cf8d97 2670 page_results = self._cache.get(pagenum)
2671 if page_results is None:
2672 page_results = list(self._pagefunc(pagenum))
7be9ccff 2673 if self._use_cache:
2674 self._cache[pagenum] = page_results
2675 return page_results
2676
2677 def getslice(self, start=0, end=None):
2678 return list(self._getslice(start, end))
2679
2680 def _getslice(self, start, end):
55575225 2681 raise NotImplementedError('This method must be implemented by subclasses')
2682
2683 def __getitem__(self, idx):
7be9ccff 2684 # NOTE: cache must be enabled if this is used
55575225 2685 if not isinstance(idx, int) or idx < 0:
2686 raise TypeError('indices must be non-negative integers')
2687 entries = self.getslice(idx, idx + 1)
d8cf8d97 2688 if not entries:
c07a39ae 2689 raise self.IndexError()
d8cf8d97 2690 return entries[0]
55575225 2691
9c44d242
PH
2692
2693class OnDemandPagedList(PagedList):
7be9ccff 2694 def _getslice(self, start, end):
b7ab0590
PH
2695 for pagenum in itertools.count(start // self._pagesize):
2696 firstid = pagenum * self._pagesize
2697 nextfirstid = pagenum * self._pagesize + self._pagesize
2698 if start >= nextfirstid:
2699 continue
2700
b7ab0590
PH
2701 startv = (
2702 start % self._pagesize
2703 if firstid <= start < nextfirstid
2704 else 0)
b7ab0590
PH
2705 endv = (
2706 ((end - 1) % self._pagesize) + 1
2707 if (end is not None and firstid <= end <= nextfirstid)
2708 else None)
2709
7be9ccff 2710 page_results = self.getpage(pagenum)
b7ab0590
PH
2711 if startv != 0 or endv is not None:
2712 page_results = page_results[startv:endv]
7be9ccff 2713 yield from page_results
b7ab0590
PH
2714
2715 # A little optimization - if current page is not "full", ie. does
2716 # not contain page_size videos then we can assume that this page
2717 # is the last one - there are no more ids on further pages -
2718 # i.e. no need to query again.
2719 if len(page_results) + startv < self._pagesize:
2720 break
2721
2722 # If we got the whole page, but the next page is not interesting,
2723 # break out early as well
2724 if end == nextfirstid:
2725 break
81c2f20b
PH
2726
2727
9c44d242
PH
2728class InAdvancePagedList(PagedList):
2729 def __init__(self, pagefunc, pagecount, pagesize):
9c44d242 2730 self._pagecount = pagecount
7be9ccff 2731 PagedList.__init__(self, pagefunc, pagesize, True)
9c44d242 2732
7be9ccff 2733 def _getslice(self, start, end):
9c44d242
PH
2734 start_page = start // self._pagesize
2735 end_page = (
2736 self._pagecount if end is None else (end // self._pagesize + 1))
2737 skip_elems = start - start_page * self._pagesize
2738 only_more = None if end is None else end - start
2739 for pagenum in range(start_page, end_page):
7be9ccff 2740 page_results = self.getpage(pagenum)
9c44d242 2741 if skip_elems:
7be9ccff 2742 page_results = page_results[skip_elems:]
9c44d242
PH
2743 skip_elems = None
2744 if only_more is not None:
7be9ccff 2745 if len(page_results) < only_more:
2746 only_more -= len(page_results)
9c44d242 2747 else:
7be9ccff 2748 yield from page_results[:only_more]
9c44d242 2749 break
7be9ccff 2750 yield from page_results
9c44d242
PH
2751
2752
81c2f20b 2753def uppercase_escape(s):
676eb3f2 2754 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2755 return re.sub(
a612753d 2756 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2757 lambda m: unicode_escape(m.group(0))[0],
2758 s)
0fe2ff78
YCH
2759
2760
2761def lowercase_escape(s):
2762 unicode_escape = codecs.getdecoder('unicode_escape')
2763 return re.sub(
2764 r'\\u[0-9a-fA-F]{4}',
2765 lambda m: unicode_escape(m.group(0))[0],
2766 s)
b53466e1 2767
d05cfe06
S
2768
2769def escape_rfc3986(s):
2770 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 2771 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 2772 s = s.encode('utf-8')
ecc0c5ee 2773 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2774
2775
2776def escape_url(url):
2777 """Escape URL as suggested by RFC 3986"""
2778 url_parsed = compat_urllib_parse_urlparse(url)
2779 return url_parsed._replace(
efbed08d 2780 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2781 path=escape_rfc3986(url_parsed.path),
2782 params=escape_rfc3986(url_parsed.params),
2783 query=escape_rfc3986(url_parsed.query),
2784 fragment=escape_rfc3986(url_parsed.fragment)
2785 ).geturl()
2786
62e609ab 2787
4dfbf869 2788def parse_qs(url):
2789 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2790
2791
62e609ab
PH
2792def read_batch_urls(batch_fd):
2793 def fixup(url):
2794 if not isinstance(url, compat_str):
2795 url = url.decode('utf-8', 'replace')
8c04f0be 2796 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2797 for bom in BOM_UTF8:
2798 if url.startswith(bom):
2799 url = url[len(bom):]
2800 url = url.lstrip()
2801 if not url or url.startswith(('#', ';', ']')):
62e609ab 2802 return False
8c04f0be 2803 # "#" cannot be stripped out since it is part of the URI
2804 # However, it can be safely stipped out if follwing a whitespace
2805 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
2806
2807 with contextlib.closing(batch_fd) as fd:
2808 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2809
2810
2811def urlencode_postdata(*args, **kargs):
15707c7e 2812 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2813
2814
38f9ef31 2815def update_url_query(url, query):
cacd9966
YCH
2816 if not query:
2817 return url
38f9ef31 2818 parsed_url = compat_urlparse.urlparse(url)
2819 qs = compat_parse_qs(parsed_url.query)
2820 qs.update(query)
2821 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2822 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2823
8e60dc75 2824
ed0291d1
S
2825def update_Request(req, url=None, data=None, headers={}, query={}):
2826 req_headers = req.headers.copy()
2827 req_headers.update(headers)
2828 req_data = data or req.data
2829 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2830 req_get_method = req.get_method()
2831 if req_get_method == 'HEAD':
2832 req_type = HEADRequest
2833 elif req_get_method == 'PUT':
2834 req_type = PUTRequest
2835 else:
2836 req_type = compat_urllib_request.Request
ed0291d1
S
2837 new_req = req_type(
2838 req_url, data=req_data, headers=req_headers,
2839 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2840 if hasattr(req, 'timeout'):
2841 new_req.timeout = req.timeout
2842 return new_req
2843
2844
10c87c15 2845def _multipart_encode_impl(data, boundary):
0c265486
YCH
2846 content_type = 'multipart/form-data; boundary=%s' % boundary
2847
2848 out = b''
2849 for k, v in data.items():
2850 out += b'--' + boundary.encode('ascii') + b'\r\n'
2851 if isinstance(k, compat_str):
2852 k = k.encode('utf-8')
2853 if isinstance(v, compat_str):
2854 v = v.encode('utf-8')
2855 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2856 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2857 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2858 if boundary.encode('ascii') in content:
2859 raise ValueError('Boundary overlaps with data')
2860 out += content
2861
2862 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2863
2864 return out, content_type
2865
2866
2867def multipart_encode(data, boundary=None):
2868 '''
2869 Encode a dict to RFC 7578-compliant form-data
2870
2871 data:
2872 A dict where keys and values can be either Unicode or bytes-like
2873 objects.
2874 boundary:
2875 If specified a Unicode object, it's used as the boundary. Otherwise
2876 a random boundary is generated.
2877
2878 Reference: https://tools.ietf.org/html/rfc7578
2879 '''
2880 has_specified_boundary = boundary is not None
2881
2882 while True:
2883 if boundary is None:
2884 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2885
2886 try:
10c87c15 2887 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2888 break
2889 except ValueError:
2890 if has_specified_boundary:
2891 raise
2892 boundary = None
2893
2894 return out, content_type
2895
2896
86296ad2 2897def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
2898 if isinstance(key_or_keys, (list, tuple)):
2899 for key in key_or_keys:
86296ad2
S
2900 if key not in d or d[key] is None or skip_false_values and not d[key]:
2901 continue
2902 return d[key]
cbecc9b9
S
2903 return default
2904 return d.get(key_or_keys, default)
2905
2906
329ca3be 2907def try_get(src, getter, expected_type=None):
6606817a 2908 for get in variadic(getter):
a32a9a7e
S
2909 try:
2910 v = get(src)
2911 except (AttributeError, KeyError, TypeError, IndexError):
2912 pass
2913 else:
2914 if expected_type is None or isinstance(v, expected_type):
2915 return v
329ca3be
S
2916
2917
6cc62232
S
2918def merge_dicts(*dicts):
2919 merged = {}
2920 for a_dict in dicts:
2921 for k, v in a_dict.items():
2922 if v is None:
2923 continue
3089bc74
S
2924 if (k not in merged
2925 or (isinstance(v, compat_str) and v
2926 and isinstance(merged[k], compat_str)
2927 and not merged[k])):
6cc62232
S
2928 merged[k] = v
2929 return merged
2930
2931
8e60dc75
S
2932def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2933 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2934
16392824 2935
a1a530b0
PH
2936US_RATINGS = {
2937 'G': 0,
2938 'PG': 10,
2939 'PG-13': 13,
2940 'R': 16,
2941 'NC': 18,
2942}
fac55558
PH
2943
2944
a8795327 2945TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
2946 'TV-Y': 0,
2947 'TV-Y7': 7,
2948 'TV-G': 0,
2949 'TV-PG': 0,
2950 'TV-14': 14,
2951 'TV-MA': 17,
a8795327
S
2952}
2953
2954
146c80e2 2955def parse_age_limit(s):
a8795327
S
2956 if type(s) == int:
2957 return s if 0 <= s <= 21 else None
2958 if not isinstance(s, compat_basestring):
d838b1bd 2959 return None
146c80e2 2960 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2961 if m:
2962 return int(m.group('age'))
5c5fae6d 2963 s = s.upper()
a8795327
S
2964 if s in US_RATINGS:
2965 return US_RATINGS[s]
5a16c9d9 2966 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 2967 if m:
5a16c9d9 2968 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 2969 return None
146c80e2
S
2970
2971
fac55558 2972def strip_jsonp(code):
609a61e3 2973 return re.sub(
5552c9eb 2974 r'''(?sx)^
e9c671d5 2975 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
2976 (?:\s*&&\s*(?P=func_name))?
2977 \s*\(\s*(?P<callback_data>.*)\);?
2978 \s*?(?://[^\n]*)*$''',
2979 r'\g<callback_data>', code)
478c2c61
PH
2980
2981
5c610515 2982def js_to_json(code, vars={}):
2983 # vars is a dict of var, val pairs to substitute
c843e685 2984 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
4195096e
S
2985 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2986 INTEGER_TABLE = (
2987 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2988 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2989 )
2990
e05f6939 2991 def fix_kv(m):
e7b6d122
PH
2992 v = m.group(0)
2993 if v in ('true', 'false', 'null'):
2994 return v
421ddcb8
C
2995 elif v in ('undefined', 'void 0'):
2996 return 'null'
8bdd16b4 2997 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
bd1e4844 2998 return ""
2999
3000 if v[0] in ("'", '"'):
3001 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 3002 '"': '\\"',
bd1e4844 3003 "\\'": "'",
3004 '\\\n': '',
3005 '\\x': '\\u00',
3006 }.get(m.group(0), m.group(0)), v[1:-1])
8bdd16b4 3007 else:
3008 for regex, base in INTEGER_TABLE:
3009 im = re.match(regex, v)
3010 if im:
3011 i = int(im.group(1), base)
3012 return '"%d":' % i if v.endswith(':') else '%d' % i
89ac4a19 3013
5c610515 3014 if v in vars:
3015 return vars[v]
3016
e7b6d122 3017 return '"%s"' % v
e05f6939 3018
bd1e4844 3019 return re.sub(r'''(?sx)
3020 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3021 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 3022 {comment}|,(?={skip}[\]}}])|
421ddcb8 3023 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
4195096e 3024 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
8bdd16b4 3025 [0-9]+(?={skip}:)|
3026 !+
4195096e 3027 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
3028
3029
478c2c61
PH
3030def qualities(quality_ids):
3031 """ Get a numeric quality value out of a list of possible values """
3032 def q(qid):
3033 try:
3034 return quality_ids.index(qid)
3035 except ValueError:
3036 return -1
3037 return q
3038
acd69589 3039
ebed8b37 3040POSTPROCESS_WHEN = {'pre_process', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
1e43a6f7 3041
3042
de6000d9 3043DEFAULT_OUTTMPL = {
3044 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3045 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3046}
3047OUTTMPL_TYPES = {
72755351 3048 'chapter': None,
de6000d9 3049 'subtitle': None,
3050 'thumbnail': None,
3051 'description': 'description',
3052 'annotation': 'annotations.xml',
3053 'infojson': 'info.json',
08438d2c 3054 'link': None,
5112f26a 3055 'pl_thumbnail': None,
de6000d9 3056 'pl_description': 'description',
3057 'pl_infojson': 'info.json',
3058}
0a871f68 3059
143db31d 3060# As of [1] format syntax is:
3061# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3062# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3063STR_FORMAT_RE_TMPL = r'''(?x)
3064 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3065 %
524e2e4f 3066 (?P<has_key>\((?P<key>{0})\))?
752cda38 3067 (?P<format>
524e2e4f 3068 (?P<conversion>[#0\-+ ]+)?
3069 (?P<min_width>\d+)?
3070 (?P<precision>\.\d+)?
3071 (?P<len_mod>[hlL])? # unused in python
901130bb 3072 {1} # conversion type
752cda38 3073 )
143db31d 3074'''
3075
7d1eb38a 3076
901130bb 3077STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3078
7d1eb38a 3079
a020a0dc
PH
3080def limit_length(s, length):
3081 """ Add ellipses to overly long strings """
3082 if s is None:
3083 return None
3084 ELLIPSES = '...'
3085 if len(s) > length:
3086 return s[:length - len(ELLIPSES)] + ELLIPSES
3087 return s
48844745
PH
3088
3089
3090def version_tuple(v):
5f9b8394 3091 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3092
3093
3094def is_outdated_version(version, limit, assume_new=True):
3095 if not version:
3096 return not assume_new
3097 try:
3098 return version_tuple(version) < version_tuple(limit)
3099 except ValueError:
3100 return not assume_new
732ea2f0
PH
3101
3102
3103def ytdl_is_updateable():
7a5c1cfe 3104 """ Returns if yt-dlp can be updated with -U """
735d865e 3105
5d535b4a 3106 from .update import is_non_updateable
732ea2f0 3107
5d535b4a 3108 return not is_non_updateable()
7d4111ed
PH
3109
3110
3111def args_to_str(args):
3112 # Get a short string representation for a subprocess command
702ccf2d 3113 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3114
3115
9b9c5355 3116def error_to_compat_str(err):
fdae2358
S
3117 err_str = str(err)
3118 # On python 2 error byte string must be decoded with proper
3119 # encoding rather than ascii
3120 if sys.version_info[0] < 3:
3121 err_str = err_str.decode(preferredencoding())
3122 return err_str
3123
3124
c460bdd5 3125def mimetype2ext(mt):
eb9ee194
S
3126 if mt is None:
3127 return None
3128
9359f3d4
F
3129 mt, _, params = mt.partition(';')
3130 mt = mt.strip()
3131
3132 FULL_MAP = {
765ac263 3133 'audio/mp4': 'm4a',
6c33d24b
YCH
3134 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3135 # it's the most popular one
3136 'audio/mpeg': 'mp3',
ba39289d 3137 'audio/x-wav': 'wav',
9359f3d4
F
3138 'audio/wav': 'wav',
3139 'audio/wave': 'wav',
3140 }
3141
3142 ext = FULL_MAP.get(mt)
765ac263
JMF
3143 if ext is not None:
3144 return ext
3145
9359f3d4 3146 SUBTYPE_MAP = {
f6861ec9 3147 '3gpp': '3gp',
cafcf657 3148 'smptett+xml': 'tt',
cafcf657 3149 'ttaf+xml': 'dfxp',
a0d8d704 3150 'ttml+xml': 'ttml',
f6861ec9 3151 'x-flv': 'flv',
a0d8d704 3152 'x-mp4-fragmented': 'mp4',
d4f05d47 3153 'x-ms-sami': 'sami',
a0d8d704 3154 'x-ms-wmv': 'wmv',
b4173f15
RA
3155 'mpegurl': 'm3u8',
3156 'x-mpegurl': 'm3u8',
3157 'vnd.apple.mpegurl': 'm3u8',
3158 'dash+xml': 'mpd',
b4173f15 3159 'f4m+xml': 'f4m',
f164b971 3160 'hds+xml': 'f4m',
e910fe2f 3161 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 3162 'quicktime': 'mov',
98ce1a3f 3163 'mp2t': 'ts',
39e7107d 3164 'x-wav': 'wav',
9359f3d4
F
3165 'filmstrip+json': 'fs',
3166 'svg+xml': 'svg',
3167 }
3168
3169 _, _, subtype = mt.rpartition('/')
3170 ext = SUBTYPE_MAP.get(subtype.lower())
3171 if ext is not None:
3172 return ext
3173
3174 SUFFIX_MAP = {
3175 'json': 'json',
3176 'xml': 'xml',
3177 'zip': 'zip',
3178 'gzip': 'gz',
3179 }
3180
3181 _, _, suffix = subtype.partition('+')
3182 ext = SUFFIX_MAP.get(suffix)
3183 if ext is not None:
3184 return ext
3185
3186 return subtype.replace('+', '.')
c460bdd5
PH
3187
3188
2814f12b
THD
3189def ext2mimetype(ext_or_url):
3190 if not ext_or_url:
3191 return None
3192 if '.' not in ext_or_url:
3193 ext_or_url = f'file.{ext_or_url}'
3194 return mimetypes.guess_type(ext_or_url)[0]
3195
3196
4f3c5e06 3197def parse_codecs(codecs_str):
3198 # http://tools.ietf.org/html/rfc6381
3199 if not codecs_str:
3200 return {}
a0566bbf 3201 split_codecs = list(filter(None, map(
dbf5416a 3202 str.strip, codecs_str.strip().strip(',').split(','))))
4afa3ec4 3203 vcodec, acodec, tcodec, hdr = None, None, None, None
a0566bbf 3204 for full_codec in split_codecs:
9bd979ca 3205 parts = full_codec.split('.')
3206 codec = parts[0].replace('0', '')
3207 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3208 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
4f3c5e06 3209 if not vcodec:
b69fd25c 3210 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
176f1866 3211 if codec in ('dvh1', 'dvhe'):
3212 hdr = 'DV'
9bd979ca 3213 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3214 hdr = 'HDR10'
3215 elif full_codec.replace('0', '').startswith('vp9.2'):
176f1866 3216 hdr = 'HDR10'
b69fd25c 3217 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 3218 if not acodec:
3219 acodec = full_codec
4afa3ec4
F
3220 elif codec in ('stpp', 'wvtt',):
3221 if not tcodec:
3222 tcodec = full_codec
4f3c5e06 3223 else:
60f5c9fb 3224 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
4afa3ec4 3225 if vcodec or acodec or tcodec:
4f3c5e06 3226 return {
3227 'vcodec': vcodec or 'none',
3228 'acodec': acodec or 'none',
176f1866 3229 'dynamic_range': hdr,
4afa3ec4 3230 **({'tcodec': tcodec} if tcodec is not None else {}),
4f3c5e06 3231 }
b69fd25c 3232 elif len(split_codecs) == 2:
3233 return {
3234 'vcodec': split_codecs[0],
3235 'acodec': split_codecs[1],
3236 }
4f3c5e06 3237 return {}
3238
3239
2ccd1b10 3240def urlhandle_detect_ext(url_handle):
79298173 3241 getheader = url_handle.headers.get
2ccd1b10 3242
b55ee18f
PH
3243 cd = getheader('Content-Disposition')
3244 if cd:
3245 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3246 if m:
3247 e = determine_ext(m.group('filename'), default_ext=None)
3248 if e:
3249 return e
3250
c460bdd5 3251 return mimetype2ext(getheader('Content-Type'))
05900629
PH
3252
3253
1e399778
YCH
3254def encode_data_uri(data, mime_type):
3255 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3256
3257
05900629 3258def age_restricted(content_limit, age_limit):
6ec6cb4e 3259 """ Returns True iff the content should be blocked """
05900629
PH
3260
3261 if age_limit is None: # No limit set
3262 return False
3263 if content_limit is None:
3264 return False # Content available for everyone
3265 return age_limit < content_limit
61ca9a80
PH
3266
3267
3268def is_html(first_bytes):
3269 """ Detect whether a file contains HTML by examining its first bytes. """
3270
3271 BOMS = [
3272 (b'\xef\xbb\xbf', 'utf-8'),
3273 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3274 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3275 (b'\xff\xfe', 'utf-16-le'),
3276 (b'\xfe\xff', 'utf-16-be'),
3277 ]
3278 for bom, enc in BOMS:
3279 if first_bytes.startswith(bom):
3280 s = first_bytes[len(bom):].decode(enc, 'replace')
3281 break
3282 else:
3283 s = first_bytes.decode('utf-8', 'replace')
3284
3285 return re.match(r'^\s*<', s)
a055469f
PH
3286
3287
3288def determine_protocol(info_dict):
3289 protocol = info_dict.get('protocol')
3290 if protocol is not None:
3291 return protocol
3292
7de837a5 3293 url = sanitize_url(info_dict['url'])
a055469f
PH
3294 if url.startswith('rtmp'):
3295 return 'rtmp'
3296 elif url.startswith('mms'):
3297 return 'mms'
3298 elif url.startswith('rtsp'):
3299 return 'rtsp'
3300
3301 ext = determine_ext(url)
3302 if ext == 'm3u8':
3303 return 'm3u8'
3304 elif ext == 'f4m':
3305 return 'f4m'
3306
3307 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
3308
3309
c5e3f849 3310def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3311 """ Render a list of rows, each as a list of values.
3312 Text after a \t will be right aligned """
ec11a9f4 3313 def width(string):
c5e3f849 3314 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3315
3316 def get_max_lens(table):
ec11a9f4 3317 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3318
3319 def filter_using_list(row, filterArray):
3320 return [col for (take, col) in zip(filterArray, row) if take]
3321
c5e3f849 3322 if hide_empty:
76d321f6 3323 max_lens = get_max_lens(data)
3324 header_row = filter_using_list(header_row, max_lens)
3325 data = [filter_using_list(row, max_lens) for row in data]
3326
cfb56d1a 3327 table = [header_row] + data
76d321f6 3328 max_lens = get_max_lens(table)
c5e3f849 3329 extra_gap += 1
76d321f6 3330 if delim:
c5e3f849 3331 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3332 table[1][-1] = table[1][-1][:-extra_gap] # Remove extra_gap from end of delimiter
ec11a9f4 3333 for row in table:
3334 for pos, text in enumerate(map(str, row)):
c5e3f849 3335 if '\t' in text:
3336 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3337 else:
3338 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3339 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3340 return ret
347de493
PH
3341
3342
8f18aca8 3343def _match_one(filter_part, dct, incomplete):
77b87f05 3344 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3345 STRING_OPERATORS = {
3346 '*=': operator.contains,
3347 '^=': lambda attr, value: attr.startswith(value),
3348 '$=': lambda attr, value: attr.endswith(value),
3349 '~=': lambda attr, value: re.search(value, attr),
3350 }
347de493 3351 COMPARISON_OPERATORS = {
a047eeb6 3352 **STRING_OPERATORS,
3353 '<=': operator.le, # "<=" must be defined above "<"
347de493 3354 '<': operator.lt,
347de493 3355 '>=': operator.ge,
a047eeb6 3356 '>': operator.gt,
347de493 3357 '=': operator.eq,
347de493 3358 }
a047eeb6 3359
347de493
PH
3360 operator_rex = re.compile(r'''(?x)\s*
3361 (?P<key>[a-z_]+)
77b87f05 3362 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3363 (?:
a047eeb6 3364 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3365 (?P<strval>.+?)
347de493
PH
3366 )
3367 \s*$
3368 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3369 m = operator_rex.search(filter_part)
3370 if m:
18f96d12 3371 m = m.groupdict()
3372 unnegated_op = COMPARISON_OPERATORS[m['op']]
3373 if m['negation']:
77b87f05
MT
3374 op = lambda attr, value: not unnegated_op(attr, value)
3375 else:
3376 op = unnegated_op
18f96d12 3377 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3378 if m['quote']:
3379 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3380 actual_value = dct.get(m['key'])
3381 numeric_comparison = None
3382 if isinstance(actual_value, compat_numeric_types):
e5a088dc
S
3383 # If the original field is a string and matching comparisonvalue is
3384 # a number we should respect the origin of the original field
3385 # and process comparison value as a string (see
18f96d12 3386 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3387 try:
18f96d12 3388 numeric_comparison = int(comparison_value)
347de493 3389 except ValueError:
18f96d12 3390 numeric_comparison = parse_filesize(comparison_value)
3391 if numeric_comparison is None:
3392 numeric_comparison = parse_filesize(f'{comparison_value}B')
3393 if numeric_comparison is None:
3394 numeric_comparison = parse_duration(comparison_value)
3395 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3396 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3397 if actual_value is None:
18f96d12 3398 return incomplete or m['none_inclusive']
3399 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3400
3401 UNARY_OPERATORS = {
1cc47c66
S
3402 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3403 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493
PH
3404 }
3405 operator_rex = re.compile(r'''(?x)\s*
3406 (?P<op>%s)\s*(?P<key>[a-z_]+)
3407 \s*$
3408 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3409 m = operator_rex.search(filter_part)
3410 if m:
3411 op = UNARY_OPERATORS[m.group('op')]
3412 actual_value = dct.get(m.group('key'))
8f18aca8 3413 if incomplete and actual_value is None:
3414 return True
347de493
PH
3415 return op(actual_value)
3416
3417 raise ValueError('Invalid filter part %r' % filter_part)
3418
3419
8f18aca8 3420def match_str(filter_str, dct, incomplete=False):
3421 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3422 When incomplete, all conditions passes on missing fields
3423 """
347de493 3424 return all(
8f18aca8 3425 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3426 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3427
3428
3429def match_filter_func(filter_str):
8f18aca8 3430 def _match_func(info_dict, *args, **kwargs):
3431 if match_str(filter_str, info_dict, *args, **kwargs):
347de493
PH
3432 return None
3433 else:
3434 video_title = info_dict.get('title', info_dict.get('id', 'video'))
3435 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3436 return _match_func
91410c9b
PH
3437
3438
bf6427d2
YCH
3439def parse_dfxp_time_expr(time_expr):
3440 if not time_expr:
d631d5f9 3441 return
bf6427d2
YCH
3442
3443 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3444 if mobj:
3445 return float(mobj.group('time_offset'))
3446
db2fe38b 3447 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3448 if mobj:
db2fe38b 3449 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3450
3451
c1c924ab 3452def srt_subtitles_timecode(seconds):
aa7785f8 3453 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3454
3455
3456def ass_subtitles_timecode(seconds):
3457 time = timetuple_from_msec(seconds * 1000)
3458 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3459
3460
3461def dfxp2srt(dfxp_data):
3869028f
YCH
3462 '''
3463 @param dfxp_data A bytes-like object containing DFXP data
3464 @returns A unicode object containing converted SRT data
3465 '''
5b995f71 3466 LEGACY_NAMESPACES = (
3869028f
YCH
3467 (b'http://www.w3.org/ns/ttml', [
3468 b'http://www.w3.org/2004/11/ttaf1',
3469 b'http://www.w3.org/2006/04/ttaf1',
3470 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3471 ]),
3869028f
YCH
3472 (b'http://www.w3.org/ns/ttml#styling', [
3473 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3474 ]),
3475 )
3476
3477 SUPPORTED_STYLING = [
3478 'color',
3479 'fontFamily',
3480 'fontSize',
3481 'fontStyle',
3482 'fontWeight',
3483 'textDecoration'
3484 ]
3485
4e335771 3486 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3487 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3488 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3489 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3490 })
bf6427d2 3491
5b995f71
RA
3492 styles = {}
3493 default_style = {}
3494
87de7069 3495 class TTMLPElementParser(object):
5b995f71
RA
3496 _out = ''
3497 _unclosed_elements = []
3498 _applied_styles = []
bf6427d2 3499
2b14cb56 3500 def start(self, tag, attrib):
5b995f71
RA
3501 if tag in (_x('ttml:br'), 'br'):
3502 self._out += '\n'
3503 else:
3504 unclosed_elements = []
3505 style = {}
3506 element_style_id = attrib.get('style')
3507 if default_style:
3508 style.update(default_style)
3509 if element_style_id:
3510 style.update(styles.get(element_style_id, {}))
3511 for prop in SUPPORTED_STYLING:
3512 prop_val = attrib.get(_x('tts:' + prop))
3513 if prop_val:
3514 style[prop] = prop_val
3515 if style:
3516 font = ''
3517 for k, v in sorted(style.items()):
3518 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3519 continue
3520 if k == 'color':
3521 font += ' color="%s"' % v
3522 elif k == 'fontSize':
3523 font += ' size="%s"' % v
3524 elif k == 'fontFamily':
3525 font += ' face="%s"' % v
3526 elif k == 'fontWeight' and v == 'bold':
3527 self._out += '<b>'
3528 unclosed_elements.append('b')
3529 elif k == 'fontStyle' and v == 'italic':
3530 self._out += '<i>'
3531 unclosed_elements.append('i')
3532 elif k == 'textDecoration' and v == 'underline':
3533 self._out += '<u>'
3534 unclosed_elements.append('u')
3535 if font:
3536 self._out += '<font' + font + '>'
3537 unclosed_elements.append('font')
3538 applied_style = {}
3539 if self._applied_styles:
3540 applied_style.update(self._applied_styles[-1])
3541 applied_style.update(style)
3542 self._applied_styles.append(applied_style)
3543 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3544
2b14cb56 3545 def end(self, tag):
5b995f71
RA
3546 if tag not in (_x('ttml:br'), 'br'):
3547 unclosed_elements = self._unclosed_elements.pop()
3548 for element in reversed(unclosed_elements):
3549 self._out += '</%s>' % element
3550 if unclosed_elements and self._applied_styles:
3551 self._applied_styles.pop()
bf6427d2 3552
2b14cb56 3553 def data(self, data):
5b995f71 3554 self._out += data
2b14cb56 3555
3556 def close(self):
5b995f71 3557 return self._out.strip()
2b14cb56 3558
3559 def parse_node(node):
3560 target = TTMLPElementParser()
3561 parser = xml.etree.ElementTree.XMLParser(target=target)
3562 parser.feed(xml.etree.ElementTree.tostring(node))
3563 return parser.close()
bf6427d2 3564
5b995f71
RA
3565 for k, v in LEGACY_NAMESPACES:
3566 for ns in v:
3567 dfxp_data = dfxp_data.replace(ns, k)
3568
3869028f 3569 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3570 out = []
5b995f71 3571 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3572
3573 if not paras:
3574 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3575
5b995f71
RA
3576 repeat = False
3577 while True:
3578 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3579 style_id = style.get('id') or style.get(_x('xml:id'))
3580 if not style_id:
3581 continue
5b995f71
RA
3582 parent_style_id = style.get('style')
3583 if parent_style_id:
3584 if parent_style_id not in styles:
3585 repeat = True
3586 continue
3587 styles[style_id] = styles[parent_style_id].copy()
3588 for prop in SUPPORTED_STYLING:
3589 prop_val = style.get(_x('tts:' + prop))
3590 if prop_val:
3591 styles.setdefault(style_id, {})[prop] = prop_val
3592 if repeat:
3593 repeat = False
3594 else:
3595 break
3596
3597 for p in ('body', 'div'):
3598 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3599 if ele is None:
3600 continue
3601 style = styles.get(ele.get('style'))
3602 if not style:
3603 continue
3604 default_style.update(style)
3605
bf6427d2 3606 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3607 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3608 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3609 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3610 if begin_time is None:
3611 continue
7dff0363 3612 if not end_time:
d631d5f9
YCH
3613 if not dur:
3614 continue
3615 end_time = begin_time + dur
bf6427d2
YCH
3616 out.append('%d\n%s --> %s\n%s\n\n' % (
3617 index,
c1c924ab
YCH
3618 srt_subtitles_timecode(begin_time),
3619 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3620 parse_node(para)))
3621
3622 return ''.join(out)
3623
3624
66e289ba
S
3625def cli_option(params, command_option, param):
3626 param = params.get(param)
98e698f1
RA
3627 if param:
3628 param = compat_str(param)
66e289ba
S
3629 return [command_option, param] if param is not None else []
3630
3631
3632def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3633 param = params.get(param)
5b232f46
S
3634 if param is None:
3635 return []
66e289ba
S
3636 assert isinstance(param, bool)
3637 if separator:
3638 return [command_option + separator + (true_value if param else false_value)]
3639 return [command_option, true_value if param else false_value]
3640
3641
3642def cli_valueless_option(params, command_option, param, expected_value=True):
3643 param = params.get(param)
3644 return [command_option] if param == expected_value else []
3645
3646
e92caff5 3647def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3648 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3649 if use_compat:
5b1ecbb3 3650 return argdict
3651 else:
3652 argdict = None
eab9b2bc 3653 if argdict is None:
5b1ecbb3 3654 return default
eab9b2bc 3655 assert isinstance(argdict, dict)
3656
e92caff5 3657 assert isinstance(keys, (list, tuple))
3658 for key_list in keys:
e92caff5 3659 arg_list = list(filter(
3660 lambda x: x is not None,
6606817a 3661 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3662 if arg_list:
3663 return [arg for args in arg_list for arg in args]
3664 return default
66e289ba 3665
6251555f 3666
330690a2 3667def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3668 main_key, exe = main_key.lower(), exe.lower()
3669 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3670 keys = [f'{root_key}{k}' for k in (keys or [''])]
3671 if root_key in keys:
3672 if main_key != exe:
3673 keys.append((main_key, exe))
3674 keys.append('default')
3675 else:
3676 use_compat = False
3677 return cli_configuration_args(argdict, keys, default, use_compat)
3678
66e289ba 3679
39672624
YCH
3680class ISO639Utils(object):
3681 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3682 _lang_map = {
3683 'aa': 'aar',
3684 'ab': 'abk',
3685 'ae': 'ave',
3686 'af': 'afr',
3687 'ak': 'aka',
3688 'am': 'amh',
3689 'an': 'arg',
3690 'ar': 'ara',
3691 'as': 'asm',
3692 'av': 'ava',
3693 'ay': 'aym',
3694 'az': 'aze',
3695 'ba': 'bak',
3696 'be': 'bel',
3697 'bg': 'bul',
3698 'bh': 'bih',
3699 'bi': 'bis',
3700 'bm': 'bam',
3701 'bn': 'ben',
3702 'bo': 'bod',
3703 'br': 'bre',
3704 'bs': 'bos',
3705 'ca': 'cat',
3706 'ce': 'che',
3707 'ch': 'cha',
3708 'co': 'cos',
3709 'cr': 'cre',
3710 'cs': 'ces',
3711 'cu': 'chu',
3712 'cv': 'chv',
3713 'cy': 'cym',
3714 'da': 'dan',
3715 'de': 'deu',
3716 'dv': 'div',
3717 'dz': 'dzo',
3718 'ee': 'ewe',
3719 'el': 'ell',
3720 'en': 'eng',
3721 'eo': 'epo',
3722 'es': 'spa',
3723 'et': 'est',
3724 'eu': 'eus',
3725 'fa': 'fas',
3726 'ff': 'ful',
3727 'fi': 'fin',
3728 'fj': 'fij',
3729 'fo': 'fao',
3730 'fr': 'fra',
3731 'fy': 'fry',
3732 'ga': 'gle',
3733 'gd': 'gla',
3734 'gl': 'glg',
3735 'gn': 'grn',
3736 'gu': 'guj',
3737 'gv': 'glv',
3738 'ha': 'hau',
3739 'he': 'heb',
b7acc835 3740 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
3741 'hi': 'hin',
3742 'ho': 'hmo',
3743 'hr': 'hrv',
3744 'ht': 'hat',
3745 'hu': 'hun',
3746 'hy': 'hye',
3747 'hz': 'her',
3748 'ia': 'ina',
3749 'id': 'ind',
b7acc835 3750 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
3751 'ie': 'ile',
3752 'ig': 'ibo',
3753 'ii': 'iii',
3754 'ik': 'ipk',
3755 'io': 'ido',
3756 'is': 'isl',
3757 'it': 'ita',
3758 'iu': 'iku',
3759 'ja': 'jpn',
3760 'jv': 'jav',
3761 'ka': 'kat',
3762 'kg': 'kon',
3763 'ki': 'kik',
3764 'kj': 'kua',
3765 'kk': 'kaz',
3766 'kl': 'kal',
3767 'km': 'khm',
3768 'kn': 'kan',
3769 'ko': 'kor',
3770 'kr': 'kau',
3771 'ks': 'kas',
3772 'ku': 'kur',
3773 'kv': 'kom',
3774 'kw': 'cor',
3775 'ky': 'kir',
3776 'la': 'lat',
3777 'lb': 'ltz',
3778 'lg': 'lug',
3779 'li': 'lim',
3780 'ln': 'lin',
3781 'lo': 'lao',
3782 'lt': 'lit',
3783 'lu': 'lub',
3784 'lv': 'lav',
3785 'mg': 'mlg',
3786 'mh': 'mah',
3787 'mi': 'mri',
3788 'mk': 'mkd',
3789 'ml': 'mal',
3790 'mn': 'mon',
3791 'mr': 'mar',
3792 'ms': 'msa',
3793 'mt': 'mlt',
3794 'my': 'mya',
3795 'na': 'nau',
3796 'nb': 'nob',
3797 'nd': 'nde',
3798 'ne': 'nep',
3799 'ng': 'ndo',
3800 'nl': 'nld',
3801 'nn': 'nno',
3802 'no': 'nor',
3803 'nr': 'nbl',
3804 'nv': 'nav',
3805 'ny': 'nya',
3806 'oc': 'oci',
3807 'oj': 'oji',
3808 'om': 'orm',
3809 'or': 'ori',
3810 'os': 'oss',
3811 'pa': 'pan',
3812 'pi': 'pli',
3813 'pl': 'pol',
3814 'ps': 'pus',
3815 'pt': 'por',
3816 'qu': 'que',
3817 'rm': 'roh',
3818 'rn': 'run',
3819 'ro': 'ron',
3820 'ru': 'rus',
3821 'rw': 'kin',
3822 'sa': 'san',
3823 'sc': 'srd',
3824 'sd': 'snd',
3825 'se': 'sme',
3826 'sg': 'sag',
3827 'si': 'sin',
3828 'sk': 'slk',
3829 'sl': 'slv',
3830 'sm': 'smo',
3831 'sn': 'sna',
3832 'so': 'som',
3833 'sq': 'sqi',
3834 'sr': 'srp',
3835 'ss': 'ssw',
3836 'st': 'sot',
3837 'su': 'sun',
3838 'sv': 'swe',
3839 'sw': 'swa',
3840 'ta': 'tam',
3841 'te': 'tel',
3842 'tg': 'tgk',
3843 'th': 'tha',
3844 'ti': 'tir',
3845 'tk': 'tuk',
3846 'tl': 'tgl',
3847 'tn': 'tsn',
3848 'to': 'ton',
3849 'tr': 'tur',
3850 'ts': 'tso',
3851 'tt': 'tat',
3852 'tw': 'twi',
3853 'ty': 'tah',
3854 'ug': 'uig',
3855 'uk': 'ukr',
3856 'ur': 'urd',
3857 'uz': 'uzb',
3858 've': 'ven',
3859 'vi': 'vie',
3860 'vo': 'vol',
3861 'wa': 'wln',
3862 'wo': 'wol',
3863 'xh': 'xho',
3864 'yi': 'yid',
e9a50fba 3865 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
3866 'yo': 'yor',
3867 'za': 'zha',
3868 'zh': 'zho',
3869 'zu': 'zul',
3870 }
3871
3872 @classmethod
3873 def short2long(cls, code):
3874 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3875 return cls._lang_map.get(code[:2])
3876
3877 @classmethod
3878 def long2short(cls, code):
3879 """Convert language code from ISO 639-2/T to ISO 639-1"""
3880 for short_name, long_name in cls._lang_map.items():
3881 if long_name == code:
3882 return short_name
3883
3884
4eb10f66
YCH
3885class ISO3166Utils(object):
3886 # From http://data.okfn.org/data/core/country-list
3887 _country_map = {
3888 'AF': 'Afghanistan',
3889 'AX': 'Åland Islands',
3890 'AL': 'Albania',
3891 'DZ': 'Algeria',
3892 'AS': 'American Samoa',
3893 'AD': 'Andorra',
3894 'AO': 'Angola',
3895 'AI': 'Anguilla',
3896 'AQ': 'Antarctica',
3897 'AG': 'Antigua and Barbuda',
3898 'AR': 'Argentina',
3899 'AM': 'Armenia',
3900 'AW': 'Aruba',
3901 'AU': 'Australia',
3902 'AT': 'Austria',
3903 'AZ': 'Azerbaijan',
3904 'BS': 'Bahamas',
3905 'BH': 'Bahrain',
3906 'BD': 'Bangladesh',
3907 'BB': 'Barbados',
3908 'BY': 'Belarus',
3909 'BE': 'Belgium',
3910 'BZ': 'Belize',
3911 'BJ': 'Benin',
3912 'BM': 'Bermuda',
3913 'BT': 'Bhutan',
3914 'BO': 'Bolivia, Plurinational State of',
3915 'BQ': 'Bonaire, Sint Eustatius and Saba',
3916 'BA': 'Bosnia and Herzegovina',
3917 'BW': 'Botswana',
3918 'BV': 'Bouvet Island',
3919 'BR': 'Brazil',
3920 'IO': 'British Indian Ocean Territory',
3921 'BN': 'Brunei Darussalam',
3922 'BG': 'Bulgaria',
3923 'BF': 'Burkina Faso',
3924 'BI': 'Burundi',
3925 'KH': 'Cambodia',
3926 'CM': 'Cameroon',
3927 'CA': 'Canada',
3928 'CV': 'Cape Verde',
3929 'KY': 'Cayman Islands',
3930 'CF': 'Central African Republic',
3931 'TD': 'Chad',
3932 'CL': 'Chile',
3933 'CN': 'China',
3934 'CX': 'Christmas Island',
3935 'CC': 'Cocos (Keeling) Islands',
3936 'CO': 'Colombia',
3937 'KM': 'Comoros',
3938 'CG': 'Congo',
3939 'CD': 'Congo, the Democratic Republic of the',
3940 'CK': 'Cook Islands',
3941 'CR': 'Costa Rica',
3942 'CI': 'Côte d\'Ivoire',
3943 'HR': 'Croatia',
3944 'CU': 'Cuba',
3945 'CW': 'Curaçao',
3946 'CY': 'Cyprus',
3947 'CZ': 'Czech Republic',
3948 'DK': 'Denmark',
3949 'DJ': 'Djibouti',
3950 'DM': 'Dominica',
3951 'DO': 'Dominican Republic',
3952 'EC': 'Ecuador',
3953 'EG': 'Egypt',
3954 'SV': 'El Salvador',
3955 'GQ': 'Equatorial Guinea',
3956 'ER': 'Eritrea',
3957 'EE': 'Estonia',
3958 'ET': 'Ethiopia',
3959 'FK': 'Falkland Islands (Malvinas)',
3960 'FO': 'Faroe Islands',
3961 'FJ': 'Fiji',
3962 'FI': 'Finland',
3963 'FR': 'France',
3964 'GF': 'French Guiana',
3965 'PF': 'French Polynesia',
3966 'TF': 'French Southern Territories',
3967 'GA': 'Gabon',
3968 'GM': 'Gambia',
3969 'GE': 'Georgia',
3970 'DE': 'Germany',
3971 'GH': 'Ghana',
3972 'GI': 'Gibraltar',
3973 'GR': 'Greece',
3974 'GL': 'Greenland',
3975 'GD': 'Grenada',
3976 'GP': 'Guadeloupe',
3977 'GU': 'Guam',
3978 'GT': 'Guatemala',
3979 'GG': 'Guernsey',
3980 'GN': 'Guinea',
3981 'GW': 'Guinea-Bissau',
3982 'GY': 'Guyana',
3983 'HT': 'Haiti',
3984 'HM': 'Heard Island and McDonald Islands',
3985 'VA': 'Holy See (Vatican City State)',
3986 'HN': 'Honduras',
3987 'HK': 'Hong Kong',
3988 'HU': 'Hungary',
3989 'IS': 'Iceland',
3990 'IN': 'India',
3991 'ID': 'Indonesia',
3992 'IR': 'Iran, Islamic Republic of',
3993 'IQ': 'Iraq',
3994 'IE': 'Ireland',
3995 'IM': 'Isle of Man',
3996 'IL': 'Israel',
3997 'IT': 'Italy',
3998 'JM': 'Jamaica',
3999 'JP': 'Japan',
4000 'JE': 'Jersey',
4001 'JO': 'Jordan',
4002 'KZ': 'Kazakhstan',
4003 'KE': 'Kenya',
4004 'KI': 'Kiribati',
4005 'KP': 'Korea, Democratic People\'s Republic of',
4006 'KR': 'Korea, Republic of',
4007 'KW': 'Kuwait',
4008 'KG': 'Kyrgyzstan',
4009 'LA': 'Lao People\'s Democratic Republic',
4010 'LV': 'Latvia',
4011 'LB': 'Lebanon',
4012 'LS': 'Lesotho',
4013 'LR': 'Liberia',
4014 'LY': 'Libya',
4015 'LI': 'Liechtenstein',
4016 'LT': 'Lithuania',
4017 'LU': 'Luxembourg',
4018 'MO': 'Macao',
4019 'MK': 'Macedonia, the Former Yugoslav Republic of',
4020 'MG': 'Madagascar',
4021 'MW': 'Malawi',
4022 'MY': 'Malaysia',
4023 'MV': 'Maldives',
4024 'ML': 'Mali',
4025 'MT': 'Malta',
4026 'MH': 'Marshall Islands',
4027 'MQ': 'Martinique',
4028 'MR': 'Mauritania',
4029 'MU': 'Mauritius',
4030 'YT': 'Mayotte',
4031 'MX': 'Mexico',
4032 'FM': 'Micronesia, Federated States of',
4033 'MD': 'Moldova, Republic of',
4034 'MC': 'Monaco',
4035 'MN': 'Mongolia',
4036 'ME': 'Montenegro',
4037 'MS': 'Montserrat',
4038 'MA': 'Morocco',
4039 'MZ': 'Mozambique',
4040 'MM': 'Myanmar',
4041 'NA': 'Namibia',
4042 'NR': 'Nauru',
4043 'NP': 'Nepal',
4044 'NL': 'Netherlands',
4045 'NC': 'New Caledonia',
4046 'NZ': 'New Zealand',
4047 'NI': 'Nicaragua',
4048 'NE': 'Niger',
4049 'NG': 'Nigeria',
4050 'NU': 'Niue',
4051 'NF': 'Norfolk Island',
4052 'MP': 'Northern Mariana Islands',
4053 'NO': 'Norway',
4054 'OM': 'Oman',
4055 'PK': 'Pakistan',
4056 'PW': 'Palau',
4057 'PS': 'Palestine, State of',
4058 'PA': 'Panama',
4059 'PG': 'Papua New Guinea',
4060 'PY': 'Paraguay',
4061 'PE': 'Peru',
4062 'PH': 'Philippines',
4063 'PN': 'Pitcairn',
4064 'PL': 'Poland',
4065 'PT': 'Portugal',
4066 'PR': 'Puerto Rico',
4067 'QA': 'Qatar',
4068 'RE': 'Réunion',
4069 'RO': 'Romania',
4070 'RU': 'Russian Federation',
4071 'RW': 'Rwanda',
4072 'BL': 'Saint Barthélemy',
4073 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4074 'KN': 'Saint Kitts and Nevis',
4075 'LC': 'Saint Lucia',
4076 'MF': 'Saint Martin (French part)',
4077 'PM': 'Saint Pierre and Miquelon',
4078 'VC': 'Saint Vincent and the Grenadines',
4079 'WS': 'Samoa',
4080 'SM': 'San Marino',
4081 'ST': 'Sao Tome and Principe',
4082 'SA': 'Saudi Arabia',
4083 'SN': 'Senegal',
4084 'RS': 'Serbia',
4085 'SC': 'Seychelles',
4086 'SL': 'Sierra Leone',
4087 'SG': 'Singapore',
4088 'SX': 'Sint Maarten (Dutch part)',
4089 'SK': 'Slovakia',
4090 'SI': 'Slovenia',
4091 'SB': 'Solomon Islands',
4092 'SO': 'Somalia',
4093 'ZA': 'South Africa',
4094 'GS': 'South Georgia and the South Sandwich Islands',
4095 'SS': 'South Sudan',
4096 'ES': 'Spain',
4097 'LK': 'Sri Lanka',
4098 'SD': 'Sudan',
4099 'SR': 'Suriname',
4100 'SJ': 'Svalbard and Jan Mayen',
4101 'SZ': 'Swaziland',
4102 'SE': 'Sweden',
4103 'CH': 'Switzerland',
4104 'SY': 'Syrian Arab Republic',
4105 'TW': 'Taiwan, Province of China',
4106 'TJ': 'Tajikistan',
4107 'TZ': 'Tanzania, United Republic of',
4108 'TH': 'Thailand',
4109 'TL': 'Timor-Leste',
4110 'TG': 'Togo',
4111 'TK': 'Tokelau',
4112 'TO': 'Tonga',
4113 'TT': 'Trinidad and Tobago',
4114 'TN': 'Tunisia',
4115 'TR': 'Turkey',
4116 'TM': 'Turkmenistan',
4117 'TC': 'Turks and Caicos Islands',
4118 'TV': 'Tuvalu',
4119 'UG': 'Uganda',
4120 'UA': 'Ukraine',
4121 'AE': 'United Arab Emirates',
4122 'GB': 'United Kingdom',
4123 'US': 'United States',
4124 'UM': 'United States Minor Outlying Islands',
4125 'UY': 'Uruguay',
4126 'UZ': 'Uzbekistan',
4127 'VU': 'Vanuatu',
4128 'VE': 'Venezuela, Bolivarian Republic of',
4129 'VN': 'Viet Nam',
4130 'VG': 'Virgin Islands, British',
4131 'VI': 'Virgin Islands, U.S.',
4132 'WF': 'Wallis and Futuna',
4133 'EH': 'Western Sahara',
4134 'YE': 'Yemen',
4135 'ZM': 'Zambia',
4136 'ZW': 'Zimbabwe',
4137 }
4138
4139 @classmethod
4140 def short2full(cls, code):
4141 """Convert an ISO 3166-2 country code to the corresponding full name"""
4142 return cls._country_map.get(code.upper())
4143
4144
773f291d
S
4145class GeoUtils(object):
4146 # Major IPv4 address blocks per country
4147 _country_ip_map = {
53896ca5 4148 'AD': '46.172.224.0/19',
773f291d
S
4149 'AE': '94.200.0.0/13',
4150 'AF': '149.54.0.0/17',
4151 'AG': '209.59.64.0/18',
4152 'AI': '204.14.248.0/21',
4153 'AL': '46.99.0.0/16',
4154 'AM': '46.70.0.0/15',
4155 'AO': '105.168.0.0/13',
53896ca5
S
4156 'AP': '182.50.184.0/21',
4157 'AQ': '23.154.160.0/24',
773f291d
S
4158 'AR': '181.0.0.0/12',
4159 'AS': '202.70.112.0/20',
53896ca5 4160 'AT': '77.116.0.0/14',
773f291d
S
4161 'AU': '1.128.0.0/11',
4162 'AW': '181.41.0.0/18',
53896ca5
S
4163 'AX': '185.217.4.0/22',
4164 'AZ': '5.197.0.0/16',
773f291d
S
4165 'BA': '31.176.128.0/17',
4166 'BB': '65.48.128.0/17',
4167 'BD': '114.130.0.0/16',
4168 'BE': '57.0.0.0/8',
53896ca5 4169 'BF': '102.178.0.0/15',
773f291d
S
4170 'BG': '95.42.0.0/15',
4171 'BH': '37.131.0.0/17',
4172 'BI': '154.117.192.0/18',
4173 'BJ': '137.255.0.0/16',
53896ca5 4174 'BL': '185.212.72.0/23',
773f291d
S
4175 'BM': '196.12.64.0/18',
4176 'BN': '156.31.0.0/16',
4177 'BO': '161.56.0.0/16',
4178 'BQ': '161.0.80.0/20',
53896ca5 4179 'BR': '191.128.0.0/12',
773f291d
S
4180 'BS': '24.51.64.0/18',
4181 'BT': '119.2.96.0/19',
4182 'BW': '168.167.0.0/16',
4183 'BY': '178.120.0.0/13',
4184 'BZ': '179.42.192.0/18',
4185 'CA': '99.224.0.0/11',
4186 'CD': '41.243.0.0/16',
53896ca5
S
4187 'CF': '197.242.176.0/21',
4188 'CG': '160.113.0.0/16',
773f291d 4189 'CH': '85.0.0.0/13',
53896ca5 4190 'CI': '102.136.0.0/14',
773f291d
S
4191 'CK': '202.65.32.0/19',
4192 'CL': '152.172.0.0/14',
53896ca5 4193 'CM': '102.244.0.0/14',
773f291d
S
4194 'CN': '36.128.0.0/10',
4195 'CO': '181.240.0.0/12',
4196 'CR': '201.192.0.0/12',
4197 'CU': '152.206.0.0/15',
4198 'CV': '165.90.96.0/19',
4199 'CW': '190.88.128.0/17',
53896ca5 4200 'CY': '31.153.0.0/16',
773f291d
S
4201 'CZ': '88.100.0.0/14',
4202 'DE': '53.0.0.0/8',
4203 'DJ': '197.241.0.0/17',
4204 'DK': '87.48.0.0/12',
4205 'DM': '192.243.48.0/20',
4206 'DO': '152.166.0.0/15',
4207 'DZ': '41.96.0.0/12',
4208 'EC': '186.68.0.0/15',
4209 'EE': '90.190.0.0/15',
4210 'EG': '156.160.0.0/11',
4211 'ER': '196.200.96.0/20',
4212 'ES': '88.0.0.0/11',
4213 'ET': '196.188.0.0/14',
4214 'EU': '2.16.0.0/13',
4215 'FI': '91.152.0.0/13',
4216 'FJ': '144.120.0.0/16',
53896ca5 4217 'FK': '80.73.208.0/21',
773f291d
S
4218 'FM': '119.252.112.0/20',
4219 'FO': '88.85.32.0/19',
4220 'FR': '90.0.0.0/9',
4221 'GA': '41.158.0.0/15',
4222 'GB': '25.0.0.0/8',
4223 'GD': '74.122.88.0/21',
4224 'GE': '31.146.0.0/16',
4225 'GF': '161.22.64.0/18',
4226 'GG': '62.68.160.0/19',
53896ca5
S
4227 'GH': '154.160.0.0/12',
4228 'GI': '95.164.0.0/16',
773f291d
S
4229 'GL': '88.83.0.0/19',
4230 'GM': '160.182.0.0/15',
4231 'GN': '197.149.192.0/18',
4232 'GP': '104.250.0.0/19',
4233 'GQ': '105.235.224.0/20',
4234 'GR': '94.64.0.0/13',
4235 'GT': '168.234.0.0/16',
4236 'GU': '168.123.0.0/16',
4237 'GW': '197.214.80.0/20',
4238 'GY': '181.41.64.0/18',
4239 'HK': '113.252.0.0/14',
4240 'HN': '181.210.0.0/16',
4241 'HR': '93.136.0.0/13',
4242 'HT': '148.102.128.0/17',
4243 'HU': '84.0.0.0/14',
4244 'ID': '39.192.0.0/10',
4245 'IE': '87.32.0.0/12',
4246 'IL': '79.176.0.0/13',
4247 'IM': '5.62.80.0/20',
4248 'IN': '117.192.0.0/10',
4249 'IO': '203.83.48.0/21',
4250 'IQ': '37.236.0.0/14',
4251 'IR': '2.176.0.0/12',
4252 'IS': '82.221.0.0/16',
4253 'IT': '79.0.0.0/10',
4254 'JE': '87.244.64.0/18',
4255 'JM': '72.27.0.0/17',
4256 'JO': '176.29.0.0/16',
53896ca5 4257 'JP': '133.0.0.0/8',
773f291d
S
4258 'KE': '105.48.0.0/12',
4259 'KG': '158.181.128.0/17',
4260 'KH': '36.37.128.0/17',
4261 'KI': '103.25.140.0/22',
4262 'KM': '197.255.224.0/20',
53896ca5 4263 'KN': '198.167.192.0/19',
773f291d
S
4264 'KP': '175.45.176.0/22',
4265 'KR': '175.192.0.0/10',
4266 'KW': '37.36.0.0/14',
4267 'KY': '64.96.0.0/15',
4268 'KZ': '2.72.0.0/13',
4269 'LA': '115.84.64.0/18',
4270 'LB': '178.135.0.0/16',
53896ca5 4271 'LC': '24.92.144.0/20',
773f291d
S
4272 'LI': '82.117.0.0/19',
4273 'LK': '112.134.0.0/15',
53896ca5 4274 'LR': '102.183.0.0/16',
773f291d
S
4275 'LS': '129.232.0.0/17',
4276 'LT': '78.56.0.0/13',
4277 'LU': '188.42.0.0/16',
4278 'LV': '46.109.0.0/16',
4279 'LY': '41.252.0.0/14',
4280 'MA': '105.128.0.0/11',
4281 'MC': '88.209.64.0/18',
4282 'MD': '37.246.0.0/16',
4283 'ME': '178.175.0.0/17',
4284 'MF': '74.112.232.0/21',
4285 'MG': '154.126.0.0/17',
4286 'MH': '117.103.88.0/21',
4287 'MK': '77.28.0.0/15',
4288 'ML': '154.118.128.0/18',
4289 'MM': '37.111.0.0/17',
4290 'MN': '49.0.128.0/17',
4291 'MO': '60.246.0.0/16',
4292 'MP': '202.88.64.0/20',
4293 'MQ': '109.203.224.0/19',
4294 'MR': '41.188.64.0/18',
4295 'MS': '208.90.112.0/22',
4296 'MT': '46.11.0.0/16',
4297 'MU': '105.16.0.0/12',
4298 'MV': '27.114.128.0/18',
53896ca5 4299 'MW': '102.70.0.0/15',
773f291d
S
4300 'MX': '187.192.0.0/11',
4301 'MY': '175.136.0.0/13',
4302 'MZ': '197.218.0.0/15',
4303 'NA': '41.182.0.0/16',
4304 'NC': '101.101.0.0/18',
4305 'NE': '197.214.0.0/18',
4306 'NF': '203.17.240.0/22',
4307 'NG': '105.112.0.0/12',
4308 'NI': '186.76.0.0/15',
4309 'NL': '145.96.0.0/11',
4310 'NO': '84.208.0.0/13',
4311 'NP': '36.252.0.0/15',
4312 'NR': '203.98.224.0/19',
4313 'NU': '49.156.48.0/22',
4314 'NZ': '49.224.0.0/14',
4315 'OM': '5.36.0.0/15',
4316 'PA': '186.72.0.0/15',
4317 'PE': '186.160.0.0/14',
4318 'PF': '123.50.64.0/18',
4319 'PG': '124.240.192.0/19',
4320 'PH': '49.144.0.0/13',
4321 'PK': '39.32.0.0/11',
4322 'PL': '83.0.0.0/11',
4323 'PM': '70.36.0.0/20',
4324 'PR': '66.50.0.0/16',
4325 'PS': '188.161.0.0/16',
4326 'PT': '85.240.0.0/13',
4327 'PW': '202.124.224.0/20',
4328 'PY': '181.120.0.0/14',
4329 'QA': '37.210.0.0/15',
53896ca5 4330 'RE': '102.35.0.0/16',
773f291d 4331 'RO': '79.112.0.0/13',
53896ca5 4332 'RS': '93.86.0.0/15',
773f291d 4333 'RU': '5.136.0.0/13',
53896ca5 4334 'RW': '41.186.0.0/16',
773f291d
S
4335 'SA': '188.48.0.0/13',
4336 'SB': '202.1.160.0/19',
4337 'SC': '154.192.0.0/11',
53896ca5 4338 'SD': '102.120.0.0/13',
773f291d 4339 'SE': '78.64.0.0/12',
53896ca5 4340 'SG': '8.128.0.0/10',
773f291d
S
4341 'SI': '188.196.0.0/14',
4342 'SK': '78.98.0.0/15',
53896ca5 4343 'SL': '102.143.0.0/17',
773f291d
S
4344 'SM': '89.186.32.0/19',
4345 'SN': '41.82.0.0/15',
53896ca5 4346 'SO': '154.115.192.0/18',
773f291d
S
4347 'SR': '186.179.128.0/17',
4348 'SS': '105.235.208.0/21',
4349 'ST': '197.159.160.0/19',
4350 'SV': '168.243.0.0/16',
4351 'SX': '190.102.0.0/20',
4352 'SY': '5.0.0.0/16',
4353 'SZ': '41.84.224.0/19',
4354 'TC': '65.255.48.0/20',
4355 'TD': '154.68.128.0/19',
4356 'TG': '196.168.0.0/14',
4357 'TH': '171.96.0.0/13',
4358 'TJ': '85.9.128.0/18',
4359 'TK': '27.96.24.0/21',
4360 'TL': '180.189.160.0/20',
4361 'TM': '95.85.96.0/19',
4362 'TN': '197.0.0.0/11',
4363 'TO': '175.176.144.0/21',
4364 'TR': '78.160.0.0/11',
4365 'TT': '186.44.0.0/15',
4366 'TV': '202.2.96.0/19',
4367 'TW': '120.96.0.0/11',
4368 'TZ': '156.156.0.0/14',
53896ca5
S
4369 'UA': '37.52.0.0/14',
4370 'UG': '102.80.0.0/13',
4371 'US': '6.0.0.0/8',
773f291d 4372 'UY': '167.56.0.0/13',
53896ca5 4373 'UZ': '84.54.64.0/18',
773f291d 4374 'VA': '212.77.0.0/19',
53896ca5 4375 'VC': '207.191.240.0/21',
773f291d 4376 'VE': '186.88.0.0/13',
53896ca5 4377 'VG': '66.81.192.0/20',
773f291d
S
4378 'VI': '146.226.0.0/16',
4379 'VN': '14.160.0.0/11',
4380 'VU': '202.80.32.0/20',
4381 'WF': '117.20.32.0/21',
4382 'WS': '202.4.32.0/19',
4383 'YE': '134.35.0.0/16',
4384 'YT': '41.242.116.0/22',
4385 'ZA': '41.0.0.0/11',
53896ca5
S
4386 'ZM': '102.144.0.0/13',
4387 'ZW': '102.177.192.0/18',
773f291d
S
4388 }
4389
4390 @classmethod
5f95927a
S
4391 def random_ipv4(cls, code_or_block):
4392 if len(code_or_block) == 2:
4393 block = cls._country_ip_map.get(code_or_block.upper())
4394 if not block:
4395 return None
4396 else:
4397 block = code_or_block
773f291d
S
4398 addr, preflen = block.split('/')
4399 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4400 addr_max = addr_min | (0xffffffff >> int(preflen))
18a0defa 4401 return compat_str(socket.inet_ntoa(
4248dad9 4402 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4403
4404
91410c9b 4405class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
4406 def __init__(self, proxies=None):
4407 # Set default handlers
4408 for type in ('http', 'https'):
4409 setattr(self, '%s_open' % type,
4410 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4411 meth(r, proxy, type))
38e87f6c 4412 compat_urllib_request.ProxyHandler.__init__(self, proxies)
2461f79d 4413
91410c9b 4414 def proxy_open(self, req, proxy, type):
2461f79d 4415 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4416 if req_proxy is not None:
4417 proxy = req_proxy
2461f79d
PH
4418 del req.headers['Ytdl-request-proxy']
4419
4420 if proxy == '__noproxy__':
4421 return None # No Proxy
51fb4995 4422 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4423 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4424 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4425 return None
91410c9b
PH
4426 return compat_urllib_request.ProxyHandler.proxy_open(
4427 self, req, proxy, type)
5bc880b9
YCH
4428
4429
0a5445dd
YCH
4430# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4431# released into Public Domain
4432# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4433
4434def long_to_bytes(n, blocksize=0):
4435 """long_to_bytes(n:long, blocksize:int) : string
4436 Convert a long integer to a byte string.
4437
4438 If optional blocksize is given and greater than zero, pad the front of the
4439 byte string with binary zeros so that the length is a multiple of
4440 blocksize.
4441 """
4442 # after much testing, this algorithm was deemed to be the fastest
4443 s = b''
4444 n = int(n)
4445 while n > 0:
4446 s = compat_struct_pack('>I', n & 0xffffffff) + s
4447 n = n >> 32
4448 # strip off leading zeros
4449 for i in range(len(s)):
4450 if s[i] != b'\000'[0]:
4451 break
4452 else:
4453 # only happens when n == 0
4454 s = b'\000'
4455 i = 0
4456 s = s[i:]
4457 # add back some pad bytes. this could be done more efficiently w.r.t. the
4458 # de-padding being done above, but sigh...
4459 if blocksize > 0 and len(s) % blocksize:
4460 s = (blocksize - len(s) % blocksize) * b'\000' + s
4461 return s
4462
4463
4464def bytes_to_long(s):
4465 """bytes_to_long(string) : long
4466 Convert a byte string to a long integer.
4467
4468 This is (essentially) the inverse of long_to_bytes().
4469 """
4470 acc = 0
4471 length = len(s)
4472 if length % 4:
4473 extra = (4 - length % 4)
4474 s = b'\000' * extra + s
4475 length = length + extra
4476 for i in range(0, length, 4):
4477 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4478 return acc
4479
4480
5bc880b9
YCH
4481def ohdave_rsa_encrypt(data, exponent, modulus):
4482 '''
4483 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4484
4485 Input:
4486 data: data to encrypt, bytes-like object
4487 exponent, modulus: parameter e and N of RSA algorithm, both integer
4488 Output: hex string of encrypted data
4489
4490 Limitation: supports one block encryption only
4491 '''
4492
4493 payload = int(binascii.hexlify(data[::-1]), 16)
4494 encrypted = pow(payload, exponent, modulus)
4495 return '%x' % encrypted
81bdc8fd
YCH
4496
4497
f48409c7
YCH
4498def pkcs1pad(data, length):
4499 """
4500 Padding input data with PKCS#1 scheme
4501
4502 @param {int[]} data input data
4503 @param {int} length target length
4504 @returns {int[]} padded data
4505 """
4506 if len(data) > length - 11:
4507 raise ValueError('Input data too long for PKCS#1 padding')
4508
4509 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4510 return [0, 2] + pseudo_random + [0] + data
4511
4512
5eb6bdce 4513def encode_base_n(num, n, table=None):
59f898b7 4514 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
4515 if not table:
4516 table = FULL_TABLE[:n]
4517
5eb6bdce
YCH
4518 if n > len(table):
4519 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4520
4521 if num == 0:
4522 return table[0]
4523
81bdc8fd
YCH
4524 ret = ''
4525 while num:
4526 ret = table[num % n] + ret
4527 num = num // n
4528 return ret
f52354a8
YCH
4529
4530
4531def decode_packed_codes(code):
06b3fe29 4532 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4533 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4534 base = int(base)
4535 count = int(count)
4536 symbols = symbols.split('|')
4537 symbol_table = {}
4538
4539 while count:
4540 count -= 1
5eb6bdce 4541 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4542 symbol_table[base_n_count] = symbols[count] or base_n_count
4543
4544 return re.sub(
4545 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4546 obfuscated_code)
e154c651 4547
4548
1ced2221
S
4549def caesar(s, alphabet, shift):
4550 if shift == 0:
4551 return s
4552 l = len(alphabet)
4553 return ''.join(
4554 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4555 for c in s)
4556
4557
4558def rot47(s):
4559 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4560
4561
e154c651 4562def parse_m3u8_attributes(attrib):
4563 info = {}
4564 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4565 if val.startswith('"'):
4566 val = val[1:-1]
4567 info[key] = val
4568 return info
1143535d
YCH
4569
4570
4571def urshift(val, n):
4572 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4573
4574
4575# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 4576# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
4577def decode_png(png_data):
4578 # Reference: https://www.w3.org/TR/PNG/
4579 header = png_data[8:]
4580
4581 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4582 raise IOError('Not a valid PNG file.')
4583
4584 int_map = {1: '>B', 2: '>H', 4: '>I'}
4585 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4586
4587 chunks = []
4588
4589 while header:
4590 length = unpack_integer(header[:4])
4591 header = header[4:]
4592
4593 chunk_type = header[:4]
4594 header = header[4:]
4595
4596 chunk_data = header[:length]
4597 header = header[length:]
4598
4599 header = header[4:] # Skip CRC
4600
4601 chunks.append({
4602 'type': chunk_type,
4603 'length': length,
4604 'data': chunk_data
4605 })
4606
4607 ihdr = chunks[0]['data']
4608
4609 width = unpack_integer(ihdr[:4])
4610 height = unpack_integer(ihdr[4:8])
4611
4612 idat = b''
4613
4614 for chunk in chunks:
4615 if chunk['type'] == b'IDAT':
4616 idat += chunk['data']
4617
4618 if not idat:
4619 raise IOError('Unable to read PNG data.')
4620
4621 decompressed_data = bytearray(zlib.decompress(idat))
4622
4623 stride = width * 3
4624 pixels = []
4625
4626 def _get_pixel(idx):
4627 x = idx % stride
4628 y = idx // stride
4629 return pixels[y][x]
4630
4631 for y in range(height):
4632 basePos = y * (1 + stride)
4633 filter_type = decompressed_data[basePos]
4634
4635 current_row = []
4636
4637 pixels.append(current_row)
4638
4639 for x in range(stride):
4640 color = decompressed_data[1 + basePos + x]
4641 basex = y * stride + x
4642 left = 0
4643 up = 0
4644
4645 if x > 2:
4646 left = _get_pixel(basex - 3)
4647 if y > 0:
4648 up = _get_pixel(basex - stride)
4649
4650 if filter_type == 1: # Sub
4651 color = (color + left) & 0xff
4652 elif filter_type == 2: # Up
4653 color = (color + up) & 0xff
4654 elif filter_type == 3: # Average
4655 color = (color + ((left + up) >> 1)) & 0xff
4656 elif filter_type == 4: # Paeth
4657 a = left
4658 b = up
4659 c = 0
4660
4661 if x > 2 and y > 0:
4662 c = _get_pixel(basex - stride - 3)
4663
4664 p = a + b - c
4665
4666 pa = abs(p - a)
4667 pb = abs(p - b)
4668 pc = abs(p - c)
4669
4670 if pa <= pb and pa <= pc:
4671 color = (color + a) & 0xff
4672 elif pb <= pc:
4673 color = (color + b) & 0xff
4674 else:
4675 color = (color + c) & 0xff
4676
4677 current_row.append(color)
4678
4679 return width, height, pixels
efa97bdc
YCH
4680
4681
4682def write_xattr(path, key, value):
4683 # This mess below finds the best xattr tool for the job
4684 try:
4685 # try the pyxattr module...
4686 import xattr
4687
53a7e3d2
YCH
4688 if hasattr(xattr, 'set'): # pyxattr
4689 # Unicode arguments are not supported in python-pyxattr until
4690 # version 0.5.0
067aa17e 4691 # See https://github.com/ytdl-org/youtube-dl/issues/5498
53a7e3d2
YCH
4692 pyxattr_required_version = '0.5.0'
4693 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4694 # TODO: fallback to CLI tools
4695 raise XAttrUnavailableError(
4696 'python-pyxattr is detected but is too old. '
7a5c1cfe 4697 'yt-dlp requires %s or above while your version is %s. '
53a7e3d2
YCH
4698 'Falling back to other xattr implementations' % (
4699 pyxattr_required_version, xattr.__version__))
4700
4701 setxattr = xattr.set
4702 else: # xattr
4703 setxattr = xattr.setxattr
efa97bdc
YCH
4704
4705 try:
53a7e3d2 4706 setxattr(path, key, value)
efa97bdc
YCH
4707 except EnvironmentError as e:
4708 raise XAttrMetadataError(e.errno, e.strerror)
4709
4710 except ImportError:
4711 if compat_os_name == 'nt':
4712 # Write xattrs to NTFS Alternate Data Streams:
4713 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4714 assert ':' not in key
4715 assert os.path.exists(path)
4716
4717 ads_fn = path + ':' + key
4718 try:
4719 with open(ads_fn, 'wb') as f:
4720 f.write(value)
4721 except EnvironmentError as e:
4722 raise XAttrMetadataError(e.errno, e.strerror)
4723 else:
4724 user_has_setfattr = check_executable('setfattr', ['--version'])
4725 user_has_xattr = check_executable('xattr', ['-h'])
4726
4727 if user_has_setfattr or user_has_xattr:
4728
4729 value = value.decode('utf-8')
4730 if user_has_setfattr:
4731 executable = 'setfattr'
4732 opts = ['-n', key, '-v', value]
4733 elif user_has_xattr:
4734 executable = 'xattr'
4735 opts = ['-w', key, value]
4736
3089bc74
S
4737 cmd = ([encodeFilename(executable, True)]
4738 + [encodeArgument(o) for o in opts]
4739 + [encodeFilename(path, True)])
efa97bdc
YCH
4740
4741 try:
d3c93ec2 4742 p = Popen(
efa97bdc
YCH
4743 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4744 except EnvironmentError as e:
4745 raise XAttrMetadataError(e.errno, e.strerror)
d3c93ec2 4746 stdout, stderr = p.communicate_or_kill()
efa97bdc
YCH
4747 stderr = stderr.decode('utf-8', 'replace')
4748 if p.returncode != 0:
4749 raise XAttrMetadataError(p.returncode, stderr)
4750
4751 else:
4752 # On Unix, and can't find pyxattr, setfattr, or xattr.
4753 if sys.platform.startswith('linux'):
4754 raise XAttrUnavailableError(
4755 "Couldn't find a tool to set the xattrs. "
4756 "Install either the python 'pyxattr' or 'xattr' "
4757 "modules, or the GNU 'attr' package "
4758 "(which contains the 'setfattr' tool).")
4759 else:
4760 raise XAttrUnavailableError(
4761 "Couldn't find a tool to set the xattrs. "
4762 "Install either the python 'xattr' module, "
4763 "or the 'xattr' binary.")
0c265486
YCH
4764
4765
4766def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
4767 start_date = datetime.date(1950, 1, 1)
4768 end_date = datetime.date(1995, 12, 31)
4769 offset = random.randint(0, (end_date - start_date).days)
4770 random_date = start_date + datetime.timedelta(offset)
0c265486 4771 return {
aa374bc7
AS
4772 year_field: str(random_date.year),
4773 month_field: str(random_date.month),
4774 day_field: str(random_date.day),
0c265486 4775 }
732044af 4776
c76eb41b 4777
732044af 4778# Templates for internet shortcut files, which are plain text files.
4779DOT_URL_LINK_TEMPLATE = '''
4780[InternetShortcut]
4781URL=%(url)s
4782'''.lstrip()
4783
4784DOT_WEBLOC_LINK_TEMPLATE = '''
4785<?xml version="1.0" encoding="UTF-8"?>
4786<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4787<plist version="1.0">
4788<dict>
4789\t<key>URL</key>
4790\t<string>%(url)s</string>
4791</dict>
4792</plist>
4793'''.lstrip()
4794
4795DOT_DESKTOP_LINK_TEMPLATE = '''
4796[Desktop Entry]
4797Encoding=UTF-8
4798Name=%(filename)s
4799Type=Link
4800URL=%(url)s
4801Icon=text-html
4802'''.lstrip()
4803
08438d2c 4804LINK_TEMPLATES = {
4805 'url': DOT_URL_LINK_TEMPLATE,
4806 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4807 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4808}
4809
732044af 4810
4811def iri_to_uri(iri):
4812 """
4813 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4814
4815 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4816 """
4817
4818 iri_parts = compat_urllib_parse_urlparse(iri)
4819
4820 if '[' in iri_parts.netloc:
4821 raise ValueError('IPv6 URIs are not, yet, supported.')
4822 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4823
4824 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4825
4826 net_location = ''
4827 if iri_parts.username:
4828 net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4829 if iri_parts.password is not None:
4830 net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4831 net_location += '@'
4832
4833 net_location += iri_parts.hostname.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
4834 # The 'idna' encoding produces ASCII text.
4835 if iri_parts.port is not None and iri_parts.port != 80:
4836 net_location += ':' + str(iri_parts.port)
4837
4838 return compat_urllib_parse_urlunparse(
4839 (iri_parts.scheme,
4840 net_location,
4841
4842 compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4843
4844 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4845 compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4846
4847 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4848 compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4849
4850 compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4851
4852 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4853
4854
4855def to_high_limit_path(path):
4856 if sys.platform in ['win32', 'cygwin']:
4857 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4858 return r'\\?\ '.rstrip() + os.path.abspath(path)
4859
4860 return path
76d321f6 4861
c76eb41b 4862
b868936c 4863def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4864 if field is None:
4865 val = obj if obj is not None else default
4866 else:
4867 val = obj.get(field, default)
76d321f6 4868 if func and val not in ignore:
4869 val = func(val)
4870 return template % val if val not in ignore else default
00dd0cd5 4871
4872
4873def clean_podcast_url(url):
4874 return re.sub(r'''(?x)
4875 (?:
4876 (?:
4877 chtbl\.com/track|
4878 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4879 play\.podtrac\.com
4880 )/[^/]+|
4881 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4882 flex\.acast\.com|
4883 pd(?:
4884 cn\.co| # https://podcorn.com/analytics-prefix/
4885 st\.fm # https://podsights.com/docs/
4886 )/e
4887 )/''', '', url)
ffcb8191
THD
4888
4889
4890_HEX_TABLE = '0123456789abcdef'
4891
4892
4893def random_uuidv4():
4894 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 4895
4896
4897def make_dir(path, to_screen=None):
4898 try:
4899 dn = os.path.dirname(path)
4900 if dn and not os.path.exists(dn):
4901 os.makedirs(dn)
4902 return True
4903 except (OSError, IOError) as err:
4904 if callable(to_screen) is not None:
4905 to_screen('unable to create directory ' + error_to_compat_str(err))
4906 return False
f74980cb 4907
4908
4909def get_executable_path():
c552ae88 4910 from zipimport import zipimporter
4911 if hasattr(sys, 'frozen'): # Running from PyInstaller
4912 path = os.path.dirname(sys.executable)
4913 elif isinstance(globals().get('__loader__'), zipimporter): # Running from ZIP
4914 path = os.path.join(os.path.dirname(__file__), '../..')
4915 else:
4916 path = os.path.join(os.path.dirname(__file__), '..')
f74980cb 4917 return os.path.abspath(path)
4918
4919
2f567473 4920def load_plugins(name, suffix, namespace):
3ae5e797 4921 classes = {}
f74980cb 4922 try:
019a94f7
ÁS
4923 plugins_spec = importlib.util.spec_from_file_location(
4924 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
4925 plugins = importlib.util.module_from_spec(plugins_spec)
4926 sys.modules[plugins_spec.name] = plugins
4927 plugins_spec.loader.exec_module(plugins)
f74980cb 4928 for name in dir(plugins):
2f567473 4929 if name in namespace:
4930 continue
4931 if not name.endswith(suffix):
f74980cb 4932 continue
4933 klass = getattr(plugins, name)
3ae5e797 4934 classes[name] = namespace[name] = klass
019a94f7 4935 except FileNotFoundError:
f74980cb 4936 pass
f74980cb 4937 return classes
06167fbb 4938
4939
325ebc17 4940def traverse_obj(
352d63fd 4941 obj, *path_list, default=None, expected_type=None, get_all=True,
325ebc17 4942 casesense=True, is_user_input=False, traverse_string=False):
324ad820 4943 ''' Traverse nested list/dict/tuple
8f334380 4944 @param path_list A list of paths which are checked one by one.
4945 Each path is a list of keys where each key is a string,
1797b073 4946 a function, a tuple of strings/None or "...".
2614f646 4947 When a fuction is given, it takes the key as argument and
4948 returns whether the key matches or not. When a tuple is given,
8f334380 4949 all the keys given in the tuple are traversed, and
4950 "..." traverses all the keys in the object
1797b073 4951 "None" returns the object without traversal
325ebc17 4952 @param default Default value to return
352d63fd 4953 @param expected_type Only accept final value of this type (Can also be any callable)
4954 @param get_all Return all the values obtained from a path or only the first one
324ad820 4955 @param casesense Whether to consider dictionary keys as case sensitive
4956 @param is_user_input Whether the keys are generated from user input. If True,
4957 strings are converted to int/slice if necessary
4958 @param traverse_string Whether to traverse inside strings. If True, any
4959 non-compatible object will also be converted into a string
8f334380 4960 # TODO: Write tests
324ad820 4961 '''
325ebc17 4962 if not casesense:
dbf5416a 4963 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
8f334380 4964 path_list = (map(_lower, variadic(path)) for path in path_list)
4965
4966 def _traverse_obj(obj, path, _current_depth=0):
4967 nonlocal depth
4968 path = tuple(variadic(path))
4969 for i, key in enumerate(path):
1797b073 4970 if None in (key, obj):
4971 return obj
8f334380 4972 if isinstance(key, (list, tuple)):
4973 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
4974 key = ...
4975 if key is ...:
4976 obj = (obj.values() if isinstance(obj, dict)
4977 else obj if isinstance(obj, (list, tuple, LazyList))
4978 else str(obj) if traverse_string else [])
4979 _current_depth += 1
4980 depth = max(depth, _current_depth)
4981 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
2614f646 4982 elif callable(key):
4983 if isinstance(obj, (list, tuple, LazyList)):
4984 obj = enumerate(obj)
4985 elif isinstance(obj, dict):
4986 obj = obj.items()
4987 else:
4988 if not traverse_string:
4989 return None
4990 obj = str(obj)
4991 _current_depth += 1
4992 depth = max(depth, _current_depth)
4993 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
575e17a1 4994 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
325ebc17 4995 obj = (obj.get(key) if casesense or (key in obj)
4996 else next((v for k, v in obj.items() if _lower(k) == key), None))
4997 else:
4998 if is_user_input:
4999 key = (int_or_none(key) if ':' not in key
5000 else slice(*map(int_or_none, key.split(':'))))
8f334380 5001 if key == slice(None):
575e17a1 5002 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
325ebc17 5003 if not isinstance(key, (int, slice)):
9fea350f 5004 return None
8f334380 5005 if not isinstance(obj, (list, tuple, LazyList)):
325ebc17 5006 if not traverse_string:
5007 return None
5008 obj = str(obj)
5009 try:
5010 obj = obj[key]
5011 except IndexError:
324ad820 5012 return None
325ebc17 5013 return obj
5014
352d63fd 5015 if isinstance(expected_type, type):
5016 type_test = lambda val: val if isinstance(val, expected_type) else None
5017 elif expected_type is not None:
5018 type_test = expected_type
5019 else:
5020 type_test = lambda val: val
5021
8f334380 5022 for path in path_list:
5023 depth = 0
5024 val = _traverse_obj(obj, path)
325ebc17 5025 if val is not None:
8f334380 5026 if depth:
5027 for _ in range(depth - 1):
6586bca9 5028 val = itertools.chain.from_iterable(v for v in val if v is not None)
352d63fd 5029 val = [v for v in map(type_test, val) if v is not None]
8f334380 5030 if val:
352d63fd 5031 return val if get_all else val[0]
5032 else:
5033 val = type_test(val)
5034 if val is not None:
8f334380 5035 return val
325ebc17 5036 return default
324ad820 5037
5038
ee8dd27a 5039# Deprecated
324ad820 5040def traverse_dict(dictn, keys, casesense=True):
ee8dd27a 5041 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5042 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5043 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5044
5045
4b4b7f74 5046def variadic(x, allowed_types=(str, bytes, dict)):
cb89cfc1 5047 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
bd50a52b
THD
5048
5049
49fa4d9a
N
5050# create a JSON Web Signature (jws) with HS256 algorithm
5051# the resulting format is in JWS Compact Serialization
5052# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5053# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5054def jwt_encode_hs256(payload_data, key, headers={}):
5055 header_data = {
5056 'alg': 'HS256',
5057 'typ': 'JWT',
5058 }
5059 if headers:
5060 header_data.update(headers)
5061 header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5062 payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5063 h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5064 signature_b64 = base64.b64encode(h.digest())
5065 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5066 return token
819e0531 5067
5068
16b0d7e6 5069# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5070def jwt_decode_hs256(jwt):
5071 header_b64, payload_b64, signature_b64 = jwt.split('.')
5072 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5073 return payload_data
5074
5075
819e0531 5076def supports_terminal_sequences(stream):
5077 if compat_os_name == 'nt':
e3c7d495 5078 from .compat import WINDOWS_VT_MODE # Must be imported locally
5079 if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
819e0531 5080 return False
5081 elif not os.getenv('TERM'):
5082 return False
5083 try:
5084 return stream.isatty()
5085 except BaseException:
5086 return False
5087
5088
ec11a9f4 5089_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5090
5091
5092def remove_terminal_sequences(string):
5093 return _terminal_sequences_re.sub('', string)
5094
5095
5096def number_of_digits(number):
5097 return len('%d' % number)
34921b43 5098
5099
5100def join_nonempty(*values, delim='-', from_dict=None):
5101 if from_dict is not None:
c586f9e8 5102 values = map(from_dict.get, values)
34921b43 5103 return delim.join(map(str, filter(None, values)))
06e57990 5104
5105
5106class Config:
5107 own_args = None
5108 filename = None
5109 __initialized = False
5110
5111 def __init__(self, parser, label=None):
5112 self._parser, self.label = parser, label
5113 self._loaded_paths, self.configs = set(), []
5114
5115 def init(self, args=None, filename=None):
5116 assert not self.__initialized
5117 if filename:
5118 location = os.path.realpath(filename)
5119 if location in self._loaded_paths:
5120 return False
5121 self._loaded_paths.add(location)
5122
5123 self.__initialized = True
5124 self.own_args, self.filename = args, filename
5125 for location in self._parser.parse_args(args)[0].config_locations or []:
5126 location = compat_expanduser(location)
5127 if os.path.isdir(location):
5128 location = os.path.join(location, 'yt-dlp.conf')
5129 if not os.path.exists(location):
5130 self._parser.error(f'config location {location} does not exist')
5131 self.append_config(self.read_file(location), location)
5132 return True
5133
5134 def __str__(self):
5135 label = join_nonempty(
5136 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5137 delim=' ')
5138 return join_nonempty(
5139 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5140 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5141 delim='\n')
5142
5143 @staticmethod
5144 def read_file(filename, default=[]):
5145 try:
5146 optionf = open(filename)
5147 except IOError:
5148 return default # silently skip if file is not present
5149 try:
5150 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5151 contents = optionf.read()
5152 if sys.version_info < (3,):
5153 contents = contents.decode(preferredencoding())
5154 res = compat_shlex_split(contents, comments=True)
5155 finally:
5156 optionf.close()
5157 return res
5158
5159 @staticmethod
5160 def hide_login_info(opts):
5161 PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5162 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5163
5164 def _scrub_eq(o):
5165 m = eqre.match(o)
5166 if m:
5167 return m.group('key') + '=PRIVATE'
5168 else:
5169 return o
5170
5171 opts = list(map(_scrub_eq, opts))
5172 for idx, opt in enumerate(opts):
5173 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5174 opts[idx + 1] = 'PRIVATE'
5175 return opts
5176
5177 def append_config(self, *args, label=None):
5178 config = type(self)(self._parser, label)
5179 config._loaded_paths = self._loaded_paths
5180 if config.init(*args):
5181 self.configs.append(config)
5182
5183 @property
5184 def all_args(self):
5185 for config in reversed(self.configs):
5186 yield from config.all_args
5187 yield from self.own_args or []
5188
5189 def parse_args(self):
5190 return self._parser.parse_args(list(self.all_args))