]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
[MainStreaming] Add extractor (#2180)
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
cc52de43 1#!/usr/bin/env python3
dcdb292f 2# coding: utf-8
d77c3dfd 3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
c380cc28 10import collections
62e609ab 11import contextlib
e3946f98 12import ctypes
c496ca96
PH
13import datetime
14import email.utils
0c265486 15import email.header
f45c185f 16import errno
be4a824d 17import functools
d77c3dfd 18import gzip
49fa4d9a
N
19import hashlib
20import hmac
019a94f7 21import importlib.util
03f9daab 22import io
79a2e94e 23import itertools
f4bfd65f 24import json
d77c3dfd 25import locale
02dbf93f 26import math
347de493 27import operator
d77c3dfd 28import os
c496ca96 29import platform
773f291d 30import random
d77c3dfd 31import re
c496ca96 32import socket
79a2e94e 33import ssl
1c088fa8 34import subprocess
d77c3dfd 35import sys
181c8655 36import tempfile
c380cc28 37import time
01951dda 38import traceback
bcf89ce6 39import xml.etree.ElementTree
d77c3dfd 40import zlib
2814f12b 41import mimetypes
d77c3dfd 42
8c25f81b 43from .compat import (
b4a3d461 44 compat_HTMLParseError,
8bb56eee 45 compat_HTMLParser,
201c1459 46 compat_HTTPError,
8f9312c3 47 compat_basestring,
8c25f81b 48 compat_chr,
1bab3437 49 compat_cookiejar,
d7cd9a9e 50 compat_ctypes_WINFUNCTYPE,
36e6f62c 51 compat_etree_fromstring,
51098426 52 compat_expanduser,
8c25f81b 53 compat_html_entities,
55b2f099 54 compat_html_entities_html5,
be4a824d 55 compat_http_client,
42db58ec 56 compat_integer_types,
e29663c6 57 compat_numeric_types,
c86b6142 58 compat_kwargs,
efa97bdc 59 compat_os_name,
8c25f81b 60 compat_parse_qs,
06e57990 61 compat_shlex_split,
702ccf2d 62 compat_shlex_quote,
8c25f81b 63 compat_str,
edaa23f8 64 compat_struct_pack,
d3f8e038 65 compat_struct_unpack,
8c25f81b
PH
66 compat_urllib_error,
67 compat_urllib_parse,
15707c7e 68 compat_urllib_parse_urlencode,
8c25f81b 69 compat_urllib_parse_urlparse,
732044af 70 compat_urllib_parse_urlunparse,
71 compat_urllib_parse_quote,
72 compat_urllib_parse_quote_plus,
7581bfc9 73 compat_urllib_parse_unquote_plus,
8c25f81b
PH
74 compat_urllib_request,
75 compat_urlparse,
810c10ba 76 compat_xpath,
8c25f81b 77)
4644ac55 78
71aff188
YCH
79from .socks import (
80 ProxyType,
81 sockssocket,
82)
83
4644ac55 84
51fb4995
YCH
85def register_socks_protocols():
86 # "Register" SOCKS protocols
d5ae6bb5
YCH
87 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
88 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
89 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
90 if scheme not in compat_urlparse.uses_netloc:
91 compat_urlparse.uses_netloc.append(scheme)
92
93
468e2e92
FV
94# This is not clearly defined otherwise
95compiled_regex_type = type(re.compile(''))
96
f7a147e3
S
97
98def random_user_agent():
99 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
100 _CHROME_VERSIONS = (
d76d15a6
F
101 '90.0.4430.212',
102 '90.0.4430.24',
103 '90.0.4430.70',
104 '90.0.4430.72',
105 '90.0.4430.85',
106 '90.0.4430.93',
107 '91.0.4472.101',
108 '91.0.4472.106',
109 '91.0.4472.114',
110 '91.0.4472.124',
111 '91.0.4472.164',
112 '91.0.4472.19',
113 '91.0.4472.77',
114 '92.0.4515.107',
115 '92.0.4515.115',
116 '92.0.4515.131',
117 '92.0.4515.159',
118 '92.0.4515.43',
119 '93.0.4556.0',
120 '93.0.4577.15',
121 '93.0.4577.63',
122 '93.0.4577.82',
123 '94.0.4606.41',
124 '94.0.4606.54',
125 '94.0.4606.61',
126 '94.0.4606.71',
127 '94.0.4606.81',
128 '94.0.4606.85',
129 '95.0.4638.17',
130 '95.0.4638.50',
131 '95.0.4638.54',
132 '95.0.4638.69',
133 '95.0.4638.74',
134 '96.0.4664.18',
135 '96.0.4664.45',
136 '96.0.4664.55',
137 '96.0.4664.93',
138 '97.0.4692.20',
f7a147e3
S
139 )
140 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
141
142
3e669f36 143std_headers = {
f7a147e3 144 'User-Agent': random_user_agent(),
59ae15a5
PH
145 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
146 'Accept-Encoding': 'gzip, deflate',
147 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 148}
f427df17 149
5f6a1245 150
fb37eb25
S
151USER_AGENTS = {
152 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
153}
154
155
bf42a990
S
156NO_DEFAULT = object()
157
7105440c
YCH
158ENGLISH_MONTH_NAMES = [
159 'January', 'February', 'March', 'April', 'May', 'June',
160 'July', 'August', 'September', 'October', 'November', 'December']
161
f6717dec
S
162MONTH_NAMES = {
163 'en': ENGLISH_MONTH_NAMES,
164 'fr': [
3e4185c3
S
165 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
166 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 167}
a942d6cb 168
a7aaa398
S
169KNOWN_EXTENSIONS = (
170 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
171 'flv', 'f4v', 'f4a', 'f4b',
172 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
173 'mkv', 'mka', 'mk3d',
174 'avi', 'divx',
175 'mov',
176 'asf', 'wmv', 'wma',
177 '3gp', '3g2',
178 'mp3',
179 'flac',
180 'ape',
181 'wav',
182 'f4f', 'f4m', 'm3u8', 'smil')
183
c587cbb7 184# needed for sanitizing filenames in restricted mode
c8827027 185ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
186 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
187 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 188
46f59e89
S
189DATE_FORMATS = (
190 '%d %B %Y',
191 '%d %b %Y',
192 '%B %d %Y',
cb655f34
S
193 '%B %dst %Y',
194 '%B %dnd %Y',
9d30c213 195 '%B %drd %Y',
cb655f34 196 '%B %dth %Y',
46f59e89 197 '%b %d %Y',
cb655f34
S
198 '%b %dst %Y',
199 '%b %dnd %Y',
9d30c213 200 '%b %drd %Y',
cb655f34 201 '%b %dth %Y',
46f59e89
S
202 '%b %dst %Y %I:%M',
203 '%b %dnd %Y %I:%M',
9d30c213 204 '%b %drd %Y %I:%M',
46f59e89
S
205 '%b %dth %Y %I:%M',
206 '%Y %m %d',
207 '%Y-%m-%d',
bccdbd22 208 '%Y.%m.%d.',
46f59e89 209 '%Y/%m/%d',
81c13222 210 '%Y/%m/%d %H:%M',
46f59e89 211 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
212 '%Y%m%d%H%M',
213 '%Y%m%d%H%M%S',
4f3fa23e 214 '%Y%m%d',
0c1c6f4b 215 '%Y-%m-%d %H:%M',
46f59e89
S
216 '%Y-%m-%d %H:%M:%S',
217 '%Y-%m-%d %H:%M:%S.%f',
5014558a 218 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
219 '%d.%m.%Y %H:%M',
220 '%d.%m.%Y %H.%M',
221 '%Y-%m-%dT%H:%M:%SZ',
222 '%Y-%m-%dT%H:%M:%S.%fZ',
223 '%Y-%m-%dT%H:%M:%S.%f0Z',
224 '%Y-%m-%dT%H:%M:%S',
225 '%Y-%m-%dT%H:%M:%S.%f',
226 '%Y-%m-%dT%H:%M',
c6eed6b8
S
227 '%b %d %Y at %H:%M',
228 '%b %d %Y at %H:%M:%S',
b555ae9b
S
229 '%B %d %Y at %H:%M',
230 '%B %d %Y at %H:%M:%S',
a63d9bd0 231 '%H:%M %d-%b-%Y',
46f59e89
S
232)
233
234DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
235DATE_FORMATS_DAY_FIRST.extend([
236 '%d-%m-%Y',
237 '%d.%m.%Y',
238 '%d.%m.%y',
239 '%d/%m/%Y',
240 '%d/%m/%y',
241 '%d/%m/%Y %H:%M:%S',
242])
243
244DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
245DATE_FORMATS_MONTH_FIRST.extend([
246 '%m-%d-%Y',
247 '%m.%d.%Y',
248 '%m/%d/%Y',
249 '%m/%d/%y',
250 '%m/%d/%Y %H:%M:%S',
251])
252
06b3fe29 253PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
22f5f5c6 254JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
06b3fe29 255
7105440c 256
d77c3dfd 257def preferredencoding():
59ae15a5 258 """Get preferred encoding.
d77c3dfd 259
59ae15a5
PH
260 Returns the best encoding scheme for the system, based on
261 locale.getpreferredencoding() and some further tweaks.
262 """
263 try:
264 pref = locale.getpreferredencoding()
28e614de 265 'TEST'.encode(pref)
70a1165b 266 except Exception:
59ae15a5 267 pref = 'UTF-8'
bae611f2 268
59ae15a5 269 return pref
d77c3dfd 270
f4bfd65f 271
181c8655 272def write_json_file(obj, fn):
1394646a 273 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 274
92120217 275 fn = encodeFilename(fn)
61ee5aeb 276 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
277 encoding = get_filesystem_encoding()
278 # os.path.basename returns a bytes object, but NamedTemporaryFile
279 # will fail if the filename contains non ascii characters unless we
280 # use a unicode object
281 path_basename = lambda f: os.path.basename(fn).decode(encoding)
282 # the same for os.path.dirname
283 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
284 else:
285 path_basename = os.path.basename
286 path_dirname = os.path.dirname
287
73159f99
S
288 args = {
289 'suffix': '.tmp',
ec5f6016
JMF
290 'prefix': path_basename(fn) + '.',
291 'dir': path_dirname(fn),
73159f99
S
292 'delete': False,
293 }
294
181c8655
PH
295 # In Python 2.x, json.dump expects a bytestream.
296 # In Python 3.x, it writes to a character stream
297 if sys.version_info < (3, 0):
73159f99 298 args['mode'] = 'wb'
181c8655 299 else:
73159f99
S
300 args.update({
301 'mode': 'w',
302 'encoding': 'utf-8',
303 })
304
c86b6142 305 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
306
307 try:
308 with tf:
45d86abe 309 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
310 if sys.platform == 'win32':
311 # Need to remove existing file on Windows, else os.rename raises
312 # WindowsError or FileExistsError.
313 try:
314 os.unlink(fn)
315 except OSError:
316 pass
9cd5f54e
R
317 try:
318 mask = os.umask(0)
319 os.umask(mask)
320 os.chmod(tf.name, 0o666 & ~mask)
321 except OSError:
322 pass
181c8655 323 os.rename(tf.name, fn)
70a1165b 324 except Exception:
181c8655
PH
325 try:
326 os.remove(tf.name)
327 except OSError:
328 pass
329 raise
330
331
332if sys.version_info >= (2, 7):
ee114368 333 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 334 """ Find the xpath xpath[@key=val] """
5d2354f1 335 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 336 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
337 return node.find(expr)
338else:
ee114368 339 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 340 for f in node.findall(compat_xpath(xpath)):
ee114368
S
341 if key not in f.attrib:
342 continue
343 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
344 return f
345 return None
346
d7e66d39
JMF
347# On python2.6 the xml.etree.ElementTree.Element methods don't support
348# the namespace parameter
5f6a1245
JW
349
350
d7e66d39
JMF
351def xpath_with_ns(path, ns_map):
352 components = [c.split(':') for c in path.split('/')]
353 replaced = []
354 for c in components:
355 if len(c) == 1:
356 replaced.append(c[0])
357 else:
358 ns, tag = c
359 replaced.append('{%s}%s' % (ns_map[ns], tag))
360 return '/'.join(replaced)
361
d77c3dfd 362
a41fb80c 363def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 364 def _find_xpath(xpath):
810c10ba 365 return node.find(compat_xpath(xpath))
578c0745
S
366
367 if isinstance(xpath, (str, compat_str)):
368 n = _find_xpath(xpath)
369 else:
370 for xp in xpath:
371 n = _find_xpath(xp)
372 if n is not None:
373 break
d74bebd5 374
8e636da4 375 if n is None:
bf42a990
S
376 if default is not NO_DEFAULT:
377 return default
378 elif fatal:
bf0ff932
PH
379 name = xpath if name is None else name
380 raise ExtractorError('Could not find XML element %s' % name)
381 else:
382 return None
a41fb80c
S
383 return n
384
385
386def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
387 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
388 if n is None or n == default:
389 return n
390 if n.text is None:
391 if default is not NO_DEFAULT:
392 return default
393 elif fatal:
394 name = xpath if name is None else name
395 raise ExtractorError('Could not find XML element\'s text %s' % name)
396 else:
397 return None
398 return n.text
a41fb80c
S
399
400
401def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
402 n = find_xpath_attr(node, xpath, key)
403 if n is None:
404 if default is not NO_DEFAULT:
405 return default
406 elif fatal:
407 name = '%s[@%s]' % (xpath, key) if name is None else name
408 raise ExtractorError('Could not find XML attribute %s' % name)
409 else:
410 return None
411 return n.attrib[key]
bf0ff932
PH
412
413
9e6dd238 414def get_element_by_id(id, html):
43e8fafd 415 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 416 return get_element_by_attribute('id', id, html)
43e8fafd 417
12ea2f30 418
84c237fb 419def get_element_by_class(class_name, html):
2af12ad9
TC
420 """Return the content of the first tag with the specified class in the passed HTML document"""
421 retval = get_elements_by_class(class_name, html)
422 return retval[0] if retval else None
423
424
425def get_element_by_attribute(attribute, value, html, escape_value=True):
426 retval = get_elements_by_attribute(attribute, value, html, escape_value)
427 return retval[0] if retval else None
428
429
430def get_elements_by_class(class_name, html):
431 """Return the content of all tags with the specified class in the passed HTML document as a list"""
432 return get_elements_by_attribute(
84c237fb
YCH
433 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
434 html, escape_value=False)
435
436
2af12ad9 437def get_elements_by_attribute(attribute, value, html, escape_value=True):
43e8fafd 438 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 439
84c237fb
YCH
440 value = re.escape(value) if escape_value else value
441
2af12ad9
TC
442 retlist = []
443 for m in re.finditer(r'''(?xs)
38285056 444 <([a-zA-Z0-9:._-]+)
609ff8ca 445 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38285056 446 \s+%s=['"]?%s['"]?
609ff8ca 447 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38285056
PH
448 \s*>
449 (?P<content>.*?)
450 </\1>
2af12ad9
TC
451 ''' % (re.escape(attribute), value), html):
452 res = m.group('content')
38285056 453
2af12ad9
TC
454 if res.startswith('"') or res.startswith("'"):
455 res = res[1:-1]
38285056 456
2af12ad9 457 retlist.append(unescapeHTML(res))
a921f407 458
2af12ad9 459 return retlist
a921f407 460
c5229f39 461
8bb56eee
BF
462class HTMLAttributeParser(compat_HTMLParser):
463 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 464
8bb56eee 465 def __init__(self):
c5229f39 466 self.attrs = {}
8bb56eee
BF
467 compat_HTMLParser.__init__(self)
468
469 def handle_starttag(self, tag, attrs):
470 self.attrs = dict(attrs)
471
c5229f39 472
73673ccf
FF
473class HTMLListAttrsParser(compat_HTMLParser):
474 """HTML parser to gather the attributes for the elements of a list"""
475
476 def __init__(self):
477 compat_HTMLParser.__init__(self)
478 self.items = []
479 self._level = 0
480
481 def handle_starttag(self, tag, attrs):
482 if tag == 'li' and self._level == 0:
483 self.items.append(dict(attrs))
484 self._level += 1
485
486 def handle_endtag(self, tag):
487 self._level -= 1
488
489
8bb56eee
BF
490def extract_attributes(html_element):
491 """Given a string for an HTML element such as
492 <el
493 a="foo" B="bar" c="&98;az" d=boz
494 empty= noval entity="&amp;"
495 sq='"' dq="'"
496 >
497 Decode and return a dictionary of attributes.
498 {
499 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
500 'empty': '', 'noval': None, 'entity': '&',
501 'sq': '"', 'dq': '\''
502 }.
503 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
504 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
505 """
506 parser = HTMLAttributeParser()
b4a3d461
S
507 try:
508 parser.feed(html_element)
509 parser.close()
510 # Older Python may throw HTMLParseError in case of malformed HTML
511 except compat_HTMLParseError:
512 pass
8bb56eee 513 return parser.attrs
9e6dd238 514
c5229f39 515
73673ccf
FF
516def parse_list(webpage):
517 """Given a string for an series of HTML <li> elements,
518 return a dictionary of their attributes"""
519 parser = HTMLListAttrsParser()
520 parser.feed(webpage)
521 parser.close()
522 return parser.items
523
524
9e6dd238 525def clean_html(html):
59ae15a5 526 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
527
528 if html is None: # Convenience for sanitizing descriptions etc.
529 return html
530
59ae15a5
PH
531 # Newline vs <br />
532 html = html.replace('\n', ' ')
edd9221c
TF
533 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
534 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
535 # Strip html tags
536 html = re.sub('<.*?>', '', html)
537 # Replace html entities
538 html = unescapeHTML(html)
7decf895 539 return html.strip()
9e6dd238
FV
540
541
d77c3dfd 542def sanitize_open(filename, open_mode):
59ae15a5
PH
543 """Try to open the given filename, and slightly tweak it if this fails.
544
545 Attempts to open the given filename. If this fails, it tries to change
546 the filename slightly, step by step, until it's either able to open it
547 or it fails and raises a final exception, like the standard open()
548 function.
549
550 It returns the tuple (stream, definitive_file_name).
551 """
552 try:
28e614de 553 if filename == '-':
59ae15a5
PH
554 if sys.platform == 'win32':
555 import msvcrt
556 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 557 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
558 stream = open(encodeFilename(filename), open_mode)
559 return (stream, filename)
560 except (IOError, OSError) as err:
f45c185f
PH
561 if err.errno in (errno.EACCES,):
562 raise
59ae15a5 563
f45c185f 564 # In case of error, try to remove win32 forbidden chars
d55de57b 565 alt_filename = sanitize_path(filename)
f45c185f
PH
566 if alt_filename == filename:
567 raise
568 else:
569 # An exception here should be caught in the caller
d55de57b 570 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 571 return (stream, alt_filename)
d77c3dfd
FV
572
573
574def timeconvert(timestr):
59ae15a5
PH
575 """Convert RFC 2822 defined time string into system timestamp"""
576 timestamp = None
577 timetuple = email.utils.parsedate_tz(timestr)
578 if timetuple is not None:
579 timestamp = email.utils.mktime_tz(timetuple)
580 return timestamp
1c469a94 581
5f6a1245 582
796173d0 583def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
584 """Sanitizes a string so it could be used as part of a filename.
585 If restricted is set, use a stricter subset of allowed characters.
158af524
S
586 Set is_id if this is not an arbitrary string, but an ID that should be kept
587 if possible.
59ae15a5
PH
588 """
589 def replace_insane(char):
c587cbb7
AT
590 if restricted and char in ACCENT_CHARS:
591 return ACCENT_CHARS[char]
91dd88b9 592 elif not restricted and char == '\n':
593 return ' '
594 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
595 return ''
596 elif char == '"':
597 return '' if restricted else '\''
598 elif char == ':':
599 return '_-' if restricted else ' -'
600 elif char in '\\/|*<>':
601 return '_'
627dcfff 602 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
603 return '_'
604 if restricted and ord(char) > 127:
605 return '_'
606 return char
607
639f1cea 608 if s == '':
609 return ''
2aeb06d6
PH
610 # Handle timestamps
611 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 612 result = ''.join(map(replace_insane, s))
796173d0
PH
613 if not is_id:
614 while '__' in result:
615 result = result.replace('__', '_')
616 result = result.strip('_')
617 # Common case of "Foreign band name - English song title"
618 if restricted and result.startswith('-_'):
619 result = result[2:]
5a42414b
PH
620 if result.startswith('-'):
621 result = '_' + result[len('-'):]
a7440261 622 result = result.lstrip('.')
796173d0
PH
623 if not result:
624 result = '_'
59ae15a5 625 return result
d77c3dfd 626
5f6a1245 627
c2934512 628def sanitize_path(s, force=False):
a2aaf4db 629 """Sanitizes and normalizes path on Windows"""
c2934512 630 if sys.platform == 'win32':
c4218ac3 631 force = False
c2934512 632 drive_or_unc, _ = os.path.splitdrive(s)
633 if sys.version_info < (2, 7) and not drive_or_unc:
634 drive_or_unc, _ = os.path.splitunc(s)
635 elif force:
636 drive_or_unc = ''
637 else:
a2aaf4db 638 return s
c2934512 639
be531ef1
S
640 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
641 if drive_or_unc:
a2aaf4db
S
642 norm_path.pop(0)
643 sanitized_path = [
ec85ded8 644 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 645 for path_part in norm_path]
be531ef1
S
646 if drive_or_unc:
647 sanitized_path.insert(0, drive_or_unc + os.path.sep)
c4218ac3 648 elif force and s[0] == os.path.sep:
649 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
650 return os.path.join(*sanitized_path)
651
652
17bcc626 653def sanitize_url(url):
befa4708
S
654 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
655 # the number of unwanted failures due to missing protocol
656 if url.startswith('//'):
657 return 'http:%s' % url
658 # Fix some common typos seen so far
659 COMMON_TYPOS = (
067aa17e 660 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
661 (r'^httpss://', r'https://'),
662 # https://bx1.be/lives/direct-tv/
663 (r'^rmtp([es]?)://', r'rtmp\1://'),
664 )
665 for mistake, fixup in COMMON_TYPOS:
666 if re.match(mistake, url):
667 return re.sub(mistake, fixup, url)
bc6b9bcd 668 return url
17bcc626
S
669
670
5435dcf9
HH
671def extract_basic_auth(url):
672 parts = compat_urlparse.urlsplit(url)
673 if parts.username is None:
674 return url, None
675 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
676 parts.hostname if parts.port is None
677 else '%s:%d' % (parts.hostname, parts.port))))
678 auth_payload = base64.b64encode(
679 ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
680 return url, 'Basic ' + auth_payload.decode('utf-8')
681
682
67dda517 683def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 684 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
685 if auth_header is not None:
686 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
687 headers['Authorization'] = auth_header
688 return compat_urllib_request.Request(url, *args, **kwargs)
67dda517
S
689
690
51098426
S
691def expand_path(s):
692 """Expand shell variables and ~"""
693 return os.path.expandvars(compat_expanduser(s))
694
695
d77c3dfd 696def orderedSet(iterable):
59ae15a5
PH
697 """ Remove all duplicates from the input iterable """
698 res = []
699 for el in iterable:
700 if el not in res:
701 res.append(el)
702 return res
d77c3dfd 703
912b38b4 704
55b2f099 705def _htmlentity_transform(entity_with_semicolon):
4e408e47 706 """Transforms an HTML entity to a character."""
55b2f099
YCH
707 entity = entity_with_semicolon[:-1]
708
4e408e47
PH
709 # Known non-numeric HTML entity
710 if entity in compat_html_entities.name2codepoint:
711 return compat_chr(compat_html_entities.name2codepoint[entity])
712
55b2f099
YCH
713 # TODO: HTML5 allows entities without a semicolon. For example,
714 # '&Eacuteric' should be decoded as 'Éric'.
715 if entity_with_semicolon in compat_html_entities_html5:
716 return compat_html_entities_html5[entity_with_semicolon]
717
91757b0f 718 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
719 if mobj is not None:
720 numstr = mobj.group(1)
28e614de 721 if numstr.startswith('x'):
4e408e47 722 base = 16
28e614de 723 numstr = '0%s' % numstr
4e408e47
PH
724 else:
725 base = 10
067aa17e 726 # See https://github.com/ytdl-org/youtube-dl/issues/7518
7aefc49c
S
727 try:
728 return compat_chr(int(numstr, base))
729 except ValueError:
730 pass
4e408e47
PH
731
732 # Unknown entity in name, return its literal representation
7a3f0c00 733 return '&%s;' % entity
4e408e47
PH
734
735
d77c3dfd 736def unescapeHTML(s):
912b38b4
PH
737 if s is None:
738 return None
739 assert type(s) == compat_str
d77c3dfd 740
4e408e47 741 return re.sub(
95f3f7c2 742 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 743
8bf48f23 744
cdb19aa4 745def escapeHTML(text):
746 return (
747 text
748 .replace('&', '&amp;')
749 .replace('<', '&lt;')
750 .replace('>', '&gt;')
751 .replace('"', '&quot;')
752 .replace("'", '&#39;')
753 )
754
755
f5b1bca9 756def process_communicate_or_kill(p, *args, **kwargs):
757 try:
758 return p.communicate(*args, **kwargs)
759 except BaseException: # Including KeyboardInterrupt
760 p.kill()
761 p.wait()
762 raise
763
764
d3c93ec2 765class Popen(subprocess.Popen):
766 if sys.platform == 'win32':
767 _startupinfo = subprocess.STARTUPINFO()
768 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
769 else:
770 _startupinfo = None
771
772 def __init__(self, *args, **kwargs):
773 super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
774
775 def communicate_or_kill(self, *args, **kwargs):
776 return process_communicate_or_kill(self, *args, **kwargs)
777
778
aa49acd1
S
779def get_subprocess_encoding():
780 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
781 # For subprocess calls, encode with locale encoding
782 # Refer to http://stackoverflow.com/a/9951851/35070
783 encoding = preferredencoding()
784 else:
785 encoding = sys.getfilesystemencoding()
786 if encoding is None:
787 encoding = 'utf-8'
788 return encoding
789
790
8bf48f23 791def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
792 """
793 @param s The name of the file
794 """
d77c3dfd 795
8bf48f23 796 assert type(s) == compat_str
d77c3dfd 797
59ae15a5
PH
798 # Python 3 has a Unicode API
799 if sys.version_info >= (3, 0):
800 return s
0f00efed 801
aa49acd1
S
802 # Pass '' directly to use Unicode APIs on Windows 2000 and up
803 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
804 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
805 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
806 return s
807
8ee239e9
YCH
808 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
809 if sys.platform.startswith('java'):
810 return s
811
aa49acd1
S
812 return s.encode(get_subprocess_encoding(), 'ignore')
813
814
815def decodeFilename(b, for_subprocess=False):
816
817 if sys.version_info >= (3, 0):
818 return b
819
820 if not isinstance(b, bytes):
821 return b
822
823 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 824
f07b74fc
PH
825
826def encodeArgument(s):
827 if not isinstance(s, compat_str):
828 # Legacy code that uses byte strings
829 # Uncomment the following line after fixing all post processors
7af808a5 830 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
831 s = s.decode('ascii')
832 return encodeFilename(s, True)
833
834
aa49acd1
S
835def decodeArgument(b):
836 return decodeFilename(b, True)
837
838
8271226a
PH
839def decodeOption(optval):
840 if optval is None:
841 return optval
842 if isinstance(optval, bytes):
843 optval = optval.decode(preferredencoding())
844
845 assert isinstance(optval, compat_str)
846 return optval
1c256f70 847
5f6a1245 848
aa7785f8 849_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
850
851
852def timetuple_from_msec(msec):
853 secs, msec = divmod(msec, 1000)
854 mins, secs = divmod(secs, 60)
855 hrs, mins = divmod(mins, 60)
856 return _timetuple(hrs, mins, secs, msec)
857
858
cdb19aa4 859def formatSeconds(secs, delim=':', msec=False):
aa7785f8 860 time = timetuple_from_msec(secs * 1000)
861 if time.hours:
862 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
863 elif time.minutes:
864 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 865 else:
aa7785f8 866 ret = '%d' % time.seconds
867 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 868
a0ddb8a2 869
77562778 870def _ssl_load_windows_store_certs(ssl_context, storename):
871 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
872 try:
873 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
874 if encoding == 'x509_asn' and (
875 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
876 except PermissionError:
877 return
878 for cert in certs:
a2366922 879 try:
77562778 880 ssl_context.load_verify_locations(cadata=cert)
881 except ssl.SSLError:
a2366922
PH
882 pass
883
77562778 884
885def make_HTTPS_handler(params, **kwargs):
886 opts_check_certificate = not params.get('nocheckcertificate')
887 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
888 context.check_hostname = opts_check_certificate
889 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
890 if opts_check_certificate:
4e3d1898 891 try:
892 context.load_default_certs()
893 # Work around the issue in load_default_certs when there are bad certificates. See:
894 # https://github.com/yt-dlp/yt-dlp/issues/1060,
895 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
896 except ssl.SSLError:
897 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
898 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
899 # Create a new context to discard any certificates that were already loaded
900 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
901 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
902 for storename in ('CA', 'ROOT'):
903 _ssl_load_windows_store_certs(context, storename)
904 context.set_default_verify_paths()
77562778 905 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 906
732ea2f0 907
5873d4cc 908def bug_reports_message(before=';'):
08f2a92c 909 if ytdl_is_updateable():
7a5c1cfe 910 update_cmd = 'type yt-dlp -U to update'
08f2a92c 911 else:
7a5c1cfe 912 update_cmd = 'see https://github.com/yt-dlp/yt-dlp on how to update'
5873d4cc 913 msg = 'please report this issue on https://github.com/yt-dlp/yt-dlp .'
08f2a92c 914 msg += ' Make sure you are using the latest version; %s.' % update_cmd
7a5c1cfe 915 msg += ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
5873d4cc
F
916
917 before = before.rstrip()
918 if not before or before.endswith(('.', '!', '?')):
919 msg = msg[0].title() + msg[1:]
920
921 return (before + ' ' if before else '') + msg
08f2a92c
JMF
922
923
bf5b9d85
PM
924class YoutubeDLError(Exception):
925 """Base exception for YoutubeDL errors."""
aa9369a2 926 msg = None
927
928 def __init__(self, msg=None):
929 if msg is not None:
930 self.msg = msg
931 elif self.msg is None:
932 self.msg = type(self).__name__
933 super().__init__(self.msg)
bf5b9d85
PM
934
935
3158150c 936network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
937if hasattr(ssl, 'CertificateError'):
938 network_exceptions.append(ssl.CertificateError)
939network_exceptions = tuple(network_exceptions)
940
941
bf5b9d85 942class ExtractorError(YoutubeDLError):
1c256f70 943 """Error during info extraction."""
5f6a1245 944
1151c407 945 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 946 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 947 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 948 """
3158150c 949 if sys.exc_info()[0] in network_exceptions:
9a82b238 950 expected = True
d5979c5d 951
526d74ec 952 self.msg = str(msg)
1c256f70 953 self.traceback = tb
1151c407 954 self.expected = expected
2eabb802 955 self.cause = cause
d11271dd 956 self.video_id = video_id
1151c407 957 self.ie = ie
958 self.exc_info = sys.exc_info() # preserve original exception
959
960 super(ExtractorError, self).__init__(''.join((
961 format_field(ie, template='[%s] '),
962 format_field(video_id, template='%s: '),
526d74ec 963 self.msg,
1151c407 964 format_field(cause, template=' (caused by %r)'),
965 '' if expected else bug_reports_message())))
1c256f70 966
01951dda
PH
967 def format_traceback(self):
968 if self.traceback is None:
969 return None
28e614de 970 return ''.join(traceback.format_tb(self.traceback))
01951dda 971
1c256f70 972
416c7fcb
PH
973class UnsupportedError(ExtractorError):
974 def __init__(self, url):
975 super(UnsupportedError, self).__init__(
976 'Unsupported URL: %s' % url, expected=True)
977 self.url = url
978
979
55b3e45b
JMF
980class RegexNotFoundError(ExtractorError):
981 """Error when a regex didn't match"""
982 pass
983
984
773f291d
S
985class GeoRestrictedError(ExtractorError):
986 """Geographic restriction Error exception.
987
988 This exception may be thrown when a video is not available from your
989 geographic location due to geographic restrictions imposed by a website.
990 """
b6e0c7d2 991
0db3bae8 992 def __init__(self, msg, countries=None, **kwargs):
993 kwargs['expected'] = True
994 super(GeoRestrictedError, self).__init__(msg, **kwargs)
773f291d
S
995 self.countries = countries
996
997
bf5b9d85 998class DownloadError(YoutubeDLError):
59ae15a5 999 """Download Error exception.
d77c3dfd 1000
59ae15a5
PH
1001 This exception may be thrown by FileDownloader objects if they are not
1002 configured to continue on errors. They will contain the appropriate
1003 error message.
1004 """
5f6a1245 1005
8cc83b8d
FV
1006 def __init__(self, msg, exc_info=None):
1007 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1008 super(DownloadError, self).__init__(msg)
1009 self.exc_info = exc_info
d77c3dfd
FV
1010
1011
498f5606 1012class EntryNotInPlaylist(YoutubeDLError):
1013 """Entry not in playlist exception.
1014
1015 This exception will be thrown by YoutubeDL when a requested entry
1016 is not found in the playlist info_dict
1017 """
aa9369a2 1018 msg = 'Entry not found in info'
498f5606 1019
1020
bf5b9d85 1021class SameFileError(YoutubeDLError):
59ae15a5 1022 """Same File exception.
d77c3dfd 1023
59ae15a5
PH
1024 This exception will be thrown by FileDownloader objects if they detect
1025 multiple files would have to be downloaded to the same file on disk.
1026 """
aa9369a2 1027 msg = 'Fixed output name but more than one file to download'
1028
1029 def __init__(self, filename=None):
1030 if filename is not None:
1031 self.msg += f': {filename}'
1032 super().__init__(self.msg)
d77c3dfd
FV
1033
1034
bf5b9d85 1035class PostProcessingError(YoutubeDLError):
59ae15a5 1036 """Post Processing exception.
d77c3dfd 1037
59ae15a5
PH
1038 This exception may be raised by PostProcessor's .run() method to
1039 indicate an error in the postprocessing task.
1040 """
5f6a1245 1041
5f6a1245 1042
48f79687 1043class DownloadCancelled(YoutubeDLError):
1044 """ Exception raised when the download queue should be interrupted """
1045 msg = 'The download was cancelled'
8b0d7497 1046
8b0d7497 1047
48f79687 1048class ExistingVideoReached(DownloadCancelled):
1049 """ --break-on-existing triggered """
1050 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1051
48f79687 1052
1053class RejectedVideoReached(DownloadCancelled):
1054 """ --break-on-reject triggered """
1055 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
51d9739f 1056
1057
48f79687 1058class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1059 """ --max-downloads limit has been reached. """
48f79687 1060 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1061
1062
f2ebc5c7 1063class ReExtractInfo(YoutubeDLError):
1064 """ Video info needs to be re-extracted. """
1065
1066 def __init__(self, msg, expected=False):
1067 super().__init__(msg)
1068 self.expected = expected
1069
1070
1071class ThrottledDownload(ReExtractInfo):
48f79687 1072 """ Download speed below --throttled-rate. """
aa9369a2 1073 msg = 'The download speed is below throttle limit'
d77c3dfd 1074
43b22906 1075 def __init__(self):
1076 super().__init__(self.msg, expected=False)
f2ebc5c7 1077
d77c3dfd 1078
bf5b9d85 1079class UnavailableVideoError(YoutubeDLError):
59ae15a5 1080 """Unavailable Format exception.
d77c3dfd 1081
59ae15a5
PH
1082 This exception will be thrown when a video is requested
1083 in a format that is not available for that video.
1084 """
aa9369a2 1085 msg = 'Unable to download video'
1086
1087 def __init__(self, err=None):
1088 if err is not None:
1089 self.msg += f': {err}'
1090 super().__init__(self.msg)
d77c3dfd
FV
1091
1092
bf5b9d85 1093class ContentTooShortError(YoutubeDLError):
59ae15a5 1094 """Content Too Short exception.
d77c3dfd 1095
59ae15a5
PH
1096 This exception may be raised by FileDownloader objects when a file they
1097 download is too small for what the server announced first, indicating
1098 the connection was probably interrupted.
1099 """
d77c3dfd 1100
59ae15a5 1101 def __init__(self, downloaded, expected):
bf5b9d85
PM
1102 super(ContentTooShortError, self).__init__(
1103 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1104 )
2c7ed247 1105 # Both in bytes
59ae15a5
PH
1106 self.downloaded = downloaded
1107 self.expected = expected
d77c3dfd 1108
5f6a1245 1109
bf5b9d85 1110class XAttrMetadataError(YoutubeDLError):
efa97bdc
YCH
1111 def __init__(self, code=None, msg='Unknown error'):
1112 super(XAttrMetadataError, self).__init__(msg)
1113 self.code = code
bd264412 1114 self.msg = msg
efa97bdc
YCH
1115
1116 # Parsing code and msg
3089bc74 1117 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1118 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1119 self.reason = 'NO_SPACE'
1120 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1121 self.reason = 'VALUE_TOO_LONG'
1122 else:
1123 self.reason = 'NOT_SUPPORTED'
1124
1125
bf5b9d85 1126class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1127 pass
1128
1129
c5a59d93 1130def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
1131 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1132 # expected HTTP responses to meet HTTP/1.0 or later (see also
067aa17e 1133 # https://github.com/ytdl-org/youtube-dl/issues/6727)
e5e78797 1134 if sys.version_info < (3, 0):
65220c3b
S
1135 kwargs['strict'] = True
1136 hc = http_class(*args, **compat_kwargs(kwargs))
be4a824d 1137 source_address = ydl_handler._params.get('source_address')
8959018a 1138
be4a824d 1139 if source_address is not None:
8959018a
AU
1140 # This is to workaround _create_connection() from socket where it will try all
1141 # address data from getaddrinfo() including IPv6. This filters the result from
1142 # getaddrinfo() based on the source_address value.
1143 # This is based on the cpython socket.create_connection() function.
1144 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1145 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1146 host, port = address
1147 err = None
1148 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1149 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1150 ip_addrs = [addr for addr in addrs if addr[0] == af]
1151 if addrs and not ip_addrs:
1152 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1153 raise socket.error(
1154 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1155 % (ip_version, source_address[0]))
8959018a
AU
1156 for res in ip_addrs:
1157 af, socktype, proto, canonname, sa = res
1158 sock = None
1159 try:
1160 sock = socket.socket(af, socktype, proto)
1161 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1162 sock.settimeout(timeout)
1163 sock.bind(source_address)
1164 sock.connect(sa)
1165 err = None # Explicitly break reference cycle
1166 return sock
1167 except socket.error as _:
1168 err = _
1169 if sock is not None:
1170 sock.close()
1171 if err is not None:
1172 raise err
1173 else:
9e21e6d9
S
1174 raise socket.error('getaddrinfo returns an empty list')
1175 if hasattr(hc, '_create_connection'):
1176 hc._create_connection = _create_connection
be4a824d
PH
1177 sa = (source_address, 0)
1178 if hasattr(hc, 'source_address'): # Python 2.7+
1179 hc.source_address = sa
1180 else: # Python 2.6
1181 def _hc_connect(self, *args, **kwargs):
9e21e6d9 1182 sock = _create_connection(
be4a824d
PH
1183 (self.host, self.port), self.timeout, sa)
1184 if is_https:
d7932313
PH
1185 self.sock = ssl.wrap_socket(
1186 sock, self.key_file, self.cert_file,
1187 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
1188 else:
1189 self.sock = sock
1190 hc.connect = functools.partial(_hc_connect, hc)
1191
1192 return hc
1193
1194
87f0e62d 1195def handle_youtubedl_headers(headers):
992fc9d6
YCH
1196 filtered_headers = headers
1197
1198 if 'Youtubedl-no-compression' in filtered_headers:
1199 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 1200 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1201
992fc9d6 1202 return filtered_headers
87f0e62d
YCH
1203
1204
acebc9cd 1205class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
1206 """Handler for HTTP requests and responses.
1207
1208 This class, when installed with an OpenerDirector, automatically adds
1209 the standard headers to every HTTP request and handles gzipped and
1210 deflated responses from web servers. If compression is to be avoided in
1211 a particular request, the original request in the program code only has
0424ec30 1212 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1213 removed before making the real request.
1214
1215 Part of this code was copied from:
1216
1217 http://techknack.net/python-urllib2-handlers/
1218
1219 Andrew Rowls, the author of that code, agreed to release it to the
1220 public domain.
1221 """
1222
be4a824d
PH
1223 def __init__(self, params, *args, **kwargs):
1224 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1225 self._params = params
1226
1227 def http_open(self, req):
71aff188
YCH
1228 conn_class = compat_http_client.HTTPConnection
1229
1230 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1231 if socks_proxy:
1232 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1233 del req.headers['Ytdl-socks-proxy']
1234
be4a824d 1235 return self.do_open(functools.partial(
71aff188 1236 _create_http_connection, self, conn_class, False),
be4a824d
PH
1237 req)
1238
59ae15a5
PH
1239 @staticmethod
1240 def deflate(data):
fc2119f2 1241 if not data:
1242 return data
59ae15a5
PH
1243 try:
1244 return zlib.decompress(data, -zlib.MAX_WBITS)
1245 except zlib.error:
1246 return zlib.decompress(data)
1247
acebc9cd 1248 def http_request(self, req):
51f267d9
S
1249 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1250 # always respected by websites, some tend to give out URLs with non percent-encoded
1251 # non-ASCII characters (see telemb.py, ard.py [#3412])
1252 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1253 # To work around aforementioned issue we will replace request's original URL with
1254 # percent-encoded one
1255 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1256 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1257 url = req.get_full_url()
1258 url_escaped = escape_url(url)
1259
1260 # Substitute URL if any change after escaping
1261 if url != url_escaped:
15d260eb 1262 req = update_Request(req, url=url_escaped)
51f267d9 1263
33ac271b 1264 for h, v in std_headers.items():
3d5f7a39
JK
1265 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1266 # The dict keys are capitalized because of this bug by urllib
1267 if h.capitalize() not in req.headers:
33ac271b 1268 req.add_header(h, v)
87f0e62d
YCH
1269
1270 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
1271
1272 if sys.version_info < (2, 7) and '#' in req.get_full_url():
1273 # Python 2.6 is brain-dead when it comes to fragments
1274 req._Request__original = req._Request__original.partition('#')[0]
1275 req._Request__r_type = req._Request__r_type.partition('#')[0]
1276
59ae15a5
PH
1277 return req
1278
acebc9cd 1279 def http_response(self, req, resp):
59ae15a5
PH
1280 old_resp = resp
1281 # gzip
1282 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1283 content = resp.read()
1284 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1285 try:
1286 uncompressed = io.BytesIO(gz.read())
1287 except IOError as original_ioerror:
1288 # There may be junk add the end of the file
1289 # See http://stackoverflow.com/q/4928560/35070 for details
1290 for i in range(1, 1024):
1291 try:
1292 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1293 uncompressed = io.BytesIO(gz.read())
1294 except IOError:
1295 continue
1296 break
1297 else:
1298 raise original_ioerror
b407d853 1299 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1300 resp.msg = old_resp.msg
c047270c 1301 del resp.headers['Content-encoding']
59ae15a5
PH
1302 # deflate
1303 if resp.headers.get('Content-encoding', '') == 'deflate':
1304 gz = io.BytesIO(self.deflate(resp.read()))
b407d853 1305 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1306 resp.msg = old_resp.msg
c047270c 1307 del resp.headers['Content-encoding']
ad729172 1308 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1309 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1310 if 300 <= resp.code < 400:
1311 location = resp.headers.get('Location')
1312 if location:
1313 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1314 if sys.version_info >= (3, 0):
1315 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
1316 else:
1317 location = location.decode('utf-8')
5a4d9ddb
S
1318 location_escaped = escape_url(location)
1319 if location != location_escaped:
1320 del resp.headers['Location']
9a4aec8b
YCH
1321 if sys.version_info < (3, 0):
1322 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 1323 resp.headers['Location'] = location_escaped
59ae15a5 1324 return resp
0f8d03f8 1325
acebc9cd
PH
1326 https_request = http_request
1327 https_response = http_response
bf50b038 1328
5de90176 1329
71aff188
YCH
1330def make_socks_conn_class(base_class, socks_proxy):
1331 assert issubclass(base_class, (
1332 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1333
1334 url_components = compat_urlparse.urlparse(socks_proxy)
1335 if url_components.scheme.lower() == 'socks5':
1336 socks_type = ProxyType.SOCKS5
1337 elif url_components.scheme.lower() in ('socks', 'socks4'):
1338 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1339 elif url_components.scheme.lower() == 'socks4a':
1340 socks_type = ProxyType.SOCKS4A
71aff188 1341
cdd94c2e
YCH
1342 def unquote_if_non_empty(s):
1343 if not s:
1344 return s
1345 return compat_urllib_parse_unquote_plus(s)
1346
71aff188
YCH
1347 proxy_args = (
1348 socks_type,
1349 url_components.hostname, url_components.port or 1080,
1350 True, # Remote DNS
cdd94c2e
YCH
1351 unquote_if_non_empty(url_components.username),
1352 unquote_if_non_empty(url_components.password),
71aff188
YCH
1353 )
1354
1355 class SocksConnection(base_class):
1356 def connect(self):
1357 self.sock = sockssocket()
1358 self.sock.setproxy(*proxy_args)
1359 if type(self.timeout) in (int, float):
1360 self.sock.settimeout(self.timeout)
1361 self.sock.connect((self.host, self.port))
1362
1363 if isinstance(self, compat_http_client.HTTPSConnection):
1364 if hasattr(self, '_context'): # Python > 2.6
1365 self.sock = self._context.wrap_socket(
1366 self.sock, server_hostname=self.host)
1367 else:
1368 self.sock = ssl.wrap_socket(self.sock)
1369
1370 return SocksConnection
1371
1372
be4a824d
PH
1373class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1374 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1375 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1376 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1377 self._params = params
1378
1379 def https_open(self, req):
4f264c02 1380 kwargs = {}
71aff188
YCH
1381 conn_class = self._https_conn_class
1382
4f264c02
JMF
1383 if hasattr(self, '_context'): # python > 2.6
1384 kwargs['context'] = self._context
1385 if hasattr(self, '_check_hostname'): # python 3.x
1386 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1387
1388 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1389 if socks_proxy:
1390 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1391 del req.headers['Ytdl-socks-proxy']
1392
be4a824d 1393 return self.do_open(functools.partial(
71aff188 1394 _create_http_connection, self, conn_class, True),
4f264c02 1395 req, **kwargs)
be4a824d
PH
1396
1397
1bab3437 1398class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
f1a8511f
S
1399 """
1400 See [1] for cookie file format.
1401
1402 1. https://curl.haxx.se/docs/http-cookies.html
1403 """
e7e62441 1404 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1405 _ENTRY_LEN = 7
1406 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1407# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1408
1409'''
1410 _CookieFileEntry = collections.namedtuple(
1411 'CookieFileEntry',
1412 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1413
1bab3437 1414 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
c380cc28
S
1415 """
1416 Save cookies to a file.
1417
1418 Most of the code is taken from CPython 3.8 and slightly adapted
1419 to support cookie files with UTF-8 in both python 2 and 3.
1420 """
1421 if filename is None:
1422 if self.filename is not None:
1423 filename = self.filename
1424 else:
1425 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1426
1bab3437
S
1427 # Store session cookies with `expires` set to 0 instead of an empty
1428 # string
1429 for cookie in self:
1430 if cookie.expires is None:
1431 cookie.expires = 0
c380cc28
S
1432
1433 with io.open(filename, 'w', encoding='utf-8') as f:
1434 f.write(self._HEADER)
1435 now = time.time()
1436 for cookie in self:
1437 if not ignore_discard and cookie.discard:
1438 continue
1439 if not ignore_expires and cookie.is_expired(now):
1440 continue
1441 if cookie.secure:
1442 secure = 'TRUE'
1443 else:
1444 secure = 'FALSE'
1445 if cookie.domain.startswith('.'):
1446 initial_dot = 'TRUE'
1447 else:
1448 initial_dot = 'FALSE'
1449 if cookie.expires is not None:
1450 expires = compat_str(cookie.expires)
1451 else:
1452 expires = ''
1453 if cookie.value is None:
1454 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1455 # with no name, whereas http.cookiejar regards it as a
1456 # cookie with no value.
1457 name = ''
1458 value = cookie.name
1459 else:
1460 name = cookie.name
1461 value = cookie.value
1462 f.write(
1463 '\t'.join([cookie.domain, initial_dot, cookie.path,
1464 secure, expires, name, value]) + '\n')
1bab3437
S
1465
1466 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1467 """Load cookies from a file."""
1468 if filename is None:
1469 if self.filename is not None:
1470 filename = self.filename
1471 else:
1472 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1473
c380cc28
S
1474 def prepare_line(line):
1475 if line.startswith(self._HTTPONLY_PREFIX):
1476 line = line[len(self._HTTPONLY_PREFIX):]
1477 # comments and empty lines are fine
1478 if line.startswith('#') or not line.strip():
1479 return line
1480 cookie_list = line.split('\t')
1481 if len(cookie_list) != self._ENTRY_LEN:
1482 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1483 cookie = self._CookieFileEntry(*cookie_list)
1484 if cookie.expires_at and not cookie.expires_at.isdigit():
1485 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1486 return line
1487
e7e62441 1488 cf = io.StringIO()
c380cc28 1489 with io.open(filename, encoding='utf-8') as f:
e7e62441 1490 for line in f:
c380cc28
S
1491 try:
1492 cf.write(prepare_line(line))
1493 except compat_cookiejar.LoadError as e:
1494 write_string(
1495 'WARNING: skipping cookie file entry due to %s: %r\n'
1496 % (e, line), sys.stderr)
1497 continue
e7e62441 1498 cf.seek(0)
1499 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1500 # Session cookies are denoted by either `expires` field set to
1501 # an empty string or 0. MozillaCookieJar only recognizes the former
1502 # (see [1]). So we need force the latter to be recognized as session
1503 # cookies on our own.
1504 # Session cookies may be important for cookies-based authentication,
1505 # e.g. usually, when user does not check 'Remember me' check box while
1506 # logging in on a site, some important cookies are stored as session
1507 # cookies so that not recognizing them will result in failed login.
1508 # 1. https://bugs.python.org/issue17164
1509 for cookie in self:
1510 # Treat `expires=0` cookies as session cookies
1511 if cookie.expires == 0:
1512 cookie.expires = None
1513 cookie.discard = True
1514
1515
a6420bf5
S
1516class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1517 def __init__(self, cookiejar=None):
1518 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1519
1520 def http_response(self, request, response):
1521 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1522 # characters in Set-Cookie HTTP header of last response (see
067aa17e 1523 # https://github.com/ytdl-org/youtube-dl/issues/6769).
a6420bf5
S
1524 # In order to at least prevent crashing we will percent encode Set-Cookie
1525 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1526 # if sys.version_info < (3, 0) and response.headers:
1527 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1528 # set_cookie = response.headers.get(set_cookie_header)
1529 # if set_cookie:
1530 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1531 # if set_cookie != set_cookie_escaped:
1532 # del response.headers[set_cookie_header]
1533 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1534 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1535
f5fa042c 1536 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
a6420bf5
S
1537 https_response = http_response
1538
1539
fca6dba8 1540class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
201c1459 1541 """YoutubeDL redirect handler
1542
1543 The code is based on HTTPRedirectHandler implementation from CPython [1].
1544
1545 This redirect handler solves two issues:
1546 - ensures redirect URL is always unicode under python 2
1547 - introduces support for experimental HTTP response status code
1548 308 Permanent Redirect [2] used by some sites [3]
1549
1550 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1551 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1552 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1553 """
1554
1555 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1556
1557 def redirect_request(self, req, fp, code, msg, headers, newurl):
1558 """Return a Request or None in response to a redirect.
1559
1560 This is called by the http_error_30x methods when a
1561 redirection response is received. If a redirection should
1562 take place, return a new Request to allow http_error_30x to
1563 perform the redirect. Otherwise, raise HTTPError if no-one
1564 else should try to handle this url. Return None if you can't
1565 but another Handler might.
1566 """
1567 m = req.get_method()
1568 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1569 or code in (301, 302, 303) and m == "POST")):
1570 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1571 # Strictly (according to RFC 2616), 301 or 302 in response to
1572 # a POST MUST NOT cause a redirection without confirmation
1573 # from the user (of urllib.request, in this case). In practice,
1574 # essentially all clients do redirect in this case, so we do
1575 # the same.
1576
1577 # On python 2 urlh.geturl() may sometimes return redirect URL
1578 # as byte string instead of unicode. This workaround allows
1579 # to force it always return unicode.
1580 if sys.version_info[0] < 3:
1581 newurl = compat_str(newurl)
1582
1583 # Be conciliant with URIs containing a space. This is mainly
1584 # redundant with the more complete encoding done in http_error_302(),
1585 # but it is kept for compatibility with other callers.
1586 newurl = newurl.replace(' ', '%20')
1587
1588 CONTENT_HEADERS = ("content-length", "content-type")
1589 # NB: don't use dict comprehension for python 2.6 compatibility
1590 newheaders = dict((k, v) for k, v in req.headers.items()
1591 if k.lower() not in CONTENT_HEADERS)
1592 return compat_urllib_request.Request(
1593 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1594 unverifiable=True)
fca6dba8
S
1595
1596
46f59e89
S
1597def extract_timezone(date_str):
1598 m = re.search(
f137e4c2 1599 r'''(?x)
1600 ^.{8,}? # >=8 char non-TZ prefix, if present
1601 (?P<tz>Z| # just the UTC Z, or
1602 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1603 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1604 [ ]? # optional space
1605 (?P<sign>\+|-) # +/-
1606 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1607 $)
1608 ''', date_str)
46f59e89
S
1609 if not m:
1610 timezone = datetime.timedelta()
1611 else:
1612 date_str = date_str[:-len(m.group('tz'))]
1613 if not m.group('sign'):
1614 timezone = datetime.timedelta()
1615 else:
1616 sign = 1 if m.group('sign') == '+' else -1
1617 timezone = datetime.timedelta(
1618 hours=sign * int(m.group('hours')),
1619 minutes=sign * int(m.group('minutes')))
1620 return timezone, date_str
1621
1622
08b38d54 1623def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1624 """ Return a UNIX timestamp from the given date """
1625
1626 if date_str is None:
1627 return None
1628
52c3a6e4
S
1629 date_str = re.sub(r'\.[0-9]+', '', date_str)
1630
08b38d54 1631 if timezone is None:
46f59e89
S
1632 timezone, date_str = extract_timezone(date_str)
1633
52c3a6e4
S
1634 try:
1635 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1636 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1637 return calendar.timegm(dt.timetuple())
1638 except ValueError:
1639 pass
912b38b4
PH
1640
1641
46f59e89
S
1642def date_formats(day_first=True):
1643 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1644
1645
42bdd9d0 1646def unified_strdate(date_str, day_first=True):
bf50b038 1647 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1648
1649 if date_str is None:
1650 return None
bf50b038 1651 upload_date = None
5f6a1245 1652 # Replace commas
026fcc04 1653 date_str = date_str.replace(',', ' ')
42bdd9d0 1654 # Remove AM/PM + timezone
9bb8e0a3 1655 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1656 _, date_str = extract_timezone(date_str)
42bdd9d0 1657
46f59e89 1658 for expression in date_formats(day_first):
bf50b038
JMF
1659 try:
1660 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1661 except ValueError:
bf50b038 1662 pass
42393ce2
PH
1663 if upload_date is None:
1664 timetuple = email.utils.parsedate_tz(date_str)
1665 if timetuple:
c6b9cf05
S
1666 try:
1667 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1668 except ValueError:
1669 pass
6a750402
JMF
1670 if upload_date is not None:
1671 return compat_str(upload_date)
bf50b038 1672
5f6a1245 1673
46f59e89
S
1674def unified_timestamp(date_str, day_first=True):
1675 if date_str is None:
1676 return None
1677
2ae2ffda 1678 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1679
7dc2a74e 1680 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1681 timezone, date_str = extract_timezone(date_str)
1682
1683 # Remove AM/PM + timezone
1684 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1685
deef3195
S
1686 # Remove unrecognized timezones from ISO 8601 alike timestamps
1687 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1688 if m:
1689 date_str = date_str[:-len(m.group('tz'))]
1690
f226880c
PH
1691 # Python only supports microseconds, so remove nanoseconds
1692 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1693 if m:
1694 date_str = m.group(1)
1695
46f59e89
S
1696 for expression in date_formats(day_first):
1697 try:
7dc2a74e 1698 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1699 return calendar.timegm(dt.timetuple())
1700 except ValueError:
1701 pass
1702 timetuple = email.utils.parsedate_tz(date_str)
1703 if timetuple:
7dc2a74e 1704 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1705
1706
28e614de 1707def determine_ext(url, default_ext='unknown_video'):
85750f89 1708 if url is None or '.' not in url:
f4776371 1709 return default_ext
9cb9a5df 1710 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1711 if re.match(r'^[A-Za-z0-9]+$', guess):
1712 return guess
a7aaa398
S
1713 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1714 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1715 return guess.rstrip('/')
73e79f2a 1716 else:
cbdbb766 1717 return default_ext
73e79f2a 1718
5f6a1245 1719
824fa511
S
1720def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1721 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1722
5f6a1245 1723
9e62f283 1724def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
37254abc
JMF
1725 """
1726 Return a datetime object from a string in the format YYYYMMDD or
9e62f283 1727 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1728
1729 format: string date format used to return datetime object from
1730 precision: round the time portion of a datetime object.
1731 auto|microsecond|second|minute|hour|day.
1732 auto: round to the unit provided in date_str (if applicable).
1733 """
1734 auto_precision = False
1735 if precision == 'auto':
1736 auto_precision = True
1737 precision = 'microsecond'
1738 today = datetime_round(datetime.datetime.now(), precision)
f8795e10 1739 if date_str in ('now', 'today'):
37254abc 1740 return today
f8795e10
PH
1741 if date_str == 'yesterday':
1742 return today - datetime.timedelta(days=1)
9e62f283 1743 match = re.match(
1744 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1745 date_str)
37254abc 1746 if match is not None:
9e62f283 1747 start_time = datetime_from_str(match.group('start'), precision, format)
1748 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1749 unit = match.group('unit')
9e62f283 1750 if unit == 'month' or unit == 'year':
1751 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1752 unit = 'day'
9e62f283 1753 else:
1754 if unit == 'week':
1755 unit = 'day'
1756 time *= 7
1757 delta = datetime.timedelta(**{unit + 's': time})
1758 new_date = start_time + delta
1759 if auto_precision:
1760 return datetime_round(new_date, unit)
1761 return new_date
1762
1763 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1764
1765
1766def date_from_str(date_str, format='%Y%m%d'):
1767 """
1768 Return a datetime object from a string in the format YYYYMMDD or
1769 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1770
1771 format: string date format used to return datetime object from
1772 """
1773 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1774
1775
1776def datetime_add_months(dt, months):
1777 """Increment/Decrement a datetime object by months."""
1778 month = dt.month + months - 1
1779 year = dt.year + month // 12
1780 month = month % 12 + 1
1781 day = min(dt.day, calendar.monthrange(year, month)[1])
1782 return dt.replace(year, month, day)
1783
1784
1785def datetime_round(dt, precision='day'):
1786 """
1787 Round a datetime object's time to a specific precision
1788 """
1789 if precision == 'microsecond':
1790 return dt
1791
1792 unit_seconds = {
1793 'day': 86400,
1794 'hour': 3600,
1795 'minute': 60,
1796 'second': 1,
1797 }
1798 roundto = lambda x, n: ((x + n / 2) // n) * n
1799 timestamp = calendar.timegm(dt.timetuple())
1800 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1801
1802
e63fc1be 1803def hyphenate_date(date_str):
1804 """
1805 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1806 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1807 if match is not None:
1808 return '-'.join(match.groups())
1809 else:
1810 return date_str
1811
5f6a1245 1812
bd558525
JMF
1813class DateRange(object):
1814 """Represents a time interval between two dates"""
5f6a1245 1815
bd558525
JMF
1816 def __init__(self, start=None, end=None):
1817 """start and end must be strings in the format accepted by date"""
1818 if start is not None:
1819 self.start = date_from_str(start)
1820 else:
1821 self.start = datetime.datetime.min.date()
1822 if end is not None:
1823 self.end = date_from_str(end)
1824 else:
1825 self.end = datetime.datetime.max.date()
37254abc 1826 if self.start > self.end:
bd558525 1827 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1828
bd558525
JMF
1829 @classmethod
1830 def day(cls, day):
1831 """Returns a range that only contains the given day"""
5f6a1245
JW
1832 return cls(day, day)
1833
bd558525
JMF
1834 def __contains__(self, date):
1835 """Check if the date is in the range"""
37254abc
JMF
1836 if not isinstance(date, datetime.date):
1837 date = date_from_str(date)
1838 return self.start <= date <= self.end
5f6a1245 1839
bd558525 1840 def __str__(self):
5f6a1245 1841 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1842
1843
1844def platform_name():
1845 """ Returns the platform name as a compat_str """
1846 res = platform.platform()
1847 if isinstance(res, bytes):
1848 res = res.decode(preferredencoding())
1849
1850 assert isinstance(res, compat_str)
1851 return res
c257baff
PH
1852
1853
49fa4d9a
N
1854def get_windows_version():
1855 ''' Get Windows version. None if it's not running on Windows '''
1856 if compat_os_name == 'nt':
1857 return version_tuple(platform.win32_ver()[1])
1858 else:
1859 return None
1860
1861
b58ddb32
PH
1862def _windows_write_string(s, out):
1863 """ Returns True if the string was written using special methods,
1864 False if it has yet to be written out."""
1865 # Adapted from http://stackoverflow.com/a/3259271/35070
1866
b58ddb32
PH
1867 import ctypes.wintypes
1868
1869 WIN_OUTPUT_IDS = {
1870 1: -11,
1871 2: -12,
1872 }
1873
a383a98a
PH
1874 try:
1875 fileno = out.fileno()
1876 except AttributeError:
1877 # If the output stream doesn't have a fileno, it's virtual
1878 return False
aa42e873
PH
1879 except io.UnsupportedOperation:
1880 # Some strange Windows pseudo files?
1881 return False
b58ddb32
PH
1882 if fileno not in WIN_OUTPUT_IDS:
1883 return False
1884
d7cd9a9e 1885 GetStdHandle = compat_ctypes_WINFUNCTYPE(
b58ddb32 1886 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
d7cd9a9e 1887 ('GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1888 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1889
d7cd9a9e 1890 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
1891 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1892 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
d7cd9a9e 1893 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1894 written = ctypes.wintypes.DWORD(0)
1895
d7cd9a9e 1896 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1897 FILE_TYPE_CHAR = 0x0002
1898 FILE_TYPE_REMOTE = 0x8000
d7cd9a9e 1899 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
1900 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1901 ctypes.POINTER(ctypes.wintypes.DWORD))(
d7cd9a9e 1902 ('GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1903 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1904
1905 def not_a_console(handle):
1906 if handle == INVALID_HANDLE_VALUE or handle is None:
1907 return True
3089bc74
S
1908 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1909 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1910
1911 if not_a_console(h):
1912 return False
1913
d1b9c912
PH
1914 def next_nonbmp_pos(s):
1915 try:
1916 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1917 except StopIteration:
1918 return len(s)
1919
1920 while s:
1921 count = min(next_nonbmp_pos(s), 1024)
1922
b58ddb32 1923 ret = WriteConsoleW(
d1b9c912 1924 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1925 if ret == 0:
1926 raise OSError('Failed to write string')
d1b9c912
PH
1927 if not count: # We just wrote a non-BMP character
1928 assert written.value == 2
1929 s = s[1:]
1930 else:
1931 assert written.value > 0
1932 s = s[written.value:]
b58ddb32
PH
1933 return True
1934
1935
734f90bb 1936def write_string(s, out=None, encoding=None):
7459e3a2
PH
1937 if out is None:
1938 out = sys.stderr
8bf48f23 1939 assert type(s) == compat_str
7459e3a2 1940
b58ddb32
PH
1941 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1942 if _windows_write_string(s, out):
1943 return
1944
3089bc74
S
1945 if ('b' in getattr(out, 'mode', '')
1946 or sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1947 byt = s.encode(encoding or preferredencoding(), 'ignore')
1948 out.write(byt)
1949 elif hasattr(out, 'buffer'):
1950 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1951 byt = s.encode(enc, 'ignore')
1952 out.buffer.write(byt)
1953 else:
8bf48f23 1954 out.write(s)
7459e3a2
PH
1955 out.flush()
1956
1957
48ea9cea
PH
1958def bytes_to_intlist(bs):
1959 if not bs:
1960 return []
1961 if isinstance(bs[0], int): # Python 3
1962 return list(bs)
1963 else:
1964 return [ord(c) for c in bs]
1965
c257baff 1966
cba892fa 1967def intlist_to_bytes(xs):
1968 if not xs:
1969 return b''
edaa23f8 1970 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1971
1972
c1c9a79c
PH
1973# Cross-platform file locking
1974if sys.platform == 'win32':
1975 import ctypes.wintypes
1976 import msvcrt
1977
1978 class OVERLAPPED(ctypes.Structure):
1979 _fields_ = [
1980 ('Internal', ctypes.wintypes.LPVOID),
1981 ('InternalHigh', ctypes.wintypes.LPVOID),
1982 ('Offset', ctypes.wintypes.DWORD),
1983 ('OffsetHigh', ctypes.wintypes.DWORD),
1984 ('hEvent', ctypes.wintypes.HANDLE),
1985 ]
1986
1987 kernel32 = ctypes.windll.kernel32
1988 LockFileEx = kernel32.LockFileEx
1989 LockFileEx.argtypes = [
1990 ctypes.wintypes.HANDLE, # hFile
1991 ctypes.wintypes.DWORD, # dwFlags
1992 ctypes.wintypes.DWORD, # dwReserved
1993 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1994 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1995 ctypes.POINTER(OVERLAPPED) # Overlapped
1996 ]
1997 LockFileEx.restype = ctypes.wintypes.BOOL
1998 UnlockFileEx = kernel32.UnlockFileEx
1999 UnlockFileEx.argtypes = [
2000 ctypes.wintypes.HANDLE, # hFile
2001 ctypes.wintypes.DWORD, # dwReserved
2002 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2003 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2004 ctypes.POINTER(OVERLAPPED) # Overlapped
2005 ]
2006 UnlockFileEx.restype = ctypes.wintypes.BOOL
2007 whole_low = 0xffffffff
2008 whole_high = 0x7fffffff
2009
2010 def _lock_file(f, exclusive):
2011 overlapped = OVERLAPPED()
2012 overlapped.Offset = 0
2013 overlapped.OffsetHigh = 0
2014 overlapped.hEvent = 0
2015 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2016 handle = msvcrt.get_osfhandle(f.fileno())
2017 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
2018 whole_low, whole_high, f._lock_file_overlapped_p):
2019 raise OSError('Locking file failed: %r' % ctypes.FormatError())
2020
2021 def _unlock_file(f):
2022 assert f._lock_file_overlapped_p
2023 handle = msvcrt.get_osfhandle(f.fileno())
2024 if not UnlockFileEx(handle, 0,
2025 whole_low, whole_high, f._lock_file_overlapped_p):
2026 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2027
2028else:
399a76e6
YCH
2029 # Some platforms, such as Jython, is missing fcntl
2030 try:
2031 import fcntl
c1c9a79c 2032
399a76e6
YCH
2033 def _lock_file(f, exclusive):
2034 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 2035
399a76e6
YCH
2036 def _unlock_file(f):
2037 fcntl.flock(f, fcntl.LOCK_UN)
2038 except ImportError:
2039 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2040
2041 def _lock_file(f, exclusive):
2042 raise IOError(UNSUPPORTED_MSG)
2043
2044 def _unlock_file(f):
2045 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
2046
2047
2048class locked_file(object):
2049 def __init__(self, filename, mode, encoding=None):
2050 assert mode in ['r', 'a', 'w']
2051 self.f = io.open(filename, mode, encoding=encoding)
2052 self.mode = mode
2053
2054 def __enter__(self):
2055 exclusive = self.mode != 'r'
2056 try:
2057 _lock_file(self.f, exclusive)
2058 except IOError:
2059 self.f.close()
2060 raise
2061 return self
2062
2063 def __exit__(self, etype, value, traceback):
2064 try:
2065 _unlock_file(self.f)
2066 finally:
2067 self.f.close()
2068
2069 def __iter__(self):
2070 return iter(self.f)
2071
2072 def write(self, *args):
2073 return self.f.write(*args)
2074
2075 def read(self, *args):
2076 return self.f.read(*args)
4eb7f1d1
JMF
2077
2078
4644ac55
S
2079def get_filesystem_encoding():
2080 encoding = sys.getfilesystemencoding()
2081 return encoding if encoding is not None else 'utf-8'
2082
2083
4eb7f1d1 2084def shell_quote(args):
a6a173c2 2085 quoted_args = []
4644ac55 2086 encoding = get_filesystem_encoding()
a6a173c2
JMF
2087 for a in args:
2088 if isinstance(a, bytes):
2089 # We may get a filename encoded with 'encodeFilename'
2090 a = a.decode(encoding)
aefce8e6 2091 quoted_args.append(compat_shlex_quote(a))
28e614de 2092 return ' '.join(quoted_args)
9d4660ca
PH
2093
2094
2095def smuggle_url(url, data):
2096 """ Pass additional data in a URL for internal use. """
2097
81953d1a
RA
2098 url, idata = unsmuggle_url(url, {})
2099 data.update(idata)
15707c7e 2100 sdata = compat_urllib_parse_urlencode(
28e614de
PH
2101 {'__youtubedl_smuggle': json.dumps(data)})
2102 return url + '#' + sdata
9d4660ca
PH
2103
2104
79f82953 2105def unsmuggle_url(smug_url, default=None):
83e865a3 2106 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2107 return smug_url, default
28e614de
PH
2108 url, _, sdata = smug_url.rpartition('#')
2109 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2110 data = json.loads(jsond)
2111 return url, data
02dbf93f
PH
2112
2113
e0fd9573 2114def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2115 """ Formats numbers with decimal sufixes like K, M, etc """
2116 num, factor = float_or_none(num), float(factor)
2117 if num is None:
2118 return None
2119 exponent = 0 if num == 0 else int(math.log(num, factor))
abbeeebc 2120 suffix = ['', *'kMGTPEZY'][exponent]
2121 if factor == 1024:
2122 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2123 converted = num / (factor ** exponent)
abbeeebc 2124 return fmt % (converted, suffix)
e0fd9573 2125
2126
02dbf93f 2127def format_bytes(bytes):
f02d24d8 2128 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2129
1c088fa8 2130
fb47597b
S
2131def lookup_unit_table(unit_table, s):
2132 units_re = '|'.join(re.escape(u) for u in unit_table)
2133 m = re.match(
782b1b5b 2134 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
2135 if not m:
2136 return None
2137 num_str = m.group('num').replace(',', '.')
2138 mult = unit_table[m.group('unit')]
2139 return int(float(num_str) * mult)
2140
2141
be64b5b0
PH
2142def parse_filesize(s):
2143 if s is None:
2144 return None
2145
dfb1b146 2146 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2147 # but we support those too
2148 _UNIT_TABLE = {
2149 'B': 1,
2150 'b': 1,
70852b47 2151 'bytes': 1,
be64b5b0
PH
2152 'KiB': 1024,
2153 'KB': 1000,
2154 'kB': 1024,
2155 'Kb': 1000,
13585d76 2156 'kb': 1000,
70852b47
YCH
2157 'kilobytes': 1000,
2158 'kibibytes': 1024,
be64b5b0
PH
2159 'MiB': 1024 ** 2,
2160 'MB': 1000 ** 2,
2161 'mB': 1024 ** 2,
2162 'Mb': 1000 ** 2,
13585d76 2163 'mb': 1000 ** 2,
70852b47
YCH
2164 'megabytes': 1000 ** 2,
2165 'mebibytes': 1024 ** 2,
be64b5b0
PH
2166 'GiB': 1024 ** 3,
2167 'GB': 1000 ** 3,
2168 'gB': 1024 ** 3,
2169 'Gb': 1000 ** 3,
13585d76 2170 'gb': 1000 ** 3,
70852b47
YCH
2171 'gigabytes': 1000 ** 3,
2172 'gibibytes': 1024 ** 3,
be64b5b0
PH
2173 'TiB': 1024 ** 4,
2174 'TB': 1000 ** 4,
2175 'tB': 1024 ** 4,
2176 'Tb': 1000 ** 4,
13585d76 2177 'tb': 1000 ** 4,
70852b47
YCH
2178 'terabytes': 1000 ** 4,
2179 'tebibytes': 1024 ** 4,
be64b5b0
PH
2180 'PiB': 1024 ** 5,
2181 'PB': 1000 ** 5,
2182 'pB': 1024 ** 5,
2183 'Pb': 1000 ** 5,
13585d76 2184 'pb': 1000 ** 5,
70852b47
YCH
2185 'petabytes': 1000 ** 5,
2186 'pebibytes': 1024 ** 5,
be64b5b0
PH
2187 'EiB': 1024 ** 6,
2188 'EB': 1000 ** 6,
2189 'eB': 1024 ** 6,
2190 'Eb': 1000 ** 6,
13585d76 2191 'eb': 1000 ** 6,
70852b47
YCH
2192 'exabytes': 1000 ** 6,
2193 'exbibytes': 1024 ** 6,
be64b5b0
PH
2194 'ZiB': 1024 ** 7,
2195 'ZB': 1000 ** 7,
2196 'zB': 1024 ** 7,
2197 'Zb': 1000 ** 7,
13585d76 2198 'zb': 1000 ** 7,
70852b47
YCH
2199 'zettabytes': 1000 ** 7,
2200 'zebibytes': 1024 ** 7,
be64b5b0
PH
2201 'YiB': 1024 ** 8,
2202 'YB': 1000 ** 8,
2203 'yB': 1024 ** 8,
2204 'Yb': 1000 ** 8,
13585d76 2205 'yb': 1000 ** 8,
70852b47
YCH
2206 'yottabytes': 1000 ** 8,
2207 'yobibytes': 1024 ** 8,
be64b5b0
PH
2208 }
2209
fb47597b
S
2210 return lookup_unit_table(_UNIT_TABLE, s)
2211
2212
2213def parse_count(s):
2214 if s is None:
be64b5b0
PH
2215 return None
2216
352d5da8 2217 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2218
2219 if re.match(r'^[\d,.]+$', s):
2220 return str_to_int(s)
2221
2222 _UNIT_TABLE = {
2223 'k': 1000,
2224 'K': 1000,
2225 'm': 1000 ** 2,
2226 'M': 1000 ** 2,
2227 'kk': 1000 ** 2,
2228 'KK': 1000 ** 2,
352d5da8 2229 'b': 1000 ** 3,
2230 'B': 1000 ** 3,
fb47597b 2231 }
be64b5b0 2232
352d5da8 2233 ret = lookup_unit_table(_UNIT_TABLE, s)
2234 if ret is not None:
2235 return ret
2236
2237 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2238 if mobj:
2239 return str_to_int(mobj.group(1))
be64b5b0 2240
2f7ae819 2241
b871d7e9
S
2242def parse_resolution(s):
2243 if s is None:
2244 return {}
2245
17ec8bcf 2246 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2247 if mobj:
2248 return {
2249 'width': int(mobj.group('w')),
2250 'height': int(mobj.group('h')),
2251 }
2252
17ec8bcf 2253 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2254 if mobj:
2255 return {'height': int(mobj.group(1))}
2256
2257 mobj = re.search(r'\b([48])[kK]\b', s)
2258 if mobj:
2259 return {'height': int(mobj.group(1)) * 540}
2260
2261 return {}
2262
2263
0dc41787
S
2264def parse_bitrate(s):
2265 if not isinstance(s, compat_str):
2266 return
2267 mobj = re.search(r'\b(\d+)\s*kbps', s)
2268 if mobj:
2269 return int(mobj.group(1))
2270
2271
a942d6cb 2272def month_by_name(name, lang='en'):
caefb1de
PH
2273 """ Return the number of a month by (locale-independently) English name """
2274
f6717dec 2275 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2276
caefb1de 2277 try:
f6717dec 2278 return month_names.index(name) + 1
7105440c
YCH
2279 except ValueError:
2280 return None
2281
2282
2283def month_by_abbreviation(abbrev):
2284 """ Return the number of a month by (locale-independently) English
2285 abbreviations """
2286
2287 try:
2288 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2289 except ValueError:
2290 return None
18258362
JMF
2291
2292
5aafe895 2293def fix_xml_ampersands(xml_str):
18258362 2294 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2295 return re.sub(
2296 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2297 '&amp;',
5aafe895 2298 xml_str)
e3946f98
PH
2299
2300
2301def setproctitle(title):
8bf48f23 2302 assert isinstance(title, compat_str)
c1c05c67
YCH
2303
2304 # ctypes in Jython is not complete
2305 # http://bugs.jython.org/issue2148
2306 if sys.platform.startswith('java'):
2307 return
2308
e3946f98 2309 try:
611c1dd9 2310 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2311 except OSError:
2312 return
2f49bcd6
RC
2313 except TypeError:
2314 # LoadLibrary in Windows Python 2.7.13 only expects
2315 # a bytestring, but since unicode_literals turns
2316 # every string into a unicode string, it fails.
2317 return
6eefe533
PH
2318 title_bytes = title.encode('utf-8')
2319 buf = ctypes.create_string_buffer(len(title_bytes))
2320 buf.value = title_bytes
e3946f98 2321 try:
6eefe533 2322 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2323 except AttributeError:
2324 return # Strange libc, just skip this
d7dda168
PH
2325
2326
2327def remove_start(s, start):
46bc9b7d 2328 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2329
2330
2b9faf55 2331def remove_end(s, end):
46bc9b7d 2332 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2333
2334
31b2051e
S
2335def remove_quotes(s):
2336 if s is None or len(s) < 2:
2337 return s
2338 for quote in ('"', "'", ):
2339 if s[0] == quote and s[-1] == quote:
2340 return s[1:-1]
2341 return s
2342
2343
b6e0c7d2
U
2344def get_domain(url):
2345 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2346 return domain.group('domain') if domain else None
2347
2348
29eb5174 2349def url_basename(url):
9b8aaeed 2350 path = compat_urlparse.urlparse(url).path
28e614de 2351 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2352
2353
02dc0a36
S
2354def base_url(url):
2355 return re.match(r'https?://[^?#&]+/', url).group()
2356
2357
e34c3361 2358def urljoin(base, path):
4b5de77b
S
2359 if isinstance(path, bytes):
2360 path = path.decode('utf-8')
e34c3361
S
2361 if not isinstance(path, compat_str) or not path:
2362 return None
fad4ceb5 2363 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2364 return path
4b5de77b
S
2365 if isinstance(base, bytes):
2366 base = base.decode('utf-8')
2367 if not isinstance(base, compat_str) or not re.match(
2368 r'^(?:https?:)?//', base):
e34c3361
S
2369 return None
2370 return compat_urlparse.urljoin(base, path)
2371
2372
aa94a6d3
PH
2373class HEADRequest(compat_urllib_request.Request):
2374 def get_method(self):
611c1dd9 2375 return 'HEAD'
7217e148
PH
2376
2377
95cf60e8
S
2378class PUTRequest(compat_urllib_request.Request):
2379 def get_method(self):
2380 return 'PUT'
2381
2382
9732d77e 2383def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2384 if get_attr and v is not None:
2385 v = getattr(v, get_attr, None)
1812afb7
S
2386 try:
2387 return int(v) * invscale // scale
31c49255 2388 except (ValueError, TypeError, OverflowError):
af98f8ff 2389 return default
9732d77e 2390
9572013d 2391
40a90862
JMF
2392def str_or_none(v, default=None):
2393 return default if v is None else compat_str(v)
2394
9732d77e
PH
2395
2396def str_to_int(int_str):
48d4681e 2397 """ A more relaxed version of int_or_none """
42db58ec 2398 if isinstance(int_str, compat_integer_types):
348c6bf1 2399 return int_str
42db58ec
S
2400 elif isinstance(int_str, compat_str):
2401 int_str = re.sub(r'[,\.\+]', '', int_str)
2402 return int_or_none(int_str)
608d11f5
PH
2403
2404
9732d77e 2405def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2406 if v is None:
2407 return default
2408 try:
2409 return float(v) * invscale / scale
5e1271c5 2410 except (ValueError, TypeError):
caf80631 2411 return default
43f775e4
PH
2412
2413
c7e327c4
S
2414def bool_or_none(v, default=None):
2415 return v if isinstance(v, bool) else default
2416
2417
53cd37ba
S
2418def strip_or_none(v, default=None):
2419 return v.strip() if isinstance(v, compat_str) else default
b72b4431
S
2420
2421
af03000a
S
2422def url_or_none(url):
2423 if not url or not isinstance(url, compat_str):
2424 return None
2425 url = url.strip()
29f7c58a 2426 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2427
2428
e29663c6 2429def strftime_or_none(timestamp, date_format, default=None):
2430 datetime_object = None
2431 try:
2432 if isinstance(timestamp, compat_numeric_types): # unix timestamp
2433 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2434 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2435 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2436 return datetime_object.strftime(date_format)
2437 except (ValueError, TypeError, AttributeError):
2438 return default
2439
2440
608d11f5 2441def parse_duration(s):
8f9312c3 2442 if not isinstance(s, compat_basestring):
608d11f5 2443 return None
ca7b3246 2444 s = s.strip()
38d79fd1 2445 if not s:
2446 return None
ca7b3246 2447
acaff495 2448 days, hours, mins, secs, ms = [None] * 5
15846398 2449 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
acaff495 2450 if m:
2451 days, hours, mins, secs, ms = m.groups()
2452 else:
2453 m = re.match(
056653bb
S
2454 r'''(?ix)(?:P?
2455 (?:
2456 [0-9]+\s*y(?:ears?)?\s*
2457 )?
2458 (?:
2459 [0-9]+\s*m(?:onths?)?\s*
2460 )?
2461 (?:
2462 [0-9]+\s*w(?:eeks?)?\s*
2463 )?
8f4b58d7 2464 (?:
acaff495 2465 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 2466 )?
056653bb 2467 T)?
acaff495 2468 (?:
2469 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2470 )?
2471 (?:
2472 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2473 )?
2474 (?:
2475 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2476 )?Z?$''', s)
acaff495 2477 if m:
2478 days, hours, mins, secs, ms = m.groups()
2479 else:
15846398 2480 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2481 if m:
2482 hours, mins = m.groups()
2483 else:
2484 return None
2485
2486 duration = 0
2487 if secs:
2488 duration += float(secs)
2489 if mins:
2490 duration += float(mins) * 60
2491 if hours:
2492 duration += float(hours) * 60 * 60
2493 if days:
2494 duration += float(days) * 24 * 60 * 60
2495 if ms:
2496 duration += float(ms)
2497 return duration
91d7d0b3
JMF
2498
2499
e65e4c88 2500def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2501 name, real_ext = os.path.splitext(filename)
e65e4c88
S
2502 return (
2503 '{0}.{1}{2}'.format(name, ext, real_ext)
2504 if not expected_real_ext or real_ext[1:] == expected_real_ext
2505 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
2506
2507
b3ed15b7
S
2508def replace_extension(filename, ext, expected_real_ext=None):
2509 name, real_ext = os.path.splitext(filename)
2510 return '{0}.{1}'.format(
2511 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2512 ext)
2513
2514
d70ad093
PH
2515def check_executable(exe, args=[]):
2516 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2517 args can be a list of arguments for a short output (like -version) """
2518 try:
d3c93ec2 2519 Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
d70ad093
PH
2520 except OSError:
2521 return False
2522 return exe
b7ab0590
PH
2523
2524
9af98e17 2525def _get_exe_version_output(exe, args):
95807118 2526 try:
b64d04c1 2527 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2528 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2529 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
d3c93ec2 2530 out, _ = Popen(
2531 [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2532 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
95807118
PH
2533 except OSError:
2534 return False
cae97f65
PH
2535 if isinstance(out, bytes): # Python 2.x
2536 out = out.decode('ascii', 'ignore')
9af98e17 2537 return out
cae97f65
PH
2538
2539
2540def detect_exe_version(output, version_re=None, unrecognized='present'):
2541 assert isinstance(output, compat_str)
2542 if version_re is None:
2543 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2544 m = re.search(version_re, output)
95807118
PH
2545 if m:
2546 return m.group(1)
2547 else:
2548 return unrecognized
2549
2550
9af98e17 2551def get_exe_version(exe, args=['--version'],
2552 version_re=None, unrecognized='present'):
2553 """ Returns the version of the specified executable,
2554 or False if the executable is not present """
2555 out = _get_exe_version_output(exe, args)
2556 return detect_exe_version(out, version_re, unrecognized) if out else False
2557
2558
cb89cfc1 2559class LazyList(collections.abc.Sequence):
483336e7 2560 ''' Lazy immutable list from an iterable
2561 Note that slices of a LazyList are lists and not LazyList'''
2562
8e5fecc8 2563 class IndexError(IndexError):
2564 pass
2565
282f5709 2566 def __init__(self, iterable, *, reverse=False, _cache=None):
483336e7 2567 self.__iterable = iter(iterable)
282f5709 2568 self.__cache = [] if _cache is None else _cache
2569 self.__reversed = reverse
483336e7 2570
2571 def __iter__(self):
28419ca2 2572 if self.__reversed:
2573 # We need to consume the entire iterable to iterate in reverse
981052c9 2574 yield from self.exhaust()
28419ca2 2575 return
2576 yield from self.__cache
483336e7 2577 for item in self.__iterable:
2578 self.__cache.append(item)
2579 yield item
2580
981052c9 2581 def __exhaust(self):
483336e7 2582 self.__cache.extend(self.__iterable)
9f1a1c36 2583 # Discard the emptied iterable to make it pickle-able
2584 self.__iterable = []
28419ca2 2585 return self.__cache
2586
981052c9 2587 def exhaust(self):
2588 ''' Evaluate the entire iterable '''
2589 return self.__exhaust()[::-1 if self.__reversed else 1]
2590
28419ca2 2591 @staticmethod
981052c9 2592 def __reverse_index(x):
e0f2b4b4 2593 return None if x is None else -(x + 1)
483336e7 2594
2595 def __getitem__(self, idx):
2596 if isinstance(idx, slice):
28419ca2 2597 if self.__reversed:
e0f2b4b4 2598 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2599 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2600 elif isinstance(idx, int):
28419ca2 2601 if self.__reversed:
981052c9 2602 idx = self.__reverse_index(idx)
e0f2b4b4 2603 start, stop, step = idx, idx, 0
483336e7 2604 else:
2605 raise TypeError('indices must be integers or slices')
e0f2b4b4 2606 if ((start or 0) < 0 or (stop or 0) < 0
2607 or (start is None and step < 0)
2608 or (stop is None and step > 0)):
483336e7 2609 # We need to consume the entire iterable to be able to slice from the end
2610 # Obviously, never use this with infinite iterables
8e5fecc8 2611 self.__exhaust()
2612 try:
2613 return self.__cache[idx]
2614 except IndexError as e:
2615 raise self.IndexError(e) from e
e0f2b4b4 2616 n = max(start or 0, stop or 0) - len(self.__cache) + 1
28419ca2 2617 if n > 0:
2618 self.__cache.extend(itertools.islice(self.__iterable, n))
8e5fecc8 2619 try:
2620 return self.__cache[idx]
2621 except IndexError as e:
2622 raise self.IndexError(e) from e
483336e7 2623
2624 def __bool__(self):
2625 try:
28419ca2 2626 self[-1] if self.__reversed else self[0]
8e5fecc8 2627 except self.IndexError:
483336e7 2628 return False
2629 return True
2630
2631 def __len__(self):
8e5fecc8 2632 self.__exhaust()
483336e7 2633 return len(self.__cache)
2634
282f5709 2635 def __reversed__(self):
2636 return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2637
2638 def __copy__(self):
2639 return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2640
28419ca2 2641 def __repr__(self):
2642 # repr and str should mimic a list. So we exhaust the iterable
2643 return repr(self.exhaust())
2644
2645 def __str__(self):
2646 return repr(self.exhaust())
2647
483336e7 2648
7be9ccff 2649class PagedList:
c07a39ae 2650
2651 class IndexError(IndexError):
2652 pass
2653
dd26ced1
PH
2654 def __len__(self):
2655 # This is only useful for tests
2656 return len(self.getslice())
2657
7be9ccff 2658 def __init__(self, pagefunc, pagesize, use_cache=True):
2659 self._pagefunc = pagefunc
2660 self._pagesize = pagesize
2661 self._use_cache = use_cache
2662 self._cache = {}
2663
2664 def getpage(self, pagenum):
d8cf8d97 2665 page_results = self._cache.get(pagenum)
2666 if page_results is None:
2667 page_results = list(self._pagefunc(pagenum))
7be9ccff 2668 if self._use_cache:
2669 self._cache[pagenum] = page_results
2670 return page_results
2671
2672 def getslice(self, start=0, end=None):
2673 return list(self._getslice(start, end))
2674
2675 def _getslice(self, start, end):
55575225 2676 raise NotImplementedError('This method must be implemented by subclasses')
2677
2678 def __getitem__(self, idx):
7be9ccff 2679 # NOTE: cache must be enabled if this is used
55575225 2680 if not isinstance(idx, int) or idx < 0:
2681 raise TypeError('indices must be non-negative integers')
2682 entries = self.getslice(idx, idx + 1)
d8cf8d97 2683 if not entries:
c07a39ae 2684 raise self.IndexError()
d8cf8d97 2685 return entries[0]
55575225 2686
9c44d242
PH
2687
2688class OnDemandPagedList(PagedList):
7be9ccff 2689 def _getslice(self, start, end):
b7ab0590
PH
2690 for pagenum in itertools.count(start // self._pagesize):
2691 firstid = pagenum * self._pagesize
2692 nextfirstid = pagenum * self._pagesize + self._pagesize
2693 if start >= nextfirstid:
2694 continue
2695
b7ab0590
PH
2696 startv = (
2697 start % self._pagesize
2698 if firstid <= start < nextfirstid
2699 else 0)
b7ab0590
PH
2700 endv = (
2701 ((end - 1) % self._pagesize) + 1
2702 if (end is not None and firstid <= end <= nextfirstid)
2703 else None)
2704
7be9ccff 2705 page_results = self.getpage(pagenum)
b7ab0590
PH
2706 if startv != 0 or endv is not None:
2707 page_results = page_results[startv:endv]
7be9ccff 2708 yield from page_results
b7ab0590
PH
2709
2710 # A little optimization - if current page is not "full", ie. does
2711 # not contain page_size videos then we can assume that this page
2712 # is the last one - there are no more ids on further pages -
2713 # i.e. no need to query again.
2714 if len(page_results) + startv < self._pagesize:
2715 break
2716
2717 # If we got the whole page, but the next page is not interesting,
2718 # break out early as well
2719 if end == nextfirstid:
2720 break
81c2f20b
PH
2721
2722
9c44d242
PH
2723class InAdvancePagedList(PagedList):
2724 def __init__(self, pagefunc, pagecount, pagesize):
9c44d242 2725 self._pagecount = pagecount
7be9ccff 2726 PagedList.__init__(self, pagefunc, pagesize, True)
9c44d242 2727
7be9ccff 2728 def _getslice(self, start, end):
9c44d242
PH
2729 start_page = start // self._pagesize
2730 end_page = (
2731 self._pagecount if end is None else (end // self._pagesize + 1))
2732 skip_elems = start - start_page * self._pagesize
2733 only_more = None if end is None else end - start
2734 for pagenum in range(start_page, end_page):
7be9ccff 2735 page_results = self.getpage(pagenum)
9c44d242 2736 if skip_elems:
7be9ccff 2737 page_results = page_results[skip_elems:]
9c44d242
PH
2738 skip_elems = None
2739 if only_more is not None:
7be9ccff 2740 if len(page_results) < only_more:
2741 only_more -= len(page_results)
9c44d242 2742 else:
7be9ccff 2743 yield from page_results[:only_more]
9c44d242 2744 break
7be9ccff 2745 yield from page_results
9c44d242
PH
2746
2747
81c2f20b 2748def uppercase_escape(s):
676eb3f2 2749 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2750 return re.sub(
a612753d 2751 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2752 lambda m: unicode_escape(m.group(0))[0],
2753 s)
0fe2ff78
YCH
2754
2755
2756def lowercase_escape(s):
2757 unicode_escape = codecs.getdecoder('unicode_escape')
2758 return re.sub(
2759 r'\\u[0-9a-fA-F]{4}',
2760 lambda m: unicode_escape(m.group(0))[0],
2761 s)
b53466e1 2762
d05cfe06
S
2763
2764def escape_rfc3986(s):
2765 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 2766 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 2767 s = s.encode('utf-8')
ecc0c5ee 2768 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2769
2770
2771def escape_url(url):
2772 """Escape URL as suggested by RFC 3986"""
2773 url_parsed = compat_urllib_parse_urlparse(url)
2774 return url_parsed._replace(
efbed08d 2775 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2776 path=escape_rfc3986(url_parsed.path),
2777 params=escape_rfc3986(url_parsed.params),
2778 query=escape_rfc3986(url_parsed.query),
2779 fragment=escape_rfc3986(url_parsed.fragment)
2780 ).geturl()
2781
62e609ab 2782
4dfbf869 2783def parse_qs(url):
2784 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2785
2786
62e609ab
PH
2787def read_batch_urls(batch_fd):
2788 def fixup(url):
2789 if not isinstance(url, compat_str):
2790 url = url.decode('utf-8', 'replace')
8c04f0be 2791 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2792 for bom in BOM_UTF8:
2793 if url.startswith(bom):
2794 url = url[len(bom):]
2795 url = url.lstrip()
2796 if not url or url.startswith(('#', ';', ']')):
62e609ab 2797 return False
8c04f0be 2798 # "#" cannot be stripped out since it is part of the URI
2799 # However, it can be safely stipped out if follwing a whitespace
2800 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
2801
2802 with contextlib.closing(batch_fd) as fd:
2803 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2804
2805
2806def urlencode_postdata(*args, **kargs):
15707c7e 2807 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2808
2809
38f9ef31 2810def update_url_query(url, query):
cacd9966
YCH
2811 if not query:
2812 return url
38f9ef31 2813 parsed_url = compat_urlparse.urlparse(url)
2814 qs = compat_parse_qs(parsed_url.query)
2815 qs.update(query)
2816 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2817 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2818
8e60dc75 2819
ed0291d1
S
2820def update_Request(req, url=None, data=None, headers={}, query={}):
2821 req_headers = req.headers.copy()
2822 req_headers.update(headers)
2823 req_data = data or req.data
2824 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2825 req_get_method = req.get_method()
2826 if req_get_method == 'HEAD':
2827 req_type = HEADRequest
2828 elif req_get_method == 'PUT':
2829 req_type = PUTRequest
2830 else:
2831 req_type = compat_urllib_request.Request
ed0291d1
S
2832 new_req = req_type(
2833 req_url, data=req_data, headers=req_headers,
2834 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2835 if hasattr(req, 'timeout'):
2836 new_req.timeout = req.timeout
2837 return new_req
2838
2839
10c87c15 2840def _multipart_encode_impl(data, boundary):
0c265486
YCH
2841 content_type = 'multipart/form-data; boundary=%s' % boundary
2842
2843 out = b''
2844 for k, v in data.items():
2845 out += b'--' + boundary.encode('ascii') + b'\r\n'
2846 if isinstance(k, compat_str):
2847 k = k.encode('utf-8')
2848 if isinstance(v, compat_str):
2849 v = v.encode('utf-8')
2850 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2851 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2852 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2853 if boundary.encode('ascii') in content:
2854 raise ValueError('Boundary overlaps with data')
2855 out += content
2856
2857 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2858
2859 return out, content_type
2860
2861
2862def multipart_encode(data, boundary=None):
2863 '''
2864 Encode a dict to RFC 7578-compliant form-data
2865
2866 data:
2867 A dict where keys and values can be either Unicode or bytes-like
2868 objects.
2869 boundary:
2870 If specified a Unicode object, it's used as the boundary. Otherwise
2871 a random boundary is generated.
2872
2873 Reference: https://tools.ietf.org/html/rfc7578
2874 '''
2875 has_specified_boundary = boundary is not None
2876
2877 while True:
2878 if boundary is None:
2879 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2880
2881 try:
10c87c15 2882 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2883 break
2884 except ValueError:
2885 if has_specified_boundary:
2886 raise
2887 boundary = None
2888
2889 return out, content_type
2890
2891
86296ad2 2892def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
2893 if isinstance(key_or_keys, (list, tuple)):
2894 for key in key_or_keys:
86296ad2
S
2895 if key not in d or d[key] is None or skip_false_values and not d[key]:
2896 continue
2897 return d[key]
cbecc9b9
S
2898 return default
2899 return d.get(key_or_keys, default)
2900
2901
329ca3be 2902def try_get(src, getter, expected_type=None):
6606817a 2903 for get in variadic(getter):
a32a9a7e
S
2904 try:
2905 v = get(src)
2906 except (AttributeError, KeyError, TypeError, IndexError):
2907 pass
2908 else:
2909 if expected_type is None or isinstance(v, expected_type):
2910 return v
329ca3be
S
2911
2912
6cc62232
S
2913def merge_dicts(*dicts):
2914 merged = {}
2915 for a_dict in dicts:
2916 for k, v in a_dict.items():
2917 if v is None:
2918 continue
3089bc74
S
2919 if (k not in merged
2920 or (isinstance(v, compat_str) and v
2921 and isinstance(merged[k], compat_str)
2922 and not merged[k])):
6cc62232
S
2923 merged[k] = v
2924 return merged
2925
2926
8e60dc75
S
2927def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2928 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2929
16392824 2930
a1a530b0
PH
2931US_RATINGS = {
2932 'G': 0,
2933 'PG': 10,
2934 'PG-13': 13,
2935 'R': 16,
2936 'NC': 18,
2937}
fac55558
PH
2938
2939
a8795327 2940TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
2941 'TV-Y': 0,
2942 'TV-Y7': 7,
2943 'TV-G': 0,
2944 'TV-PG': 0,
2945 'TV-14': 14,
2946 'TV-MA': 17,
a8795327
S
2947}
2948
2949
146c80e2 2950def parse_age_limit(s):
a8795327
S
2951 if type(s) == int:
2952 return s if 0 <= s <= 21 else None
2953 if not isinstance(s, compat_basestring):
d838b1bd 2954 return None
146c80e2 2955 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2956 if m:
2957 return int(m.group('age'))
5c5fae6d 2958 s = s.upper()
a8795327
S
2959 if s in US_RATINGS:
2960 return US_RATINGS[s]
5a16c9d9 2961 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 2962 if m:
5a16c9d9 2963 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 2964 return None
146c80e2
S
2965
2966
fac55558 2967def strip_jsonp(code):
609a61e3 2968 return re.sub(
5552c9eb 2969 r'''(?sx)^
e9c671d5 2970 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
2971 (?:\s*&&\s*(?P=func_name))?
2972 \s*\(\s*(?P<callback_data>.*)\);?
2973 \s*?(?://[^\n]*)*$''',
2974 r'\g<callback_data>', code)
478c2c61
PH
2975
2976
5c610515 2977def js_to_json(code, vars={}):
2978 # vars is a dict of var, val pairs to substitute
c843e685 2979 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
4195096e
S
2980 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2981 INTEGER_TABLE = (
2982 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2983 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2984 )
2985
e05f6939 2986 def fix_kv(m):
e7b6d122
PH
2987 v = m.group(0)
2988 if v in ('true', 'false', 'null'):
2989 return v
421ddcb8
C
2990 elif v in ('undefined', 'void 0'):
2991 return 'null'
8bdd16b4 2992 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
bd1e4844 2993 return ""
2994
2995 if v[0] in ("'", '"'):
2996 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 2997 '"': '\\"',
bd1e4844 2998 "\\'": "'",
2999 '\\\n': '',
3000 '\\x': '\\u00',
3001 }.get(m.group(0), m.group(0)), v[1:-1])
8bdd16b4 3002 else:
3003 for regex, base in INTEGER_TABLE:
3004 im = re.match(regex, v)
3005 if im:
3006 i = int(im.group(1), base)
3007 return '"%d":' % i if v.endswith(':') else '%d' % i
89ac4a19 3008
5c610515 3009 if v in vars:
3010 return vars[v]
3011
e7b6d122 3012 return '"%s"' % v
e05f6939 3013
bd1e4844 3014 return re.sub(r'''(?sx)
3015 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3016 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 3017 {comment}|,(?={skip}[\]}}])|
421ddcb8 3018 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
4195096e 3019 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
8bdd16b4 3020 [0-9]+(?={skip}:)|
3021 !+
4195096e 3022 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
3023
3024
478c2c61
PH
3025def qualities(quality_ids):
3026 """ Get a numeric quality value out of a list of possible values """
3027 def q(qid):
3028 try:
3029 return quality_ids.index(qid)
3030 except ValueError:
3031 return -1
3032 return q
3033
acd69589 3034
ebed8b37 3035POSTPROCESS_WHEN = {'pre_process', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
1e43a6f7 3036
3037
de6000d9 3038DEFAULT_OUTTMPL = {
3039 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3040 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3041}
3042OUTTMPL_TYPES = {
72755351 3043 'chapter': None,
de6000d9 3044 'subtitle': None,
3045 'thumbnail': None,
3046 'description': 'description',
3047 'annotation': 'annotations.xml',
3048 'infojson': 'info.json',
08438d2c 3049 'link': None,
5112f26a 3050 'pl_thumbnail': None,
de6000d9 3051 'pl_description': 'description',
3052 'pl_infojson': 'info.json',
3053}
0a871f68 3054
143db31d 3055# As of [1] format syntax is:
3056# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3057# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3058STR_FORMAT_RE_TMPL = r'''(?x)
3059 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3060 %
524e2e4f 3061 (?P<has_key>\((?P<key>{0})\))?
752cda38 3062 (?P<format>
524e2e4f 3063 (?P<conversion>[#0\-+ ]+)?
3064 (?P<min_width>\d+)?
3065 (?P<precision>\.\d+)?
3066 (?P<len_mod>[hlL])? # unused in python
901130bb 3067 {1} # conversion type
752cda38 3068 )
143db31d 3069'''
3070
7d1eb38a 3071
901130bb 3072STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3073
7d1eb38a 3074
a020a0dc
PH
3075def limit_length(s, length):
3076 """ Add ellipses to overly long strings """
3077 if s is None:
3078 return None
3079 ELLIPSES = '...'
3080 if len(s) > length:
3081 return s[:length - len(ELLIPSES)] + ELLIPSES
3082 return s
48844745
PH
3083
3084
3085def version_tuple(v):
5f9b8394 3086 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3087
3088
3089def is_outdated_version(version, limit, assume_new=True):
3090 if not version:
3091 return not assume_new
3092 try:
3093 return version_tuple(version) < version_tuple(limit)
3094 except ValueError:
3095 return not assume_new
732ea2f0
PH
3096
3097
3098def ytdl_is_updateable():
7a5c1cfe 3099 """ Returns if yt-dlp can be updated with -U """
735d865e 3100
5d535b4a 3101 from .update import is_non_updateable
732ea2f0 3102
5d535b4a 3103 return not is_non_updateable()
7d4111ed
PH
3104
3105
3106def args_to_str(args):
3107 # Get a short string representation for a subprocess command
702ccf2d 3108 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3109
3110
9b9c5355 3111def error_to_compat_str(err):
fdae2358
S
3112 err_str = str(err)
3113 # On python 2 error byte string must be decoded with proper
3114 # encoding rather than ascii
3115 if sys.version_info[0] < 3:
3116 err_str = err_str.decode(preferredencoding())
3117 return err_str
3118
3119
c460bdd5 3120def mimetype2ext(mt):
eb9ee194
S
3121 if mt is None:
3122 return None
3123
9359f3d4
F
3124 mt, _, params = mt.partition(';')
3125 mt = mt.strip()
3126
3127 FULL_MAP = {
765ac263 3128 'audio/mp4': 'm4a',
6c33d24b
YCH
3129 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3130 # it's the most popular one
3131 'audio/mpeg': 'mp3',
ba39289d 3132 'audio/x-wav': 'wav',
9359f3d4
F
3133 'audio/wav': 'wav',
3134 'audio/wave': 'wav',
3135 }
3136
3137 ext = FULL_MAP.get(mt)
765ac263
JMF
3138 if ext is not None:
3139 return ext
3140
9359f3d4 3141 SUBTYPE_MAP = {
f6861ec9 3142 '3gpp': '3gp',
cafcf657 3143 'smptett+xml': 'tt',
cafcf657 3144 'ttaf+xml': 'dfxp',
a0d8d704 3145 'ttml+xml': 'ttml',
f6861ec9 3146 'x-flv': 'flv',
a0d8d704 3147 'x-mp4-fragmented': 'mp4',
d4f05d47 3148 'x-ms-sami': 'sami',
a0d8d704 3149 'x-ms-wmv': 'wmv',
b4173f15
RA
3150 'mpegurl': 'm3u8',
3151 'x-mpegurl': 'm3u8',
3152 'vnd.apple.mpegurl': 'm3u8',
3153 'dash+xml': 'mpd',
b4173f15 3154 'f4m+xml': 'f4m',
f164b971 3155 'hds+xml': 'f4m',
e910fe2f 3156 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 3157 'quicktime': 'mov',
98ce1a3f 3158 'mp2t': 'ts',
39e7107d 3159 'x-wav': 'wav',
9359f3d4
F
3160 'filmstrip+json': 'fs',
3161 'svg+xml': 'svg',
3162 }
3163
3164 _, _, subtype = mt.rpartition('/')
3165 ext = SUBTYPE_MAP.get(subtype.lower())
3166 if ext is not None:
3167 return ext
3168
3169 SUFFIX_MAP = {
3170 'json': 'json',
3171 'xml': 'xml',
3172 'zip': 'zip',
3173 'gzip': 'gz',
3174 }
3175
3176 _, _, suffix = subtype.partition('+')
3177 ext = SUFFIX_MAP.get(suffix)
3178 if ext is not None:
3179 return ext
3180
3181 return subtype.replace('+', '.')
c460bdd5
PH
3182
3183
2814f12b
THD
3184def ext2mimetype(ext_or_url):
3185 if not ext_or_url:
3186 return None
3187 if '.' not in ext_or_url:
3188 ext_or_url = f'file.{ext_or_url}'
3189 return mimetypes.guess_type(ext_or_url)[0]
3190
3191
4f3c5e06 3192def parse_codecs(codecs_str):
3193 # http://tools.ietf.org/html/rfc6381
3194 if not codecs_str:
3195 return {}
a0566bbf 3196 split_codecs = list(filter(None, map(
dbf5416a 3197 str.strip, codecs_str.strip().strip(',').split(','))))
4afa3ec4 3198 vcodec, acodec, tcodec, hdr = None, None, None, None
a0566bbf 3199 for full_codec in split_codecs:
9bd979ca 3200 parts = full_codec.split('.')
3201 codec = parts[0].replace('0', '')
3202 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3203 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
4f3c5e06 3204 if not vcodec:
b69fd25c 3205 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
176f1866 3206 if codec in ('dvh1', 'dvhe'):
3207 hdr = 'DV'
9bd979ca 3208 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3209 hdr = 'HDR10'
3210 elif full_codec.replace('0', '').startswith('vp9.2'):
176f1866 3211 hdr = 'HDR10'
b69fd25c 3212 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 3213 if not acodec:
3214 acodec = full_codec
4afa3ec4
F
3215 elif codec in ('stpp', 'wvtt',):
3216 if not tcodec:
3217 tcodec = full_codec
4f3c5e06 3218 else:
60f5c9fb 3219 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
4afa3ec4 3220 if vcodec or acodec or tcodec:
4f3c5e06 3221 return {
3222 'vcodec': vcodec or 'none',
3223 'acodec': acodec or 'none',
176f1866 3224 'dynamic_range': hdr,
4afa3ec4 3225 **({'tcodec': tcodec} if tcodec is not None else {}),
4f3c5e06 3226 }
b69fd25c 3227 elif len(split_codecs) == 2:
3228 return {
3229 'vcodec': split_codecs[0],
3230 'acodec': split_codecs[1],
3231 }
4f3c5e06 3232 return {}
3233
3234
2ccd1b10 3235def urlhandle_detect_ext(url_handle):
79298173 3236 getheader = url_handle.headers.get
2ccd1b10 3237
b55ee18f
PH
3238 cd = getheader('Content-Disposition')
3239 if cd:
3240 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3241 if m:
3242 e = determine_ext(m.group('filename'), default_ext=None)
3243 if e:
3244 return e
3245
c460bdd5 3246 return mimetype2ext(getheader('Content-Type'))
05900629
PH
3247
3248
1e399778
YCH
3249def encode_data_uri(data, mime_type):
3250 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3251
3252
05900629 3253def age_restricted(content_limit, age_limit):
6ec6cb4e 3254 """ Returns True iff the content should be blocked """
05900629
PH
3255
3256 if age_limit is None: # No limit set
3257 return False
3258 if content_limit is None:
3259 return False # Content available for everyone
3260 return age_limit < content_limit
61ca9a80
PH
3261
3262
3263def is_html(first_bytes):
3264 """ Detect whether a file contains HTML by examining its first bytes. """
3265
3266 BOMS = [
3267 (b'\xef\xbb\xbf', 'utf-8'),
3268 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3269 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3270 (b'\xff\xfe', 'utf-16-le'),
3271 (b'\xfe\xff', 'utf-16-be'),
3272 ]
3273 for bom, enc in BOMS:
3274 if first_bytes.startswith(bom):
3275 s = first_bytes[len(bom):].decode(enc, 'replace')
3276 break
3277 else:
3278 s = first_bytes.decode('utf-8', 'replace')
3279
3280 return re.match(r'^\s*<', s)
a055469f
PH
3281
3282
3283def determine_protocol(info_dict):
3284 protocol = info_dict.get('protocol')
3285 if protocol is not None:
3286 return protocol
3287
7de837a5 3288 url = sanitize_url(info_dict['url'])
a055469f
PH
3289 if url.startswith('rtmp'):
3290 return 'rtmp'
3291 elif url.startswith('mms'):
3292 return 'mms'
3293 elif url.startswith('rtsp'):
3294 return 'rtsp'
3295
3296 ext = determine_ext(url)
3297 if ext == 'm3u8':
3298 return 'm3u8'
3299 elif ext == 'f4m':
3300 return 'f4m'
3301
3302 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
3303
3304
c5e3f849 3305def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3306 """ Render a list of rows, each as a list of values.
3307 Text after a \t will be right aligned """
ec11a9f4 3308 def width(string):
c5e3f849 3309 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3310
3311 def get_max_lens(table):
ec11a9f4 3312 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3313
3314 def filter_using_list(row, filterArray):
3315 return [col for (take, col) in zip(filterArray, row) if take]
3316
c5e3f849 3317 if hide_empty:
76d321f6 3318 max_lens = get_max_lens(data)
3319 header_row = filter_using_list(header_row, max_lens)
3320 data = [filter_using_list(row, max_lens) for row in data]
3321
cfb56d1a 3322 table = [header_row] + data
76d321f6 3323 max_lens = get_max_lens(table)
c5e3f849 3324 extra_gap += 1
76d321f6 3325 if delim:
c5e3f849 3326 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3327 table[1][-1] = table[1][-1][:-extra_gap] # Remove extra_gap from end of delimiter
ec11a9f4 3328 for row in table:
3329 for pos, text in enumerate(map(str, row)):
c5e3f849 3330 if '\t' in text:
3331 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3332 else:
3333 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3334 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3335 return ret
347de493
PH
3336
3337
8f18aca8 3338def _match_one(filter_part, dct, incomplete):
77b87f05 3339 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3340 STRING_OPERATORS = {
3341 '*=': operator.contains,
3342 '^=': lambda attr, value: attr.startswith(value),
3343 '$=': lambda attr, value: attr.endswith(value),
3344 '~=': lambda attr, value: re.search(value, attr),
3345 }
347de493 3346 COMPARISON_OPERATORS = {
a047eeb6 3347 **STRING_OPERATORS,
3348 '<=': operator.le, # "<=" must be defined above "<"
347de493 3349 '<': operator.lt,
347de493 3350 '>=': operator.ge,
a047eeb6 3351 '>': operator.gt,
347de493 3352 '=': operator.eq,
347de493 3353 }
a047eeb6 3354
347de493
PH
3355 operator_rex = re.compile(r'''(?x)\s*
3356 (?P<key>[a-z_]+)
77b87f05 3357 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3358 (?:
a047eeb6 3359 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3360 (?P<strval>.+?)
347de493
PH
3361 )
3362 \s*$
3363 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3364 m = operator_rex.search(filter_part)
3365 if m:
18f96d12 3366 m = m.groupdict()
3367 unnegated_op = COMPARISON_OPERATORS[m['op']]
3368 if m['negation']:
77b87f05
MT
3369 op = lambda attr, value: not unnegated_op(attr, value)
3370 else:
3371 op = unnegated_op
18f96d12 3372 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3373 if m['quote']:
3374 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3375 actual_value = dct.get(m['key'])
3376 numeric_comparison = None
3377 if isinstance(actual_value, compat_numeric_types):
e5a088dc
S
3378 # If the original field is a string and matching comparisonvalue is
3379 # a number we should respect the origin of the original field
3380 # and process comparison value as a string (see
18f96d12 3381 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3382 try:
18f96d12 3383 numeric_comparison = int(comparison_value)
347de493 3384 except ValueError:
18f96d12 3385 numeric_comparison = parse_filesize(comparison_value)
3386 if numeric_comparison is None:
3387 numeric_comparison = parse_filesize(f'{comparison_value}B')
3388 if numeric_comparison is None:
3389 numeric_comparison = parse_duration(comparison_value)
3390 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3391 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3392 if actual_value is None:
18f96d12 3393 return incomplete or m['none_inclusive']
3394 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3395
3396 UNARY_OPERATORS = {
1cc47c66
S
3397 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3398 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493
PH
3399 }
3400 operator_rex = re.compile(r'''(?x)\s*
3401 (?P<op>%s)\s*(?P<key>[a-z_]+)
3402 \s*$
3403 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3404 m = operator_rex.search(filter_part)
3405 if m:
3406 op = UNARY_OPERATORS[m.group('op')]
3407 actual_value = dct.get(m.group('key'))
8f18aca8 3408 if incomplete and actual_value is None:
3409 return True
347de493
PH
3410 return op(actual_value)
3411
3412 raise ValueError('Invalid filter part %r' % filter_part)
3413
3414
8f18aca8 3415def match_str(filter_str, dct, incomplete=False):
3416 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3417 When incomplete, all conditions passes on missing fields
3418 """
347de493 3419 return all(
8f18aca8 3420 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3421 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3422
3423
3424def match_filter_func(filter_str):
8f18aca8 3425 def _match_func(info_dict, *args, **kwargs):
3426 if match_str(filter_str, info_dict, *args, **kwargs):
347de493
PH
3427 return None
3428 else:
3429 video_title = info_dict.get('title', info_dict.get('id', 'video'))
3430 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3431 return _match_func
91410c9b
PH
3432
3433
bf6427d2
YCH
3434def parse_dfxp_time_expr(time_expr):
3435 if not time_expr:
d631d5f9 3436 return
bf6427d2
YCH
3437
3438 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3439 if mobj:
3440 return float(mobj.group('time_offset'))
3441
db2fe38b 3442 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3443 if mobj:
db2fe38b 3444 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3445
3446
c1c924ab 3447def srt_subtitles_timecode(seconds):
aa7785f8 3448 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3449
3450
3451def ass_subtitles_timecode(seconds):
3452 time = timetuple_from_msec(seconds * 1000)
3453 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3454
3455
3456def dfxp2srt(dfxp_data):
3869028f
YCH
3457 '''
3458 @param dfxp_data A bytes-like object containing DFXP data
3459 @returns A unicode object containing converted SRT data
3460 '''
5b995f71 3461 LEGACY_NAMESPACES = (
3869028f
YCH
3462 (b'http://www.w3.org/ns/ttml', [
3463 b'http://www.w3.org/2004/11/ttaf1',
3464 b'http://www.w3.org/2006/04/ttaf1',
3465 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3466 ]),
3869028f
YCH
3467 (b'http://www.w3.org/ns/ttml#styling', [
3468 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3469 ]),
3470 )
3471
3472 SUPPORTED_STYLING = [
3473 'color',
3474 'fontFamily',
3475 'fontSize',
3476 'fontStyle',
3477 'fontWeight',
3478 'textDecoration'
3479 ]
3480
4e335771 3481 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3482 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3483 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3484 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3485 })
bf6427d2 3486
5b995f71
RA
3487 styles = {}
3488 default_style = {}
3489
87de7069 3490 class TTMLPElementParser(object):
5b995f71
RA
3491 _out = ''
3492 _unclosed_elements = []
3493 _applied_styles = []
bf6427d2 3494
2b14cb56 3495 def start(self, tag, attrib):
5b995f71
RA
3496 if tag in (_x('ttml:br'), 'br'):
3497 self._out += '\n'
3498 else:
3499 unclosed_elements = []
3500 style = {}
3501 element_style_id = attrib.get('style')
3502 if default_style:
3503 style.update(default_style)
3504 if element_style_id:
3505 style.update(styles.get(element_style_id, {}))
3506 for prop in SUPPORTED_STYLING:
3507 prop_val = attrib.get(_x('tts:' + prop))
3508 if prop_val:
3509 style[prop] = prop_val
3510 if style:
3511 font = ''
3512 for k, v in sorted(style.items()):
3513 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3514 continue
3515 if k == 'color':
3516 font += ' color="%s"' % v
3517 elif k == 'fontSize':
3518 font += ' size="%s"' % v
3519 elif k == 'fontFamily':
3520 font += ' face="%s"' % v
3521 elif k == 'fontWeight' and v == 'bold':
3522 self._out += '<b>'
3523 unclosed_elements.append('b')
3524 elif k == 'fontStyle' and v == 'italic':
3525 self._out += '<i>'
3526 unclosed_elements.append('i')
3527 elif k == 'textDecoration' and v == 'underline':
3528 self._out += '<u>'
3529 unclosed_elements.append('u')
3530 if font:
3531 self._out += '<font' + font + '>'
3532 unclosed_elements.append('font')
3533 applied_style = {}
3534 if self._applied_styles:
3535 applied_style.update(self._applied_styles[-1])
3536 applied_style.update(style)
3537 self._applied_styles.append(applied_style)
3538 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3539
2b14cb56 3540 def end(self, tag):
5b995f71
RA
3541 if tag not in (_x('ttml:br'), 'br'):
3542 unclosed_elements = self._unclosed_elements.pop()
3543 for element in reversed(unclosed_elements):
3544 self._out += '</%s>' % element
3545 if unclosed_elements and self._applied_styles:
3546 self._applied_styles.pop()
bf6427d2 3547
2b14cb56 3548 def data(self, data):
5b995f71 3549 self._out += data
2b14cb56 3550
3551 def close(self):
5b995f71 3552 return self._out.strip()
2b14cb56 3553
3554 def parse_node(node):
3555 target = TTMLPElementParser()
3556 parser = xml.etree.ElementTree.XMLParser(target=target)
3557 parser.feed(xml.etree.ElementTree.tostring(node))
3558 return parser.close()
bf6427d2 3559
5b995f71
RA
3560 for k, v in LEGACY_NAMESPACES:
3561 for ns in v:
3562 dfxp_data = dfxp_data.replace(ns, k)
3563
3869028f 3564 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3565 out = []
5b995f71 3566 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3567
3568 if not paras:
3569 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3570
5b995f71
RA
3571 repeat = False
3572 while True:
3573 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3574 style_id = style.get('id') or style.get(_x('xml:id'))
3575 if not style_id:
3576 continue
5b995f71
RA
3577 parent_style_id = style.get('style')
3578 if parent_style_id:
3579 if parent_style_id not in styles:
3580 repeat = True
3581 continue
3582 styles[style_id] = styles[parent_style_id].copy()
3583 for prop in SUPPORTED_STYLING:
3584 prop_val = style.get(_x('tts:' + prop))
3585 if prop_val:
3586 styles.setdefault(style_id, {})[prop] = prop_val
3587 if repeat:
3588 repeat = False
3589 else:
3590 break
3591
3592 for p in ('body', 'div'):
3593 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3594 if ele is None:
3595 continue
3596 style = styles.get(ele.get('style'))
3597 if not style:
3598 continue
3599 default_style.update(style)
3600
bf6427d2 3601 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3602 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3603 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3604 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3605 if begin_time is None:
3606 continue
7dff0363 3607 if not end_time:
d631d5f9
YCH
3608 if not dur:
3609 continue
3610 end_time = begin_time + dur
bf6427d2
YCH
3611 out.append('%d\n%s --> %s\n%s\n\n' % (
3612 index,
c1c924ab
YCH
3613 srt_subtitles_timecode(begin_time),
3614 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3615 parse_node(para)))
3616
3617 return ''.join(out)
3618
3619
66e289ba
S
3620def cli_option(params, command_option, param):
3621 param = params.get(param)
98e698f1
RA
3622 if param:
3623 param = compat_str(param)
66e289ba
S
3624 return [command_option, param] if param is not None else []
3625
3626
3627def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3628 param = params.get(param)
5b232f46
S
3629 if param is None:
3630 return []
66e289ba
S
3631 assert isinstance(param, bool)
3632 if separator:
3633 return [command_option + separator + (true_value if param else false_value)]
3634 return [command_option, true_value if param else false_value]
3635
3636
3637def cli_valueless_option(params, command_option, param, expected_value=True):
3638 param = params.get(param)
3639 return [command_option] if param == expected_value else []
3640
3641
e92caff5 3642def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3643 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3644 if use_compat:
5b1ecbb3 3645 return argdict
3646 else:
3647 argdict = None
eab9b2bc 3648 if argdict is None:
5b1ecbb3 3649 return default
eab9b2bc 3650 assert isinstance(argdict, dict)
3651
e92caff5 3652 assert isinstance(keys, (list, tuple))
3653 for key_list in keys:
e92caff5 3654 arg_list = list(filter(
3655 lambda x: x is not None,
6606817a 3656 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3657 if arg_list:
3658 return [arg for args in arg_list for arg in args]
3659 return default
66e289ba 3660
6251555f 3661
330690a2 3662def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3663 main_key, exe = main_key.lower(), exe.lower()
3664 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3665 keys = [f'{root_key}{k}' for k in (keys or [''])]
3666 if root_key in keys:
3667 if main_key != exe:
3668 keys.append((main_key, exe))
3669 keys.append('default')
3670 else:
3671 use_compat = False
3672 return cli_configuration_args(argdict, keys, default, use_compat)
3673
66e289ba 3674
39672624
YCH
3675class ISO639Utils(object):
3676 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3677 _lang_map = {
3678 'aa': 'aar',
3679 'ab': 'abk',
3680 'ae': 'ave',
3681 'af': 'afr',
3682 'ak': 'aka',
3683 'am': 'amh',
3684 'an': 'arg',
3685 'ar': 'ara',
3686 'as': 'asm',
3687 'av': 'ava',
3688 'ay': 'aym',
3689 'az': 'aze',
3690 'ba': 'bak',
3691 'be': 'bel',
3692 'bg': 'bul',
3693 'bh': 'bih',
3694 'bi': 'bis',
3695 'bm': 'bam',
3696 'bn': 'ben',
3697 'bo': 'bod',
3698 'br': 'bre',
3699 'bs': 'bos',
3700 'ca': 'cat',
3701 'ce': 'che',
3702 'ch': 'cha',
3703 'co': 'cos',
3704 'cr': 'cre',
3705 'cs': 'ces',
3706 'cu': 'chu',
3707 'cv': 'chv',
3708 'cy': 'cym',
3709 'da': 'dan',
3710 'de': 'deu',
3711 'dv': 'div',
3712 'dz': 'dzo',
3713 'ee': 'ewe',
3714 'el': 'ell',
3715 'en': 'eng',
3716 'eo': 'epo',
3717 'es': 'spa',
3718 'et': 'est',
3719 'eu': 'eus',
3720 'fa': 'fas',
3721 'ff': 'ful',
3722 'fi': 'fin',
3723 'fj': 'fij',
3724 'fo': 'fao',
3725 'fr': 'fra',
3726 'fy': 'fry',
3727 'ga': 'gle',
3728 'gd': 'gla',
3729 'gl': 'glg',
3730 'gn': 'grn',
3731 'gu': 'guj',
3732 'gv': 'glv',
3733 'ha': 'hau',
3734 'he': 'heb',
b7acc835 3735 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
3736 'hi': 'hin',
3737 'ho': 'hmo',
3738 'hr': 'hrv',
3739 'ht': 'hat',
3740 'hu': 'hun',
3741 'hy': 'hye',
3742 'hz': 'her',
3743 'ia': 'ina',
3744 'id': 'ind',
b7acc835 3745 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
3746 'ie': 'ile',
3747 'ig': 'ibo',
3748 'ii': 'iii',
3749 'ik': 'ipk',
3750 'io': 'ido',
3751 'is': 'isl',
3752 'it': 'ita',
3753 'iu': 'iku',
3754 'ja': 'jpn',
3755 'jv': 'jav',
3756 'ka': 'kat',
3757 'kg': 'kon',
3758 'ki': 'kik',
3759 'kj': 'kua',
3760 'kk': 'kaz',
3761 'kl': 'kal',
3762 'km': 'khm',
3763 'kn': 'kan',
3764 'ko': 'kor',
3765 'kr': 'kau',
3766 'ks': 'kas',
3767 'ku': 'kur',
3768 'kv': 'kom',
3769 'kw': 'cor',
3770 'ky': 'kir',
3771 'la': 'lat',
3772 'lb': 'ltz',
3773 'lg': 'lug',
3774 'li': 'lim',
3775 'ln': 'lin',
3776 'lo': 'lao',
3777 'lt': 'lit',
3778 'lu': 'lub',
3779 'lv': 'lav',
3780 'mg': 'mlg',
3781 'mh': 'mah',
3782 'mi': 'mri',
3783 'mk': 'mkd',
3784 'ml': 'mal',
3785 'mn': 'mon',
3786 'mr': 'mar',
3787 'ms': 'msa',
3788 'mt': 'mlt',
3789 'my': 'mya',
3790 'na': 'nau',
3791 'nb': 'nob',
3792 'nd': 'nde',
3793 'ne': 'nep',
3794 'ng': 'ndo',
3795 'nl': 'nld',
3796 'nn': 'nno',
3797 'no': 'nor',
3798 'nr': 'nbl',
3799 'nv': 'nav',
3800 'ny': 'nya',
3801 'oc': 'oci',
3802 'oj': 'oji',
3803 'om': 'orm',
3804 'or': 'ori',
3805 'os': 'oss',
3806 'pa': 'pan',
3807 'pi': 'pli',
3808 'pl': 'pol',
3809 'ps': 'pus',
3810 'pt': 'por',
3811 'qu': 'que',
3812 'rm': 'roh',
3813 'rn': 'run',
3814 'ro': 'ron',
3815 'ru': 'rus',
3816 'rw': 'kin',
3817 'sa': 'san',
3818 'sc': 'srd',
3819 'sd': 'snd',
3820 'se': 'sme',
3821 'sg': 'sag',
3822 'si': 'sin',
3823 'sk': 'slk',
3824 'sl': 'slv',
3825 'sm': 'smo',
3826 'sn': 'sna',
3827 'so': 'som',
3828 'sq': 'sqi',
3829 'sr': 'srp',
3830 'ss': 'ssw',
3831 'st': 'sot',
3832 'su': 'sun',
3833 'sv': 'swe',
3834 'sw': 'swa',
3835 'ta': 'tam',
3836 'te': 'tel',
3837 'tg': 'tgk',
3838 'th': 'tha',
3839 'ti': 'tir',
3840 'tk': 'tuk',
3841 'tl': 'tgl',
3842 'tn': 'tsn',
3843 'to': 'ton',
3844 'tr': 'tur',
3845 'ts': 'tso',
3846 'tt': 'tat',
3847 'tw': 'twi',
3848 'ty': 'tah',
3849 'ug': 'uig',
3850 'uk': 'ukr',
3851 'ur': 'urd',
3852 'uz': 'uzb',
3853 've': 'ven',
3854 'vi': 'vie',
3855 'vo': 'vol',
3856 'wa': 'wln',
3857 'wo': 'wol',
3858 'xh': 'xho',
3859 'yi': 'yid',
e9a50fba 3860 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
3861 'yo': 'yor',
3862 'za': 'zha',
3863 'zh': 'zho',
3864 'zu': 'zul',
3865 }
3866
3867 @classmethod
3868 def short2long(cls, code):
3869 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3870 return cls._lang_map.get(code[:2])
3871
3872 @classmethod
3873 def long2short(cls, code):
3874 """Convert language code from ISO 639-2/T to ISO 639-1"""
3875 for short_name, long_name in cls._lang_map.items():
3876 if long_name == code:
3877 return short_name
3878
3879
4eb10f66
YCH
3880class ISO3166Utils(object):
3881 # From http://data.okfn.org/data/core/country-list
3882 _country_map = {
3883 'AF': 'Afghanistan',
3884 'AX': 'Åland Islands',
3885 'AL': 'Albania',
3886 'DZ': 'Algeria',
3887 'AS': 'American Samoa',
3888 'AD': 'Andorra',
3889 'AO': 'Angola',
3890 'AI': 'Anguilla',
3891 'AQ': 'Antarctica',
3892 'AG': 'Antigua and Barbuda',
3893 'AR': 'Argentina',
3894 'AM': 'Armenia',
3895 'AW': 'Aruba',
3896 'AU': 'Australia',
3897 'AT': 'Austria',
3898 'AZ': 'Azerbaijan',
3899 'BS': 'Bahamas',
3900 'BH': 'Bahrain',
3901 'BD': 'Bangladesh',
3902 'BB': 'Barbados',
3903 'BY': 'Belarus',
3904 'BE': 'Belgium',
3905 'BZ': 'Belize',
3906 'BJ': 'Benin',
3907 'BM': 'Bermuda',
3908 'BT': 'Bhutan',
3909 'BO': 'Bolivia, Plurinational State of',
3910 'BQ': 'Bonaire, Sint Eustatius and Saba',
3911 'BA': 'Bosnia and Herzegovina',
3912 'BW': 'Botswana',
3913 'BV': 'Bouvet Island',
3914 'BR': 'Brazil',
3915 'IO': 'British Indian Ocean Territory',
3916 'BN': 'Brunei Darussalam',
3917 'BG': 'Bulgaria',
3918 'BF': 'Burkina Faso',
3919 'BI': 'Burundi',
3920 'KH': 'Cambodia',
3921 'CM': 'Cameroon',
3922 'CA': 'Canada',
3923 'CV': 'Cape Verde',
3924 'KY': 'Cayman Islands',
3925 'CF': 'Central African Republic',
3926 'TD': 'Chad',
3927 'CL': 'Chile',
3928 'CN': 'China',
3929 'CX': 'Christmas Island',
3930 'CC': 'Cocos (Keeling) Islands',
3931 'CO': 'Colombia',
3932 'KM': 'Comoros',
3933 'CG': 'Congo',
3934 'CD': 'Congo, the Democratic Republic of the',
3935 'CK': 'Cook Islands',
3936 'CR': 'Costa Rica',
3937 'CI': 'Côte d\'Ivoire',
3938 'HR': 'Croatia',
3939 'CU': 'Cuba',
3940 'CW': 'Curaçao',
3941 'CY': 'Cyprus',
3942 'CZ': 'Czech Republic',
3943 'DK': 'Denmark',
3944 'DJ': 'Djibouti',
3945 'DM': 'Dominica',
3946 'DO': 'Dominican Republic',
3947 'EC': 'Ecuador',
3948 'EG': 'Egypt',
3949 'SV': 'El Salvador',
3950 'GQ': 'Equatorial Guinea',
3951 'ER': 'Eritrea',
3952 'EE': 'Estonia',
3953 'ET': 'Ethiopia',
3954 'FK': 'Falkland Islands (Malvinas)',
3955 'FO': 'Faroe Islands',
3956 'FJ': 'Fiji',
3957 'FI': 'Finland',
3958 'FR': 'France',
3959 'GF': 'French Guiana',
3960 'PF': 'French Polynesia',
3961 'TF': 'French Southern Territories',
3962 'GA': 'Gabon',
3963 'GM': 'Gambia',
3964 'GE': 'Georgia',
3965 'DE': 'Germany',
3966 'GH': 'Ghana',
3967 'GI': 'Gibraltar',
3968 'GR': 'Greece',
3969 'GL': 'Greenland',
3970 'GD': 'Grenada',
3971 'GP': 'Guadeloupe',
3972 'GU': 'Guam',
3973 'GT': 'Guatemala',
3974 'GG': 'Guernsey',
3975 'GN': 'Guinea',
3976 'GW': 'Guinea-Bissau',
3977 'GY': 'Guyana',
3978 'HT': 'Haiti',
3979 'HM': 'Heard Island and McDonald Islands',
3980 'VA': 'Holy See (Vatican City State)',
3981 'HN': 'Honduras',
3982 'HK': 'Hong Kong',
3983 'HU': 'Hungary',
3984 'IS': 'Iceland',
3985 'IN': 'India',
3986 'ID': 'Indonesia',
3987 'IR': 'Iran, Islamic Republic of',
3988 'IQ': 'Iraq',
3989 'IE': 'Ireland',
3990 'IM': 'Isle of Man',
3991 'IL': 'Israel',
3992 'IT': 'Italy',
3993 'JM': 'Jamaica',
3994 'JP': 'Japan',
3995 'JE': 'Jersey',
3996 'JO': 'Jordan',
3997 'KZ': 'Kazakhstan',
3998 'KE': 'Kenya',
3999 'KI': 'Kiribati',
4000 'KP': 'Korea, Democratic People\'s Republic of',
4001 'KR': 'Korea, Republic of',
4002 'KW': 'Kuwait',
4003 'KG': 'Kyrgyzstan',
4004 'LA': 'Lao People\'s Democratic Republic',
4005 'LV': 'Latvia',
4006 'LB': 'Lebanon',
4007 'LS': 'Lesotho',
4008 'LR': 'Liberia',
4009 'LY': 'Libya',
4010 'LI': 'Liechtenstein',
4011 'LT': 'Lithuania',
4012 'LU': 'Luxembourg',
4013 'MO': 'Macao',
4014 'MK': 'Macedonia, the Former Yugoslav Republic of',
4015 'MG': 'Madagascar',
4016 'MW': 'Malawi',
4017 'MY': 'Malaysia',
4018 'MV': 'Maldives',
4019 'ML': 'Mali',
4020 'MT': 'Malta',
4021 'MH': 'Marshall Islands',
4022 'MQ': 'Martinique',
4023 'MR': 'Mauritania',
4024 'MU': 'Mauritius',
4025 'YT': 'Mayotte',
4026 'MX': 'Mexico',
4027 'FM': 'Micronesia, Federated States of',
4028 'MD': 'Moldova, Republic of',
4029 'MC': 'Monaco',
4030 'MN': 'Mongolia',
4031 'ME': 'Montenegro',
4032 'MS': 'Montserrat',
4033 'MA': 'Morocco',
4034 'MZ': 'Mozambique',
4035 'MM': 'Myanmar',
4036 'NA': 'Namibia',
4037 'NR': 'Nauru',
4038 'NP': 'Nepal',
4039 'NL': 'Netherlands',
4040 'NC': 'New Caledonia',
4041 'NZ': 'New Zealand',
4042 'NI': 'Nicaragua',
4043 'NE': 'Niger',
4044 'NG': 'Nigeria',
4045 'NU': 'Niue',
4046 'NF': 'Norfolk Island',
4047 'MP': 'Northern Mariana Islands',
4048 'NO': 'Norway',
4049 'OM': 'Oman',
4050 'PK': 'Pakistan',
4051 'PW': 'Palau',
4052 'PS': 'Palestine, State of',
4053 'PA': 'Panama',
4054 'PG': 'Papua New Guinea',
4055 'PY': 'Paraguay',
4056 'PE': 'Peru',
4057 'PH': 'Philippines',
4058 'PN': 'Pitcairn',
4059 'PL': 'Poland',
4060 'PT': 'Portugal',
4061 'PR': 'Puerto Rico',
4062 'QA': 'Qatar',
4063 'RE': 'Réunion',
4064 'RO': 'Romania',
4065 'RU': 'Russian Federation',
4066 'RW': 'Rwanda',
4067 'BL': 'Saint Barthélemy',
4068 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4069 'KN': 'Saint Kitts and Nevis',
4070 'LC': 'Saint Lucia',
4071 'MF': 'Saint Martin (French part)',
4072 'PM': 'Saint Pierre and Miquelon',
4073 'VC': 'Saint Vincent and the Grenadines',
4074 'WS': 'Samoa',
4075 'SM': 'San Marino',
4076 'ST': 'Sao Tome and Principe',
4077 'SA': 'Saudi Arabia',
4078 'SN': 'Senegal',
4079 'RS': 'Serbia',
4080 'SC': 'Seychelles',
4081 'SL': 'Sierra Leone',
4082 'SG': 'Singapore',
4083 'SX': 'Sint Maarten (Dutch part)',
4084 'SK': 'Slovakia',
4085 'SI': 'Slovenia',
4086 'SB': 'Solomon Islands',
4087 'SO': 'Somalia',
4088 'ZA': 'South Africa',
4089 'GS': 'South Georgia and the South Sandwich Islands',
4090 'SS': 'South Sudan',
4091 'ES': 'Spain',
4092 'LK': 'Sri Lanka',
4093 'SD': 'Sudan',
4094 'SR': 'Suriname',
4095 'SJ': 'Svalbard and Jan Mayen',
4096 'SZ': 'Swaziland',
4097 'SE': 'Sweden',
4098 'CH': 'Switzerland',
4099 'SY': 'Syrian Arab Republic',
4100 'TW': 'Taiwan, Province of China',
4101 'TJ': 'Tajikistan',
4102 'TZ': 'Tanzania, United Republic of',
4103 'TH': 'Thailand',
4104 'TL': 'Timor-Leste',
4105 'TG': 'Togo',
4106 'TK': 'Tokelau',
4107 'TO': 'Tonga',
4108 'TT': 'Trinidad and Tobago',
4109 'TN': 'Tunisia',
4110 'TR': 'Turkey',
4111 'TM': 'Turkmenistan',
4112 'TC': 'Turks and Caicos Islands',
4113 'TV': 'Tuvalu',
4114 'UG': 'Uganda',
4115 'UA': 'Ukraine',
4116 'AE': 'United Arab Emirates',
4117 'GB': 'United Kingdom',
4118 'US': 'United States',
4119 'UM': 'United States Minor Outlying Islands',
4120 'UY': 'Uruguay',
4121 'UZ': 'Uzbekistan',
4122 'VU': 'Vanuatu',
4123 'VE': 'Venezuela, Bolivarian Republic of',
4124 'VN': 'Viet Nam',
4125 'VG': 'Virgin Islands, British',
4126 'VI': 'Virgin Islands, U.S.',
4127 'WF': 'Wallis and Futuna',
4128 'EH': 'Western Sahara',
4129 'YE': 'Yemen',
4130 'ZM': 'Zambia',
4131 'ZW': 'Zimbabwe',
4132 }
4133
4134 @classmethod
4135 def short2full(cls, code):
4136 """Convert an ISO 3166-2 country code to the corresponding full name"""
4137 return cls._country_map.get(code.upper())
4138
4139
773f291d
S
4140class GeoUtils(object):
4141 # Major IPv4 address blocks per country
4142 _country_ip_map = {
53896ca5 4143 'AD': '46.172.224.0/19',
773f291d
S
4144 'AE': '94.200.0.0/13',
4145 'AF': '149.54.0.0/17',
4146 'AG': '209.59.64.0/18',
4147 'AI': '204.14.248.0/21',
4148 'AL': '46.99.0.0/16',
4149 'AM': '46.70.0.0/15',
4150 'AO': '105.168.0.0/13',
53896ca5
S
4151 'AP': '182.50.184.0/21',
4152 'AQ': '23.154.160.0/24',
773f291d
S
4153 'AR': '181.0.0.0/12',
4154 'AS': '202.70.112.0/20',
53896ca5 4155 'AT': '77.116.0.0/14',
773f291d
S
4156 'AU': '1.128.0.0/11',
4157 'AW': '181.41.0.0/18',
53896ca5
S
4158 'AX': '185.217.4.0/22',
4159 'AZ': '5.197.0.0/16',
773f291d
S
4160 'BA': '31.176.128.0/17',
4161 'BB': '65.48.128.0/17',
4162 'BD': '114.130.0.0/16',
4163 'BE': '57.0.0.0/8',
53896ca5 4164 'BF': '102.178.0.0/15',
773f291d
S
4165 'BG': '95.42.0.0/15',
4166 'BH': '37.131.0.0/17',
4167 'BI': '154.117.192.0/18',
4168 'BJ': '137.255.0.0/16',
53896ca5 4169 'BL': '185.212.72.0/23',
773f291d
S
4170 'BM': '196.12.64.0/18',
4171 'BN': '156.31.0.0/16',
4172 'BO': '161.56.0.0/16',
4173 'BQ': '161.0.80.0/20',
53896ca5 4174 'BR': '191.128.0.0/12',
773f291d
S
4175 'BS': '24.51.64.0/18',
4176 'BT': '119.2.96.0/19',
4177 'BW': '168.167.0.0/16',
4178 'BY': '178.120.0.0/13',
4179 'BZ': '179.42.192.0/18',
4180 'CA': '99.224.0.0/11',
4181 'CD': '41.243.0.0/16',
53896ca5
S
4182 'CF': '197.242.176.0/21',
4183 'CG': '160.113.0.0/16',
773f291d 4184 'CH': '85.0.0.0/13',
53896ca5 4185 'CI': '102.136.0.0/14',
773f291d
S
4186 'CK': '202.65.32.0/19',
4187 'CL': '152.172.0.0/14',
53896ca5 4188 'CM': '102.244.0.0/14',
773f291d
S
4189 'CN': '36.128.0.0/10',
4190 'CO': '181.240.0.0/12',
4191 'CR': '201.192.0.0/12',
4192 'CU': '152.206.0.0/15',
4193 'CV': '165.90.96.0/19',
4194 'CW': '190.88.128.0/17',
53896ca5 4195 'CY': '31.153.0.0/16',
773f291d
S
4196 'CZ': '88.100.0.0/14',
4197 'DE': '53.0.0.0/8',
4198 'DJ': '197.241.0.0/17',
4199 'DK': '87.48.0.0/12',
4200 'DM': '192.243.48.0/20',
4201 'DO': '152.166.0.0/15',
4202 'DZ': '41.96.0.0/12',
4203 'EC': '186.68.0.0/15',
4204 'EE': '90.190.0.0/15',
4205 'EG': '156.160.0.0/11',
4206 'ER': '196.200.96.0/20',
4207 'ES': '88.0.0.0/11',
4208 'ET': '196.188.0.0/14',
4209 'EU': '2.16.0.0/13',
4210 'FI': '91.152.0.0/13',
4211 'FJ': '144.120.0.0/16',
53896ca5 4212 'FK': '80.73.208.0/21',
773f291d
S
4213 'FM': '119.252.112.0/20',
4214 'FO': '88.85.32.0/19',
4215 'FR': '90.0.0.0/9',
4216 'GA': '41.158.0.0/15',
4217 'GB': '25.0.0.0/8',
4218 'GD': '74.122.88.0/21',
4219 'GE': '31.146.0.0/16',
4220 'GF': '161.22.64.0/18',
4221 'GG': '62.68.160.0/19',
53896ca5
S
4222 'GH': '154.160.0.0/12',
4223 'GI': '95.164.0.0/16',
773f291d
S
4224 'GL': '88.83.0.0/19',
4225 'GM': '160.182.0.0/15',
4226 'GN': '197.149.192.0/18',
4227 'GP': '104.250.0.0/19',
4228 'GQ': '105.235.224.0/20',
4229 'GR': '94.64.0.0/13',
4230 'GT': '168.234.0.0/16',
4231 'GU': '168.123.0.0/16',
4232 'GW': '197.214.80.0/20',
4233 'GY': '181.41.64.0/18',
4234 'HK': '113.252.0.0/14',
4235 'HN': '181.210.0.0/16',
4236 'HR': '93.136.0.0/13',
4237 'HT': '148.102.128.0/17',
4238 'HU': '84.0.0.0/14',
4239 'ID': '39.192.0.0/10',
4240 'IE': '87.32.0.0/12',
4241 'IL': '79.176.0.0/13',
4242 'IM': '5.62.80.0/20',
4243 'IN': '117.192.0.0/10',
4244 'IO': '203.83.48.0/21',
4245 'IQ': '37.236.0.0/14',
4246 'IR': '2.176.0.0/12',
4247 'IS': '82.221.0.0/16',
4248 'IT': '79.0.0.0/10',
4249 'JE': '87.244.64.0/18',
4250 'JM': '72.27.0.0/17',
4251 'JO': '176.29.0.0/16',
53896ca5 4252 'JP': '133.0.0.0/8',
773f291d
S
4253 'KE': '105.48.0.0/12',
4254 'KG': '158.181.128.0/17',
4255 'KH': '36.37.128.0/17',
4256 'KI': '103.25.140.0/22',
4257 'KM': '197.255.224.0/20',
53896ca5 4258 'KN': '198.167.192.0/19',
773f291d
S
4259 'KP': '175.45.176.0/22',
4260 'KR': '175.192.0.0/10',
4261 'KW': '37.36.0.0/14',
4262 'KY': '64.96.0.0/15',
4263 'KZ': '2.72.0.0/13',
4264 'LA': '115.84.64.0/18',
4265 'LB': '178.135.0.0/16',
53896ca5 4266 'LC': '24.92.144.0/20',
773f291d
S
4267 'LI': '82.117.0.0/19',
4268 'LK': '112.134.0.0/15',
53896ca5 4269 'LR': '102.183.0.0/16',
773f291d
S
4270 'LS': '129.232.0.0/17',
4271 'LT': '78.56.0.0/13',
4272 'LU': '188.42.0.0/16',
4273 'LV': '46.109.0.0/16',
4274 'LY': '41.252.0.0/14',
4275 'MA': '105.128.0.0/11',
4276 'MC': '88.209.64.0/18',
4277 'MD': '37.246.0.0/16',
4278 'ME': '178.175.0.0/17',
4279 'MF': '74.112.232.0/21',
4280 'MG': '154.126.0.0/17',
4281 'MH': '117.103.88.0/21',
4282 'MK': '77.28.0.0/15',
4283 'ML': '154.118.128.0/18',
4284 'MM': '37.111.0.0/17',
4285 'MN': '49.0.128.0/17',
4286 'MO': '60.246.0.0/16',
4287 'MP': '202.88.64.0/20',
4288 'MQ': '109.203.224.0/19',
4289 'MR': '41.188.64.0/18',
4290 'MS': '208.90.112.0/22',
4291 'MT': '46.11.0.0/16',
4292 'MU': '105.16.0.0/12',
4293 'MV': '27.114.128.0/18',
53896ca5 4294 'MW': '102.70.0.0/15',
773f291d
S
4295 'MX': '187.192.0.0/11',
4296 'MY': '175.136.0.0/13',
4297 'MZ': '197.218.0.0/15',
4298 'NA': '41.182.0.0/16',
4299 'NC': '101.101.0.0/18',
4300 'NE': '197.214.0.0/18',
4301 'NF': '203.17.240.0/22',
4302 'NG': '105.112.0.0/12',
4303 'NI': '186.76.0.0/15',
4304 'NL': '145.96.0.0/11',
4305 'NO': '84.208.0.0/13',
4306 'NP': '36.252.0.0/15',
4307 'NR': '203.98.224.0/19',
4308 'NU': '49.156.48.0/22',
4309 'NZ': '49.224.0.0/14',
4310 'OM': '5.36.0.0/15',
4311 'PA': '186.72.0.0/15',
4312 'PE': '186.160.0.0/14',
4313 'PF': '123.50.64.0/18',
4314 'PG': '124.240.192.0/19',
4315 'PH': '49.144.0.0/13',
4316 'PK': '39.32.0.0/11',
4317 'PL': '83.0.0.0/11',
4318 'PM': '70.36.0.0/20',
4319 'PR': '66.50.0.0/16',
4320 'PS': '188.161.0.0/16',
4321 'PT': '85.240.0.0/13',
4322 'PW': '202.124.224.0/20',
4323 'PY': '181.120.0.0/14',
4324 'QA': '37.210.0.0/15',
53896ca5 4325 'RE': '102.35.0.0/16',
773f291d 4326 'RO': '79.112.0.0/13',
53896ca5 4327 'RS': '93.86.0.0/15',
773f291d 4328 'RU': '5.136.0.0/13',
53896ca5 4329 'RW': '41.186.0.0/16',
773f291d
S
4330 'SA': '188.48.0.0/13',
4331 'SB': '202.1.160.0/19',
4332 'SC': '154.192.0.0/11',
53896ca5 4333 'SD': '102.120.0.0/13',
773f291d 4334 'SE': '78.64.0.0/12',
53896ca5 4335 'SG': '8.128.0.0/10',
773f291d
S
4336 'SI': '188.196.0.0/14',
4337 'SK': '78.98.0.0/15',
53896ca5 4338 'SL': '102.143.0.0/17',
773f291d
S
4339 'SM': '89.186.32.0/19',
4340 'SN': '41.82.0.0/15',
53896ca5 4341 'SO': '154.115.192.0/18',
773f291d
S
4342 'SR': '186.179.128.0/17',
4343 'SS': '105.235.208.0/21',
4344 'ST': '197.159.160.0/19',
4345 'SV': '168.243.0.0/16',
4346 'SX': '190.102.0.0/20',
4347 'SY': '5.0.0.0/16',
4348 'SZ': '41.84.224.0/19',
4349 'TC': '65.255.48.0/20',
4350 'TD': '154.68.128.0/19',
4351 'TG': '196.168.0.0/14',
4352 'TH': '171.96.0.0/13',
4353 'TJ': '85.9.128.0/18',
4354 'TK': '27.96.24.0/21',
4355 'TL': '180.189.160.0/20',
4356 'TM': '95.85.96.0/19',
4357 'TN': '197.0.0.0/11',
4358 'TO': '175.176.144.0/21',
4359 'TR': '78.160.0.0/11',
4360 'TT': '186.44.0.0/15',
4361 'TV': '202.2.96.0/19',
4362 'TW': '120.96.0.0/11',
4363 'TZ': '156.156.0.0/14',
53896ca5
S
4364 'UA': '37.52.0.0/14',
4365 'UG': '102.80.0.0/13',
4366 'US': '6.0.0.0/8',
773f291d 4367 'UY': '167.56.0.0/13',
53896ca5 4368 'UZ': '84.54.64.0/18',
773f291d 4369 'VA': '212.77.0.0/19',
53896ca5 4370 'VC': '207.191.240.0/21',
773f291d 4371 'VE': '186.88.0.0/13',
53896ca5 4372 'VG': '66.81.192.0/20',
773f291d
S
4373 'VI': '146.226.0.0/16',
4374 'VN': '14.160.0.0/11',
4375 'VU': '202.80.32.0/20',
4376 'WF': '117.20.32.0/21',
4377 'WS': '202.4.32.0/19',
4378 'YE': '134.35.0.0/16',
4379 'YT': '41.242.116.0/22',
4380 'ZA': '41.0.0.0/11',
53896ca5
S
4381 'ZM': '102.144.0.0/13',
4382 'ZW': '102.177.192.0/18',
773f291d
S
4383 }
4384
4385 @classmethod
5f95927a
S
4386 def random_ipv4(cls, code_or_block):
4387 if len(code_or_block) == 2:
4388 block = cls._country_ip_map.get(code_or_block.upper())
4389 if not block:
4390 return None
4391 else:
4392 block = code_or_block
773f291d
S
4393 addr, preflen = block.split('/')
4394 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4395 addr_max = addr_min | (0xffffffff >> int(preflen))
18a0defa 4396 return compat_str(socket.inet_ntoa(
4248dad9 4397 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4398
4399
91410c9b 4400class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
4401 def __init__(self, proxies=None):
4402 # Set default handlers
4403 for type in ('http', 'https'):
4404 setattr(self, '%s_open' % type,
4405 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4406 meth(r, proxy, type))
38e87f6c 4407 compat_urllib_request.ProxyHandler.__init__(self, proxies)
2461f79d 4408
91410c9b 4409 def proxy_open(self, req, proxy, type):
2461f79d 4410 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4411 if req_proxy is not None:
4412 proxy = req_proxy
2461f79d
PH
4413 del req.headers['Ytdl-request-proxy']
4414
4415 if proxy == '__noproxy__':
4416 return None # No Proxy
51fb4995 4417 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4418 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4419 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4420 return None
91410c9b
PH
4421 return compat_urllib_request.ProxyHandler.proxy_open(
4422 self, req, proxy, type)
5bc880b9
YCH
4423
4424
0a5445dd
YCH
4425# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4426# released into Public Domain
4427# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4428
4429def long_to_bytes(n, blocksize=0):
4430 """long_to_bytes(n:long, blocksize:int) : string
4431 Convert a long integer to a byte string.
4432
4433 If optional blocksize is given and greater than zero, pad the front of the
4434 byte string with binary zeros so that the length is a multiple of
4435 blocksize.
4436 """
4437 # after much testing, this algorithm was deemed to be the fastest
4438 s = b''
4439 n = int(n)
4440 while n > 0:
4441 s = compat_struct_pack('>I', n & 0xffffffff) + s
4442 n = n >> 32
4443 # strip off leading zeros
4444 for i in range(len(s)):
4445 if s[i] != b'\000'[0]:
4446 break
4447 else:
4448 # only happens when n == 0
4449 s = b'\000'
4450 i = 0
4451 s = s[i:]
4452 # add back some pad bytes. this could be done more efficiently w.r.t. the
4453 # de-padding being done above, but sigh...
4454 if blocksize > 0 and len(s) % blocksize:
4455 s = (blocksize - len(s) % blocksize) * b'\000' + s
4456 return s
4457
4458
4459def bytes_to_long(s):
4460 """bytes_to_long(string) : long
4461 Convert a byte string to a long integer.
4462
4463 This is (essentially) the inverse of long_to_bytes().
4464 """
4465 acc = 0
4466 length = len(s)
4467 if length % 4:
4468 extra = (4 - length % 4)
4469 s = b'\000' * extra + s
4470 length = length + extra
4471 for i in range(0, length, 4):
4472 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4473 return acc
4474
4475
5bc880b9
YCH
4476def ohdave_rsa_encrypt(data, exponent, modulus):
4477 '''
4478 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4479
4480 Input:
4481 data: data to encrypt, bytes-like object
4482 exponent, modulus: parameter e and N of RSA algorithm, both integer
4483 Output: hex string of encrypted data
4484
4485 Limitation: supports one block encryption only
4486 '''
4487
4488 payload = int(binascii.hexlify(data[::-1]), 16)
4489 encrypted = pow(payload, exponent, modulus)
4490 return '%x' % encrypted
81bdc8fd
YCH
4491
4492
f48409c7
YCH
4493def pkcs1pad(data, length):
4494 """
4495 Padding input data with PKCS#1 scheme
4496
4497 @param {int[]} data input data
4498 @param {int} length target length
4499 @returns {int[]} padded data
4500 """
4501 if len(data) > length - 11:
4502 raise ValueError('Input data too long for PKCS#1 padding')
4503
4504 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4505 return [0, 2] + pseudo_random + [0] + data
4506
4507
5eb6bdce 4508def encode_base_n(num, n, table=None):
59f898b7 4509 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
4510 if not table:
4511 table = FULL_TABLE[:n]
4512
5eb6bdce
YCH
4513 if n > len(table):
4514 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4515
4516 if num == 0:
4517 return table[0]
4518
81bdc8fd
YCH
4519 ret = ''
4520 while num:
4521 ret = table[num % n] + ret
4522 num = num // n
4523 return ret
f52354a8
YCH
4524
4525
4526def decode_packed_codes(code):
06b3fe29 4527 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4528 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4529 base = int(base)
4530 count = int(count)
4531 symbols = symbols.split('|')
4532 symbol_table = {}
4533
4534 while count:
4535 count -= 1
5eb6bdce 4536 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4537 symbol_table[base_n_count] = symbols[count] or base_n_count
4538
4539 return re.sub(
4540 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4541 obfuscated_code)
e154c651 4542
4543
1ced2221
S
4544def caesar(s, alphabet, shift):
4545 if shift == 0:
4546 return s
4547 l = len(alphabet)
4548 return ''.join(
4549 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4550 for c in s)
4551
4552
4553def rot47(s):
4554 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4555
4556
e154c651 4557def parse_m3u8_attributes(attrib):
4558 info = {}
4559 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4560 if val.startswith('"'):
4561 val = val[1:-1]
4562 info[key] = val
4563 return info
1143535d
YCH
4564
4565
4566def urshift(val, n):
4567 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4568
4569
4570# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 4571# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
4572def decode_png(png_data):
4573 # Reference: https://www.w3.org/TR/PNG/
4574 header = png_data[8:]
4575
4576 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4577 raise IOError('Not a valid PNG file.')
4578
4579 int_map = {1: '>B', 2: '>H', 4: '>I'}
4580 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4581
4582 chunks = []
4583
4584 while header:
4585 length = unpack_integer(header[:4])
4586 header = header[4:]
4587
4588 chunk_type = header[:4]
4589 header = header[4:]
4590
4591 chunk_data = header[:length]
4592 header = header[length:]
4593
4594 header = header[4:] # Skip CRC
4595
4596 chunks.append({
4597 'type': chunk_type,
4598 'length': length,
4599 'data': chunk_data
4600 })
4601
4602 ihdr = chunks[0]['data']
4603
4604 width = unpack_integer(ihdr[:4])
4605 height = unpack_integer(ihdr[4:8])
4606
4607 idat = b''
4608
4609 for chunk in chunks:
4610 if chunk['type'] == b'IDAT':
4611 idat += chunk['data']
4612
4613 if not idat:
4614 raise IOError('Unable to read PNG data.')
4615
4616 decompressed_data = bytearray(zlib.decompress(idat))
4617
4618 stride = width * 3
4619 pixels = []
4620
4621 def _get_pixel(idx):
4622 x = idx % stride
4623 y = idx // stride
4624 return pixels[y][x]
4625
4626 for y in range(height):
4627 basePos = y * (1 + stride)
4628 filter_type = decompressed_data[basePos]
4629
4630 current_row = []
4631
4632 pixels.append(current_row)
4633
4634 for x in range(stride):
4635 color = decompressed_data[1 + basePos + x]
4636 basex = y * stride + x
4637 left = 0
4638 up = 0
4639
4640 if x > 2:
4641 left = _get_pixel(basex - 3)
4642 if y > 0:
4643 up = _get_pixel(basex - stride)
4644
4645 if filter_type == 1: # Sub
4646 color = (color + left) & 0xff
4647 elif filter_type == 2: # Up
4648 color = (color + up) & 0xff
4649 elif filter_type == 3: # Average
4650 color = (color + ((left + up) >> 1)) & 0xff
4651 elif filter_type == 4: # Paeth
4652 a = left
4653 b = up
4654 c = 0
4655
4656 if x > 2 and y > 0:
4657 c = _get_pixel(basex - stride - 3)
4658
4659 p = a + b - c
4660
4661 pa = abs(p - a)
4662 pb = abs(p - b)
4663 pc = abs(p - c)
4664
4665 if pa <= pb and pa <= pc:
4666 color = (color + a) & 0xff
4667 elif pb <= pc:
4668 color = (color + b) & 0xff
4669 else:
4670 color = (color + c) & 0xff
4671
4672 current_row.append(color)
4673
4674 return width, height, pixels
efa97bdc
YCH
4675
4676
4677def write_xattr(path, key, value):
4678 # This mess below finds the best xattr tool for the job
4679 try:
4680 # try the pyxattr module...
4681 import xattr
4682
53a7e3d2
YCH
4683 if hasattr(xattr, 'set'): # pyxattr
4684 # Unicode arguments are not supported in python-pyxattr until
4685 # version 0.5.0
067aa17e 4686 # See https://github.com/ytdl-org/youtube-dl/issues/5498
53a7e3d2
YCH
4687 pyxattr_required_version = '0.5.0'
4688 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4689 # TODO: fallback to CLI tools
4690 raise XAttrUnavailableError(
4691 'python-pyxattr is detected but is too old. '
7a5c1cfe 4692 'yt-dlp requires %s or above while your version is %s. '
53a7e3d2
YCH
4693 'Falling back to other xattr implementations' % (
4694 pyxattr_required_version, xattr.__version__))
4695
4696 setxattr = xattr.set
4697 else: # xattr
4698 setxattr = xattr.setxattr
efa97bdc
YCH
4699
4700 try:
53a7e3d2 4701 setxattr(path, key, value)
efa97bdc
YCH
4702 except EnvironmentError as e:
4703 raise XAttrMetadataError(e.errno, e.strerror)
4704
4705 except ImportError:
4706 if compat_os_name == 'nt':
4707 # Write xattrs to NTFS Alternate Data Streams:
4708 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4709 assert ':' not in key
4710 assert os.path.exists(path)
4711
4712 ads_fn = path + ':' + key
4713 try:
4714 with open(ads_fn, 'wb') as f:
4715 f.write(value)
4716 except EnvironmentError as e:
4717 raise XAttrMetadataError(e.errno, e.strerror)
4718 else:
4719 user_has_setfattr = check_executable('setfattr', ['--version'])
4720 user_has_xattr = check_executable('xattr', ['-h'])
4721
4722 if user_has_setfattr or user_has_xattr:
4723
4724 value = value.decode('utf-8')
4725 if user_has_setfattr:
4726 executable = 'setfattr'
4727 opts = ['-n', key, '-v', value]
4728 elif user_has_xattr:
4729 executable = 'xattr'
4730 opts = ['-w', key, value]
4731
3089bc74
S
4732 cmd = ([encodeFilename(executable, True)]
4733 + [encodeArgument(o) for o in opts]
4734 + [encodeFilename(path, True)])
efa97bdc
YCH
4735
4736 try:
d3c93ec2 4737 p = Popen(
efa97bdc
YCH
4738 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4739 except EnvironmentError as e:
4740 raise XAttrMetadataError(e.errno, e.strerror)
d3c93ec2 4741 stdout, stderr = p.communicate_or_kill()
efa97bdc
YCH
4742 stderr = stderr.decode('utf-8', 'replace')
4743 if p.returncode != 0:
4744 raise XAttrMetadataError(p.returncode, stderr)
4745
4746 else:
4747 # On Unix, and can't find pyxattr, setfattr, or xattr.
4748 if sys.platform.startswith('linux'):
4749 raise XAttrUnavailableError(
4750 "Couldn't find a tool to set the xattrs. "
4751 "Install either the python 'pyxattr' or 'xattr' "
4752 "modules, or the GNU 'attr' package "
4753 "(which contains the 'setfattr' tool).")
4754 else:
4755 raise XAttrUnavailableError(
4756 "Couldn't find a tool to set the xattrs. "
4757 "Install either the python 'xattr' module, "
4758 "or the 'xattr' binary.")
0c265486
YCH
4759
4760
4761def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
4762 start_date = datetime.date(1950, 1, 1)
4763 end_date = datetime.date(1995, 12, 31)
4764 offset = random.randint(0, (end_date - start_date).days)
4765 random_date = start_date + datetime.timedelta(offset)
0c265486 4766 return {
aa374bc7
AS
4767 year_field: str(random_date.year),
4768 month_field: str(random_date.month),
4769 day_field: str(random_date.day),
0c265486 4770 }
732044af 4771
c76eb41b 4772
732044af 4773# Templates for internet shortcut files, which are plain text files.
4774DOT_URL_LINK_TEMPLATE = '''
4775[InternetShortcut]
4776URL=%(url)s
4777'''.lstrip()
4778
4779DOT_WEBLOC_LINK_TEMPLATE = '''
4780<?xml version="1.0" encoding="UTF-8"?>
4781<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4782<plist version="1.0">
4783<dict>
4784\t<key>URL</key>
4785\t<string>%(url)s</string>
4786</dict>
4787</plist>
4788'''.lstrip()
4789
4790DOT_DESKTOP_LINK_TEMPLATE = '''
4791[Desktop Entry]
4792Encoding=UTF-8
4793Name=%(filename)s
4794Type=Link
4795URL=%(url)s
4796Icon=text-html
4797'''.lstrip()
4798
08438d2c 4799LINK_TEMPLATES = {
4800 'url': DOT_URL_LINK_TEMPLATE,
4801 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4802 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4803}
4804
732044af 4805
4806def iri_to_uri(iri):
4807 """
4808 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4809
4810 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4811 """
4812
4813 iri_parts = compat_urllib_parse_urlparse(iri)
4814
4815 if '[' in iri_parts.netloc:
4816 raise ValueError('IPv6 URIs are not, yet, supported.')
4817 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4818
4819 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4820
4821 net_location = ''
4822 if iri_parts.username:
4823 net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4824 if iri_parts.password is not None:
4825 net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4826 net_location += '@'
4827
4828 net_location += iri_parts.hostname.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
4829 # The 'idna' encoding produces ASCII text.
4830 if iri_parts.port is not None and iri_parts.port != 80:
4831 net_location += ':' + str(iri_parts.port)
4832
4833 return compat_urllib_parse_urlunparse(
4834 (iri_parts.scheme,
4835 net_location,
4836
4837 compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4838
4839 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4840 compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4841
4842 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4843 compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4844
4845 compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4846
4847 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4848
4849
4850def to_high_limit_path(path):
4851 if sys.platform in ['win32', 'cygwin']:
4852 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4853 return r'\\?\ '.rstrip() + os.path.abspath(path)
4854
4855 return path
76d321f6 4856
c76eb41b 4857
b868936c 4858def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4859 if field is None:
4860 val = obj if obj is not None else default
4861 else:
4862 val = obj.get(field, default)
76d321f6 4863 if func and val not in ignore:
4864 val = func(val)
4865 return template % val if val not in ignore else default
00dd0cd5 4866
4867
4868def clean_podcast_url(url):
4869 return re.sub(r'''(?x)
4870 (?:
4871 (?:
4872 chtbl\.com/track|
4873 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4874 play\.podtrac\.com
4875 )/[^/]+|
4876 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4877 flex\.acast\.com|
4878 pd(?:
4879 cn\.co| # https://podcorn.com/analytics-prefix/
4880 st\.fm # https://podsights.com/docs/
4881 )/e
4882 )/''', '', url)
ffcb8191
THD
4883
4884
4885_HEX_TABLE = '0123456789abcdef'
4886
4887
4888def random_uuidv4():
4889 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 4890
4891
4892def make_dir(path, to_screen=None):
4893 try:
4894 dn = os.path.dirname(path)
4895 if dn and not os.path.exists(dn):
4896 os.makedirs(dn)
4897 return True
4898 except (OSError, IOError) as err:
4899 if callable(to_screen) is not None:
4900 to_screen('unable to create directory ' + error_to_compat_str(err))
4901 return False
f74980cb 4902
4903
4904def get_executable_path():
c552ae88 4905 from zipimport import zipimporter
4906 if hasattr(sys, 'frozen'): # Running from PyInstaller
4907 path = os.path.dirname(sys.executable)
4908 elif isinstance(globals().get('__loader__'), zipimporter): # Running from ZIP
4909 path = os.path.join(os.path.dirname(__file__), '../..')
4910 else:
4911 path = os.path.join(os.path.dirname(__file__), '..')
f74980cb 4912 return os.path.abspath(path)
4913
4914
2f567473 4915def load_plugins(name, suffix, namespace):
3ae5e797 4916 classes = {}
f74980cb 4917 try:
019a94f7
ÁS
4918 plugins_spec = importlib.util.spec_from_file_location(
4919 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
4920 plugins = importlib.util.module_from_spec(plugins_spec)
4921 sys.modules[plugins_spec.name] = plugins
4922 plugins_spec.loader.exec_module(plugins)
f74980cb 4923 for name in dir(plugins):
2f567473 4924 if name in namespace:
4925 continue
4926 if not name.endswith(suffix):
f74980cb 4927 continue
4928 klass = getattr(plugins, name)
3ae5e797 4929 classes[name] = namespace[name] = klass
019a94f7 4930 except FileNotFoundError:
f74980cb 4931 pass
f74980cb 4932 return classes
06167fbb 4933
4934
325ebc17 4935def traverse_obj(
352d63fd 4936 obj, *path_list, default=None, expected_type=None, get_all=True,
325ebc17 4937 casesense=True, is_user_input=False, traverse_string=False):
324ad820 4938 ''' Traverse nested list/dict/tuple
8f334380 4939 @param path_list A list of paths which are checked one by one.
4940 Each path is a list of keys where each key is a string,
1797b073 4941 a function, a tuple of strings/None or "...".
2614f646 4942 When a fuction is given, it takes the key as argument and
4943 returns whether the key matches or not. When a tuple is given,
8f334380 4944 all the keys given in the tuple are traversed, and
4945 "..." traverses all the keys in the object
1797b073 4946 "None" returns the object without traversal
325ebc17 4947 @param default Default value to return
352d63fd 4948 @param expected_type Only accept final value of this type (Can also be any callable)
4949 @param get_all Return all the values obtained from a path or only the first one
324ad820 4950 @param casesense Whether to consider dictionary keys as case sensitive
4951 @param is_user_input Whether the keys are generated from user input. If True,
4952 strings are converted to int/slice if necessary
4953 @param traverse_string Whether to traverse inside strings. If True, any
4954 non-compatible object will also be converted into a string
8f334380 4955 # TODO: Write tests
324ad820 4956 '''
325ebc17 4957 if not casesense:
dbf5416a 4958 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
8f334380 4959 path_list = (map(_lower, variadic(path)) for path in path_list)
4960
4961 def _traverse_obj(obj, path, _current_depth=0):
4962 nonlocal depth
4963 path = tuple(variadic(path))
4964 for i, key in enumerate(path):
1797b073 4965 if None in (key, obj):
4966 return obj
8f334380 4967 if isinstance(key, (list, tuple)):
4968 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
4969 key = ...
4970 if key is ...:
4971 obj = (obj.values() if isinstance(obj, dict)
4972 else obj if isinstance(obj, (list, tuple, LazyList))
4973 else str(obj) if traverse_string else [])
4974 _current_depth += 1
4975 depth = max(depth, _current_depth)
4976 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
2614f646 4977 elif callable(key):
4978 if isinstance(obj, (list, tuple, LazyList)):
4979 obj = enumerate(obj)
4980 elif isinstance(obj, dict):
4981 obj = obj.items()
4982 else:
4983 if not traverse_string:
4984 return None
4985 obj = str(obj)
4986 _current_depth += 1
4987 depth = max(depth, _current_depth)
4988 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
575e17a1 4989 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
325ebc17 4990 obj = (obj.get(key) if casesense or (key in obj)
4991 else next((v for k, v in obj.items() if _lower(k) == key), None))
4992 else:
4993 if is_user_input:
4994 key = (int_or_none(key) if ':' not in key
4995 else slice(*map(int_or_none, key.split(':'))))
8f334380 4996 if key == slice(None):
575e17a1 4997 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
325ebc17 4998 if not isinstance(key, (int, slice)):
9fea350f 4999 return None
8f334380 5000 if not isinstance(obj, (list, tuple, LazyList)):
325ebc17 5001 if not traverse_string:
5002 return None
5003 obj = str(obj)
5004 try:
5005 obj = obj[key]
5006 except IndexError:
324ad820 5007 return None
325ebc17 5008 return obj
5009
352d63fd 5010 if isinstance(expected_type, type):
5011 type_test = lambda val: val if isinstance(val, expected_type) else None
5012 elif expected_type is not None:
5013 type_test = expected_type
5014 else:
5015 type_test = lambda val: val
5016
8f334380 5017 for path in path_list:
5018 depth = 0
5019 val = _traverse_obj(obj, path)
325ebc17 5020 if val is not None:
8f334380 5021 if depth:
5022 for _ in range(depth - 1):
6586bca9 5023 val = itertools.chain.from_iterable(v for v in val if v is not None)
352d63fd 5024 val = [v for v in map(type_test, val) if v is not None]
8f334380 5025 if val:
352d63fd 5026 return val if get_all else val[0]
5027 else:
5028 val = type_test(val)
5029 if val is not None:
8f334380 5030 return val
325ebc17 5031 return default
324ad820 5032
5033
5034def traverse_dict(dictn, keys, casesense=True):
ee8dd27a 5035 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5036 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5037 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5038
5039
4b4b7f74 5040def variadic(x, allowed_types=(str, bytes, dict)):
cb89cfc1 5041 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
bd50a52b
THD
5042
5043
49fa4d9a
N
5044# create a JSON Web Signature (jws) with HS256 algorithm
5045# the resulting format is in JWS Compact Serialization
5046# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5047# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5048def jwt_encode_hs256(payload_data, key, headers={}):
5049 header_data = {
5050 'alg': 'HS256',
5051 'typ': 'JWT',
5052 }
5053 if headers:
5054 header_data.update(headers)
5055 header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5056 payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5057 h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5058 signature_b64 = base64.b64encode(h.digest())
5059 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5060 return token
819e0531 5061
5062
16b0d7e6 5063# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5064def jwt_decode_hs256(jwt):
5065 header_b64, payload_b64, signature_b64 = jwt.split('.')
5066 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5067 return payload_data
5068
5069
819e0531 5070def supports_terminal_sequences(stream):
5071 if compat_os_name == 'nt':
e3c7d495 5072 from .compat import WINDOWS_VT_MODE # Must be imported locally
5073 if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
819e0531 5074 return False
5075 elif not os.getenv('TERM'):
5076 return False
5077 try:
5078 return stream.isatty()
5079 except BaseException:
5080 return False
5081
5082
ec11a9f4 5083_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5084
5085
5086def remove_terminal_sequences(string):
5087 return _terminal_sequences_re.sub('', string)
5088
5089
5090def number_of_digits(number):
5091 return len('%d' % number)
34921b43 5092
5093
5094def join_nonempty(*values, delim='-', from_dict=None):
5095 if from_dict is not None:
c586f9e8 5096 values = map(from_dict.get, values)
34921b43 5097 return delim.join(map(str, filter(None, values)))
06e57990 5098
5099
5100class Config:
5101 own_args = None
5102 filename = None
5103 __initialized = False
5104
5105 def __init__(self, parser, label=None):
5106 self._parser, self.label = parser, label
5107 self._loaded_paths, self.configs = set(), []
5108
5109 def init(self, args=None, filename=None):
5110 assert not self.__initialized
5111 if filename:
5112 location = os.path.realpath(filename)
5113 if location in self._loaded_paths:
5114 return False
5115 self._loaded_paths.add(location)
5116
5117 self.__initialized = True
5118 self.own_args, self.filename = args, filename
5119 for location in self._parser.parse_args(args)[0].config_locations or []:
5120 location = compat_expanduser(location)
5121 if os.path.isdir(location):
5122 location = os.path.join(location, 'yt-dlp.conf')
5123 if not os.path.exists(location):
5124 self._parser.error(f'config location {location} does not exist')
5125 self.append_config(self.read_file(location), location)
5126 return True
5127
5128 def __str__(self):
5129 label = join_nonempty(
5130 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5131 delim=' ')
5132 return join_nonempty(
5133 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5134 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5135 delim='\n')
5136
5137 @staticmethod
5138 def read_file(filename, default=[]):
5139 try:
5140 optionf = open(filename)
5141 except IOError:
5142 return default # silently skip if file is not present
5143 try:
5144 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5145 contents = optionf.read()
5146 if sys.version_info < (3,):
5147 contents = contents.decode(preferredencoding())
5148 res = compat_shlex_split(contents, comments=True)
5149 finally:
5150 optionf.close()
5151 return res
5152
5153 @staticmethod
5154 def hide_login_info(opts):
5155 PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5156 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5157
5158 def _scrub_eq(o):
5159 m = eqre.match(o)
5160 if m:
5161 return m.group('key') + '=PRIVATE'
5162 else:
5163 return o
5164
5165 opts = list(map(_scrub_eq, opts))
5166 for idx, opt in enumerate(opts):
5167 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5168 opts[idx + 1] = 'PRIVATE'
5169 return opts
5170
5171 def append_config(self, *args, label=None):
5172 config = type(self)(self._parser, label)
5173 config._loaded_paths = self._loaded_paths
5174 if config.init(*args):
5175 self.configs.append(config)
5176
5177 @property
5178 def all_args(self):
5179 for config in reversed(self.configs):
5180 yield from config.all_args
5181 yield from self.own_args or []
5182
5183 def parse_args(self):
5184 return self._parser.parse_args(list(self.all_args))