]> jfr.im git - yt-dlp.git/blame - yt_dlp/utils.py
[adobepass] Add Suddenlink MSO (#2977)
[yt-dlp.git] / yt_dlp / utils.py
CommitLineData
cc52de43 1#!/usr/bin/env python3
dcdb292f 2# coding: utf-8
d77c3dfd 3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
da42679b 6import asyncio
15dfb392 7import atexit
1e399778 8import base64
5bc880b9 9import binascii
912b38b4 10import calendar
676eb3f2 11import codecs
c380cc28 12import collections
62e609ab 13import contextlib
e3946f98 14import ctypes
c496ca96
PH
15import datetime
16import email.utils
0c265486 17import email.header
f45c185f 18import errno
be4a824d 19import functools
d77c3dfd 20import gzip
49fa4d9a
N
21import hashlib
22import hmac
019a94f7 23import importlib.util
03f9daab 24import io
79a2e94e 25import itertools
f4bfd65f 26import json
d77c3dfd 27import locale
02dbf93f 28import math
347de493 29import operator
d77c3dfd 30import os
c496ca96 31import platform
773f291d 32import random
d77c3dfd 33import re
c496ca96 34import socket
79a2e94e 35import ssl
1c088fa8 36import subprocess
d77c3dfd 37import sys
181c8655 38import tempfile
c380cc28 39import time
01951dda 40import traceback
bcf89ce6 41import xml.etree.ElementTree
d77c3dfd 42import zlib
2814f12b 43import mimetypes
d77c3dfd 44
8c25f81b 45from .compat import (
b4a3d461 46 compat_HTMLParseError,
8bb56eee 47 compat_HTMLParser,
201c1459 48 compat_HTTPError,
8f9312c3 49 compat_basestring,
8c25f81b 50 compat_chr,
1bab3437 51 compat_cookiejar,
d7cd9a9e 52 compat_ctypes_WINFUNCTYPE,
36e6f62c 53 compat_etree_fromstring,
51098426 54 compat_expanduser,
8c25f81b 55 compat_html_entities,
55b2f099 56 compat_html_entities_html5,
be4a824d 57 compat_http_client,
42db58ec 58 compat_integer_types,
e29663c6 59 compat_numeric_types,
c86b6142 60 compat_kwargs,
efa97bdc 61 compat_os_name,
8c25f81b 62 compat_parse_qs,
06e57990 63 compat_shlex_split,
702ccf2d 64 compat_shlex_quote,
8c25f81b 65 compat_str,
edaa23f8 66 compat_struct_pack,
d3f8e038 67 compat_struct_unpack,
8c25f81b
PH
68 compat_urllib_error,
69 compat_urllib_parse,
15707c7e 70 compat_urllib_parse_urlencode,
8c25f81b 71 compat_urllib_parse_urlparse,
732044af 72 compat_urllib_parse_urlunparse,
73 compat_urllib_parse_quote,
74 compat_urllib_parse_quote_plus,
7581bfc9 75 compat_urllib_parse_unquote_plus,
8c25f81b
PH
76 compat_urllib_request,
77 compat_urlparse,
da42679b 78 compat_websockets,
810c10ba 79 compat_xpath,
8c25f81b 80)
4644ac55 81
71aff188
YCH
82from .socks import (
83 ProxyType,
84 sockssocket,
85)
86
4644ac55 87
51fb4995
YCH
88def register_socks_protocols():
89 # "Register" SOCKS protocols
d5ae6bb5
YCH
90 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
91 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
92 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
93 if scheme not in compat_urlparse.uses_netloc:
94 compat_urlparse.uses_netloc.append(scheme)
95
96
468e2e92
FV
97# This is not clearly defined otherwise
98compiled_regex_type = type(re.compile(''))
99
f7a147e3
S
100
101def random_user_agent():
102 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
103 _CHROME_VERSIONS = (
19b4c74d 104 '90.0.4430.212',
105 '90.0.4430.24',
106 '90.0.4430.70',
107 '90.0.4430.72',
108 '90.0.4430.85',
109 '90.0.4430.93',
110 '91.0.4472.101',
111 '91.0.4472.106',
112 '91.0.4472.114',
113 '91.0.4472.124',
114 '91.0.4472.164',
115 '91.0.4472.19',
116 '91.0.4472.77',
117 '92.0.4515.107',
118 '92.0.4515.115',
119 '92.0.4515.131',
120 '92.0.4515.159',
121 '92.0.4515.43',
122 '93.0.4556.0',
123 '93.0.4577.15',
124 '93.0.4577.63',
125 '93.0.4577.82',
126 '94.0.4606.41',
127 '94.0.4606.54',
128 '94.0.4606.61',
129 '94.0.4606.71',
130 '94.0.4606.81',
131 '94.0.4606.85',
132 '95.0.4638.17',
133 '95.0.4638.50',
134 '95.0.4638.54',
135 '95.0.4638.69',
136 '95.0.4638.74',
137 '96.0.4664.18',
138 '96.0.4664.45',
139 '96.0.4664.55',
140 '96.0.4664.93',
141 '97.0.4692.20',
f7a147e3
S
142 )
143 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
144
145
3e669f36 146std_headers = {
f7a147e3 147 'User-Agent': random_user_agent(),
59ae15a5
PH
148 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
149 'Accept-Encoding': 'gzip, deflate',
150 'Accept-Language': 'en-us,en;q=0.5',
b1156c1e 151 'Sec-Fetch-Mode': 'navigate',
3e669f36 152}
f427df17 153
5f6a1245 154
fb37eb25
S
155USER_AGENTS = {
156 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
157}
158
159
bf42a990
S
160NO_DEFAULT = object()
161
7105440c
YCH
162ENGLISH_MONTH_NAMES = [
163 'January', 'February', 'March', 'April', 'May', 'June',
164 'July', 'August', 'September', 'October', 'November', 'December']
165
f6717dec
S
166MONTH_NAMES = {
167 'en': ENGLISH_MONTH_NAMES,
168 'fr': [
3e4185c3
S
169 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
170 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 171}
a942d6cb 172
a7aaa398
S
173KNOWN_EXTENSIONS = (
174 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
175 'flv', 'f4v', 'f4a', 'f4b',
176 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
177 'mkv', 'mka', 'mk3d',
178 'avi', 'divx',
179 'mov',
180 'asf', 'wmv', 'wma',
181 '3gp', '3g2',
182 'mp3',
183 'flac',
184 'ape',
185 'wav',
186 'f4f', 'f4m', 'm3u8', 'smil')
187
c587cbb7 188# needed for sanitizing filenames in restricted mode
c8827027 189ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
fd35d8cd
JW
190 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
191 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
c587cbb7 192
46f59e89
S
193DATE_FORMATS = (
194 '%d %B %Y',
195 '%d %b %Y',
196 '%B %d %Y',
cb655f34
S
197 '%B %dst %Y',
198 '%B %dnd %Y',
9d30c213 199 '%B %drd %Y',
cb655f34 200 '%B %dth %Y',
46f59e89 201 '%b %d %Y',
cb655f34
S
202 '%b %dst %Y',
203 '%b %dnd %Y',
9d30c213 204 '%b %drd %Y',
cb655f34 205 '%b %dth %Y',
46f59e89
S
206 '%b %dst %Y %I:%M',
207 '%b %dnd %Y %I:%M',
9d30c213 208 '%b %drd %Y %I:%M',
46f59e89
S
209 '%b %dth %Y %I:%M',
210 '%Y %m %d',
211 '%Y-%m-%d',
bccdbd22 212 '%Y.%m.%d.',
46f59e89 213 '%Y/%m/%d',
81c13222 214 '%Y/%m/%d %H:%M',
46f59e89 215 '%Y/%m/%d %H:%M:%S',
1931a55e
THD
216 '%Y%m%d%H%M',
217 '%Y%m%d%H%M%S',
4f3fa23e 218 '%Y%m%d',
0c1c6f4b 219 '%Y-%m-%d %H:%M',
46f59e89
S
220 '%Y-%m-%d %H:%M:%S',
221 '%Y-%m-%d %H:%M:%S.%f',
5014558a 222 '%Y-%m-%d %H:%M:%S:%f',
46f59e89
S
223 '%d.%m.%Y %H:%M',
224 '%d.%m.%Y %H.%M',
225 '%Y-%m-%dT%H:%M:%SZ',
226 '%Y-%m-%dT%H:%M:%S.%fZ',
227 '%Y-%m-%dT%H:%M:%S.%f0Z',
228 '%Y-%m-%dT%H:%M:%S',
229 '%Y-%m-%dT%H:%M:%S.%f',
230 '%Y-%m-%dT%H:%M',
c6eed6b8
S
231 '%b %d %Y at %H:%M',
232 '%b %d %Y at %H:%M:%S',
b555ae9b
S
233 '%B %d %Y at %H:%M',
234 '%B %d %Y at %H:%M:%S',
a63d9bd0 235 '%H:%M %d-%b-%Y',
46f59e89
S
236)
237
238DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
239DATE_FORMATS_DAY_FIRST.extend([
240 '%d-%m-%Y',
241 '%d.%m.%Y',
242 '%d.%m.%y',
243 '%d/%m/%Y',
244 '%d/%m/%y',
245 '%d/%m/%Y %H:%M:%S',
246])
247
248DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
249DATE_FORMATS_MONTH_FIRST.extend([
250 '%m-%d-%Y',
251 '%m.%d.%Y',
252 '%m/%d/%Y',
253 '%m/%d/%y',
254 '%m/%d/%Y %H:%M:%S',
255])
256
06b3fe29 257PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
22f5f5c6 258JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
06b3fe29 259
7105440c 260
d77c3dfd 261def preferredencoding():
59ae15a5 262 """Get preferred encoding.
d77c3dfd 263
59ae15a5
PH
264 Returns the best encoding scheme for the system, based on
265 locale.getpreferredencoding() and some further tweaks.
266 """
267 try:
268 pref = locale.getpreferredencoding()
28e614de 269 'TEST'.encode(pref)
70a1165b 270 except Exception:
59ae15a5 271 pref = 'UTF-8'
bae611f2 272
59ae15a5 273 return pref
d77c3dfd 274
f4bfd65f 275
181c8655 276def write_json_file(obj, fn):
1394646a 277 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 278
92120217 279 fn = encodeFilename(fn)
61ee5aeb 280 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
281 encoding = get_filesystem_encoding()
282 # os.path.basename returns a bytes object, but NamedTemporaryFile
283 # will fail if the filename contains non ascii characters unless we
284 # use a unicode object
285 path_basename = lambda f: os.path.basename(fn).decode(encoding)
286 # the same for os.path.dirname
287 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
288 else:
289 path_basename = os.path.basename
290 path_dirname = os.path.dirname
291
73159f99
S
292 args = {
293 'suffix': '.tmp',
ec5f6016
JMF
294 'prefix': path_basename(fn) + '.',
295 'dir': path_dirname(fn),
73159f99
S
296 'delete': False,
297 }
298
181c8655
PH
299 # In Python 2.x, json.dump expects a bytestream.
300 # In Python 3.x, it writes to a character stream
301 if sys.version_info < (3, 0):
73159f99 302 args['mode'] = 'wb'
181c8655 303 else:
73159f99
S
304 args.update({
305 'mode': 'w',
306 'encoding': 'utf-8',
307 })
308
c86b6142 309 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
310
311 try:
312 with tf:
45d86abe 313 json.dump(obj, tf, ensure_ascii=False)
1394646a
IK
314 if sys.platform == 'win32':
315 # Need to remove existing file on Windows, else os.rename raises
316 # WindowsError or FileExistsError.
317 try:
318 os.unlink(fn)
319 except OSError:
320 pass
9cd5f54e
R
321 try:
322 mask = os.umask(0)
323 os.umask(mask)
324 os.chmod(tf.name, 0o666 & ~mask)
325 except OSError:
326 pass
181c8655 327 os.rename(tf.name, fn)
70a1165b 328 except Exception:
181c8655
PH
329 try:
330 os.remove(tf.name)
331 except OSError:
332 pass
333 raise
334
335
336if sys.version_info >= (2, 7):
ee114368 337 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 338 """ Find the xpath xpath[@key=val] """
5d2354f1 339 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 340 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
341 return node.find(expr)
342else:
ee114368 343 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 344 for f in node.findall(compat_xpath(xpath)):
ee114368
S
345 if key not in f.attrib:
346 continue
347 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
348 return f
349 return None
350
d7e66d39
JMF
351# On python2.6 the xml.etree.ElementTree.Element methods don't support
352# the namespace parameter
5f6a1245
JW
353
354
d7e66d39
JMF
355def xpath_with_ns(path, ns_map):
356 components = [c.split(':') for c in path.split('/')]
357 replaced = []
358 for c in components:
359 if len(c) == 1:
360 replaced.append(c[0])
361 else:
362 ns, tag = c
363 replaced.append('{%s}%s' % (ns_map[ns], tag))
364 return '/'.join(replaced)
365
d77c3dfd 366
a41fb80c 367def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 368 def _find_xpath(xpath):
810c10ba 369 return node.find(compat_xpath(xpath))
578c0745
S
370
371 if isinstance(xpath, (str, compat_str)):
372 n = _find_xpath(xpath)
373 else:
374 for xp in xpath:
375 n = _find_xpath(xp)
376 if n is not None:
377 break
d74bebd5 378
8e636da4 379 if n is None:
bf42a990
S
380 if default is not NO_DEFAULT:
381 return default
382 elif fatal:
bf0ff932
PH
383 name = xpath if name is None else name
384 raise ExtractorError('Could not find XML element %s' % name)
385 else:
386 return None
a41fb80c
S
387 return n
388
389
390def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
391 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
392 if n is None or n == default:
393 return n
394 if n.text is None:
395 if default is not NO_DEFAULT:
396 return default
397 elif fatal:
398 name = xpath if name is None else name
399 raise ExtractorError('Could not find XML element\'s text %s' % name)
400 else:
401 return None
402 return n.text
a41fb80c
S
403
404
405def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
406 n = find_xpath_attr(node, xpath, key)
407 if n is None:
408 if default is not NO_DEFAULT:
409 return default
410 elif fatal:
411 name = '%s[@%s]' % (xpath, key) if name is None else name
412 raise ExtractorError('Could not find XML attribute %s' % name)
413 else:
414 return None
415 return n.attrib[key]
bf0ff932
PH
416
417
9e6dd238 418def get_element_by_id(id, html):
43e8fafd 419 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 420 return get_element_by_attribute('id', id, html)
43e8fafd 421
12ea2f30 422
6f32a0b5
ZM
423def get_element_html_by_id(id, html):
424 """Return the html of the tag with the specified ID in the passed HTML document"""
425 return get_element_html_by_attribute('id', id, html)
426
427
84c237fb 428def get_element_by_class(class_name, html):
2af12ad9
TC
429 """Return the content of the first tag with the specified class in the passed HTML document"""
430 retval = get_elements_by_class(class_name, html)
431 return retval[0] if retval else None
432
433
6f32a0b5
ZM
434def get_element_html_by_class(class_name, html):
435 """Return the html of the first tag with the specified class in the passed HTML document"""
436 retval = get_elements_html_by_class(class_name, html)
437 return retval[0] if retval else None
438
439
2af12ad9
TC
440def get_element_by_attribute(attribute, value, html, escape_value=True):
441 retval = get_elements_by_attribute(attribute, value, html, escape_value)
442 return retval[0] if retval else None
443
444
6f32a0b5
ZM
445def get_element_html_by_attribute(attribute, value, html, escape_value=True):
446 retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
447 return retval[0] if retval else None
448
449
2af12ad9
TC
450def get_elements_by_class(class_name, html):
451 """Return the content of all tags with the specified class in the passed HTML document as a list"""
452 return get_elements_by_attribute(
84c237fb
YCH
453 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
454 html, escape_value=False)
455
456
6f32a0b5
ZM
457def get_elements_html_by_class(class_name, html):
458 """Return the html of all tags with the specified class in the passed HTML document as a list"""
459 return get_elements_html_by_attribute(
460 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
461 html, escape_value=False)
462
463
464def get_elements_by_attribute(*args, **kwargs):
43e8fafd 465 """Return the content of the tag with the specified attribute in the passed HTML document"""
6f32a0b5
ZM
466 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
467
468
469def get_elements_html_by_attribute(*args, **kwargs):
470 """Return the html of the tag with the specified attribute in the passed HTML document"""
471 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
472
473
474def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
475 """
476 Return the text (content) and the html (whole) of the tag with the specified
477 attribute in the passed HTML document
478 """
9e6dd238 479
0254f162
ZM
480 value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
481
84c237fb
YCH
482 value = re.escape(value) if escape_value else value
483
0254f162 484 partial_element_re = r'''(?x)
6f32a0b5 485 <(?P<tag>[a-zA-Z0-9:._-]+)
0254f162
ZM
486 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
487 \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
488 ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
38285056 489
0254f162
ZM
490 for m in re.finditer(partial_element_re, html):
491 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
a921f407 492
0254f162
ZM
493 yield (
494 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
495 whole
496 )
a921f407 497
c5229f39 498
6f32a0b5
ZM
499class HTMLBreakOnClosingTagParser(compat_HTMLParser):
500 """
501 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
502 closing tag for the first opening tag it has encountered, and can be used
503 as a context manager
504 """
505
506 class HTMLBreakOnClosingTagException(Exception):
507 pass
508
509 def __init__(self):
510 self.tagstack = collections.deque()
511 compat_HTMLParser.__init__(self)
512
513 def __enter__(self):
514 return self
515
516 def __exit__(self, *_):
517 self.close()
518
519 def close(self):
520 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
521 # so data remains buffered; we no longer have any interest in it, thus
522 # override this method to discard it
523 pass
524
525 def handle_starttag(self, tag, _):
526 self.tagstack.append(tag)
527
528 def handle_endtag(self, tag):
529 if not self.tagstack:
530 raise compat_HTMLParseError('no tags in the stack')
531 while self.tagstack:
532 inner_tag = self.tagstack.pop()
533 if inner_tag == tag:
534 break
535 else:
536 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
537 if not self.tagstack:
538 raise self.HTMLBreakOnClosingTagException()
539
540
541def get_element_text_and_html_by_tag(tag, html):
542 """
543 For the first element with the specified tag in the passed HTML document
544 return its' content (text) and the whole element (html)
545 """
546 def find_or_raise(haystack, needle, exc):
547 try:
548 return haystack.index(needle)
549 except ValueError:
550 raise exc
551 closing_tag = f'</{tag}>'
552 whole_start = find_or_raise(
553 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
554 content_start = find_or_raise(
555 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
556 content_start += whole_start + 1
557 with HTMLBreakOnClosingTagParser() as parser:
558 parser.feed(html[whole_start:content_start])
559 if not parser.tagstack or parser.tagstack[0] != tag:
560 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
561 offset = content_start
562 while offset < len(html):
563 next_closing_tag_start = find_or_raise(
564 html[offset:], closing_tag,
565 compat_HTMLParseError(f'closing {tag} tag not found'))
566 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
567 try:
568 parser.feed(html[offset:offset + next_closing_tag_end])
569 offset += next_closing_tag_end
570 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
571 return html[content_start:offset + next_closing_tag_start], \
572 html[whole_start:offset + next_closing_tag_end]
573 raise compat_HTMLParseError('unexpected end of html')
574
575
8bb56eee
BF
576class HTMLAttributeParser(compat_HTMLParser):
577 """Trivial HTML parser to gather the attributes for a single element"""
b6e0c7d2 578
8bb56eee 579 def __init__(self):
c5229f39 580 self.attrs = {}
8bb56eee
BF
581 compat_HTMLParser.__init__(self)
582
583 def handle_starttag(self, tag, attrs):
584 self.attrs = dict(attrs)
585
c5229f39 586
73673ccf
FF
587class HTMLListAttrsParser(compat_HTMLParser):
588 """HTML parser to gather the attributes for the elements of a list"""
589
590 def __init__(self):
591 compat_HTMLParser.__init__(self)
592 self.items = []
593 self._level = 0
594
595 def handle_starttag(self, tag, attrs):
596 if tag == 'li' and self._level == 0:
597 self.items.append(dict(attrs))
598 self._level += 1
599
600 def handle_endtag(self, tag):
601 self._level -= 1
602
603
8bb56eee
BF
604def extract_attributes(html_element):
605 """Given a string for an HTML element such as
606 <el
607 a="foo" B="bar" c="&98;az" d=boz
608 empty= noval entity="&amp;"
609 sq='"' dq="'"
610 >
611 Decode and return a dictionary of attributes.
612 {
613 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
614 'empty': '', 'noval': None, 'entity': '&',
615 'sq': '"', 'dq': '\''
616 }.
617 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
618 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
619 """
620 parser = HTMLAttributeParser()
b4a3d461
S
621 try:
622 parser.feed(html_element)
623 parser.close()
624 # Older Python may throw HTMLParseError in case of malformed HTML
625 except compat_HTMLParseError:
626 pass
8bb56eee 627 return parser.attrs
9e6dd238 628
c5229f39 629
73673ccf
FF
630def parse_list(webpage):
631 """Given a string for an series of HTML <li> elements,
632 return a dictionary of their attributes"""
633 parser = HTMLListAttrsParser()
634 parser.feed(webpage)
635 parser.close()
636 return parser.items
637
638
9e6dd238 639def clean_html(html):
59ae15a5 640 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
641
642 if html is None: # Convenience for sanitizing descriptions etc.
643 return html
644
49185227 645 html = re.sub(r'\s+', ' ', html)
646 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
647 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
59ae15a5
PH
648 # Strip html tags
649 html = re.sub('<.*?>', '', html)
650 # Replace html entities
651 html = unescapeHTML(html)
7decf895 652 return html.strip()
9e6dd238
FV
653
654
d77c3dfd 655def sanitize_open(filename, open_mode):
59ae15a5
PH
656 """Try to open the given filename, and slightly tweak it if this fails.
657
658 Attempts to open the given filename. If this fails, it tries to change
659 the filename slightly, step by step, until it's either able to open it
660 or it fails and raises a final exception, like the standard open()
661 function.
662
663 It returns the tuple (stream, definitive_file_name).
664 """
665 try:
28e614de 666 if filename == '-':
59ae15a5
PH
667 if sys.platform == 'win32':
668 import msvcrt
669 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 670 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
a3125791 671 stream = locked_file(filename, open_mode, block=False).open()
59ae15a5
PH
672 return (stream, filename)
673 except (IOError, OSError) as err:
f45c185f
PH
674 if err.errno in (errno.EACCES,):
675 raise
59ae15a5 676
f45c185f 677 # In case of error, try to remove win32 forbidden chars
d55de57b 678 alt_filename = sanitize_path(filename)
f45c185f
PH
679 if alt_filename == filename:
680 raise
681 else:
682 # An exception here should be caught in the caller
a3125791 683 stream = locked_file(filename, open_mode, block=False).open()
f45c185f 684 return (stream, alt_filename)
d77c3dfd
FV
685
686
687def timeconvert(timestr):
59ae15a5
PH
688 """Convert RFC 2822 defined time string into system timestamp"""
689 timestamp = None
690 timetuple = email.utils.parsedate_tz(timestr)
691 if timetuple is not None:
692 timestamp = email.utils.mktime_tz(timetuple)
693 return timestamp
1c469a94 694
5f6a1245 695
796173d0 696def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
697 """Sanitizes a string so it could be used as part of a filename.
698 If restricted is set, use a stricter subset of allowed characters.
158af524
S
699 Set is_id if this is not an arbitrary string, but an ID that should be kept
700 if possible.
59ae15a5
PH
701 """
702 def replace_insane(char):
c587cbb7
AT
703 if restricted and char in ACCENT_CHARS:
704 return ACCENT_CHARS[char]
91dd88b9 705 elif not restricted and char == '\n':
706 return ' '
707 elif char == '?' or ord(char) < 32 or ord(char) == 127:
59ae15a5
PH
708 return ''
709 elif char == '"':
710 return '' if restricted else '\''
711 elif char == ':':
712 return '_-' if restricted else ' -'
713 elif char in '\\/|*<>':
714 return '_'
627dcfff 715 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
716 return '_'
717 if restricted and ord(char) > 127:
718 return '_'
719 return char
720
639f1cea 721 if s == '':
722 return ''
2aeb06d6
PH
723 # Handle timestamps
724 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 725 result = ''.join(map(replace_insane, s))
796173d0
PH
726 if not is_id:
727 while '__' in result:
728 result = result.replace('__', '_')
729 result = result.strip('_')
730 # Common case of "Foreign band name - English song title"
731 if restricted and result.startswith('-_'):
732 result = result[2:]
5a42414b
PH
733 if result.startswith('-'):
734 result = '_' + result[len('-'):]
a7440261 735 result = result.lstrip('.')
796173d0
PH
736 if not result:
737 result = '_'
59ae15a5 738 return result
d77c3dfd 739
5f6a1245 740
c2934512 741def sanitize_path(s, force=False):
a2aaf4db 742 """Sanitizes and normalizes path on Windows"""
c2934512 743 if sys.platform == 'win32':
c4218ac3 744 force = False
c2934512 745 drive_or_unc, _ = os.path.splitdrive(s)
746 if sys.version_info < (2, 7) and not drive_or_unc:
747 drive_or_unc, _ = os.path.splitunc(s)
748 elif force:
749 drive_or_unc = ''
750 else:
a2aaf4db 751 return s
c2934512 752
be531ef1
S
753 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
754 if drive_or_unc:
a2aaf4db
S
755 norm_path.pop(0)
756 sanitized_path = [
ec85ded8 757 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 758 for path_part in norm_path]
be531ef1
S
759 if drive_or_unc:
760 sanitized_path.insert(0, drive_or_unc + os.path.sep)
c4218ac3 761 elif force and s[0] == os.path.sep:
762 sanitized_path.insert(0, os.path.sep)
a2aaf4db
S
763 return os.path.join(*sanitized_path)
764
765
17bcc626 766def sanitize_url(url):
befa4708
S
767 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
768 # the number of unwanted failures due to missing protocol
769 if url.startswith('//'):
770 return 'http:%s' % url
771 # Fix some common typos seen so far
772 COMMON_TYPOS = (
067aa17e 773 # https://github.com/ytdl-org/youtube-dl/issues/15649
befa4708
S
774 (r'^httpss://', r'https://'),
775 # https://bx1.be/lives/direct-tv/
776 (r'^rmtp([es]?)://', r'rtmp\1://'),
777 )
778 for mistake, fixup in COMMON_TYPOS:
779 if re.match(mistake, url):
780 return re.sub(mistake, fixup, url)
bc6b9bcd 781 return url
17bcc626
S
782
783
5435dcf9
HH
784def extract_basic_auth(url):
785 parts = compat_urlparse.urlsplit(url)
786 if parts.username is None:
787 return url, None
788 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
789 parts.hostname if parts.port is None
790 else '%s:%d' % (parts.hostname, parts.port))))
791 auth_payload = base64.b64encode(
792 ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
793 return url, 'Basic ' + auth_payload.decode('utf-8')
794
795
67dda517 796def sanitized_Request(url, *args, **kwargs):
bc6b9bcd 797 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
5435dcf9
HH
798 if auth_header is not None:
799 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
800 headers['Authorization'] = auth_header
801 return compat_urllib_request.Request(url, *args, **kwargs)
67dda517
S
802
803
51098426
S
804def expand_path(s):
805 """Expand shell variables and ~"""
806 return os.path.expandvars(compat_expanduser(s))
807
808
d77c3dfd 809def orderedSet(iterable):
59ae15a5
PH
810 """ Remove all duplicates from the input iterable """
811 res = []
812 for el in iterable:
813 if el not in res:
814 res.append(el)
815 return res
d77c3dfd 816
912b38b4 817
55b2f099 818def _htmlentity_transform(entity_with_semicolon):
4e408e47 819 """Transforms an HTML entity to a character."""
55b2f099
YCH
820 entity = entity_with_semicolon[:-1]
821
4e408e47
PH
822 # Known non-numeric HTML entity
823 if entity in compat_html_entities.name2codepoint:
824 return compat_chr(compat_html_entities.name2codepoint[entity])
825
55b2f099
YCH
826 # TODO: HTML5 allows entities without a semicolon. For example,
827 # '&Eacuteric' should be decoded as 'Éric'.
828 if entity_with_semicolon in compat_html_entities_html5:
829 return compat_html_entities_html5[entity_with_semicolon]
830
91757b0f 831 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
832 if mobj is not None:
833 numstr = mobj.group(1)
28e614de 834 if numstr.startswith('x'):
4e408e47 835 base = 16
28e614de 836 numstr = '0%s' % numstr
4e408e47
PH
837 else:
838 base = 10
067aa17e 839 # See https://github.com/ytdl-org/youtube-dl/issues/7518
7aefc49c
S
840 try:
841 return compat_chr(int(numstr, base))
842 except ValueError:
843 pass
4e408e47
PH
844
845 # Unknown entity in name, return its literal representation
7a3f0c00 846 return '&%s;' % entity
4e408e47
PH
847
848
d77c3dfd 849def unescapeHTML(s):
912b38b4
PH
850 if s is None:
851 return None
852 assert type(s) == compat_str
d77c3dfd 853
4e408e47 854 return re.sub(
95f3f7c2 855 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 856
8bf48f23 857
cdb19aa4 858def escapeHTML(text):
859 return (
860 text
861 .replace('&', '&amp;')
862 .replace('<', '&lt;')
863 .replace('>', '&gt;')
864 .replace('"', '&quot;')
865 .replace("'", '&#39;')
866 )
867
868
f5b1bca9 869def process_communicate_or_kill(p, *args, **kwargs):
870 try:
871 return p.communicate(*args, **kwargs)
872 except BaseException: # Including KeyboardInterrupt
873 p.kill()
874 p.wait()
875 raise
876
877
d3c93ec2 878class Popen(subprocess.Popen):
879 if sys.platform == 'win32':
880 _startupinfo = subprocess.STARTUPINFO()
881 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
882 else:
883 _startupinfo = None
884
885 def __init__(self, *args, **kwargs):
886 super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
887
888 def communicate_or_kill(self, *args, **kwargs):
889 return process_communicate_or_kill(self, *args, **kwargs)
890
891
aa49acd1
S
892def get_subprocess_encoding():
893 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
894 # For subprocess calls, encode with locale encoding
895 # Refer to http://stackoverflow.com/a/9951851/35070
896 encoding = preferredencoding()
897 else:
898 encoding = sys.getfilesystemencoding()
899 if encoding is None:
900 encoding = 'utf-8'
901 return encoding
902
903
8bf48f23 904def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
905 """
906 @param s The name of the file
907 """
d77c3dfd 908
8bf48f23 909 assert type(s) == compat_str
d77c3dfd 910
59ae15a5
PH
911 # Python 3 has a Unicode API
912 if sys.version_info >= (3, 0):
913 return s
0f00efed 914
aa49acd1
S
915 # Pass '' directly to use Unicode APIs on Windows 2000 and up
916 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
917 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
918 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
919 return s
920
8ee239e9
YCH
921 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
922 if sys.platform.startswith('java'):
923 return s
924
aa49acd1
S
925 return s.encode(get_subprocess_encoding(), 'ignore')
926
927
928def decodeFilename(b, for_subprocess=False):
929
930 if sys.version_info >= (3, 0):
931 return b
932
933 if not isinstance(b, bytes):
934 return b
935
936 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 937
f07b74fc
PH
938
939def encodeArgument(s):
940 if not isinstance(s, compat_str):
941 # Legacy code that uses byte strings
942 # Uncomment the following line after fixing all post processors
7af808a5 943 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
944 s = s.decode('ascii')
945 return encodeFilename(s, True)
946
947
aa49acd1
S
948def decodeArgument(b):
949 return decodeFilename(b, True)
950
951
8271226a
PH
952def decodeOption(optval):
953 if optval is None:
954 return optval
955 if isinstance(optval, bytes):
956 optval = optval.decode(preferredencoding())
957
958 assert isinstance(optval, compat_str)
959 return optval
1c256f70 960
5f6a1245 961
aa7785f8 962_timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
963
964
965def timetuple_from_msec(msec):
966 secs, msec = divmod(msec, 1000)
967 mins, secs = divmod(secs, 60)
968 hrs, mins = divmod(mins, 60)
969 return _timetuple(hrs, mins, secs, msec)
970
971
cdb19aa4 972def formatSeconds(secs, delim=':', msec=False):
aa7785f8 973 time = timetuple_from_msec(secs * 1000)
974 if time.hours:
975 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
976 elif time.minutes:
977 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
4539dd30 978 else:
aa7785f8 979 ret = '%d' % time.seconds
980 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
4539dd30 981
a0ddb8a2 982
77562778 983def _ssl_load_windows_store_certs(ssl_context, storename):
984 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
985 try:
986 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
987 if encoding == 'x509_asn' and (
988 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
989 except PermissionError:
990 return
991 for cert in certs:
a2366922 992 try:
77562778 993 ssl_context.load_verify_locations(cadata=cert)
994 except ssl.SSLError:
a2366922
PH
995 pass
996
77562778 997
998def make_HTTPS_handler(params, **kwargs):
999 opts_check_certificate = not params.get('nocheckcertificate')
1000 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1001 context.check_hostname = opts_check_certificate
f81c62a6 1002 if params.get('legacyserverconnect'):
1003 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
77562778 1004 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1005 if opts_check_certificate:
4e3d1898 1006 try:
1007 context.load_default_certs()
1008 # Work around the issue in load_default_certs when there are bad certificates. See:
1009 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1010 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1011 except ssl.SSLError:
1012 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1013 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1014 # Create a new context to discard any certificates that were already loaded
1015 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1016 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
1017 for storename in ('CA', 'ROOT'):
1018 _ssl_load_windows_store_certs(context, storename)
1019 context.set_default_verify_paths()
77562778 1020 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 1021
732ea2f0 1022
5873d4cc 1023def bug_reports_message(before=';'):
455a15e2 1024 msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp , '
1025 'filling out the "Broken site" issue template properly. '
1026 'Confirm you are on the latest version using -U')
5873d4cc
F
1027
1028 before = before.rstrip()
1029 if not before or before.endswith(('.', '!', '?')):
1030 msg = msg[0].title() + msg[1:]
1031
1032 return (before + ' ' if before else '') + msg
08f2a92c
JMF
1033
1034
bf5b9d85
PM
1035class YoutubeDLError(Exception):
1036 """Base exception for YoutubeDL errors."""
aa9369a2 1037 msg = None
1038
1039 def __init__(self, msg=None):
1040 if msg is not None:
1041 self.msg = msg
1042 elif self.msg is None:
1043 self.msg = type(self).__name__
1044 super().__init__(self.msg)
bf5b9d85
PM
1045
1046
3158150c 1047network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1048if hasattr(ssl, 'CertificateError'):
1049 network_exceptions.append(ssl.CertificateError)
1050network_exceptions = tuple(network_exceptions)
1051
1052
bf5b9d85 1053class ExtractorError(YoutubeDLError):
1c256f70 1054 """Error during info extraction."""
5f6a1245 1055
1151c407 1056 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
9a82b238 1057 """ tb, if given, is the original traceback (so that it can be printed out).
7a5c1cfe 1058 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
9a82b238 1059 """
3158150c 1060 if sys.exc_info()[0] in network_exceptions:
9a82b238 1061 expected = True
d5979c5d 1062
7265a219 1063 self.orig_msg = str(msg)
1c256f70 1064 self.traceback = tb
1151c407 1065 self.expected = expected
2eabb802 1066 self.cause = cause
d11271dd 1067 self.video_id = video_id
1151c407 1068 self.ie = ie
1069 self.exc_info = sys.exc_info() # preserve original exception
1070
1071 super(ExtractorError, self).__init__(''.join((
1072 format_field(ie, template='[%s] '),
1073 format_field(video_id, template='%s: '),
7265a219 1074 msg,
1151c407 1075 format_field(cause, template=' (caused by %r)'),
1076 '' if expected else bug_reports_message())))
1c256f70 1077
01951dda 1078 def format_traceback(self):
497d2fab 1079 return join_nonempty(
1080 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1081 self.cause and ''.join(traceback.format_exception(self.cause)[1:]),
1082 delim='\n') or None
01951dda 1083
1c256f70 1084
416c7fcb
PH
1085class UnsupportedError(ExtractorError):
1086 def __init__(self, url):
1087 super(UnsupportedError, self).__init__(
1088 'Unsupported URL: %s' % url, expected=True)
1089 self.url = url
1090
1091
55b3e45b
JMF
1092class RegexNotFoundError(ExtractorError):
1093 """Error when a regex didn't match"""
1094 pass
1095
1096
773f291d
S
1097class GeoRestrictedError(ExtractorError):
1098 """Geographic restriction Error exception.
1099
1100 This exception may be thrown when a video is not available from your
1101 geographic location due to geographic restrictions imposed by a website.
1102 """
b6e0c7d2 1103
0db3bae8 1104 def __init__(self, msg, countries=None, **kwargs):
1105 kwargs['expected'] = True
1106 super(GeoRestrictedError, self).__init__(msg, **kwargs)
773f291d
S
1107 self.countries = countries
1108
1109
bf5b9d85 1110class DownloadError(YoutubeDLError):
59ae15a5 1111 """Download Error exception.
d77c3dfd 1112
59ae15a5
PH
1113 This exception may be thrown by FileDownloader objects if they are not
1114 configured to continue on errors. They will contain the appropriate
1115 error message.
1116 """
5f6a1245 1117
8cc83b8d
FV
1118 def __init__(self, msg, exc_info=None):
1119 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1120 super(DownloadError, self).__init__(msg)
1121 self.exc_info = exc_info
d77c3dfd
FV
1122
1123
498f5606 1124class EntryNotInPlaylist(YoutubeDLError):
1125 """Entry not in playlist exception.
1126
1127 This exception will be thrown by YoutubeDL when a requested entry
1128 is not found in the playlist info_dict
1129 """
aa9369a2 1130 msg = 'Entry not found in info'
498f5606 1131
1132
bf5b9d85 1133class SameFileError(YoutubeDLError):
59ae15a5 1134 """Same File exception.
d77c3dfd 1135
59ae15a5
PH
1136 This exception will be thrown by FileDownloader objects if they detect
1137 multiple files would have to be downloaded to the same file on disk.
1138 """
aa9369a2 1139 msg = 'Fixed output name but more than one file to download'
1140
1141 def __init__(self, filename=None):
1142 if filename is not None:
1143 self.msg += f': {filename}'
1144 super().__init__(self.msg)
d77c3dfd
FV
1145
1146
bf5b9d85 1147class PostProcessingError(YoutubeDLError):
59ae15a5 1148 """Post Processing exception.
d77c3dfd 1149
59ae15a5
PH
1150 This exception may be raised by PostProcessor's .run() method to
1151 indicate an error in the postprocessing task.
1152 """
5f6a1245 1153
5f6a1245 1154
48f79687 1155class DownloadCancelled(YoutubeDLError):
1156 """ Exception raised when the download queue should be interrupted """
1157 msg = 'The download was cancelled'
8b0d7497 1158
8b0d7497 1159
48f79687 1160class ExistingVideoReached(DownloadCancelled):
1161 """ --break-on-existing triggered """
1162 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
8b0d7497 1163
48f79687 1164
1165class RejectedVideoReached(DownloadCancelled):
1166 """ --break-on-reject triggered """
1167 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
51d9739f 1168
1169
48f79687 1170class MaxDownloadsReached(DownloadCancelled):
59ae15a5 1171 """ --max-downloads limit has been reached. """
48f79687 1172 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1173
1174
f2ebc5c7 1175class ReExtractInfo(YoutubeDLError):
1176 """ Video info needs to be re-extracted. """
1177
1178 def __init__(self, msg, expected=False):
1179 super().__init__(msg)
1180 self.expected = expected
1181
1182
1183class ThrottledDownload(ReExtractInfo):
48f79687 1184 """ Download speed below --throttled-rate. """
aa9369a2 1185 msg = 'The download speed is below throttle limit'
d77c3dfd 1186
43b22906 1187 def __init__(self):
1188 super().__init__(self.msg, expected=False)
f2ebc5c7 1189
d77c3dfd 1190
bf5b9d85 1191class UnavailableVideoError(YoutubeDLError):
59ae15a5 1192 """Unavailable Format exception.
d77c3dfd 1193
59ae15a5
PH
1194 This exception will be thrown when a video is requested
1195 in a format that is not available for that video.
1196 """
aa9369a2 1197 msg = 'Unable to download video'
1198
1199 def __init__(self, err=None):
1200 if err is not None:
1201 self.msg += f': {err}'
1202 super().__init__(self.msg)
d77c3dfd
FV
1203
1204
bf5b9d85 1205class ContentTooShortError(YoutubeDLError):
59ae15a5 1206 """Content Too Short exception.
d77c3dfd 1207
59ae15a5
PH
1208 This exception may be raised by FileDownloader objects when a file they
1209 download is too small for what the server announced first, indicating
1210 the connection was probably interrupted.
1211 """
d77c3dfd 1212
59ae15a5 1213 def __init__(self, downloaded, expected):
bf5b9d85
PM
1214 super(ContentTooShortError, self).__init__(
1215 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1216 )
2c7ed247 1217 # Both in bytes
59ae15a5
PH
1218 self.downloaded = downloaded
1219 self.expected = expected
d77c3dfd 1220
5f6a1245 1221
bf5b9d85 1222class XAttrMetadataError(YoutubeDLError):
efa97bdc
YCH
1223 def __init__(self, code=None, msg='Unknown error'):
1224 super(XAttrMetadataError, self).__init__(msg)
1225 self.code = code
bd264412 1226 self.msg = msg
efa97bdc
YCH
1227
1228 # Parsing code and msg
3089bc74 1229 if (self.code in (errno.ENOSPC, errno.EDQUOT)
a0566bbf 1230 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
efa97bdc
YCH
1231 self.reason = 'NO_SPACE'
1232 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1233 self.reason = 'VALUE_TOO_LONG'
1234 else:
1235 self.reason = 'NOT_SUPPORTED'
1236
1237
bf5b9d85 1238class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
1239 pass
1240
1241
c5a59d93 1242def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
1243 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1244 # expected HTTP responses to meet HTTP/1.0 or later (see also
067aa17e 1245 # https://github.com/ytdl-org/youtube-dl/issues/6727)
e5e78797 1246 if sys.version_info < (3, 0):
65220c3b
S
1247 kwargs['strict'] = True
1248 hc = http_class(*args, **compat_kwargs(kwargs))
be4a824d 1249 source_address = ydl_handler._params.get('source_address')
8959018a 1250
be4a824d 1251 if source_address is not None:
8959018a
AU
1252 # This is to workaround _create_connection() from socket where it will try all
1253 # address data from getaddrinfo() including IPv6. This filters the result from
1254 # getaddrinfo() based on the source_address value.
1255 # This is based on the cpython socket.create_connection() function.
1256 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1257 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1258 host, port = address
1259 err = None
1260 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
9e21e6d9
S
1261 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1262 ip_addrs = [addr for addr in addrs if addr[0] == af]
1263 if addrs and not ip_addrs:
1264 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1265 raise socket.error(
1266 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1267 % (ip_version, source_address[0]))
8959018a
AU
1268 for res in ip_addrs:
1269 af, socktype, proto, canonname, sa = res
1270 sock = None
1271 try:
1272 sock = socket.socket(af, socktype, proto)
1273 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1274 sock.settimeout(timeout)
1275 sock.bind(source_address)
1276 sock.connect(sa)
1277 err = None # Explicitly break reference cycle
1278 return sock
1279 except socket.error as _:
1280 err = _
1281 if sock is not None:
1282 sock.close()
1283 if err is not None:
1284 raise err
1285 else:
9e21e6d9
S
1286 raise socket.error('getaddrinfo returns an empty list')
1287 if hasattr(hc, '_create_connection'):
1288 hc._create_connection = _create_connection
be4a824d
PH
1289 sa = (source_address, 0)
1290 if hasattr(hc, 'source_address'): # Python 2.7+
1291 hc.source_address = sa
1292 else: # Python 2.6
1293 def _hc_connect(self, *args, **kwargs):
9e21e6d9 1294 sock = _create_connection(
be4a824d
PH
1295 (self.host, self.port), self.timeout, sa)
1296 if is_https:
d7932313
PH
1297 self.sock = ssl.wrap_socket(
1298 sock, self.key_file, self.cert_file,
1299 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
1300 else:
1301 self.sock = sock
1302 hc.connect = functools.partial(_hc_connect, hc)
1303
1304 return hc
1305
1306
87f0e62d 1307def handle_youtubedl_headers(headers):
992fc9d6
YCH
1308 filtered_headers = headers
1309
1310 if 'Youtubedl-no-compression' in filtered_headers:
1311 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 1312 del filtered_headers['Youtubedl-no-compression']
87f0e62d 1313
992fc9d6 1314 return filtered_headers
87f0e62d
YCH
1315
1316
acebc9cd 1317class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
1318 """Handler for HTTP requests and responses.
1319
1320 This class, when installed with an OpenerDirector, automatically adds
1321 the standard headers to every HTTP request and handles gzipped and
1322 deflated responses from web servers. If compression is to be avoided in
1323 a particular request, the original request in the program code only has
0424ec30 1324 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
1325 removed before making the real request.
1326
1327 Part of this code was copied from:
1328
1329 http://techknack.net/python-urllib2-handlers/
1330
1331 Andrew Rowls, the author of that code, agreed to release it to the
1332 public domain.
1333 """
1334
be4a824d
PH
1335 def __init__(self, params, *args, **kwargs):
1336 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1337 self._params = params
1338
1339 def http_open(self, req):
71aff188
YCH
1340 conn_class = compat_http_client.HTTPConnection
1341
1342 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1343 if socks_proxy:
1344 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1345 del req.headers['Ytdl-socks-proxy']
1346
be4a824d 1347 return self.do_open(functools.partial(
71aff188 1348 _create_http_connection, self, conn_class, False),
be4a824d
PH
1349 req)
1350
59ae15a5
PH
1351 @staticmethod
1352 def deflate(data):
fc2119f2 1353 if not data:
1354 return data
59ae15a5
PH
1355 try:
1356 return zlib.decompress(data, -zlib.MAX_WBITS)
1357 except zlib.error:
1358 return zlib.decompress(data)
1359
acebc9cd 1360 def http_request(self, req):
51f267d9
S
1361 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1362 # always respected by websites, some tend to give out URLs with non percent-encoded
1363 # non-ASCII characters (see telemb.py, ard.py [#3412])
1364 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1365 # To work around aforementioned issue we will replace request's original URL with
1366 # percent-encoded one
1367 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1368 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1369 url = req.get_full_url()
1370 url_escaped = escape_url(url)
1371
1372 # Substitute URL if any change after escaping
1373 if url != url_escaped:
15d260eb 1374 req = update_Request(req, url=url_escaped)
51f267d9 1375
8b7539d2 1376 for h, v in self._params.get('http_headers', std_headers).items():
3d5f7a39
JK
1377 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1378 # The dict keys are capitalized because of this bug by urllib
1379 if h.capitalize() not in req.headers:
33ac271b 1380 req.add_header(h, v)
87f0e62d
YCH
1381
1382 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
1383
1384 if sys.version_info < (2, 7) and '#' in req.get_full_url():
1385 # Python 2.6 is brain-dead when it comes to fragments
1386 req._Request__original = req._Request__original.partition('#')[0]
1387 req._Request__r_type = req._Request__r_type.partition('#')[0]
1388
59ae15a5
PH
1389 return req
1390
acebc9cd 1391 def http_response(self, req, resp):
59ae15a5
PH
1392 old_resp = resp
1393 # gzip
1394 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
1395 content = resp.read()
1396 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1397 try:
1398 uncompressed = io.BytesIO(gz.read())
1399 except IOError as original_ioerror:
1400 # There may be junk add the end of the file
1401 # See http://stackoverflow.com/q/4928560/35070 for details
1402 for i in range(1, 1024):
1403 try:
1404 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1405 uncompressed = io.BytesIO(gz.read())
1406 except IOError:
1407 continue
1408 break
1409 else:
1410 raise original_ioerror
b407d853 1411 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1412 resp.msg = old_resp.msg
c047270c 1413 del resp.headers['Content-encoding']
59ae15a5
PH
1414 # deflate
1415 if resp.headers.get('Content-encoding', '') == 'deflate':
1416 gz = io.BytesIO(self.deflate(resp.read()))
b407d853 1417 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 1418 resp.msg = old_resp.msg
c047270c 1419 del resp.headers['Content-encoding']
ad729172 1420 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
067aa17e 1421 # https://github.com/ytdl-org/youtube-dl/issues/6457).
5a4d9ddb
S
1422 if 300 <= resp.code < 400:
1423 location = resp.headers.get('Location')
1424 if location:
1425 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1426 if sys.version_info >= (3, 0):
1427 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
1428 else:
1429 location = location.decode('utf-8')
5a4d9ddb
S
1430 location_escaped = escape_url(location)
1431 if location != location_escaped:
1432 del resp.headers['Location']
9a4aec8b
YCH
1433 if sys.version_info < (3, 0):
1434 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 1435 resp.headers['Location'] = location_escaped
59ae15a5 1436 return resp
0f8d03f8 1437
acebc9cd
PH
1438 https_request = http_request
1439 https_response = http_response
bf50b038 1440
5de90176 1441
71aff188
YCH
1442def make_socks_conn_class(base_class, socks_proxy):
1443 assert issubclass(base_class, (
1444 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1445
1446 url_components = compat_urlparse.urlparse(socks_proxy)
1447 if url_components.scheme.lower() == 'socks5':
1448 socks_type = ProxyType.SOCKS5
1449 elif url_components.scheme.lower() in ('socks', 'socks4'):
1450 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1451 elif url_components.scheme.lower() == 'socks4a':
1452 socks_type = ProxyType.SOCKS4A
71aff188 1453
cdd94c2e
YCH
1454 def unquote_if_non_empty(s):
1455 if not s:
1456 return s
1457 return compat_urllib_parse_unquote_plus(s)
1458
71aff188
YCH
1459 proxy_args = (
1460 socks_type,
1461 url_components.hostname, url_components.port or 1080,
1462 True, # Remote DNS
cdd94c2e
YCH
1463 unquote_if_non_empty(url_components.username),
1464 unquote_if_non_empty(url_components.password),
71aff188
YCH
1465 )
1466
1467 class SocksConnection(base_class):
1468 def connect(self):
1469 self.sock = sockssocket()
1470 self.sock.setproxy(*proxy_args)
1471 if type(self.timeout) in (int, float):
1472 self.sock.settimeout(self.timeout)
1473 self.sock.connect((self.host, self.port))
1474
1475 if isinstance(self, compat_http_client.HTTPSConnection):
1476 if hasattr(self, '_context'): # Python > 2.6
1477 self.sock = self._context.wrap_socket(
1478 self.sock, server_hostname=self.host)
1479 else:
1480 self.sock = ssl.wrap_socket(self.sock)
1481
1482 return SocksConnection
1483
1484
be4a824d
PH
1485class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1486 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1487 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1488 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1489 self._params = params
1490
1491 def https_open(self, req):
4f264c02 1492 kwargs = {}
71aff188
YCH
1493 conn_class = self._https_conn_class
1494
4f264c02
JMF
1495 if hasattr(self, '_context'): # python > 2.6
1496 kwargs['context'] = self._context
1497 if hasattr(self, '_check_hostname'): # python 3.x
1498 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1499
1500 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1501 if socks_proxy:
1502 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1503 del req.headers['Ytdl-socks-proxy']
1504
be4a824d 1505 return self.do_open(functools.partial(
71aff188 1506 _create_http_connection, self, conn_class, True),
4f264c02 1507 req, **kwargs)
be4a824d
PH
1508
1509
1bab3437 1510class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
f1a8511f
S
1511 """
1512 See [1] for cookie file format.
1513
1514 1. https://curl.haxx.se/docs/http-cookies.html
1515 """
e7e62441 1516 _HTTPONLY_PREFIX = '#HttpOnly_'
c380cc28
S
1517 _ENTRY_LEN = 7
1518 _HEADER = '''# Netscape HTTP Cookie File
7a5c1cfe 1519# This file is generated by yt-dlp. Do not edit.
c380cc28
S
1520
1521'''
1522 _CookieFileEntry = collections.namedtuple(
1523 'CookieFileEntry',
1524 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
e7e62441 1525
1bab3437 1526 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
c380cc28
S
1527 """
1528 Save cookies to a file.
1529
1530 Most of the code is taken from CPython 3.8 and slightly adapted
1531 to support cookie files with UTF-8 in both python 2 and 3.
1532 """
1533 if filename is None:
1534 if self.filename is not None:
1535 filename = self.filename
1536 else:
1537 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1538
1bab3437
S
1539 # Store session cookies with `expires` set to 0 instead of an empty
1540 # string
1541 for cookie in self:
1542 if cookie.expires is None:
1543 cookie.expires = 0
c380cc28
S
1544
1545 with io.open(filename, 'w', encoding='utf-8') as f:
1546 f.write(self._HEADER)
1547 now = time.time()
1548 for cookie in self:
1549 if not ignore_discard and cookie.discard:
1550 continue
1551 if not ignore_expires and cookie.is_expired(now):
1552 continue
1553 if cookie.secure:
1554 secure = 'TRUE'
1555 else:
1556 secure = 'FALSE'
1557 if cookie.domain.startswith('.'):
1558 initial_dot = 'TRUE'
1559 else:
1560 initial_dot = 'FALSE'
1561 if cookie.expires is not None:
1562 expires = compat_str(cookie.expires)
1563 else:
1564 expires = ''
1565 if cookie.value is None:
1566 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1567 # with no name, whereas http.cookiejar regards it as a
1568 # cookie with no value.
1569 name = ''
1570 value = cookie.name
1571 else:
1572 name = cookie.name
1573 value = cookie.value
1574 f.write(
1575 '\t'.join([cookie.domain, initial_dot, cookie.path,
1576 secure, expires, name, value]) + '\n')
1bab3437
S
1577
1578 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
e7e62441 1579 """Load cookies from a file."""
1580 if filename is None:
1581 if self.filename is not None:
1582 filename = self.filename
1583 else:
1584 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1585
c380cc28
S
1586 def prepare_line(line):
1587 if line.startswith(self._HTTPONLY_PREFIX):
1588 line = line[len(self._HTTPONLY_PREFIX):]
1589 # comments and empty lines are fine
1590 if line.startswith('#') or not line.strip():
1591 return line
1592 cookie_list = line.split('\t')
1593 if len(cookie_list) != self._ENTRY_LEN:
1594 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1595 cookie = self._CookieFileEntry(*cookie_list)
1596 if cookie.expires_at and not cookie.expires_at.isdigit():
1597 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1598 return line
1599
e7e62441 1600 cf = io.StringIO()
c380cc28 1601 with io.open(filename, encoding='utf-8') as f:
e7e62441 1602 for line in f:
c380cc28
S
1603 try:
1604 cf.write(prepare_line(line))
1605 except compat_cookiejar.LoadError as e:
1606 write_string(
1607 'WARNING: skipping cookie file entry due to %s: %r\n'
1608 % (e, line), sys.stderr)
1609 continue
e7e62441 1610 cf.seek(0)
1611 self._really_load(cf, filename, ignore_discard, ignore_expires)
1bab3437
S
1612 # Session cookies are denoted by either `expires` field set to
1613 # an empty string or 0. MozillaCookieJar only recognizes the former
1614 # (see [1]). So we need force the latter to be recognized as session
1615 # cookies on our own.
1616 # Session cookies may be important for cookies-based authentication,
1617 # e.g. usually, when user does not check 'Remember me' check box while
1618 # logging in on a site, some important cookies are stored as session
1619 # cookies so that not recognizing them will result in failed login.
1620 # 1. https://bugs.python.org/issue17164
1621 for cookie in self:
1622 # Treat `expires=0` cookies as session cookies
1623 if cookie.expires == 0:
1624 cookie.expires = None
1625 cookie.discard = True
1626
1627
a6420bf5
S
1628class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1629 def __init__(self, cookiejar=None):
1630 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1631
1632 def http_response(self, request, response):
1633 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1634 # characters in Set-Cookie HTTP header of last response (see
067aa17e 1635 # https://github.com/ytdl-org/youtube-dl/issues/6769).
a6420bf5
S
1636 # In order to at least prevent crashing we will percent encode Set-Cookie
1637 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1638 # if sys.version_info < (3, 0) and response.headers:
1639 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1640 # set_cookie = response.headers.get(set_cookie_header)
1641 # if set_cookie:
1642 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1643 # if set_cookie != set_cookie_escaped:
1644 # del response.headers[set_cookie_header]
1645 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1646 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1647
f5fa042c 1648 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
a6420bf5
S
1649 https_response = http_response
1650
1651
fca6dba8 1652class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
201c1459 1653 """YoutubeDL redirect handler
1654
1655 The code is based on HTTPRedirectHandler implementation from CPython [1].
1656
1657 This redirect handler solves two issues:
1658 - ensures redirect URL is always unicode under python 2
1659 - introduces support for experimental HTTP response status code
1660 308 Permanent Redirect [2] used by some sites [3]
1661
1662 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1663 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1664 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1665 """
1666
1667 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1668
1669 def redirect_request(self, req, fp, code, msg, headers, newurl):
1670 """Return a Request or None in response to a redirect.
1671
1672 This is called by the http_error_30x methods when a
1673 redirection response is received. If a redirection should
1674 take place, return a new Request to allow http_error_30x to
1675 perform the redirect. Otherwise, raise HTTPError if no-one
1676 else should try to handle this url. Return None if you can't
1677 but another Handler might.
1678 """
1679 m = req.get_method()
1680 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1681 or code in (301, 302, 303) and m == "POST")):
1682 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1683 # Strictly (according to RFC 2616), 301 or 302 in response to
1684 # a POST MUST NOT cause a redirection without confirmation
1685 # from the user (of urllib.request, in this case). In practice,
1686 # essentially all clients do redirect in this case, so we do
1687 # the same.
1688
1689 # On python 2 urlh.geturl() may sometimes return redirect URL
1690 # as byte string instead of unicode. This workaround allows
1691 # to force it always return unicode.
1692 if sys.version_info[0] < 3:
1693 newurl = compat_str(newurl)
1694
1695 # Be conciliant with URIs containing a space. This is mainly
1696 # redundant with the more complete encoding done in http_error_302(),
1697 # but it is kept for compatibility with other callers.
1698 newurl = newurl.replace(' ', '%20')
1699
1700 CONTENT_HEADERS = ("content-length", "content-type")
1701 # NB: don't use dict comprehension for python 2.6 compatibility
1702 newheaders = dict((k, v) for k, v in req.headers.items()
1703 if k.lower() not in CONTENT_HEADERS)
1704 return compat_urllib_request.Request(
1705 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1706 unverifiable=True)
fca6dba8
S
1707
1708
46f59e89
S
1709def extract_timezone(date_str):
1710 m = re.search(
f137e4c2 1711 r'''(?x)
1712 ^.{8,}? # >=8 char non-TZ prefix, if present
1713 (?P<tz>Z| # just the UTC Z, or
1714 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1715 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1716 [ ]? # optional space
1717 (?P<sign>\+|-) # +/-
1718 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1719 $)
1720 ''', date_str)
46f59e89
S
1721 if not m:
1722 timezone = datetime.timedelta()
1723 else:
1724 date_str = date_str[:-len(m.group('tz'))]
1725 if not m.group('sign'):
1726 timezone = datetime.timedelta()
1727 else:
1728 sign = 1 if m.group('sign') == '+' else -1
1729 timezone = datetime.timedelta(
1730 hours=sign * int(m.group('hours')),
1731 minutes=sign * int(m.group('minutes')))
1732 return timezone, date_str
1733
1734
08b38d54 1735def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1736 """ Return a UNIX timestamp from the given date """
1737
1738 if date_str is None:
1739 return None
1740
52c3a6e4
S
1741 date_str = re.sub(r'\.[0-9]+', '', date_str)
1742
08b38d54 1743 if timezone is None:
46f59e89
S
1744 timezone, date_str = extract_timezone(date_str)
1745
52c3a6e4
S
1746 try:
1747 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1748 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1749 return calendar.timegm(dt.timetuple())
1750 except ValueError:
1751 pass
912b38b4
PH
1752
1753
46f59e89
S
1754def date_formats(day_first=True):
1755 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1756
1757
42bdd9d0 1758def unified_strdate(date_str, day_first=True):
bf50b038 1759 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1760
1761 if date_str is None:
1762 return None
bf50b038 1763 upload_date = None
5f6a1245 1764 # Replace commas
026fcc04 1765 date_str = date_str.replace(',', ' ')
42bdd9d0 1766 # Remove AM/PM + timezone
9bb8e0a3 1767 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1768 _, date_str = extract_timezone(date_str)
42bdd9d0 1769
46f59e89 1770 for expression in date_formats(day_first):
bf50b038
JMF
1771 try:
1772 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1773 except ValueError:
bf50b038 1774 pass
42393ce2
PH
1775 if upload_date is None:
1776 timetuple = email.utils.parsedate_tz(date_str)
1777 if timetuple:
c6b9cf05
S
1778 try:
1779 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1780 except ValueError:
1781 pass
6a750402
JMF
1782 if upload_date is not None:
1783 return compat_str(upload_date)
bf50b038 1784
5f6a1245 1785
46f59e89
S
1786def unified_timestamp(date_str, day_first=True):
1787 if date_str is None:
1788 return None
1789
2ae2ffda 1790 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1791
7dc2a74e 1792 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1793 timezone, date_str = extract_timezone(date_str)
1794
1795 # Remove AM/PM + timezone
1796 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1797
deef3195
S
1798 # Remove unrecognized timezones from ISO 8601 alike timestamps
1799 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1800 if m:
1801 date_str = date_str[:-len(m.group('tz'))]
1802
f226880c
PH
1803 # Python only supports microseconds, so remove nanoseconds
1804 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1805 if m:
1806 date_str = m.group(1)
1807
46f59e89
S
1808 for expression in date_formats(day_first):
1809 try:
7dc2a74e 1810 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1811 return calendar.timegm(dt.timetuple())
1812 except ValueError:
1813 pass
1814 timetuple = email.utils.parsedate_tz(date_str)
1815 if timetuple:
7dc2a74e 1816 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1817
1818
28e614de 1819def determine_ext(url, default_ext='unknown_video'):
85750f89 1820 if url is None or '.' not in url:
f4776371 1821 return default_ext
9cb9a5df 1822 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1823 if re.match(r'^[A-Za-z0-9]+$', guess):
1824 return guess
a7aaa398
S
1825 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1826 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1827 return guess.rstrip('/')
73e79f2a 1828 else:
cbdbb766 1829 return default_ext
73e79f2a 1830
5f6a1245 1831
824fa511
S
1832def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1833 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
d4051a8e 1834
5f6a1245 1835
9e62f283 1836def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
37254abc
JMF
1837 """
1838 Return a datetime object from a string in the format YYYYMMDD or
d49f8db3 1839 (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
9e62f283 1840
1841 format: string date format used to return datetime object from
1842 precision: round the time portion of a datetime object.
1843 auto|microsecond|second|minute|hour|day.
1844 auto: round to the unit provided in date_str (if applicable).
1845 """
1846 auto_precision = False
1847 if precision == 'auto':
1848 auto_precision = True
1849 precision = 'microsecond'
396a76f7 1850 today = datetime_round(datetime.datetime.utcnow(), precision)
f8795e10 1851 if date_str in ('now', 'today'):
37254abc 1852 return today
f8795e10
PH
1853 if date_str == 'yesterday':
1854 return today - datetime.timedelta(days=1)
9e62f283 1855 match = re.match(
1856 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1857 date_str)
37254abc 1858 if match is not None:
9e62f283 1859 start_time = datetime_from_str(match.group('start'), precision, format)
1860 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
37254abc 1861 unit = match.group('unit')
9e62f283 1862 if unit == 'month' or unit == 'year':
1863 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
37254abc 1864 unit = 'day'
9e62f283 1865 else:
1866 if unit == 'week':
1867 unit = 'day'
1868 time *= 7
1869 delta = datetime.timedelta(**{unit + 's': time})
1870 new_date = start_time + delta
1871 if auto_precision:
1872 return datetime_round(new_date, unit)
1873 return new_date
1874
1875 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1876
1877
d49f8db3 1878def date_from_str(date_str, format='%Y%m%d', strict=False):
9e62f283 1879 """
1880 Return a datetime object from a string in the format YYYYMMDD or
d49f8db3 1881 (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1882
1883 If "strict", only (now|today)[+-][0-9](day|week|month|year)(s)? is allowed
9e62f283 1884
1885 format: string date format used to return datetime object from
1886 """
d49f8db3 1887 if strict and not re.fullmatch(r'\d{8}|(now|today)[+-]\d+(day|week|month|year)(s)?', date_str):
1888 raise ValueError(f'Invalid date format {date_str}')
9e62f283 1889 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1890
1891
1892def datetime_add_months(dt, months):
1893 """Increment/Decrement a datetime object by months."""
1894 month = dt.month + months - 1
1895 year = dt.year + month // 12
1896 month = month % 12 + 1
1897 day = min(dt.day, calendar.monthrange(year, month)[1])
1898 return dt.replace(year, month, day)
1899
1900
1901def datetime_round(dt, precision='day'):
1902 """
1903 Round a datetime object's time to a specific precision
1904 """
1905 if precision == 'microsecond':
1906 return dt
1907
1908 unit_seconds = {
1909 'day': 86400,
1910 'hour': 3600,
1911 'minute': 60,
1912 'second': 1,
1913 }
1914 roundto = lambda x, n: ((x + n / 2) // n) * n
1915 timestamp = calendar.timegm(dt.timetuple())
1916 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
5f6a1245
JW
1917
1918
e63fc1be 1919def hyphenate_date(date_str):
1920 """
1921 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1922 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1923 if match is not None:
1924 return '-'.join(match.groups())
1925 else:
1926 return date_str
1927
5f6a1245 1928
bd558525
JMF
1929class DateRange(object):
1930 """Represents a time interval between two dates"""
5f6a1245 1931
bd558525
JMF
1932 def __init__(self, start=None, end=None):
1933 """start and end must be strings in the format accepted by date"""
1934 if start is not None:
d49f8db3 1935 self.start = date_from_str(start, strict=True)
bd558525
JMF
1936 else:
1937 self.start = datetime.datetime.min.date()
1938 if end is not None:
d49f8db3 1939 self.end = date_from_str(end, strict=True)
bd558525
JMF
1940 else:
1941 self.end = datetime.datetime.max.date()
37254abc 1942 if self.start > self.end:
bd558525 1943 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1944
bd558525
JMF
1945 @classmethod
1946 def day(cls, day):
1947 """Returns a range that only contains the given day"""
5f6a1245
JW
1948 return cls(day, day)
1949
bd558525
JMF
1950 def __contains__(self, date):
1951 """Check if the date is in the range"""
37254abc
JMF
1952 if not isinstance(date, datetime.date):
1953 date = date_from_str(date)
1954 return self.start <= date <= self.end
5f6a1245 1955
bd558525 1956 def __str__(self):
5f6a1245 1957 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1958
1959
1960def platform_name():
1961 """ Returns the platform name as a compat_str """
1962 res = platform.platform()
1963 if isinstance(res, bytes):
1964 res = res.decode(preferredencoding())
1965
1966 assert isinstance(res, compat_str)
1967 return res
c257baff
PH
1968
1969
49fa4d9a
N
1970def get_windows_version():
1971 ''' Get Windows version. None if it's not running on Windows '''
1972 if compat_os_name == 'nt':
1973 return version_tuple(platform.win32_ver()[1])
1974 else:
1975 return None
1976
1977
b58ddb32
PH
1978def _windows_write_string(s, out):
1979 """ Returns True if the string was written using special methods,
1980 False if it has yet to be written out."""
1981 # Adapted from http://stackoverflow.com/a/3259271/35070
1982
b58ddb32
PH
1983 import ctypes.wintypes
1984
1985 WIN_OUTPUT_IDS = {
1986 1: -11,
1987 2: -12,
1988 }
1989
a383a98a
PH
1990 try:
1991 fileno = out.fileno()
1992 except AttributeError:
1993 # If the output stream doesn't have a fileno, it's virtual
1994 return False
aa42e873
PH
1995 except io.UnsupportedOperation:
1996 # Some strange Windows pseudo files?
1997 return False
b58ddb32
PH
1998 if fileno not in WIN_OUTPUT_IDS:
1999 return False
2000
d7cd9a9e 2001 GetStdHandle = compat_ctypes_WINFUNCTYPE(
b58ddb32 2002 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
d7cd9a9e 2003 ('GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
2004 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
2005
d7cd9a9e 2006 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
2007 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
2008 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
d7cd9a9e 2009 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
2010 written = ctypes.wintypes.DWORD(0)
2011
d7cd9a9e 2012 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
2013 FILE_TYPE_CHAR = 0x0002
2014 FILE_TYPE_REMOTE = 0x8000
d7cd9a9e 2015 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
b58ddb32
PH
2016 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
2017 ctypes.POINTER(ctypes.wintypes.DWORD))(
d7cd9a9e 2018 ('GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
2019 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
2020
2021 def not_a_console(handle):
2022 if handle == INVALID_HANDLE_VALUE or handle is None:
2023 return True
3089bc74
S
2024 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
2025 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
2026
2027 if not_a_console(h):
2028 return False
2029
d1b9c912
PH
2030 def next_nonbmp_pos(s):
2031 try:
2032 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
2033 except StopIteration:
2034 return len(s)
2035
2036 while s:
2037 count = min(next_nonbmp_pos(s), 1024)
2038
b58ddb32 2039 ret = WriteConsoleW(
d1b9c912 2040 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
2041 if ret == 0:
2042 raise OSError('Failed to write string')
d1b9c912
PH
2043 if not count: # We just wrote a non-BMP character
2044 assert written.value == 2
2045 s = s[1:]
2046 else:
2047 assert written.value > 0
2048 s = s[written.value:]
b58ddb32
PH
2049 return True
2050
2051
734f90bb 2052def write_string(s, out=None, encoding=None):
7459e3a2
PH
2053 if out is None:
2054 out = sys.stderr
8bf48f23 2055 assert type(s) == compat_str
7459e3a2 2056
b58ddb32
PH
2057 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
2058 if _windows_write_string(s, out):
2059 return
2060
3089bc74
S
2061 if ('b' in getattr(out, 'mode', '')
2062 or sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
2063 byt = s.encode(encoding or preferredencoding(), 'ignore')
2064 out.write(byt)
2065 elif hasattr(out, 'buffer'):
2066 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2067 byt = s.encode(enc, 'ignore')
2068 out.buffer.write(byt)
2069 else:
8bf48f23 2070 out.write(s)
7459e3a2
PH
2071 out.flush()
2072
2073
48ea9cea
PH
2074def bytes_to_intlist(bs):
2075 if not bs:
2076 return []
2077 if isinstance(bs[0], int): # Python 3
2078 return list(bs)
2079 else:
2080 return [ord(c) for c in bs]
2081
c257baff 2082
cba892fa 2083def intlist_to_bytes(xs):
2084 if not xs:
2085 return b''
edaa23f8 2086 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
2087
2088
c1c9a79c
PH
2089# Cross-platform file locking
2090if sys.platform == 'win32':
2091 import ctypes.wintypes
2092 import msvcrt
2093
2094 class OVERLAPPED(ctypes.Structure):
2095 _fields_ = [
2096 ('Internal', ctypes.wintypes.LPVOID),
2097 ('InternalHigh', ctypes.wintypes.LPVOID),
2098 ('Offset', ctypes.wintypes.DWORD),
2099 ('OffsetHigh', ctypes.wintypes.DWORD),
2100 ('hEvent', ctypes.wintypes.HANDLE),
2101 ]
2102
2103 kernel32 = ctypes.windll.kernel32
2104 LockFileEx = kernel32.LockFileEx
2105 LockFileEx.argtypes = [
2106 ctypes.wintypes.HANDLE, # hFile
2107 ctypes.wintypes.DWORD, # dwFlags
2108 ctypes.wintypes.DWORD, # dwReserved
2109 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2110 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2111 ctypes.POINTER(OVERLAPPED) # Overlapped
2112 ]
2113 LockFileEx.restype = ctypes.wintypes.BOOL
2114 UnlockFileEx = kernel32.UnlockFileEx
2115 UnlockFileEx.argtypes = [
2116 ctypes.wintypes.HANDLE, # hFile
2117 ctypes.wintypes.DWORD, # dwReserved
2118 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2119 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2120 ctypes.POINTER(OVERLAPPED) # Overlapped
2121 ]
2122 UnlockFileEx.restype = ctypes.wintypes.BOOL
2123 whole_low = 0xffffffff
2124 whole_high = 0x7fffffff
2125
747c0bd1 2126 def _lock_file(f, exclusive, block):
c1c9a79c
PH
2127 overlapped = OVERLAPPED()
2128 overlapped.Offset = 0
2129 overlapped.OffsetHigh = 0
2130 overlapped.hEvent = 0
2131 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
747c0bd1 2132
2133 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2134 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2135 0, whole_low, whole_high, f._lock_file_overlapped_p):
2136 raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
c1c9a79c
PH
2137
2138 def _unlock_file(f):
2139 assert f._lock_file_overlapped_p
2140 handle = msvcrt.get_osfhandle(f.fileno())
747c0bd1 2141 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
c1c9a79c
PH
2142 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2143
2144else:
399a76e6
YCH
2145 try:
2146 import fcntl
c1c9a79c 2147
a3125791 2148 def _lock_file(f, exclusive, block):
acea8d7c
JK
2149 try:
2150 fcntl.flock(f,
2151 fcntl.LOCK_SH if not exclusive
2152 else fcntl.LOCK_EX if block
2153 else fcntl.LOCK_EX | fcntl.LOCK_NB)
2154 except BlockingIOError:
2155 raise
2156 except OSError: # AOSP does not have flock()
2157 fcntl.lockf(f,
2158 fcntl.LOCK_SH if not exclusive
2159 else fcntl.LOCK_EX if block
2160 else fcntl.LOCK_EX | fcntl.LOCK_NB)
c1c9a79c 2161
399a76e6 2162 def _unlock_file(f):
acea8d7c
JK
2163 try:
2164 fcntl.flock(f, fcntl.LOCK_UN)
2165 except OSError:
2166 fcntl.lockf(f, fcntl.LOCK_UN)
a3125791 2167
399a76e6
YCH
2168 except ImportError:
2169 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2170
a3125791 2171 def _lock_file(f, exclusive, block):
399a76e6
YCH
2172 raise IOError(UNSUPPORTED_MSG)
2173
2174 def _unlock_file(f):
2175 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
2176
2177
2178class locked_file(object):
747c0bd1 2179 _closed = False
2180
a3125791
JK
2181 def __init__(self, filename, mode, block=True, encoding=None):
2182 assert mode in ['r', 'rb', 'a', 'ab', 'w', 'wb']
c1c9a79c
PH
2183 self.f = io.open(filename, mode, encoding=encoding)
2184 self.mode = mode
a3125791 2185 self.block = block
c1c9a79c
PH
2186
2187 def __enter__(self):
a3125791 2188 exclusive = 'r' not in self.mode
c1c9a79c 2189 try:
a3125791 2190 _lock_file(self.f, exclusive, self.block)
c1c9a79c
PH
2191 except IOError:
2192 self.f.close()
2193 raise
2194 return self
2195
2196 def __exit__(self, etype, value, traceback):
2197 try:
747c0bd1 2198 if not self._closed:
2199 _unlock_file(self.f)
c1c9a79c
PH
2200 finally:
2201 self.f.close()
747c0bd1 2202 self._closed = True
c1c9a79c
PH
2203
2204 def __iter__(self):
2205 return iter(self.f)
2206
2207 def write(self, *args):
2208 return self.f.write(*args)
2209
2210 def read(self, *args):
2211 return self.f.read(*args)
4eb7f1d1 2212
a3125791
JK
2213 def flush(self):
2214 self.f.flush()
2215
2216 def open(self):
2217 return self.__enter__()
2218
2219 def close(self, *args):
2220 self.__exit__(self, *args, value=False, traceback=False)
2221
4eb7f1d1 2222
4644ac55
S
2223def get_filesystem_encoding():
2224 encoding = sys.getfilesystemencoding()
2225 return encoding if encoding is not None else 'utf-8'
2226
2227
4eb7f1d1 2228def shell_quote(args):
a6a173c2 2229 quoted_args = []
4644ac55 2230 encoding = get_filesystem_encoding()
a6a173c2
JMF
2231 for a in args:
2232 if isinstance(a, bytes):
2233 # We may get a filename encoded with 'encodeFilename'
2234 a = a.decode(encoding)
aefce8e6 2235 quoted_args.append(compat_shlex_quote(a))
28e614de 2236 return ' '.join(quoted_args)
9d4660ca
PH
2237
2238
2239def smuggle_url(url, data):
2240 """ Pass additional data in a URL for internal use. """
2241
81953d1a
RA
2242 url, idata = unsmuggle_url(url, {})
2243 data.update(idata)
15707c7e 2244 sdata = compat_urllib_parse_urlencode(
28e614de
PH
2245 {'__youtubedl_smuggle': json.dumps(data)})
2246 return url + '#' + sdata
9d4660ca
PH
2247
2248
79f82953 2249def unsmuggle_url(smug_url, default=None):
83e865a3 2250 if '#__youtubedl_smuggle' not in smug_url:
79f82953 2251 return smug_url, default
28e614de
PH
2252 url, _, sdata = smug_url.rpartition('#')
2253 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
2254 data = json.loads(jsond)
2255 return url, data
02dbf93f
PH
2256
2257
e0fd9573 2258def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2259 """ Formats numbers with decimal sufixes like K, M, etc """
2260 num, factor = float_or_none(num), float(factor)
4c3f8c3f 2261 if num is None or num < 0:
e0fd9573 2262 return None
2263 exponent = 0 if num == 0 else int(math.log(num, factor))
abbeeebc 2264 suffix = ['', *'kMGTPEZY'][exponent]
2265 if factor == 1024:
2266 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
e0fd9573 2267 converted = num / (factor ** exponent)
abbeeebc 2268 return fmt % (converted, suffix)
e0fd9573 2269
2270
02dbf93f 2271def format_bytes(bytes):
f02d24d8 2272 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
f53c966a 2273
1c088fa8 2274
fb47597b
S
2275def lookup_unit_table(unit_table, s):
2276 units_re = '|'.join(re.escape(u) for u in unit_table)
2277 m = re.match(
782b1b5b 2278 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
2279 if not m:
2280 return None
2281 num_str = m.group('num').replace(',', '.')
2282 mult = unit_table[m.group('unit')]
2283 return int(float(num_str) * mult)
2284
2285
be64b5b0
PH
2286def parse_filesize(s):
2287 if s is None:
2288 return None
2289
dfb1b146 2290 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
2291 # but we support those too
2292 _UNIT_TABLE = {
2293 'B': 1,
2294 'b': 1,
70852b47 2295 'bytes': 1,
be64b5b0
PH
2296 'KiB': 1024,
2297 'KB': 1000,
2298 'kB': 1024,
2299 'Kb': 1000,
13585d76 2300 'kb': 1000,
70852b47
YCH
2301 'kilobytes': 1000,
2302 'kibibytes': 1024,
be64b5b0
PH
2303 'MiB': 1024 ** 2,
2304 'MB': 1000 ** 2,
2305 'mB': 1024 ** 2,
2306 'Mb': 1000 ** 2,
13585d76 2307 'mb': 1000 ** 2,
70852b47
YCH
2308 'megabytes': 1000 ** 2,
2309 'mebibytes': 1024 ** 2,
be64b5b0
PH
2310 'GiB': 1024 ** 3,
2311 'GB': 1000 ** 3,
2312 'gB': 1024 ** 3,
2313 'Gb': 1000 ** 3,
13585d76 2314 'gb': 1000 ** 3,
70852b47
YCH
2315 'gigabytes': 1000 ** 3,
2316 'gibibytes': 1024 ** 3,
be64b5b0
PH
2317 'TiB': 1024 ** 4,
2318 'TB': 1000 ** 4,
2319 'tB': 1024 ** 4,
2320 'Tb': 1000 ** 4,
13585d76 2321 'tb': 1000 ** 4,
70852b47
YCH
2322 'terabytes': 1000 ** 4,
2323 'tebibytes': 1024 ** 4,
be64b5b0
PH
2324 'PiB': 1024 ** 5,
2325 'PB': 1000 ** 5,
2326 'pB': 1024 ** 5,
2327 'Pb': 1000 ** 5,
13585d76 2328 'pb': 1000 ** 5,
70852b47
YCH
2329 'petabytes': 1000 ** 5,
2330 'pebibytes': 1024 ** 5,
be64b5b0
PH
2331 'EiB': 1024 ** 6,
2332 'EB': 1000 ** 6,
2333 'eB': 1024 ** 6,
2334 'Eb': 1000 ** 6,
13585d76 2335 'eb': 1000 ** 6,
70852b47
YCH
2336 'exabytes': 1000 ** 6,
2337 'exbibytes': 1024 ** 6,
be64b5b0
PH
2338 'ZiB': 1024 ** 7,
2339 'ZB': 1000 ** 7,
2340 'zB': 1024 ** 7,
2341 'Zb': 1000 ** 7,
13585d76 2342 'zb': 1000 ** 7,
70852b47
YCH
2343 'zettabytes': 1000 ** 7,
2344 'zebibytes': 1024 ** 7,
be64b5b0
PH
2345 'YiB': 1024 ** 8,
2346 'YB': 1000 ** 8,
2347 'yB': 1024 ** 8,
2348 'Yb': 1000 ** 8,
13585d76 2349 'yb': 1000 ** 8,
70852b47
YCH
2350 'yottabytes': 1000 ** 8,
2351 'yobibytes': 1024 ** 8,
be64b5b0
PH
2352 }
2353
fb47597b
S
2354 return lookup_unit_table(_UNIT_TABLE, s)
2355
2356
2357def parse_count(s):
2358 if s is None:
be64b5b0
PH
2359 return None
2360
352d5da8 2361 s = re.sub(r'^[^\d]+\s', '', s).strip()
fb47597b
S
2362
2363 if re.match(r'^[\d,.]+$', s):
2364 return str_to_int(s)
2365
2366 _UNIT_TABLE = {
2367 'k': 1000,
2368 'K': 1000,
2369 'm': 1000 ** 2,
2370 'M': 1000 ** 2,
2371 'kk': 1000 ** 2,
2372 'KK': 1000 ** 2,
352d5da8 2373 'b': 1000 ** 3,
2374 'B': 1000 ** 3,
fb47597b 2375 }
be64b5b0 2376
352d5da8 2377 ret = lookup_unit_table(_UNIT_TABLE, s)
2378 if ret is not None:
2379 return ret
2380
2381 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2382 if mobj:
2383 return str_to_int(mobj.group(1))
be64b5b0 2384
2f7ae819 2385
b871d7e9
S
2386def parse_resolution(s):
2387 if s is None:
2388 return {}
2389
17ec8bcf 2390 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
b871d7e9
S
2391 if mobj:
2392 return {
2393 'width': int(mobj.group('w')),
2394 'height': int(mobj.group('h')),
2395 }
2396
17ec8bcf 2397 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
b871d7e9
S
2398 if mobj:
2399 return {'height': int(mobj.group(1))}
2400
2401 mobj = re.search(r'\b([48])[kK]\b', s)
2402 if mobj:
2403 return {'height': int(mobj.group(1)) * 540}
2404
2405 return {}
2406
2407
0dc41787
S
2408def parse_bitrate(s):
2409 if not isinstance(s, compat_str):
2410 return
2411 mobj = re.search(r'\b(\d+)\s*kbps', s)
2412 if mobj:
2413 return int(mobj.group(1))
2414
2415
a942d6cb 2416def month_by_name(name, lang='en'):
caefb1de
PH
2417 """ Return the number of a month by (locale-independently) English name """
2418
f6717dec 2419 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 2420
caefb1de 2421 try:
f6717dec 2422 return month_names.index(name) + 1
7105440c
YCH
2423 except ValueError:
2424 return None
2425
2426
2427def month_by_abbreviation(abbrev):
2428 """ Return the number of a month by (locale-independently) English
2429 abbreviations """
2430
2431 try:
2432 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
2433 except ValueError:
2434 return None
18258362
JMF
2435
2436
5aafe895 2437def fix_xml_ampersands(xml_str):
18258362 2438 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
2439 return re.sub(
2440 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 2441 '&amp;',
5aafe895 2442 xml_str)
e3946f98
PH
2443
2444
2445def setproctitle(title):
8bf48f23 2446 assert isinstance(title, compat_str)
c1c05c67
YCH
2447
2448 # ctypes in Jython is not complete
2449 # http://bugs.jython.org/issue2148
2450 if sys.platform.startswith('java'):
2451 return
2452
e3946f98 2453 try:
611c1dd9 2454 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
2455 except OSError:
2456 return
2f49bcd6
RC
2457 except TypeError:
2458 # LoadLibrary in Windows Python 2.7.13 only expects
2459 # a bytestring, but since unicode_literals turns
2460 # every string into a unicode string, it fails.
2461 return
6eefe533
PH
2462 title_bytes = title.encode('utf-8')
2463 buf = ctypes.create_string_buffer(len(title_bytes))
2464 buf.value = title_bytes
e3946f98 2465 try:
6eefe533 2466 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
2467 except AttributeError:
2468 return # Strange libc, just skip this
d7dda168
PH
2469
2470
2471def remove_start(s, start):
46bc9b7d 2472 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
2473
2474
2b9faf55 2475def remove_end(s, end):
46bc9b7d 2476 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
2477
2478
31b2051e
S
2479def remove_quotes(s):
2480 if s is None or len(s) < 2:
2481 return s
2482 for quote in ('"', "'", ):
2483 if s[0] == quote and s[-1] == quote:
2484 return s[1:-1]
2485 return s
2486
2487
b6e0c7d2
U
2488def get_domain(url):
2489 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2490 return domain.group('domain') if domain else None
2491
2492
29eb5174 2493def url_basename(url):
9b8aaeed 2494 path = compat_urlparse.urlparse(url).path
28e614de 2495 return path.strip('/').split('/')[-1]
aa94a6d3
PH
2496
2497
02dc0a36
S
2498def base_url(url):
2499 return re.match(r'https?://[^?#&]+/', url).group()
2500
2501
e34c3361 2502def urljoin(base, path):
4b5de77b
S
2503 if isinstance(path, bytes):
2504 path = path.decode('utf-8')
e34c3361
S
2505 if not isinstance(path, compat_str) or not path:
2506 return None
fad4ceb5 2507 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
e34c3361 2508 return path
4b5de77b
S
2509 if isinstance(base, bytes):
2510 base = base.decode('utf-8')
2511 if not isinstance(base, compat_str) or not re.match(
2512 r'^(?:https?:)?//', base):
e34c3361
S
2513 return None
2514 return compat_urlparse.urljoin(base, path)
2515
2516
aa94a6d3
PH
2517class HEADRequest(compat_urllib_request.Request):
2518 def get_method(self):
611c1dd9 2519 return 'HEAD'
7217e148
PH
2520
2521
95cf60e8
S
2522class PUTRequest(compat_urllib_request.Request):
2523 def get_method(self):
2524 return 'PUT'
2525
2526
9732d77e 2527def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
9e907ebd 2528 if get_attr and v is not None:
2529 v = getattr(v, get_attr, None)
1812afb7
S
2530 try:
2531 return int(v) * invscale // scale
31c49255 2532 except (ValueError, TypeError, OverflowError):
af98f8ff 2533 return default
9732d77e 2534
9572013d 2535
40a90862
JMF
2536def str_or_none(v, default=None):
2537 return default if v is None else compat_str(v)
2538
9732d77e
PH
2539
2540def str_to_int(int_str):
48d4681e 2541 """ A more relaxed version of int_or_none """
42db58ec 2542 if isinstance(int_str, compat_integer_types):
348c6bf1 2543 return int_str
42db58ec
S
2544 elif isinstance(int_str, compat_str):
2545 int_str = re.sub(r'[,\.\+]', '', int_str)
2546 return int_or_none(int_str)
608d11f5
PH
2547
2548
9732d77e 2549def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
2550 if v is None:
2551 return default
2552 try:
2553 return float(v) * invscale / scale
5e1271c5 2554 except (ValueError, TypeError):
caf80631 2555 return default
43f775e4
PH
2556
2557
c7e327c4
S
2558def bool_or_none(v, default=None):
2559 return v if isinstance(v, bool) else default
2560
2561
53cd37ba
S
2562def strip_or_none(v, default=None):
2563 return v.strip() if isinstance(v, compat_str) else default
b72b4431
S
2564
2565
af03000a
S
2566def url_or_none(url):
2567 if not url or not isinstance(url, compat_str):
2568 return None
2569 url = url.strip()
29f7c58a 2570 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
af03000a
S
2571
2572
3e9b66d7
LNO
2573def request_to_url(req):
2574 if isinstance(req, compat_urllib_request.Request):
2575 return req.get_full_url()
2576 else:
2577 return req
2578
2579
e29663c6 2580def strftime_or_none(timestamp, date_format, default=None):
2581 datetime_object = None
2582 try:
2583 if isinstance(timestamp, compat_numeric_types): # unix timestamp
2584 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2585 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2586 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2587 return datetime_object.strftime(date_format)
2588 except (ValueError, TypeError, AttributeError):
2589 return default
2590
2591
608d11f5 2592def parse_duration(s):
8f9312c3 2593 if not isinstance(s, compat_basestring):
608d11f5 2594 return None
ca7b3246 2595 s = s.strip()
38d79fd1 2596 if not s:
2597 return None
ca7b3246 2598
acaff495 2599 days, hours, mins, secs, ms = [None] * 5
8bd1c00b 2600 m = re.match(r'''(?x)
2601 (?P<before_secs>
2602 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2603 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2604 (?P<ms>[.:][0-9]+)?Z?$
2605 ''', s)
acaff495 2606 if m:
8bd1c00b 2607 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
acaff495 2608 else:
2609 m = re.match(
056653bb
S
2610 r'''(?ix)(?:P?
2611 (?:
2612 [0-9]+\s*y(?:ears?)?\s*
2613 )?
2614 (?:
2615 [0-9]+\s*m(?:onths?)?\s*
2616 )?
2617 (?:
2618 [0-9]+\s*w(?:eeks?)?\s*
2619 )?
8f4b58d7 2620 (?:
acaff495 2621 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 2622 )?
056653bb 2623 T)?
acaff495 2624 (?:
2625 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2626 )?
2627 (?:
2628 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2629 )?
2630 (?:
2631 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 2632 )?Z?$''', s)
acaff495 2633 if m:
2634 days, hours, mins, secs, ms = m.groups()
2635 else:
15846398 2636 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 2637 if m:
2638 hours, mins = m.groups()
2639 else:
2640 return None
2641
2642 duration = 0
2643 if secs:
2644 duration += float(secs)
2645 if mins:
2646 duration += float(mins) * 60
2647 if hours:
2648 duration += float(hours) * 60 * 60
2649 if days:
2650 duration += float(days) * 24 * 60 * 60
2651 if ms:
8bd1c00b 2652 duration += float(ms.replace(':', '.'))
acaff495 2653 return duration
91d7d0b3
JMF
2654
2655
e65e4c88 2656def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 2657 name, real_ext = os.path.splitext(filename)
e65e4c88
S
2658 return (
2659 '{0}.{1}{2}'.format(name, ext, real_ext)
2660 if not expected_real_ext or real_ext[1:] == expected_real_ext
2661 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
2662
2663
b3ed15b7
S
2664def replace_extension(filename, ext, expected_real_ext=None):
2665 name, real_ext = os.path.splitext(filename)
2666 return '{0}.{1}'.format(
2667 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2668 ext)
2669
2670
d70ad093
PH
2671def check_executable(exe, args=[]):
2672 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2673 args can be a list of arguments for a short output (like -version) """
2674 try:
d3c93ec2 2675 Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
d70ad093
PH
2676 except OSError:
2677 return False
2678 return exe
b7ab0590
PH
2679
2680
9af98e17 2681def _get_exe_version_output(exe, args):
95807118 2682 try:
b64d04c1 2683 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
7a5c1cfe 2684 # SIGTTOU if yt-dlp is run in the background.
067aa17e 2685 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
d3c93ec2 2686 out, _ = Popen(
2687 [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2688 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
95807118
PH
2689 except OSError:
2690 return False
cae97f65
PH
2691 if isinstance(out, bytes): # Python 2.x
2692 out = out.decode('ascii', 'ignore')
9af98e17 2693 return out
cae97f65
PH
2694
2695
2696def detect_exe_version(output, version_re=None, unrecognized='present'):
2697 assert isinstance(output, compat_str)
2698 if version_re is None:
2699 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2700 m = re.search(version_re, output)
95807118
PH
2701 if m:
2702 return m.group(1)
2703 else:
2704 return unrecognized
2705
2706
9af98e17 2707def get_exe_version(exe, args=['--version'],
2708 version_re=None, unrecognized='present'):
2709 """ Returns the version of the specified executable,
2710 or False if the executable is not present """
2711 out = _get_exe_version_output(exe, args)
2712 return detect_exe_version(out, version_re, unrecognized) if out else False
2713
2714
cb89cfc1 2715class LazyList(collections.abc.Sequence):
483336e7 2716 ''' Lazy immutable list from an iterable
2717 Note that slices of a LazyList are lists and not LazyList'''
2718
8e5fecc8 2719 class IndexError(IndexError):
2720 pass
2721
282f5709 2722 def __init__(self, iterable, *, reverse=False, _cache=None):
483336e7 2723 self.__iterable = iter(iterable)
282f5709 2724 self.__cache = [] if _cache is None else _cache
2725 self.__reversed = reverse
483336e7 2726
2727 def __iter__(self):
28419ca2 2728 if self.__reversed:
2729 # We need to consume the entire iterable to iterate in reverse
981052c9 2730 yield from self.exhaust()
28419ca2 2731 return
2732 yield from self.__cache
483336e7 2733 for item in self.__iterable:
2734 self.__cache.append(item)
2735 yield item
2736
981052c9 2737 def __exhaust(self):
483336e7 2738 self.__cache.extend(self.__iterable)
9f1a1c36 2739 # Discard the emptied iterable to make it pickle-able
2740 self.__iterable = []
28419ca2 2741 return self.__cache
2742
981052c9 2743 def exhaust(self):
2744 ''' Evaluate the entire iterable '''
2745 return self.__exhaust()[::-1 if self.__reversed else 1]
2746
28419ca2 2747 @staticmethod
981052c9 2748 def __reverse_index(x):
e0f2b4b4 2749 return None if x is None else -(x + 1)
483336e7 2750
2751 def __getitem__(self, idx):
2752 if isinstance(idx, slice):
28419ca2 2753 if self.__reversed:
e0f2b4b4 2754 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2755 start, stop, step = idx.start, idx.stop, idx.step or 1
483336e7 2756 elif isinstance(idx, int):
28419ca2 2757 if self.__reversed:
981052c9 2758 idx = self.__reverse_index(idx)
e0f2b4b4 2759 start, stop, step = idx, idx, 0
483336e7 2760 else:
2761 raise TypeError('indices must be integers or slices')
e0f2b4b4 2762 if ((start or 0) < 0 or (stop or 0) < 0
2763 or (start is None and step < 0)
2764 or (stop is None and step > 0)):
483336e7 2765 # We need to consume the entire iterable to be able to slice from the end
2766 # Obviously, never use this with infinite iterables
8e5fecc8 2767 self.__exhaust()
2768 try:
2769 return self.__cache[idx]
2770 except IndexError as e:
2771 raise self.IndexError(e) from e
e0f2b4b4 2772 n = max(start or 0, stop or 0) - len(self.__cache) + 1
28419ca2 2773 if n > 0:
2774 self.__cache.extend(itertools.islice(self.__iterable, n))
8e5fecc8 2775 try:
2776 return self.__cache[idx]
2777 except IndexError as e:
2778 raise self.IndexError(e) from e
483336e7 2779
2780 def __bool__(self):
2781 try:
28419ca2 2782 self[-1] if self.__reversed else self[0]
8e5fecc8 2783 except self.IndexError:
483336e7 2784 return False
2785 return True
2786
2787 def __len__(self):
8e5fecc8 2788 self.__exhaust()
483336e7 2789 return len(self.__cache)
2790
282f5709 2791 def __reversed__(self):
2792 return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2793
2794 def __copy__(self):
2795 return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2796
28419ca2 2797 def __repr__(self):
2798 # repr and str should mimic a list. So we exhaust the iterable
2799 return repr(self.exhaust())
2800
2801 def __str__(self):
2802 return repr(self.exhaust())
2803
483336e7 2804
7be9ccff 2805class PagedList:
c07a39ae 2806
2807 class IndexError(IndexError):
2808 pass
2809
dd26ced1
PH
2810 def __len__(self):
2811 # This is only useful for tests
2812 return len(self.getslice())
2813
7be9ccff 2814 def __init__(self, pagefunc, pagesize, use_cache=True):
2815 self._pagefunc = pagefunc
2816 self._pagesize = pagesize
f1d13090 2817 self._pagecount = float('inf')
7be9ccff 2818 self._use_cache = use_cache
2819 self._cache = {}
2820
2821 def getpage(self, pagenum):
d8cf8d97 2822 page_results = self._cache.get(pagenum)
2823 if page_results is None:
f1d13090 2824 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
7be9ccff 2825 if self._use_cache:
2826 self._cache[pagenum] = page_results
2827 return page_results
2828
2829 def getslice(self, start=0, end=None):
2830 return list(self._getslice(start, end))
2831
2832 def _getslice(self, start, end):
55575225 2833 raise NotImplementedError('This method must be implemented by subclasses')
2834
2835 def __getitem__(self, idx):
f1d13090 2836 assert self._use_cache, 'Indexing PagedList requires cache'
55575225 2837 if not isinstance(idx, int) or idx < 0:
2838 raise TypeError('indices must be non-negative integers')
2839 entries = self.getslice(idx, idx + 1)
d8cf8d97 2840 if not entries:
c07a39ae 2841 raise self.IndexError()
d8cf8d97 2842 return entries[0]
55575225 2843
9c44d242
PH
2844
2845class OnDemandPagedList(PagedList):
7be9ccff 2846 def _getslice(self, start, end):
b7ab0590
PH
2847 for pagenum in itertools.count(start // self._pagesize):
2848 firstid = pagenum * self._pagesize
2849 nextfirstid = pagenum * self._pagesize + self._pagesize
2850 if start >= nextfirstid:
2851 continue
2852
b7ab0590
PH
2853 startv = (
2854 start % self._pagesize
2855 if firstid <= start < nextfirstid
2856 else 0)
b7ab0590
PH
2857 endv = (
2858 ((end - 1) % self._pagesize) + 1
2859 if (end is not None and firstid <= end <= nextfirstid)
2860 else None)
2861
f1d13090 2862 try:
2863 page_results = self.getpage(pagenum)
2864 except Exception:
2865 self._pagecount = pagenum - 1
2866 raise
b7ab0590
PH
2867 if startv != 0 or endv is not None:
2868 page_results = page_results[startv:endv]
7be9ccff 2869 yield from page_results
b7ab0590
PH
2870
2871 # A little optimization - if current page is not "full", ie. does
2872 # not contain page_size videos then we can assume that this page
2873 # is the last one - there are no more ids on further pages -
2874 # i.e. no need to query again.
2875 if len(page_results) + startv < self._pagesize:
2876 break
2877
2878 # If we got the whole page, but the next page is not interesting,
2879 # break out early as well
2880 if end == nextfirstid:
2881 break
81c2f20b
PH
2882
2883
9c44d242
PH
2884class InAdvancePagedList(PagedList):
2885 def __init__(self, pagefunc, pagecount, pagesize):
7be9ccff 2886 PagedList.__init__(self, pagefunc, pagesize, True)
f1d13090 2887 self._pagecount = pagecount
9c44d242 2888
7be9ccff 2889 def _getslice(self, start, end):
9c44d242 2890 start_page = start // self._pagesize
d37707bd 2891 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
9c44d242
PH
2892 skip_elems = start - start_page * self._pagesize
2893 only_more = None if end is None else end - start
2894 for pagenum in range(start_page, end_page):
7be9ccff 2895 page_results = self.getpage(pagenum)
9c44d242 2896 if skip_elems:
7be9ccff 2897 page_results = page_results[skip_elems:]
9c44d242
PH
2898 skip_elems = None
2899 if only_more is not None:
7be9ccff 2900 if len(page_results) < only_more:
2901 only_more -= len(page_results)
9c44d242 2902 else:
7be9ccff 2903 yield from page_results[:only_more]
9c44d242 2904 break
7be9ccff 2905 yield from page_results
9c44d242
PH
2906
2907
81c2f20b 2908def uppercase_escape(s):
676eb3f2 2909 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2910 return re.sub(
a612753d 2911 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2912 lambda m: unicode_escape(m.group(0))[0],
2913 s)
0fe2ff78
YCH
2914
2915
2916def lowercase_escape(s):
2917 unicode_escape = codecs.getdecoder('unicode_escape')
2918 return re.sub(
2919 r'\\u[0-9a-fA-F]{4}',
2920 lambda m: unicode_escape(m.group(0))[0],
2921 s)
b53466e1 2922
d05cfe06
S
2923
2924def escape_rfc3986(s):
2925 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 2926 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 2927 s = s.encode('utf-8')
ecc0c5ee 2928 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2929
2930
2931def escape_url(url):
2932 """Escape URL as suggested by RFC 3986"""
2933 url_parsed = compat_urllib_parse_urlparse(url)
2934 return url_parsed._replace(
efbed08d 2935 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2936 path=escape_rfc3986(url_parsed.path),
2937 params=escape_rfc3986(url_parsed.params),
2938 query=escape_rfc3986(url_parsed.query),
2939 fragment=escape_rfc3986(url_parsed.fragment)
2940 ).geturl()
2941
62e609ab 2942
4dfbf869 2943def parse_qs(url):
2944 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2945
2946
62e609ab
PH
2947def read_batch_urls(batch_fd):
2948 def fixup(url):
2949 if not isinstance(url, compat_str):
2950 url = url.decode('utf-8', 'replace')
8c04f0be 2951 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2952 for bom in BOM_UTF8:
2953 if url.startswith(bom):
2954 url = url[len(bom):]
2955 url = url.lstrip()
2956 if not url or url.startswith(('#', ';', ']')):
62e609ab 2957 return False
8c04f0be 2958 # "#" cannot be stripped out since it is part of the URI
2959 # However, it can be safely stipped out if follwing a whitespace
2960 return re.split(r'\s#', url, 1)[0].rstrip()
62e609ab
PH
2961
2962 with contextlib.closing(batch_fd) as fd:
2963 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2964
2965
2966def urlencode_postdata(*args, **kargs):
15707c7e 2967 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2968
2969
38f9ef31 2970def update_url_query(url, query):
cacd9966
YCH
2971 if not query:
2972 return url
38f9ef31 2973 parsed_url = compat_urlparse.urlparse(url)
2974 qs = compat_parse_qs(parsed_url.query)
2975 qs.update(query)
2976 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2977 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2978
8e60dc75 2979
ed0291d1
S
2980def update_Request(req, url=None, data=None, headers={}, query={}):
2981 req_headers = req.headers.copy()
2982 req_headers.update(headers)
2983 req_data = data or req.data
2984 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2985 req_get_method = req.get_method()
2986 if req_get_method == 'HEAD':
2987 req_type = HEADRequest
2988 elif req_get_method == 'PUT':
2989 req_type = PUTRequest
2990 else:
2991 req_type = compat_urllib_request.Request
ed0291d1
S
2992 new_req = req_type(
2993 req_url, data=req_data, headers=req_headers,
2994 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2995 if hasattr(req, 'timeout'):
2996 new_req.timeout = req.timeout
2997 return new_req
2998
2999
10c87c15 3000def _multipart_encode_impl(data, boundary):
0c265486
YCH
3001 content_type = 'multipart/form-data; boundary=%s' % boundary
3002
3003 out = b''
3004 for k, v in data.items():
3005 out += b'--' + boundary.encode('ascii') + b'\r\n'
3006 if isinstance(k, compat_str):
3007 k = k.encode('utf-8')
3008 if isinstance(v, compat_str):
3009 v = v.encode('utf-8')
3010 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3011 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 3012 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
3013 if boundary.encode('ascii') in content:
3014 raise ValueError('Boundary overlaps with data')
3015 out += content
3016
3017 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3018
3019 return out, content_type
3020
3021
3022def multipart_encode(data, boundary=None):
3023 '''
3024 Encode a dict to RFC 7578-compliant form-data
3025
3026 data:
3027 A dict where keys and values can be either Unicode or bytes-like
3028 objects.
3029 boundary:
3030 If specified a Unicode object, it's used as the boundary. Otherwise
3031 a random boundary is generated.
3032
3033 Reference: https://tools.ietf.org/html/rfc7578
3034 '''
3035 has_specified_boundary = boundary is not None
3036
3037 while True:
3038 if boundary is None:
3039 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3040
3041 try:
10c87c15 3042 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
3043 break
3044 except ValueError:
3045 if has_specified_boundary:
3046 raise
3047 boundary = None
3048
3049 return out, content_type
3050
3051
86296ad2 3052def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
3053 if isinstance(key_or_keys, (list, tuple)):
3054 for key in key_or_keys:
86296ad2
S
3055 if key not in d or d[key] is None or skip_false_values and not d[key]:
3056 continue
3057 return d[key]
cbecc9b9
S
3058 return default
3059 return d.get(key_or_keys, default)
3060
3061
329ca3be 3062def try_get(src, getter, expected_type=None):
6606817a 3063 for get in variadic(getter):
a32a9a7e
S
3064 try:
3065 v = get(src)
3066 except (AttributeError, KeyError, TypeError, IndexError):
3067 pass
3068 else:
3069 if expected_type is None or isinstance(v, expected_type):
3070 return v
329ca3be
S
3071
3072
6cc62232
S
3073def merge_dicts(*dicts):
3074 merged = {}
3075 for a_dict in dicts:
3076 for k, v in a_dict.items():
3077 if v is None:
3078 continue
3089bc74
S
3079 if (k not in merged
3080 or (isinstance(v, compat_str) and v
3081 and isinstance(merged[k], compat_str)
3082 and not merged[k])):
6cc62232
S
3083 merged[k] = v
3084 return merged
3085
3086
8e60dc75
S
3087def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3088 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3089
16392824 3090
a1a530b0
PH
3091US_RATINGS = {
3092 'G': 0,
3093 'PG': 10,
3094 'PG-13': 13,
3095 'R': 16,
3096 'NC': 18,
3097}
fac55558
PH
3098
3099
a8795327 3100TV_PARENTAL_GUIDELINES = {
5a16c9d9
RA
3101 'TV-Y': 0,
3102 'TV-Y7': 7,
3103 'TV-G': 0,
3104 'TV-PG': 0,
3105 'TV-14': 14,
3106 'TV-MA': 17,
a8795327
S
3107}
3108
3109
146c80e2 3110def parse_age_limit(s):
a8795327
S
3111 if type(s) == int:
3112 return s if 0 <= s <= 21 else None
3113 if not isinstance(s, compat_basestring):
d838b1bd 3114 return None
146c80e2 3115 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
3116 if m:
3117 return int(m.group('age'))
5c5fae6d 3118 s = s.upper()
a8795327
S
3119 if s in US_RATINGS:
3120 return US_RATINGS[s]
5a16c9d9 3121 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
b8361187 3122 if m:
5a16c9d9 3123 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
b8361187 3124 return None
146c80e2
S
3125
3126
fac55558 3127def strip_jsonp(code):
609a61e3 3128 return re.sub(
5552c9eb 3129 r'''(?sx)^
e9c671d5 3130 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
5552c9eb
YCH
3131 (?:\s*&&\s*(?P=func_name))?
3132 \s*\(\s*(?P<callback_data>.*)\);?
3133 \s*?(?://[^\n]*)*$''',
3134 r'\g<callback_data>', code)
478c2c61
PH
3135
3136
5c610515 3137def js_to_json(code, vars={}):
3138 # vars is a dict of var, val pairs to substitute
c843e685 3139 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
4195096e
S
3140 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3141 INTEGER_TABLE = (
3142 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3143 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3144 )
3145
e05f6939 3146 def fix_kv(m):
e7b6d122
PH
3147 v = m.group(0)
3148 if v in ('true', 'false', 'null'):
3149 return v
421ddcb8
C
3150 elif v in ('undefined', 'void 0'):
3151 return 'null'
8bdd16b4 3152 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
bd1e4844 3153 return ""
3154
3155 if v[0] in ("'", '"'):
3156 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 3157 '"': '\\"',
bd1e4844 3158 "\\'": "'",
3159 '\\\n': '',
3160 '\\x': '\\u00',
3161 }.get(m.group(0), m.group(0)), v[1:-1])
8bdd16b4 3162 else:
3163 for regex, base in INTEGER_TABLE:
3164 im = re.match(regex, v)
3165 if im:
3166 i = int(im.group(1), base)
3167 return '"%d":' % i if v.endswith(':') else '%d' % i
89ac4a19 3168
5c610515 3169 if v in vars:
3170 return vars[v]
3171
e7b6d122 3172 return '"%s"' % v
e05f6939 3173
febff4c1
B
3174 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3175
bd1e4844 3176 return re.sub(r'''(?sx)
3177 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3178 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 3179 {comment}|,(?={skip}[\]}}])|
421ddcb8 3180 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
4195096e 3181 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
8bdd16b4 3182 [0-9]+(?={skip}:)|
3183 !+
4195096e 3184 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
3185
3186
478c2c61
PH
3187def qualities(quality_ids):
3188 """ Get a numeric quality value out of a list of possible values """
3189 def q(qid):
3190 try:
3191 return quality_ids.index(qid)
3192 except ValueError:
3193 return -1
3194 return q
3195
acd69589 3196
09b49e1f 3197POSTPROCESS_WHEN = {'pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
1e43a6f7 3198
3199
de6000d9 3200DEFAULT_OUTTMPL = {
3201 'default': '%(title)s [%(id)s].%(ext)s',
72755351 3202 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
de6000d9 3203}
3204OUTTMPL_TYPES = {
72755351 3205 'chapter': None,
de6000d9 3206 'subtitle': None,
3207 'thumbnail': None,
3208 'description': 'description',
3209 'annotation': 'annotations.xml',
3210 'infojson': 'info.json',
08438d2c 3211 'link': None,
3b603dbd 3212 'pl_video': None,
5112f26a 3213 'pl_thumbnail': None,
de6000d9 3214 'pl_description': 'description',
3215 'pl_infojson': 'info.json',
3216}
0a871f68 3217
143db31d 3218# As of [1] format syntax is:
3219# %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3220# 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
901130bb 3221STR_FORMAT_RE_TMPL = r'''(?x)
3222 (?<!%)(?P<prefix>(?:%%)*)
143db31d 3223 %
524e2e4f 3224 (?P<has_key>\((?P<key>{0})\))?
752cda38 3225 (?P<format>
524e2e4f 3226 (?P<conversion>[#0\-+ ]+)?
3227 (?P<min_width>\d+)?
3228 (?P<precision>\.\d+)?
3229 (?P<len_mod>[hlL])? # unused in python
901130bb 3230 {1} # conversion type
752cda38 3231 )
143db31d 3232'''
3233
7d1eb38a 3234
901130bb 3235STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
a020a0dc 3236
7d1eb38a 3237
a020a0dc
PH
3238def limit_length(s, length):
3239 """ Add ellipses to overly long strings """
3240 if s is None:
3241 return None
3242 ELLIPSES = '...'
3243 if len(s) > length:
3244 return s[:length - len(ELLIPSES)] + ELLIPSES
3245 return s
48844745
PH
3246
3247
3248def version_tuple(v):
5f9b8394 3249 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
3250
3251
3252def is_outdated_version(version, limit, assume_new=True):
3253 if not version:
3254 return not assume_new
3255 try:
3256 return version_tuple(version) < version_tuple(limit)
3257 except ValueError:
3258 return not assume_new
732ea2f0
PH
3259
3260
3261def ytdl_is_updateable():
7a5c1cfe 3262 """ Returns if yt-dlp can be updated with -U """
735d865e 3263
5d535b4a 3264 from .update import is_non_updateable
732ea2f0 3265
5d535b4a 3266 return not is_non_updateable()
7d4111ed
PH
3267
3268
3269def args_to_str(args):
3270 # Get a short string representation for a subprocess command
702ccf2d 3271 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
3272
3273
9b9c5355 3274def error_to_compat_str(err):
fdae2358
S
3275 err_str = str(err)
3276 # On python 2 error byte string must be decoded with proper
3277 # encoding rather than ascii
3278 if sys.version_info[0] < 3:
3279 err_str = err_str.decode(preferredencoding())
3280 return err_str
3281
3282
c460bdd5 3283def mimetype2ext(mt):
eb9ee194
S
3284 if mt is None:
3285 return None
3286
9359f3d4
F
3287 mt, _, params = mt.partition(';')
3288 mt = mt.strip()
3289
3290 FULL_MAP = {
765ac263 3291 'audio/mp4': 'm4a',
6c33d24b
YCH
3292 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3293 # it's the most popular one
3294 'audio/mpeg': 'mp3',
ba39289d 3295 'audio/x-wav': 'wav',
9359f3d4
F
3296 'audio/wav': 'wav',
3297 'audio/wave': 'wav',
3298 }
3299
3300 ext = FULL_MAP.get(mt)
765ac263
JMF
3301 if ext is not None:
3302 return ext
3303
9359f3d4 3304 SUBTYPE_MAP = {
f6861ec9 3305 '3gpp': '3gp',
cafcf657 3306 'smptett+xml': 'tt',
cafcf657 3307 'ttaf+xml': 'dfxp',
a0d8d704 3308 'ttml+xml': 'ttml',
f6861ec9 3309 'x-flv': 'flv',
a0d8d704 3310 'x-mp4-fragmented': 'mp4',
d4f05d47 3311 'x-ms-sami': 'sami',
a0d8d704 3312 'x-ms-wmv': 'wmv',
b4173f15
RA
3313 'mpegurl': 'm3u8',
3314 'x-mpegurl': 'm3u8',
3315 'vnd.apple.mpegurl': 'm3u8',
3316 'dash+xml': 'mpd',
b4173f15 3317 'f4m+xml': 'f4m',
f164b971 3318 'hds+xml': 'f4m',
e910fe2f 3319 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 3320 'quicktime': 'mov',
98ce1a3f 3321 'mp2t': 'ts',
39e7107d 3322 'x-wav': 'wav',
9359f3d4
F
3323 'filmstrip+json': 'fs',
3324 'svg+xml': 'svg',
3325 }
3326
3327 _, _, subtype = mt.rpartition('/')
3328 ext = SUBTYPE_MAP.get(subtype.lower())
3329 if ext is not None:
3330 return ext
3331
3332 SUFFIX_MAP = {
3333 'json': 'json',
3334 'xml': 'xml',
3335 'zip': 'zip',
3336 'gzip': 'gz',
3337 }
3338
3339 _, _, suffix = subtype.partition('+')
3340 ext = SUFFIX_MAP.get(suffix)
3341 if ext is not None:
3342 return ext
3343
3344 return subtype.replace('+', '.')
c460bdd5
PH
3345
3346
2814f12b
THD
3347def ext2mimetype(ext_or_url):
3348 if not ext_or_url:
3349 return None
3350 if '.' not in ext_or_url:
3351 ext_or_url = f'file.{ext_or_url}'
3352 return mimetypes.guess_type(ext_or_url)[0]
3353
3354
4f3c5e06 3355def parse_codecs(codecs_str):
3356 # http://tools.ietf.org/html/rfc6381
3357 if not codecs_str:
3358 return {}
a0566bbf 3359 split_codecs = list(filter(None, map(
dbf5416a 3360 str.strip, codecs_str.strip().strip(',').split(','))))
4afa3ec4 3361 vcodec, acodec, tcodec, hdr = None, None, None, None
a0566bbf 3362 for full_codec in split_codecs:
9bd979ca 3363 parts = full_codec.split('.')
3364 codec = parts[0].replace('0', '')
3365 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3366 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
4f3c5e06 3367 if not vcodec:
b69fd25c 3368 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
176f1866 3369 if codec in ('dvh1', 'dvhe'):
3370 hdr = 'DV'
9bd979ca 3371 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3372 hdr = 'HDR10'
3373 elif full_codec.replace('0', '').startswith('vp9.2'):
176f1866 3374 hdr = 'HDR10'
b69fd25c 3375 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 3376 if not acodec:
3377 acodec = full_codec
4afa3ec4
F
3378 elif codec in ('stpp', 'wvtt',):
3379 if not tcodec:
3380 tcodec = full_codec
4f3c5e06 3381 else:
60f5c9fb 3382 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
4afa3ec4 3383 if vcodec or acodec or tcodec:
4f3c5e06 3384 return {
3385 'vcodec': vcodec or 'none',
3386 'acodec': acodec or 'none',
176f1866 3387 'dynamic_range': hdr,
4afa3ec4 3388 **({'tcodec': tcodec} if tcodec is not None else {}),
4f3c5e06 3389 }
b69fd25c 3390 elif len(split_codecs) == 2:
3391 return {
3392 'vcodec': split_codecs[0],
3393 'acodec': split_codecs[1],
3394 }
4f3c5e06 3395 return {}
3396
3397
2ccd1b10 3398def urlhandle_detect_ext(url_handle):
79298173 3399 getheader = url_handle.headers.get
2ccd1b10 3400
b55ee18f
PH
3401 cd = getheader('Content-Disposition')
3402 if cd:
3403 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3404 if m:
3405 e = determine_ext(m.group('filename'), default_ext=None)
3406 if e:
3407 return e
3408
c460bdd5 3409 return mimetype2ext(getheader('Content-Type'))
05900629
PH
3410
3411
1e399778
YCH
3412def encode_data_uri(data, mime_type):
3413 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3414
3415
05900629 3416def age_restricted(content_limit, age_limit):
6ec6cb4e 3417 """ Returns True iff the content should be blocked """
05900629
PH
3418
3419 if age_limit is None: # No limit set
3420 return False
3421 if content_limit is None:
3422 return False # Content available for everyone
3423 return age_limit < content_limit
61ca9a80
PH
3424
3425
3426def is_html(first_bytes):
3427 """ Detect whether a file contains HTML by examining its first bytes. """
3428
3429 BOMS = [
3430 (b'\xef\xbb\xbf', 'utf-8'),
3431 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3432 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3433 (b'\xff\xfe', 'utf-16-le'),
3434 (b'\xfe\xff', 'utf-16-be'),
3435 ]
3436 for bom, enc in BOMS:
3437 if first_bytes.startswith(bom):
3438 s = first_bytes[len(bom):].decode(enc, 'replace')
3439 break
3440 else:
3441 s = first_bytes.decode('utf-8', 'replace')
3442
3443 return re.match(r'^\s*<', s)
a055469f
PH
3444
3445
3446def determine_protocol(info_dict):
3447 protocol = info_dict.get('protocol')
3448 if protocol is not None:
3449 return protocol
3450
7de837a5 3451 url = sanitize_url(info_dict['url'])
a055469f
PH
3452 if url.startswith('rtmp'):
3453 return 'rtmp'
3454 elif url.startswith('mms'):
3455 return 'mms'
3456 elif url.startswith('rtsp'):
3457 return 'rtsp'
3458
3459 ext = determine_ext(url)
3460 if ext == 'm3u8':
3461 return 'm3u8'
3462 elif ext == 'f4m':
3463 return 'f4m'
3464
3465 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
3466
3467
c5e3f849 3468def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3469 """ Render a list of rows, each as a list of values.
3470 Text after a \t will be right aligned """
ec11a9f4 3471 def width(string):
c5e3f849 3472 return len(remove_terminal_sequences(string).replace('\t', ''))
76d321f6 3473
3474 def get_max_lens(table):
ec11a9f4 3475 return [max(width(str(v)) for v in col) for col in zip(*table)]
76d321f6 3476
3477 def filter_using_list(row, filterArray):
d16df59d 3478 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
76d321f6 3479
d16df59d 3480 max_lens = get_max_lens(data) if hide_empty else []
3481 header_row = filter_using_list(header_row, max_lens)
3482 data = [filter_using_list(row, max_lens) for row in data]
76d321f6 3483
cfb56d1a 3484 table = [header_row] + data
76d321f6 3485 max_lens = get_max_lens(table)
c5e3f849 3486 extra_gap += 1
76d321f6 3487 if delim:
c5e3f849 3488 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
1ed7953a 3489 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
ec11a9f4 3490 for row in table:
3491 for pos, text in enumerate(map(str, row)):
c5e3f849 3492 if '\t' in text:
3493 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3494 else:
3495 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3496 ret = '\n'.join(''.join(row).rstrip() for row in table)
ec11a9f4 3497 return ret
347de493
PH
3498
3499
8f18aca8 3500def _match_one(filter_part, dct, incomplete):
77b87f05 3501 # TODO: Generalize code with YoutubeDL._build_format_filter
a047eeb6 3502 STRING_OPERATORS = {
3503 '*=': operator.contains,
3504 '^=': lambda attr, value: attr.startswith(value),
3505 '$=': lambda attr, value: attr.endswith(value),
3506 '~=': lambda attr, value: re.search(value, attr),
3507 }
347de493 3508 COMPARISON_OPERATORS = {
a047eeb6 3509 **STRING_OPERATORS,
3510 '<=': operator.le, # "<=" must be defined above "<"
347de493 3511 '<': operator.lt,
347de493 3512 '>=': operator.ge,
a047eeb6 3513 '>': operator.gt,
347de493 3514 '=': operator.eq,
347de493 3515 }
a047eeb6 3516
347de493
PH
3517 operator_rex = re.compile(r'''(?x)\s*
3518 (?P<key>[a-z_]+)
77b87f05 3519 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
347de493 3520 (?:
a047eeb6 3521 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3522 (?P<strval>.+?)
347de493
PH
3523 )
3524 \s*$
3525 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3526 m = operator_rex.search(filter_part)
3527 if m:
18f96d12 3528 m = m.groupdict()
3529 unnegated_op = COMPARISON_OPERATORS[m['op']]
3530 if m['negation']:
77b87f05
MT
3531 op = lambda attr, value: not unnegated_op(attr, value)
3532 else:
3533 op = unnegated_op
18f96d12 3534 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3535 if m['quote']:
3536 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3537 actual_value = dct.get(m['key'])
3538 numeric_comparison = None
3539 if isinstance(actual_value, compat_numeric_types):
e5a088dc
S
3540 # If the original field is a string and matching comparisonvalue is
3541 # a number we should respect the origin of the original field
3542 # and process comparison value as a string (see
18f96d12 3543 # https://github.com/ytdl-org/youtube-dl/issues/11082)
347de493 3544 try:
18f96d12 3545 numeric_comparison = int(comparison_value)
347de493 3546 except ValueError:
18f96d12 3547 numeric_comparison = parse_filesize(comparison_value)
3548 if numeric_comparison is None:
3549 numeric_comparison = parse_filesize(f'{comparison_value}B')
3550 if numeric_comparison is None:
3551 numeric_comparison = parse_duration(comparison_value)
3552 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3553 raise ValueError('Operator %s only supports string values!' % m['op'])
347de493 3554 if actual_value is None:
18f96d12 3555 return incomplete or m['none_inclusive']
3556 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
347de493
PH
3557
3558 UNARY_OPERATORS = {
1cc47c66
S
3559 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3560 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
347de493
PH
3561 }
3562 operator_rex = re.compile(r'''(?x)\s*
3563 (?P<op>%s)\s*(?P<key>[a-z_]+)
3564 \s*$
3565 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3566 m = operator_rex.search(filter_part)
3567 if m:
3568 op = UNARY_OPERATORS[m.group('op')]
3569 actual_value = dct.get(m.group('key'))
8f18aca8 3570 if incomplete and actual_value is None:
3571 return True
347de493
PH
3572 return op(actual_value)
3573
3574 raise ValueError('Invalid filter part %r' % filter_part)
3575
3576
8f18aca8 3577def match_str(filter_str, dct, incomplete=False):
3578 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3579 When incomplete, all conditions passes on missing fields
3580 """
347de493 3581 return all(
8f18aca8 3582 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
a047eeb6 3583 for filter_part in re.split(r'(?<!\\)&', filter_str))
347de493
PH
3584
3585
3586def match_filter_func(filter_str):
8f18aca8 3587 def _match_func(info_dict, *args, **kwargs):
3588 if match_str(filter_str, info_dict, *args, **kwargs):
347de493
PH
3589 return None
3590 else:
3591 video_title = info_dict.get('title', info_dict.get('id', 'video'))
3592 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3593 return _match_func
91410c9b
PH
3594
3595
bf6427d2
YCH
3596def parse_dfxp_time_expr(time_expr):
3597 if not time_expr:
d631d5f9 3598 return
bf6427d2
YCH
3599
3600 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3601 if mobj:
3602 return float(mobj.group('time_offset'))
3603
db2fe38b 3604 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 3605 if mobj:
db2fe38b 3606 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
3607
3608
c1c924ab 3609def srt_subtitles_timecode(seconds):
aa7785f8 3610 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3611
3612
3613def ass_subtitles_timecode(seconds):
3614 time = timetuple_from_msec(seconds * 1000)
3615 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
bf6427d2
YCH
3616
3617
3618def dfxp2srt(dfxp_data):
3869028f
YCH
3619 '''
3620 @param dfxp_data A bytes-like object containing DFXP data
3621 @returns A unicode object containing converted SRT data
3622 '''
5b995f71 3623 LEGACY_NAMESPACES = (
3869028f
YCH
3624 (b'http://www.w3.org/ns/ttml', [
3625 b'http://www.w3.org/2004/11/ttaf1',
3626 b'http://www.w3.org/2006/04/ttaf1',
3627 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 3628 ]),
3869028f
YCH
3629 (b'http://www.w3.org/ns/ttml#styling', [
3630 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
3631 ]),
3632 )
3633
3634 SUPPORTED_STYLING = [
3635 'color',
3636 'fontFamily',
3637 'fontSize',
3638 'fontStyle',
3639 'fontWeight',
3640 'textDecoration'
3641 ]
3642
4e335771 3643 _x = functools.partial(xpath_with_ns, ns_map={
261f4730 3644 'xml': 'http://www.w3.org/XML/1998/namespace',
4e335771 3645 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 3646 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 3647 })
bf6427d2 3648
5b995f71
RA
3649 styles = {}
3650 default_style = {}
3651
87de7069 3652 class TTMLPElementParser(object):
5b995f71
RA
3653 _out = ''
3654 _unclosed_elements = []
3655 _applied_styles = []
bf6427d2 3656
2b14cb56 3657 def start(self, tag, attrib):
5b995f71
RA
3658 if tag in (_x('ttml:br'), 'br'):
3659 self._out += '\n'
3660 else:
3661 unclosed_elements = []
3662 style = {}
3663 element_style_id = attrib.get('style')
3664 if default_style:
3665 style.update(default_style)
3666 if element_style_id:
3667 style.update(styles.get(element_style_id, {}))
3668 for prop in SUPPORTED_STYLING:
3669 prop_val = attrib.get(_x('tts:' + prop))
3670 if prop_val:
3671 style[prop] = prop_val
3672 if style:
3673 font = ''
3674 for k, v in sorted(style.items()):
3675 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3676 continue
3677 if k == 'color':
3678 font += ' color="%s"' % v
3679 elif k == 'fontSize':
3680 font += ' size="%s"' % v
3681 elif k == 'fontFamily':
3682 font += ' face="%s"' % v
3683 elif k == 'fontWeight' and v == 'bold':
3684 self._out += '<b>'
3685 unclosed_elements.append('b')
3686 elif k == 'fontStyle' and v == 'italic':
3687 self._out += '<i>'
3688 unclosed_elements.append('i')
3689 elif k == 'textDecoration' and v == 'underline':
3690 self._out += '<u>'
3691 unclosed_elements.append('u')
3692 if font:
3693 self._out += '<font' + font + '>'
3694 unclosed_elements.append('font')
3695 applied_style = {}
3696 if self._applied_styles:
3697 applied_style.update(self._applied_styles[-1])
3698 applied_style.update(style)
3699 self._applied_styles.append(applied_style)
3700 self._unclosed_elements.append(unclosed_elements)
bf6427d2 3701
2b14cb56 3702 def end(self, tag):
5b995f71
RA
3703 if tag not in (_x('ttml:br'), 'br'):
3704 unclosed_elements = self._unclosed_elements.pop()
3705 for element in reversed(unclosed_elements):
3706 self._out += '</%s>' % element
3707 if unclosed_elements and self._applied_styles:
3708 self._applied_styles.pop()
bf6427d2 3709
2b14cb56 3710 def data(self, data):
5b995f71 3711 self._out += data
2b14cb56 3712
3713 def close(self):
5b995f71 3714 return self._out.strip()
2b14cb56 3715
3716 def parse_node(node):
3717 target = TTMLPElementParser()
3718 parser = xml.etree.ElementTree.XMLParser(target=target)
3719 parser.feed(xml.etree.ElementTree.tostring(node))
3720 return parser.close()
bf6427d2 3721
5b995f71
RA
3722 for k, v in LEGACY_NAMESPACES:
3723 for ns in v:
3724 dfxp_data = dfxp_data.replace(ns, k)
3725
3869028f 3726 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 3727 out = []
5b995f71 3728 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
3729
3730 if not paras:
3731 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 3732
5b995f71
RA
3733 repeat = False
3734 while True:
3735 for style in dfxp.findall(_x('.//ttml:style')):
261f4730
RA
3736 style_id = style.get('id') or style.get(_x('xml:id'))
3737 if not style_id:
3738 continue
5b995f71
RA
3739 parent_style_id = style.get('style')
3740 if parent_style_id:
3741 if parent_style_id not in styles:
3742 repeat = True
3743 continue
3744 styles[style_id] = styles[parent_style_id].copy()
3745 for prop in SUPPORTED_STYLING:
3746 prop_val = style.get(_x('tts:' + prop))
3747 if prop_val:
3748 styles.setdefault(style_id, {})[prop] = prop_val
3749 if repeat:
3750 repeat = False
3751 else:
3752 break
3753
3754 for p in ('body', 'div'):
3755 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3756 if ele is None:
3757 continue
3758 style = styles.get(ele.get('style'))
3759 if not style:
3760 continue
3761 default_style.update(style)
3762
bf6427d2 3763 for para, index in zip(paras, itertools.count(1)):
d631d5f9 3764 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 3765 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
3766 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3767 if begin_time is None:
3768 continue
7dff0363 3769 if not end_time:
d631d5f9
YCH
3770 if not dur:
3771 continue
3772 end_time = begin_time + dur
bf6427d2
YCH
3773 out.append('%d\n%s --> %s\n%s\n\n' % (
3774 index,
c1c924ab
YCH
3775 srt_subtitles_timecode(begin_time),
3776 srt_subtitles_timecode(end_time),
bf6427d2
YCH
3777 parse_node(para)))
3778
3779 return ''.join(out)
3780
3781
66e289ba
S
3782def cli_option(params, command_option, param):
3783 param = params.get(param)
98e698f1
RA
3784 if param:
3785 param = compat_str(param)
66e289ba
S
3786 return [command_option, param] if param is not None else []
3787
3788
3789def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3790 param = params.get(param)
5b232f46
S
3791 if param is None:
3792 return []
66e289ba
S
3793 assert isinstance(param, bool)
3794 if separator:
3795 return [command_option + separator + (true_value if param else false_value)]
3796 return [command_option, true_value if param else false_value]
3797
3798
3799def cli_valueless_option(params, command_option, param, expected_value=True):
3800 param = params.get(param)
3801 return [command_option] if param == expected_value else []
3802
3803
e92caff5 3804def cli_configuration_args(argdict, keys, default=[], use_compat=True):
eab9b2bc 3805 if isinstance(argdict, (list, tuple)): # for backward compatibility
e92caff5 3806 if use_compat:
5b1ecbb3 3807 return argdict
3808 else:
3809 argdict = None
eab9b2bc 3810 if argdict is None:
5b1ecbb3 3811 return default
eab9b2bc 3812 assert isinstance(argdict, dict)
3813
e92caff5 3814 assert isinstance(keys, (list, tuple))
3815 for key_list in keys:
e92caff5 3816 arg_list = list(filter(
3817 lambda x: x is not None,
6606817a 3818 [argdict.get(key.lower()) for key in variadic(key_list)]))
e92caff5 3819 if arg_list:
3820 return [arg for args in arg_list for arg in args]
3821 return default
66e289ba 3822
6251555f 3823
330690a2 3824def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3825 main_key, exe = main_key.lower(), exe.lower()
3826 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3827 keys = [f'{root_key}{k}' for k in (keys or [''])]
3828 if root_key in keys:
3829 if main_key != exe:
3830 keys.append((main_key, exe))
3831 keys.append('default')
3832 else:
3833 use_compat = False
3834 return cli_configuration_args(argdict, keys, default, use_compat)
3835
66e289ba 3836
39672624
YCH
3837class ISO639Utils(object):
3838 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3839 _lang_map = {
3840 'aa': 'aar',
3841 'ab': 'abk',
3842 'ae': 'ave',
3843 'af': 'afr',
3844 'ak': 'aka',
3845 'am': 'amh',
3846 'an': 'arg',
3847 'ar': 'ara',
3848 'as': 'asm',
3849 'av': 'ava',
3850 'ay': 'aym',
3851 'az': 'aze',
3852 'ba': 'bak',
3853 'be': 'bel',
3854 'bg': 'bul',
3855 'bh': 'bih',
3856 'bi': 'bis',
3857 'bm': 'bam',
3858 'bn': 'ben',
3859 'bo': 'bod',
3860 'br': 'bre',
3861 'bs': 'bos',
3862 'ca': 'cat',
3863 'ce': 'che',
3864 'ch': 'cha',
3865 'co': 'cos',
3866 'cr': 'cre',
3867 'cs': 'ces',
3868 'cu': 'chu',
3869 'cv': 'chv',
3870 'cy': 'cym',
3871 'da': 'dan',
3872 'de': 'deu',
3873 'dv': 'div',
3874 'dz': 'dzo',
3875 'ee': 'ewe',
3876 'el': 'ell',
3877 'en': 'eng',
3878 'eo': 'epo',
3879 'es': 'spa',
3880 'et': 'est',
3881 'eu': 'eus',
3882 'fa': 'fas',
3883 'ff': 'ful',
3884 'fi': 'fin',
3885 'fj': 'fij',
3886 'fo': 'fao',
3887 'fr': 'fra',
3888 'fy': 'fry',
3889 'ga': 'gle',
3890 'gd': 'gla',
3891 'gl': 'glg',
3892 'gn': 'grn',
3893 'gu': 'guj',
3894 'gv': 'glv',
3895 'ha': 'hau',
3896 'he': 'heb',
b7acc835 3897 'iw': 'heb', # Replaced by he in 1989 revision
39672624
YCH
3898 'hi': 'hin',
3899 'ho': 'hmo',
3900 'hr': 'hrv',
3901 'ht': 'hat',
3902 'hu': 'hun',
3903 'hy': 'hye',
3904 'hz': 'her',
3905 'ia': 'ina',
3906 'id': 'ind',
b7acc835 3907 'in': 'ind', # Replaced by id in 1989 revision
39672624
YCH
3908 'ie': 'ile',
3909 'ig': 'ibo',
3910 'ii': 'iii',
3911 'ik': 'ipk',
3912 'io': 'ido',
3913 'is': 'isl',
3914 'it': 'ita',
3915 'iu': 'iku',
3916 'ja': 'jpn',
3917 'jv': 'jav',
3918 'ka': 'kat',
3919 'kg': 'kon',
3920 'ki': 'kik',
3921 'kj': 'kua',
3922 'kk': 'kaz',
3923 'kl': 'kal',
3924 'km': 'khm',
3925 'kn': 'kan',
3926 'ko': 'kor',
3927 'kr': 'kau',
3928 'ks': 'kas',
3929 'ku': 'kur',
3930 'kv': 'kom',
3931 'kw': 'cor',
3932 'ky': 'kir',
3933 'la': 'lat',
3934 'lb': 'ltz',
3935 'lg': 'lug',
3936 'li': 'lim',
3937 'ln': 'lin',
3938 'lo': 'lao',
3939 'lt': 'lit',
3940 'lu': 'lub',
3941 'lv': 'lav',
3942 'mg': 'mlg',
3943 'mh': 'mah',
3944 'mi': 'mri',
3945 'mk': 'mkd',
3946 'ml': 'mal',
3947 'mn': 'mon',
3948 'mr': 'mar',
3949 'ms': 'msa',
3950 'mt': 'mlt',
3951 'my': 'mya',
3952 'na': 'nau',
3953 'nb': 'nob',
3954 'nd': 'nde',
3955 'ne': 'nep',
3956 'ng': 'ndo',
3957 'nl': 'nld',
3958 'nn': 'nno',
3959 'no': 'nor',
3960 'nr': 'nbl',
3961 'nv': 'nav',
3962 'ny': 'nya',
3963 'oc': 'oci',
3964 'oj': 'oji',
3965 'om': 'orm',
3966 'or': 'ori',
3967 'os': 'oss',
3968 'pa': 'pan',
3969 'pi': 'pli',
3970 'pl': 'pol',
3971 'ps': 'pus',
3972 'pt': 'por',
3973 'qu': 'que',
3974 'rm': 'roh',
3975 'rn': 'run',
3976 'ro': 'ron',
3977 'ru': 'rus',
3978 'rw': 'kin',
3979 'sa': 'san',
3980 'sc': 'srd',
3981 'sd': 'snd',
3982 'se': 'sme',
3983 'sg': 'sag',
3984 'si': 'sin',
3985 'sk': 'slk',
3986 'sl': 'slv',
3987 'sm': 'smo',
3988 'sn': 'sna',
3989 'so': 'som',
3990 'sq': 'sqi',
3991 'sr': 'srp',
3992 'ss': 'ssw',
3993 'st': 'sot',
3994 'su': 'sun',
3995 'sv': 'swe',
3996 'sw': 'swa',
3997 'ta': 'tam',
3998 'te': 'tel',
3999 'tg': 'tgk',
4000 'th': 'tha',
4001 'ti': 'tir',
4002 'tk': 'tuk',
4003 'tl': 'tgl',
4004 'tn': 'tsn',
4005 'to': 'ton',
4006 'tr': 'tur',
4007 'ts': 'tso',
4008 'tt': 'tat',
4009 'tw': 'twi',
4010 'ty': 'tah',
4011 'ug': 'uig',
4012 'uk': 'ukr',
4013 'ur': 'urd',
4014 'uz': 'uzb',
4015 've': 'ven',
4016 'vi': 'vie',
4017 'vo': 'vol',
4018 'wa': 'wln',
4019 'wo': 'wol',
4020 'xh': 'xho',
4021 'yi': 'yid',
e9a50fba 4022 'ji': 'yid', # Replaced by yi in 1989 revision
39672624
YCH
4023 'yo': 'yor',
4024 'za': 'zha',
4025 'zh': 'zho',
4026 'zu': 'zul',
4027 }
4028
4029 @classmethod
4030 def short2long(cls, code):
4031 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4032 return cls._lang_map.get(code[:2])
4033
4034 @classmethod
4035 def long2short(cls, code):
4036 """Convert language code from ISO 639-2/T to ISO 639-1"""
4037 for short_name, long_name in cls._lang_map.items():
4038 if long_name == code:
4039 return short_name
4040
4041
4eb10f66
YCH
4042class ISO3166Utils(object):
4043 # From http://data.okfn.org/data/core/country-list
4044 _country_map = {
4045 'AF': 'Afghanistan',
4046 'AX': 'Åland Islands',
4047 'AL': 'Albania',
4048 'DZ': 'Algeria',
4049 'AS': 'American Samoa',
4050 'AD': 'Andorra',
4051 'AO': 'Angola',
4052 'AI': 'Anguilla',
4053 'AQ': 'Antarctica',
4054 'AG': 'Antigua and Barbuda',
4055 'AR': 'Argentina',
4056 'AM': 'Armenia',
4057 'AW': 'Aruba',
4058 'AU': 'Australia',
4059 'AT': 'Austria',
4060 'AZ': 'Azerbaijan',
4061 'BS': 'Bahamas',
4062 'BH': 'Bahrain',
4063 'BD': 'Bangladesh',
4064 'BB': 'Barbados',
4065 'BY': 'Belarus',
4066 'BE': 'Belgium',
4067 'BZ': 'Belize',
4068 'BJ': 'Benin',
4069 'BM': 'Bermuda',
4070 'BT': 'Bhutan',
4071 'BO': 'Bolivia, Plurinational State of',
4072 'BQ': 'Bonaire, Sint Eustatius and Saba',
4073 'BA': 'Bosnia and Herzegovina',
4074 'BW': 'Botswana',
4075 'BV': 'Bouvet Island',
4076 'BR': 'Brazil',
4077 'IO': 'British Indian Ocean Territory',
4078 'BN': 'Brunei Darussalam',
4079 'BG': 'Bulgaria',
4080 'BF': 'Burkina Faso',
4081 'BI': 'Burundi',
4082 'KH': 'Cambodia',
4083 'CM': 'Cameroon',
4084 'CA': 'Canada',
4085 'CV': 'Cape Verde',
4086 'KY': 'Cayman Islands',
4087 'CF': 'Central African Republic',
4088 'TD': 'Chad',
4089 'CL': 'Chile',
4090 'CN': 'China',
4091 'CX': 'Christmas Island',
4092 'CC': 'Cocos (Keeling) Islands',
4093 'CO': 'Colombia',
4094 'KM': 'Comoros',
4095 'CG': 'Congo',
4096 'CD': 'Congo, the Democratic Republic of the',
4097 'CK': 'Cook Islands',
4098 'CR': 'Costa Rica',
4099 'CI': 'Côte d\'Ivoire',
4100 'HR': 'Croatia',
4101 'CU': 'Cuba',
4102 'CW': 'Curaçao',
4103 'CY': 'Cyprus',
4104 'CZ': 'Czech Republic',
4105 'DK': 'Denmark',
4106 'DJ': 'Djibouti',
4107 'DM': 'Dominica',
4108 'DO': 'Dominican Republic',
4109 'EC': 'Ecuador',
4110 'EG': 'Egypt',
4111 'SV': 'El Salvador',
4112 'GQ': 'Equatorial Guinea',
4113 'ER': 'Eritrea',
4114 'EE': 'Estonia',
4115 'ET': 'Ethiopia',
4116 'FK': 'Falkland Islands (Malvinas)',
4117 'FO': 'Faroe Islands',
4118 'FJ': 'Fiji',
4119 'FI': 'Finland',
4120 'FR': 'France',
4121 'GF': 'French Guiana',
4122 'PF': 'French Polynesia',
4123 'TF': 'French Southern Territories',
4124 'GA': 'Gabon',
4125 'GM': 'Gambia',
4126 'GE': 'Georgia',
4127 'DE': 'Germany',
4128 'GH': 'Ghana',
4129 'GI': 'Gibraltar',
4130 'GR': 'Greece',
4131 'GL': 'Greenland',
4132 'GD': 'Grenada',
4133 'GP': 'Guadeloupe',
4134 'GU': 'Guam',
4135 'GT': 'Guatemala',
4136 'GG': 'Guernsey',
4137 'GN': 'Guinea',
4138 'GW': 'Guinea-Bissau',
4139 'GY': 'Guyana',
4140 'HT': 'Haiti',
4141 'HM': 'Heard Island and McDonald Islands',
4142 'VA': 'Holy See (Vatican City State)',
4143 'HN': 'Honduras',
4144 'HK': 'Hong Kong',
4145 'HU': 'Hungary',
4146 'IS': 'Iceland',
4147 'IN': 'India',
4148 'ID': 'Indonesia',
4149 'IR': 'Iran, Islamic Republic of',
4150 'IQ': 'Iraq',
4151 'IE': 'Ireland',
4152 'IM': 'Isle of Man',
4153 'IL': 'Israel',
4154 'IT': 'Italy',
4155 'JM': 'Jamaica',
4156 'JP': 'Japan',
4157 'JE': 'Jersey',
4158 'JO': 'Jordan',
4159 'KZ': 'Kazakhstan',
4160 'KE': 'Kenya',
4161 'KI': 'Kiribati',
4162 'KP': 'Korea, Democratic People\'s Republic of',
4163 'KR': 'Korea, Republic of',
4164 'KW': 'Kuwait',
4165 'KG': 'Kyrgyzstan',
4166 'LA': 'Lao People\'s Democratic Republic',
4167 'LV': 'Latvia',
4168 'LB': 'Lebanon',
4169 'LS': 'Lesotho',
4170 'LR': 'Liberia',
4171 'LY': 'Libya',
4172 'LI': 'Liechtenstein',
4173 'LT': 'Lithuania',
4174 'LU': 'Luxembourg',
4175 'MO': 'Macao',
4176 'MK': 'Macedonia, the Former Yugoslav Republic of',
4177 'MG': 'Madagascar',
4178 'MW': 'Malawi',
4179 'MY': 'Malaysia',
4180 'MV': 'Maldives',
4181 'ML': 'Mali',
4182 'MT': 'Malta',
4183 'MH': 'Marshall Islands',
4184 'MQ': 'Martinique',
4185 'MR': 'Mauritania',
4186 'MU': 'Mauritius',
4187 'YT': 'Mayotte',
4188 'MX': 'Mexico',
4189 'FM': 'Micronesia, Federated States of',
4190 'MD': 'Moldova, Republic of',
4191 'MC': 'Monaco',
4192 'MN': 'Mongolia',
4193 'ME': 'Montenegro',
4194 'MS': 'Montserrat',
4195 'MA': 'Morocco',
4196 'MZ': 'Mozambique',
4197 'MM': 'Myanmar',
4198 'NA': 'Namibia',
4199 'NR': 'Nauru',
4200 'NP': 'Nepal',
4201 'NL': 'Netherlands',
4202 'NC': 'New Caledonia',
4203 'NZ': 'New Zealand',
4204 'NI': 'Nicaragua',
4205 'NE': 'Niger',
4206 'NG': 'Nigeria',
4207 'NU': 'Niue',
4208 'NF': 'Norfolk Island',
4209 'MP': 'Northern Mariana Islands',
4210 'NO': 'Norway',
4211 'OM': 'Oman',
4212 'PK': 'Pakistan',
4213 'PW': 'Palau',
4214 'PS': 'Palestine, State of',
4215 'PA': 'Panama',
4216 'PG': 'Papua New Guinea',
4217 'PY': 'Paraguay',
4218 'PE': 'Peru',
4219 'PH': 'Philippines',
4220 'PN': 'Pitcairn',
4221 'PL': 'Poland',
4222 'PT': 'Portugal',
4223 'PR': 'Puerto Rico',
4224 'QA': 'Qatar',
4225 'RE': 'Réunion',
4226 'RO': 'Romania',
4227 'RU': 'Russian Federation',
4228 'RW': 'Rwanda',
4229 'BL': 'Saint Barthélemy',
4230 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4231 'KN': 'Saint Kitts and Nevis',
4232 'LC': 'Saint Lucia',
4233 'MF': 'Saint Martin (French part)',
4234 'PM': 'Saint Pierre and Miquelon',
4235 'VC': 'Saint Vincent and the Grenadines',
4236 'WS': 'Samoa',
4237 'SM': 'San Marino',
4238 'ST': 'Sao Tome and Principe',
4239 'SA': 'Saudi Arabia',
4240 'SN': 'Senegal',
4241 'RS': 'Serbia',
4242 'SC': 'Seychelles',
4243 'SL': 'Sierra Leone',
4244 'SG': 'Singapore',
4245 'SX': 'Sint Maarten (Dutch part)',
4246 'SK': 'Slovakia',
4247 'SI': 'Slovenia',
4248 'SB': 'Solomon Islands',
4249 'SO': 'Somalia',
4250 'ZA': 'South Africa',
4251 'GS': 'South Georgia and the South Sandwich Islands',
4252 'SS': 'South Sudan',
4253 'ES': 'Spain',
4254 'LK': 'Sri Lanka',
4255 'SD': 'Sudan',
4256 'SR': 'Suriname',
4257 'SJ': 'Svalbard and Jan Mayen',
4258 'SZ': 'Swaziland',
4259 'SE': 'Sweden',
4260 'CH': 'Switzerland',
4261 'SY': 'Syrian Arab Republic',
4262 'TW': 'Taiwan, Province of China',
4263 'TJ': 'Tajikistan',
4264 'TZ': 'Tanzania, United Republic of',
4265 'TH': 'Thailand',
4266 'TL': 'Timor-Leste',
4267 'TG': 'Togo',
4268 'TK': 'Tokelau',
4269 'TO': 'Tonga',
4270 'TT': 'Trinidad and Tobago',
4271 'TN': 'Tunisia',
4272 'TR': 'Turkey',
4273 'TM': 'Turkmenistan',
4274 'TC': 'Turks and Caicos Islands',
4275 'TV': 'Tuvalu',
4276 'UG': 'Uganda',
4277 'UA': 'Ukraine',
4278 'AE': 'United Arab Emirates',
4279 'GB': 'United Kingdom',
4280 'US': 'United States',
4281 'UM': 'United States Minor Outlying Islands',
4282 'UY': 'Uruguay',
4283 'UZ': 'Uzbekistan',
4284 'VU': 'Vanuatu',
4285 'VE': 'Venezuela, Bolivarian Republic of',
4286 'VN': 'Viet Nam',
4287 'VG': 'Virgin Islands, British',
4288 'VI': 'Virgin Islands, U.S.',
4289 'WF': 'Wallis and Futuna',
4290 'EH': 'Western Sahara',
4291 'YE': 'Yemen',
4292 'ZM': 'Zambia',
4293 'ZW': 'Zimbabwe',
4294 }
4295
4296 @classmethod
4297 def short2full(cls, code):
4298 """Convert an ISO 3166-2 country code to the corresponding full name"""
4299 return cls._country_map.get(code.upper())
4300
4301
773f291d
S
4302class GeoUtils(object):
4303 # Major IPv4 address blocks per country
4304 _country_ip_map = {
53896ca5 4305 'AD': '46.172.224.0/19',
773f291d
S
4306 'AE': '94.200.0.0/13',
4307 'AF': '149.54.0.0/17',
4308 'AG': '209.59.64.0/18',
4309 'AI': '204.14.248.0/21',
4310 'AL': '46.99.0.0/16',
4311 'AM': '46.70.0.0/15',
4312 'AO': '105.168.0.0/13',
53896ca5
S
4313 'AP': '182.50.184.0/21',
4314 'AQ': '23.154.160.0/24',
773f291d
S
4315 'AR': '181.0.0.0/12',
4316 'AS': '202.70.112.0/20',
53896ca5 4317 'AT': '77.116.0.0/14',
773f291d
S
4318 'AU': '1.128.0.0/11',
4319 'AW': '181.41.0.0/18',
53896ca5
S
4320 'AX': '185.217.4.0/22',
4321 'AZ': '5.197.0.0/16',
773f291d
S
4322 'BA': '31.176.128.0/17',
4323 'BB': '65.48.128.0/17',
4324 'BD': '114.130.0.0/16',
4325 'BE': '57.0.0.0/8',
53896ca5 4326 'BF': '102.178.0.0/15',
773f291d
S
4327 'BG': '95.42.0.0/15',
4328 'BH': '37.131.0.0/17',
4329 'BI': '154.117.192.0/18',
4330 'BJ': '137.255.0.0/16',
53896ca5 4331 'BL': '185.212.72.0/23',
773f291d
S
4332 'BM': '196.12.64.0/18',
4333 'BN': '156.31.0.0/16',
4334 'BO': '161.56.0.0/16',
4335 'BQ': '161.0.80.0/20',
53896ca5 4336 'BR': '191.128.0.0/12',
773f291d
S
4337 'BS': '24.51.64.0/18',
4338 'BT': '119.2.96.0/19',
4339 'BW': '168.167.0.0/16',
4340 'BY': '178.120.0.0/13',
4341 'BZ': '179.42.192.0/18',
4342 'CA': '99.224.0.0/11',
4343 'CD': '41.243.0.0/16',
53896ca5
S
4344 'CF': '197.242.176.0/21',
4345 'CG': '160.113.0.0/16',
773f291d 4346 'CH': '85.0.0.0/13',
53896ca5 4347 'CI': '102.136.0.0/14',
773f291d
S
4348 'CK': '202.65.32.0/19',
4349 'CL': '152.172.0.0/14',
53896ca5 4350 'CM': '102.244.0.0/14',
773f291d
S
4351 'CN': '36.128.0.0/10',
4352 'CO': '181.240.0.0/12',
4353 'CR': '201.192.0.0/12',
4354 'CU': '152.206.0.0/15',
4355 'CV': '165.90.96.0/19',
4356 'CW': '190.88.128.0/17',
53896ca5 4357 'CY': '31.153.0.0/16',
773f291d
S
4358 'CZ': '88.100.0.0/14',
4359 'DE': '53.0.0.0/8',
4360 'DJ': '197.241.0.0/17',
4361 'DK': '87.48.0.0/12',
4362 'DM': '192.243.48.0/20',
4363 'DO': '152.166.0.0/15',
4364 'DZ': '41.96.0.0/12',
4365 'EC': '186.68.0.0/15',
4366 'EE': '90.190.0.0/15',
4367 'EG': '156.160.0.0/11',
4368 'ER': '196.200.96.0/20',
4369 'ES': '88.0.0.0/11',
4370 'ET': '196.188.0.0/14',
4371 'EU': '2.16.0.0/13',
4372 'FI': '91.152.0.0/13',
4373 'FJ': '144.120.0.0/16',
53896ca5 4374 'FK': '80.73.208.0/21',
773f291d
S
4375 'FM': '119.252.112.0/20',
4376 'FO': '88.85.32.0/19',
4377 'FR': '90.0.0.0/9',
4378 'GA': '41.158.0.0/15',
4379 'GB': '25.0.0.0/8',
4380 'GD': '74.122.88.0/21',
4381 'GE': '31.146.0.0/16',
4382 'GF': '161.22.64.0/18',
4383 'GG': '62.68.160.0/19',
53896ca5
S
4384 'GH': '154.160.0.0/12',
4385 'GI': '95.164.0.0/16',
773f291d
S
4386 'GL': '88.83.0.0/19',
4387 'GM': '160.182.0.0/15',
4388 'GN': '197.149.192.0/18',
4389 'GP': '104.250.0.0/19',
4390 'GQ': '105.235.224.0/20',
4391 'GR': '94.64.0.0/13',
4392 'GT': '168.234.0.0/16',
4393 'GU': '168.123.0.0/16',
4394 'GW': '197.214.80.0/20',
4395 'GY': '181.41.64.0/18',
4396 'HK': '113.252.0.0/14',
4397 'HN': '181.210.0.0/16',
4398 'HR': '93.136.0.0/13',
4399 'HT': '148.102.128.0/17',
4400 'HU': '84.0.0.0/14',
4401 'ID': '39.192.0.0/10',
4402 'IE': '87.32.0.0/12',
4403 'IL': '79.176.0.0/13',
4404 'IM': '5.62.80.0/20',
4405 'IN': '117.192.0.0/10',
4406 'IO': '203.83.48.0/21',
4407 'IQ': '37.236.0.0/14',
4408 'IR': '2.176.0.0/12',
4409 'IS': '82.221.0.0/16',
4410 'IT': '79.0.0.0/10',
4411 'JE': '87.244.64.0/18',
4412 'JM': '72.27.0.0/17',
4413 'JO': '176.29.0.0/16',
53896ca5 4414 'JP': '133.0.0.0/8',
773f291d
S
4415 'KE': '105.48.0.0/12',
4416 'KG': '158.181.128.0/17',
4417 'KH': '36.37.128.0/17',
4418 'KI': '103.25.140.0/22',
4419 'KM': '197.255.224.0/20',
53896ca5 4420 'KN': '198.167.192.0/19',
773f291d
S
4421 'KP': '175.45.176.0/22',
4422 'KR': '175.192.0.0/10',
4423 'KW': '37.36.0.0/14',
4424 'KY': '64.96.0.0/15',
4425 'KZ': '2.72.0.0/13',
4426 'LA': '115.84.64.0/18',
4427 'LB': '178.135.0.0/16',
53896ca5 4428 'LC': '24.92.144.0/20',
773f291d
S
4429 'LI': '82.117.0.0/19',
4430 'LK': '112.134.0.0/15',
53896ca5 4431 'LR': '102.183.0.0/16',
773f291d
S
4432 'LS': '129.232.0.0/17',
4433 'LT': '78.56.0.0/13',
4434 'LU': '188.42.0.0/16',
4435 'LV': '46.109.0.0/16',
4436 'LY': '41.252.0.0/14',
4437 'MA': '105.128.0.0/11',
4438 'MC': '88.209.64.0/18',
4439 'MD': '37.246.0.0/16',
4440 'ME': '178.175.0.0/17',
4441 'MF': '74.112.232.0/21',
4442 'MG': '154.126.0.0/17',
4443 'MH': '117.103.88.0/21',
4444 'MK': '77.28.0.0/15',
4445 'ML': '154.118.128.0/18',
4446 'MM': '37.111.0.0/17',
4447 'MN': '49.0.128.0/17',
4448 'MO': '60.246.0.0/16',
4449 'MP': '202.88.64.0/20',
4450 'MQ': '109.203.224.0/19',
4451 'MR': '41.188.64.0/18',
4452 'MS': '208.90.112.0/22',
4453 'MT': '46.11.0.0/16',
4454 'MU': '105.16.0.0/12',
4455 'MV': '27.114.128.0/18',
53896ca5 4456 'MW': '102.70.0.0/15',
773f291d
S
4457 'MX': '187.192.0.0/11',
4458 'MY': '175.136.0.0/13',
4459 'MZ': '197.218.0.0/15',
4460 'NA': '41.182.0.0/16',
4461 'NC': '101.101.0.0/18',
4462 'NE': '197.214.0.0/18',
4463 'NF': '203.17.240.0/22',
4464 'NG': '105.112.0.0/12',
4465 'NI': '186.76.0.0/15',
4466 'NL': '145.96.0.0/11',
4467 'NO': '84.208.0.0/13',
4468 'NP': '36.252.0.0/15',
4469 'NR': '203.98.224.0/19',
4470 'NU': '49.156.48.0/22',
4471 'NZ': '49.224.0.0/14',
4472 'OM': '5.36.0.0/15',
4473 'PA': '186.72.0.0/15',
4474 'PE': '186.160.0.0/14',
4475 'PF': '123.50.64.0/18',
4476 'PG': '124.240.192.0/19',
4477 'PH': '49.144.0.0/13',
4478 'PK': '39.32.0.0/11',
4479 'PL': '83.0.0.0/11',
4480 'PM': '70.36.0.0/20',
4481 'PR': '66.50.0.0/16',
4482 'PS': '188.161.0.0/16',
4483 'PT': '85.240.0.0/13',
4484 'PW': '202.124.224.0/20',
4485 'PY': '181.120.0.0/14',
4486 'QA': '37.210.0.0/15',
53896ca5 4487 'RE': '102.35.0.0/16',
773f291d 4488 'RO': '79.112.0.0/13',
53896ca5 4489 'RS': '93.86.0.0/15',
773f291d 4490 'RU': '5.136.0.0/13',
53896ca5 4491 'RW': '41.186.0.0/16',
773f291d
S
4492 'SA': '188.48.0.0/13',
4493 'SB': '202.1.160.0/19',
4494 'SC': '154.192.0.0/11',
53896ca5 4495 'SD': '102.120.0.0/13',
773f291d 4496 'SE': '78.64.0.0/12',
53896ca5 4497 'SG': '8.128.0.0/10',
773f291d
S
4498 'SI': '188.196.0.0/14',
4499 'SK': '78.98.0.0/15',
53896ca5 4500 'SL': '102.143.0.0/17',
773f291d
S
4501 'SM': '89.186.32.0/19',
4502 'SN': '41.82.0.0/15',
53896ca5 4503 'SO': '154.115.192.0/18',
773f291d
S
4504 'SR': '186.179.128.0/17',
4505 'SS': '105.235.208.0/21',
4506 'ST': '197.159.160.0/19',
4507 'SV': '168.243.0.0/16',
4508 'SX': '190.102.0.0/20',
4509 'SY': '5.0.0.0/16',
4510 'SZ': '41.84.224.0/19',
4511 'TC': '65.255.48.0/20',
4512 'TD': '154.68.128.0/19',
4513 'TG': '196.168.0.0/14',
4514 'TH': '171.96.0.0/13',
4515 'TJ': '85.9.128.0/18',
4516 'TK': '27.96.24.0/21',
4517 'TL': '180.189.160.0/20',
4518 'TM': '95.85.96.0/19',
4519 'TN': '197.0.0.0/11',
4520 'TO': '175.176.144.0/21',
4521 'TR': '78.160.0.0/11',
4522 'TT': '186.44.0.0/15',
4523 'TV': '202.2.96.0/19',
4524 'TW': '120.96.0.0/11',
4525 'TZ': '156.156.0.0/14',
53896ca5
S
4526 'UA': '37.52.0.0/14',
4527 'UG': '102.80.0.0/13',
4528 'US': '6.0.0.0/8',
773f291d 4529 'UY': '167.56.0.0/13',
53896ca5 4530 'UZ': '84.54.64.0/18',
773f291d 4531 'VA': '212.77.0.0/19',
53896ca5 4532 'VC': '207.191.240.0/21',
773f291d 4533 'VE': '186.88.0.0/13',
53896ca5 4534 'VG': '66.81.192.0/20',
773f291d
S
4535 'VI': '146.226.0.0/16',
4536 'VN': '14.160.0.0/11',
4537 'VU': '202.80.32.0/20',
4538 'WF': '117.20.32.0/21',
4539 'WS': '202.4.32.0/19',
4540 'YE': '134.35.0.0/16',
4541 'YT': '41.242.116.0/22',
4542 'ZA': '41.0.0.0/11',
53896ca5
S
4543 'ZM': '102.144.0.0/13',
4544 'ZW': '102.177.192.0/18',
773f291d
S
4545 }
4546
4547 @classmethod
5f95927a
S
4548 def random_ipv4(cls, code_or_block):
4549 if len(code_or_block) == 2:
4550 block = cls._country_ip_map.get(code_or_block.upper())
4551 if not block:
4552 return None
4553 else:
4554 block = code_or_block
773f291d
S
4555 addr, preflen = block.split('/')
4556 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4557 addr_max = addr_min | (0xffffffff >> int(preflen))
18a0defa 4558 return compat_str(socket.inet_ntoa(
4248dad9 4559 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
4560
4561
91410c9b 4562class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
4563 def __init__(self, proxies=None):
4564 # Set default handlers
4565 for type in ('http', 'https'):
4566 setattr(self, '%s_open' % type,
4567 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4568 meth(r, proxy, type))
38e87f6c 4569 compat_urllib_request.ProxyHandler.__init__(self, proxies)
2461f79d 4570
91410c9b 4571 def proxy_open(self, req, proxy, type):
2461f79d 4572 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
4573 if req_proxy is not None:
4574 proxy = req_proxy
2461f79d
PH
4575 del req.headers['Ytdl-request-proxy']
4576
4577 if proxy == '__noproxy__':
4578 return None # No Proxy
51fb4995 4579 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188 4580 req.add_header('Ytdl-socks-proxy', proxy)
7a5c1cfe 4581 # yt-dlp's http/https handlers do wrapping the socket with socks
71aff188 4582 return None
91410c9b
PH
4583 return compat_urllib_request.ProxyHandler.proxy_open(
4584 self, req, proxy, type)
5bc880b9
YCH
4585
4586
0a5445dd
YCH
4587# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4588# released into Public Domain
4589# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4590
4591def long_to_bytes(n, blocksize=0):
4592 """long_to_bytes(n:long, blocksize:int) : string
4593 Convert a long integer to a byte string.
4594
4595 If optional blocksize is given and greater than zero, pad the front of the
4596 byte string with binary zeros so that the length is a multiple of
4597 blocksize.
4598 """
4599 # after much testing, this algorithm was deemed to be the fastest
4600 s = b''
4601 n = int(n)
4602 while n > 0:
4603 s = compat_struct_pack('>I', n & 0xffffffff) + s
4604 n = n >> 32
4605 # strip off leading zeros
4606 for i in range(len(s)):
4607 if s[i] != b'\000'[0]:
4608 break
4609 else:
4610 # only happens when n == 0
4611 s = b'\000'
4612 i = 0
4613 s = s[i:]
4614 # add back some pad bytes. this could be done more efficiently w.r.t. the
4615 # de-padding being done above, but sigh...
4616 if blocksize > 0 and len(s) % blocksize:
4617 s = (blocksize - len(s) % blocksize) * b'\000' + s
4618 return s
4619
4620
4621def bytes_to_long(s):
4622 """bytes_to_long(string) : long
4623 Convert a byte string to a long integer.
4624
4625 This is (essentially) the inverse of long_to_bytes().
4626 """
4627 acc = 0
4628 length = len(s)
4629 if length % 4:
4630 extra = (4 - length % 4)
4631 s = b'\000' * extra + s
4632 length = length + extra
4633 for i in range(0, length, 4):
4634 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4635 return acc
4636
4637
5bc880b9
YCH
4638def ohdave_rsa_encrypt(data, exponent, modulus):
4639 '''
4640 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4641
4642 Input:
4643 data: data to encrypt, bytes-like object
4644 exponent, modulus: parameter e and N of RSA algorithm, both integer
4645 Output: hex string of encrypted data
4646
4647 Limitation: supports one block encryption only
4648 '''
4649
4650 payload = int(binascii.hexlify(data[::-1]), 16)
4651 encrypted = pow(payload, exponent, modulus)
4652 return '%x' % encrypted
81bdc8fd
YCH
4653
4654
f48409c7
YCH
4655def pkcs1pad(data, length):
4656 """
4657 Padding input data with PKCS#1 scheme
4658
4659 @param {int[]} data input data
4660 @param {int} length target length
4661 @returns {int[]} padded data
4662 """
4663 if len(data) > length - 11:
4664 raise ValueError('Input data too long for PKCS#1 padding')
4665
4666 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4667 return [0, 2] + pseudo_random + [0] + data
4668
4669
5eb6bdce 4670def encode_base_n(num, n, table=None):
59f898b7 4671 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
4672 if not table:
4673 table = FULL_TABLE[:n]
4674
5eb6bdce
YCH
4675 if n > len(table):
4676 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4677
4678 if num == 0:
4679 return table[0]
4680
81bdc8fd
YCH
4681 ret = ''
4682 while num:
4683 ret = table[num % n] + ret
4684 num = num // n
4685 return ret
f52354a8
YCH
4686
4687
4688def decode_packed_codes(code):
06b3fe29 4689 mobj = re.search(PACKED_CODES_RE, code)
a0566bbf 4690 obfuscated_code, base, count, symbols = mobj.groups()
f52354a8
YCH
4691 base = int(base)
4692 count = int(count)
4693 symbols = symbols.split('|')
4694 symbol_table = {}
4695
4696 while count:
4697 count -= 1
5eb6bdce 4698 base_n_count = encode_base_n(count, base)
f52354a8
YCH
4699 symbol_table[base_n_count] = symbols[count] or base_n_count
4700
4701 return re.sub(
4702 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
a0566bbf 4703 obfuscated_code)
e154c651 4704
4705
1ced2221
S
4706def caesar(s, alphabet, shift):
4707 if shift == 0:
4708 return s
4709 l = len(alphabet)
4710 return ''.join(
4711 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4712 for c in s)
4713
4714
4715def rot47(s):
4716 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4717
4718
e154c651 4719def parse_m3u8_attributes(attrib):
4720 info = {}
4721 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4722 if val.startswith('"'):
4723 val = val[1:-1]
4724 info[key] = val
4725 return info
1143535d
YCH
4726
4727
4728def urshift(val, n):
4729 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
4730
4731
4732# Based on png2str() written by @gdkchan and improved by @yokrysty
067aa17e 4733# Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
d3f8e038
YCH
4734def decode_png(png_data):
4735 # Reference: https://www.w3.org/TR/PNG/
4736 header = png_data[8:]
4737
4738 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4739 raise IOError('Not a valid PNG file.')
4740
4741 int_map = {1: '>B', 2: '>H', 4: '>I'}
4742 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4743
4744 chunks = []
4745
4746 while header:
4747 length = unpack_integer(header[:4])
4748 header = header[4:]
4749
4750 chunk_type = header[:4]
4751 header = header[4:]
4752
4753 chunk_data = header[:length]
4754 header = header[length:]
4755
4756 header = header[4:] # Skip CRC
4757
4758 chunks.append({
4759 'type': chunk_type,
4760 'length': length,
4761 'data': chunk_data
4762 })
4763
4764 ihdr = chunks[0]['data']
4765
4766 width = unpack_integer(ihdr[:4])
4767 height = unpack_integer(ihdr[4:8])
4768
4769 idat = b''
4770
4771 for chunk in chunks:
4772 if chunk['type'] == b'IDAT':
4773 idat += chunk['data']
4774
4775 if not idat:
4776 raise IOError('Unable to read PNG data.')
4777
4778 decompressed_data = bytearray(zlib.decompress(idat))
4779
4780 stride = width * 3
4781 pixels = []
4782
4783 def _get_pixel(idx):
4784 x = idx % stride
4785 y = idx // stride
4786 return pixels[y][x]
4787
4788 for y in range(height):
4789 basePos = y * (1 + stride)
4790 filter_type = decompressed_data[basePos]
4791
4792 current_row = []
4793
4794 pixels.append(current_row)
4795
4796 for x in range(stride):
4797 color = decompressed_data[1 + basePos + x]
4798 basex = y * stride + x
4799 left = 0
4800 up = 0
4801
4802 if x > 2:
4803 left = _get_pixel(basex - 3)
4804 if y > 0:
4805 up = _get_pixel(basex - stride)
4806
4807 if filter_type == 1: # Sub
4808 color = (color + left) & 0xff
4809 elif filter_type == 2: # Up
4810 color = (color + up) & 0xff
4811 elif filter_type == 3: # Average
4812 color = (color + ((left + up) >> 1)) & 0xff
4813 elif filter_type == 4: # Paeth
4814 a = left
4815 b = up
4816 c = 0
4817
4818 if x > 2 and y > 0:
4819 c = _get_pixel(basex - stride - 3)
4820
4821 p = a + b - c
4822
4823 pa = abs(p - a)
4824 pb = abs(p - b)
4825 pc = abs(p - c)
4826
4827 if pa <= pb and pa <= pc:
4828 color = (color + a) & 0xff
4829 elif pb <= pc:
4830 color = (color + b) & 0xff
4831 else:
4832 color = (color + c) & 0xff
4833
4834 current_row.append(color)
4835
4836 return width, height, pixels
efa97bdc
YCH
4837
4838
4839def write_xattr(path, key, value):
4840 # This mess below finds the best xattr tool for the job
4841 try:
4842 # try the pyxattr module...
4843 import xattr
4844
53a7e3d2
YCH
4845 if hasattr(xattr, 'set'): # pyxattr
4846 # Unicode arguments are not supported in python-pyxattr until
4847 # version 0.5.0
067aa17e 4848 # See https://github.com/ytdl-org/youtube-dl/issues/5498
53a7e3d2
YCH
4849 pyxattr_required_version = '0.5.0'
4850 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4851 # TODO: fallback to CLI tools
4852 raise XAttrUnavailableError(
4853 'python-pyxattr is detected but is too old. '
7a5c1cfe 4854 'yt-dlp requires %s or above while your version is %s. '
53a7e3d2
YCH
4855 'Falling back to other xattr implementations' % (
4856 pyxattr_required_version, xattr.__version__))
4857
4858 setxattr = xattr.set
4859 else: # xattr
4860 setxattr = xattr.setxattr
efa97bdc
YCH
4861
4862 try:
53a7e3d2 4863 setxattr(path, key, value)
efa97bdc
YCH
4864 except EnvironmentError as e:
4865 raise XAttrMetadataError(e.errno, e.strerror)
4866
4867 except ImportError:
4868 if compat_os_name == 'nt':
4869 # Write xattrs to NTFS Alternate Data Streams:
4870 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4871 assert ':' not in key
4872 assert os.path.exists(path)
4873
4874 ads_fn = path + ':' + key
4875 try:
4876 with open(ads_fn, 'wb') as f:
4877 f.write(value)
4878 except EnvironmentError as e:
4879 raise XAttrMetadataError(e.errno, e.strerror)
4880 else:
4881 user_has_setfattr = check_executable('setfattr', ['--version'])
4882 user_has_xattr = check_executable('xattr', ['-h'])
4883
4884 if user_has_setfattr or user_has_xattr:
4885
4886 value = value.decode('utf-8')
4887 if user_has_setfattr:
4888 executable = 'setfattr'
4889 opts = ['-n', key, '-v', value]
4890 elif user_has_xattr:
4891 executable = 'xattr'
4892 opts = ['-w', key, value]
4893
3089bc74
S
4894 cmd = ([encodeFilename(executable, True)]
4895 + [encodeArgument(o) for o in opts]
4896 + [encodeFilename(path, True)])
efa97bdc
YCH
4897
4898 try:
d3c93ec2 4899 p = Popen(
efa97bdc
YCH
4900 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4901 except EnvironmentError as e:
4902 raise XAttrMetadataError(e.errno, e.strerror)
d3c93ec2 4903 stdout, stderr = p.communicate_or_kill()
efa97bdc
YCH
4904 stderr = stderr.decode('utf-8', 'replace')
4905 if p.returncode != 0:
4906 raise XAttrMetadataError(p.returncode, stderr)
4907
4908 else:
4909 # On Unix, and can't find pyxattr, setfattr, or xattr.
4910 if sys.platform.startswith('linux'):
4911 raise XAttrUnavailableError(
4912 "Couldn't find a tool to set the xattrs. "
4913 "Install either the python 'pyxattr' or 'xattr' "
4914 "modules, or the GNU 'attr' package "
4915 "(which contains the 'setfattr' tool).")
4916 else:
4917 raise XAttrUnavailableError(
4918 "Couldn't find a tool to set the xattrs. "
4919 "Install either the python 'xattr' module, "
4920 "or the 'xattr' binary.")
0c265486
YCH
4921
4922
4923def random_birthday(year_field, month_field, day_field):
aa374bc7
AS
4924 start_date = datetime.date(1950, 1, 1)
4925 end_date = datetime.date(1995, 12, 31)
4926 offset = random.randint(0, (end_date - start_date).days)
4927 random_date = start_date + datetime.timedelta(offset)
0c265486 4928 return {
aa374bc7
AS
4929 year_field: str(random_date.year),
4930 month_field: str(random_date.month),
4931 day_field: str(random_date.day),
0c265486 4932 }
732044af 4933
c76eb41b 4934
732044af 4935# Templates for internet shortcut files, which are plain text files.
4936DOT_URL_LINK_TEMPLATE = '''
4937[InternetShortcut]
4938URL=%(url)s
4939'''.lstrip()
4940
4941DOT_WEBLOC_LINK_TEMPLATE = '''
4942<?xml version="1.0" encoding="UTF-8"?>
4943<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4944<plist version="1.0">
4945<dict>
4946\t<key>URL</key>
4947\t<string>%(url)s</string>
4948</dict>
4949</plist>
4950'''.lstrip()
4951
4952DOT_DESKTOP_LINK_TEMPLATE = '''
4953[Desktop Entry]
4954Encoding=UTF-8
4955Name=%(filename)s
4956Type=Link
4957URL=%(url)s
4958Icon=text-html
4959'''.lstrip()
4960
08438d2c 4961LINK_TEMPLATES = {
4962 'url': DOT_URL_LINK_TEMPLATE,
4963 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4964 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4965}
4966
732044af 4967
4968def iri_to_uri(iri):
4969 """
4970 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4971
4972 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4973 """
4974
4975 iri_parts = compat_urllib_parse_urlparse(iri)
4976
4977 if '[' in iri_parts.netloc:
4978 raise ValueError('IPv6 URIs are not, yet, supported.')
4979 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4980
4981 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4982
4983 net_location = ''
4984 if iri_parts.username:
4985 net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4986 if iri_parts.password is not None:
4987 net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4988 net_location += '@'
4989
4990 net_location += iri_parts.hostname.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
4991 # The 'idna' encoding produces ASCII text.
4992 if iri_parts.port is not None and iri_parts.port != 80:
4993 net_location += ':' + str(iri_parts.port)
4994
4995 return compat_urllib_parse_urlunparse(
4996 (iri_parts.scheme,
4997 net_location,
4998
4999 compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5000
5001 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5002 compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5003
5004 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5005 compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5006
5007 compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5008
5009 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5010
5011
5012def to_high_limit_path(path):
5013 if sys.platform in ['win32', 'cygwin']:
5014 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5015 return r'\\?\ '.rstrip() + os.path.abspath(path)
5016
5017 return path
76d321f6 5018
c76eb41b 5019
b868936c 5020def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
e0ddbd02 5021 val = traverse_obj(obj, *variadic(field))
5022 if val in ignore:
5023 return default
5024 return template % (func(val) if func else val)
00dd0cd5 5025
5026
5027def clean_podcast_url(url):
5028 return re.sub(r'''(?x)
5029 (?:
5030 (?:
5031 chtbl\.com/track|
5032 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5033 play\.podtrac\.com
5034 )/[^/]+|
5035 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5036 flex\.acast\.com|
5037 pd(?:
5038 cn\.co| # https://podcorn.com/analytics-prefix/
5039 st\.fm # https://podsights.com/docs/
5040 )/e
5041 )/''', '', url)
ffcb8191
THD
5042
5043
5044_HEX_TABLE = '0123456789abcdef'
5045
5046
5047def random_uuidv4():
5048 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
0202b52a 5049
5050
5051def make_dir(path, to_screen=None):
5052 try:
5053 dn = os.path.dirname(path)
5054 if dn and not os.path.exists(dn):
5055 os.makedirs(dn)
5056 return True
5057 except (OSError, IOError) as err:
5058 if callable(to_screen) is not None:
5059 to_screen('unable to create directory ' + error_to_compat_str(err))
5060 return False
f74980cb 5061
5062
5063def get_executable_path():
c552ae88 5064 from zipimport import zipimporter
5065 if hasattr(sys, 'frozen'): # Running from PyInstaller
5066 path = os.path.dirname(sys.executable)
5067 elif isinstance(globals().get('__loader__'), zipimporter): # Running from ZIP
5068 path = os.path.join(os.path.dirname(__file__), '../..')
5069 else:
5070 path = os.path.join(os.path.dirname(__file__), '..')
f74980cb 5071 return os.path.abspath(path)
5072
5073
2f567473 5074def load_plugins(name, suffix, namespace):
3ae5e797 5075 classes = {}
f74980cb 5076 try:
019a94f7
ÁS
5077 plugins_spec = importlib.util.spec_from_file_location(
5078 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5079 plugins = importlib.util.module_from_spec(plugins_spec)
5080 sys.modules[plugins_spec.name] = plugins
5081 plugins_spec.loader.exec_module(plugins)
f74980cb 5082 for name in dir(plugins):
2f567473 5083 if name in namespace:
5084 continue
5085 if not name.endswith(suffix):
f74980cb 5086 continue
5087 klass = getattr(plugins, name)
3ae5e797 5088 classes[name] = namespace[name] = klass
019a94f7 5089 except FileNotFoundError:
f74980cb 5090 pass
f74980cb 5091 return classes
06167fbb 5092
5093
325ebc17 5094def traverse_obj(
352d63fd 5095 obj, *path_list, default=None, expected_type=None, get_all=True,
325ebc17 5096 casesense=True, is_user_input=False, traverse_string=False):
324ad820 5097 ''' Traverse nested list/dict/tuple
8f334380 5098 @param path_list A list of paths which are checked one by one.
5099 Each path is a list of keys where each key is a string,
1797b073 5100 a function, a tuple of strings/None or "...".
2614f646 5101 When a fuction is given, it takes the key as argument and
5102 returns whether the key matches or not. When a tuple is given,
8f334380 5103 all the keys given in the tuple are traversed, and
5104 "..." traverses all the keys in the object
1797b073 5105 "None" returns the object without traversal
325ebc17 5106 @param default Default value to return
352d63fd 5107 @param expected_type Only accept final value of this type (Can also be any callable)
5108 @param get_all Return all the values obtained from a path or only the first one
324ad820 5109 @param casesense Whether to consider dictionary keys as case sensitive
5110 @param is_user_input Whether the keys are generated from user input. If True,
5111 strings are converted to int/slice if necessary
5112 @param traverse_string Whether to traverse inside strings. If True, any
5113 non-compatible object will also be converted into a string
8f334380 5114 # TODO: Write tests
324ad820 5115 '''
325ebc17 5116 if not casesense:
dbf5416a 5117 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
8f334380 5118 path_list = (map(_lower, variadic(path)) for path in path_list)
5119
5120 def _traverse_obj(obj, path, _current_depth=0):
5121 nonlocal depth
5122 path = tuple(variadic(path))
5123 for i, key in enumerate(path):
1797b073 5124 if None in (key, obj):
5125 return obj
8f334380 5126 if isinstance(key, (list, tuple)):
5127 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5128 key = ...
5129 if key is ...:
5130 obj = (obj.values() if isinstance(obj, dict)
5131 else obj if isinstance(obj, (list, tuple, LazyList))
5132 else str(obj) if traverse_string else [])
5133 _current_depth += 1
5134 depth = max(depth, _current_depth)
5135 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
2614f646 5136 elif callable(key):
5137 if isinstance(obj, (list, tuple, LazyList)):
5138 obj = enumerate(obj)
5139 elif isinstance(obj, dict):
5140 obj = obj.items()
5141 else:
5142 if not traverse_string:
5143 return None
5144 obj = str(obj)
5145 _current_depth += 1
5146 depth = max(depth, _current_depth)
5147 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
575e17a1 5148 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
325ebc17 5149 obj = (obj.get(key) if casesense or (key in obj)
5150 else next((v for k, v in obj.items() if _lower(k) == key), None))
5151 else:
5152 if is_user_input:
5153 key = (int_or_none(key) if ':' not in key
5154 else slice(*map(int_or_none, key.split(':'))))
8f334380 5155 if key == slice(None):
575e17a1 5156 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
325ebc17 5157 if not isinstance(key, (int, slice)):
9fea350f 5158 return None
8f334380 5159 if not isinstance(obj, (list, tuple, LazyList)):
325ebc17 5160 if not traverse_string:
5161 return None
5162 obj = str(obj)
5163 try:
5164 obj = obj[key]
5165 except IndexError:
324ad820 5166 return None
325ebc17 5167 return obj
5168
352d63fd 5169 if isinstance(expected_type, type):
5170 type_test = lambda val: val if isinstance(val, expected_type) else None
5171 elif expected_type is not None:
5172 type_test = expected_type
5173 else:
5174 type_test = lambda val: val
5175
8f334380 5176 for path in path_list:
5177 depth = 0
5178 val = _traverse_obj(obj, path)
325ebc17 5179 if val is not None:
8f334380 5180 if depth:
5181 for _ in range(depth - 1):
6586bca9 5182 val = itertools.chain.from_iterable(v for v in val if v is not None)
352d63fd 5183 val = [v for v in map(type_test, val) if v is not None]
8f334380 5184 if val:
352d63fd 5185 return val if get_all else val[0]
5186 else:
5187 val = type_test(val)
5188 if val is not None:
8f334380 5189 return val
325ebc17 5190 return default
324ad820 5191
5192
5193def traverse_dict(dictn, keys, casesense=True):
ee8dd27a 5194 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5195 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5196 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
6606817a 5197
5198
4b4b7f74 5199def variadic(x, allowed_types=(str, bytes, dict)):
cb89cfc1 5200 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
bd50a52b
THD
5201
5202
3e9b66d7
LNO
5203def decode_base(value, digits):
5204 # This will convert given base-x string to scalar (long or int)
5205 table = {char: index for index, char in enumerate(digits)}
5206 result = 0
5207 base = len(digits)
5208 for chr in value:
5209 result *= base
5210 result += table[chr]
5211 return result
5212
5213
5214def time_seconds(**kwargs):
5215 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5216 return t.timestamp()
5217
5218
49fa4d9a
N
5219# create a JSON Web Signature (jws) with HS256 algorithm
5220# the resulting format is in JWS Compact Serialization
5221# implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5222# implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5223def jwt_encode_hs256(payload_data, key, headers={}):
5224 header_data = {
5225 'alg': 'HS256',
5226 'typ': 'JWT',
5227 }
5228 if headers:
5229 header_data.update(headers)
5230 header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5231 payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5232 h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5233 signature_b64 = base64.b64encode(h.digest())
5234 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5235 return token
819e0531 5236
5237
16b0d7e6 5238# can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5239def jwt_decode_hs256(jwt):
5240 header_b64, payload_b64, signature_b64 = jwt.split('.')
5241 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5242 return payload_data
5243
5244
819e0531 5245def supports_terminal_sequences(stream):
5246 if compat_os_name == 'nt':
e3c7d495 5247 from .compat import WINDOWS_VT_MODE # Must be imported locally
5248 if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
819e0531 5249 return False
5250 elif not os.getenv('TERM'):
5251 return False
5252 try:
5253 return stream.isatty()
5254 except BaseException:
5255 return False
5256
5257
ec11a9f4 5258_terminal_sequences_re = re.compile('\033\\[[^m]+m')
5259
5260
5261def remove_terminal_sequences(string):
5262 return _terminal_sequences_re.sub('', string)
5263
5264
5265def number_of_digits(number):
5266 return len('%d' % number)
34921b43 5267
5268
5269def join_nonempty(*values, delim='-', from_dict=None):
5270 if from_dict is not None:
c586f9e8 5271 values = map(from_dict.get, values)
34921b43 5272 return delim.join(map(str, filter(None, values)))
06e57990 5273
5274
27231526
ZM
5275def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5276 """
5277 Find the largest format dimensions in terms of video width and, for each thumbnail:
5278 * Modify the URL: Match the width with the provided regex and replace with the former width
5279 * Update dimensions
5280
5281 This function is useful with video services that scale the provided thumbnails on demand
5282 """
5283 _keys = ('width', 'height')
5284 max_dimensions = max(
5285 [tuple(format.get(k) or 0 for k in _keys) for format in formats],
5286 default=(0, 0))
5287 if not max_dimensions[0]:
5288 return thumbnails
5289 return [
5290 merge_dicts(
5291 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5292 dict(zip(_keys, max_dimensions)), thumbnail)
5293 for thumbnail in thumbnails
5294 ]
5295
5296
93c8410d
LNO
5297def parse_http_range(range):
5298 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5299 if not range:
5300 return None, None, None
5301 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5302 if not crg:
5303 return None, None, None
5304 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5305
5306
06e57990 5307class Config:
5308 own_args = None
5309 filename = None
5310 __initialized = False
5311
5312 def __init__(self, parser, label=None):
5313 self._parser, self.label = parser, label
5314 self._loaded_paths, self.configs = set(), []
5315
5316 def init(self, args=None, filename=None):
5317 assert not self.__initialized
65662dff 5318 directory = ''
06e57990 5319 if filename:
5320 location = os.path.realpath(filename)
65662dff 5321 directory = os.path.dirname(location)
06e57990 5322 if location in self._loaded_paths:
5323 return False
5324 self._loaded_paths.add(location)
5325
5326 self.__initialized = True
5327 self.own_args, self.filename = args, filename
5328 for location in self._parser.parse_args(args)[0].config_locations or []:
65662dff 5329 location = os.path.join(directory, expand_path(location))
06e57990 5330 if os.path.isdir(location):
5331 location = os.path.join(location, 'yt-dlp.conf')
5332 if not os.path.exists(location):
5333 self._parser.error(f'config location {location} does not exist')
5334 self.append_config(self.read_file(location), location)
5335 return True
5336
5337 def __str__(self):
5338 label = join_nonempty(
5339 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5340 delim=' ')
5341 return join_nonempty(
5342 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5343 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5344 delim='\n')
5345
5346 @staticmethod
5347 def read_file(filename, default=[]):
5348 try:
5349 optionf = open(filename)
5350 except IOError:
5351 return default # silently skip if file is not present
5352 try:
5353 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5354 contents = optionf.read()
5355 if sys.version_info < (3,):
5356 contents = contents.decode(preferredencoding())
5357 res = compat_shlex_split(contents, comments=True)
5358 finally:
5359 optionf.close()
5360 return res
5361
5362 @staticmethod
5363 def hide_login_info(opts):
5364 PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5365 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5366
5367 def _scrub_eq(o):
5368 m = eqre.match(o)
5369 if m:
5370 return m.group('key') + '=PRIVATE'
5371 else:
5372 return o
5373
5374 opts = list(map(_scrub_eq, opts))
5375 for idx, opt in enumerate(opts):
5376 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5377 opts[idx + 1] = 'PRIVATE'
5378 return opts
5379
5380 def append_config(self, *args, label=None):
5381 config = type(self)(self._parser, label)
5382 config._loaded_paths = self._loaded_paths
5383 if config.init(*args):
5384 self.configs.append(config)
5385
5386 @property
5387 def all_args(self):
5388 for config in reversed(self.configs):
5389 yield from config.all_args
5390 yield from self.own_args or []
5391
5392 def parse_args(self):
5393 return self._parser.parse_args(list(self.all_args))
da42679b
LNO
5394
5395
5396class WebSocketsWrapper():
5397 """Wraps websockets module to use in non-async scopes"""
5398
5399 def __init__(self, url, headers=None):
5400 self.loop = asyncio.events.new_event_loop()
5401 self.conn = compat_websockets.connect(
5402 url, extra_headers=headers, ping_interval=None,
5403 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
15dfb392 5404 atexit.register(self.__exit__, None, None, None)
da42679b
LNO
5405
5406 def __enter__(self):
5407 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5408 return self
5409
5410 def send(self, *args):
5411 self.run_with_loop(self.pool.send(*args), self.loop)
5412
5413 def recv(self, *args):
5414 return self.run_with_loop(self.pool.recv(*args), self.loop)
5415
5416 def __exit__(self, type, value, traceback):
5417 try:
5418 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5419 finally:
5420 self.loop.close()
15dfb392 5421 self._cancel_all_tasks(self.loop)
da42679b
LNO
5422
5423 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5424 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5425 @staticmethod
5426 def run_with_loop(main, loop):
5427 if not asyncio.coroutines.iscoroutine(main):
5428 raise ValueError(f'a coroutine was expected, got {main!r}')
5429
5430 try:
5431 return loop.run_until_complete(main)
5432 finally:
5433 loop.run_until_complete(loop.shutdown_asyncgens())
5434 if hasattr(loop, 'shutdown_default_executor'):
5435 loop.run_until_complete(loop.shutdown_default_executor())
5436
5437 @staticmethod
5438 def _cancel_all_tasks(loop):
5439 to_cancel = asyncio.tasks.all_tasks(loop)
5440
5441 if not to_cancel:
5442 return
5443
5444 for task in to_cancel:
5445 task.cancel()
5446
5447 loop.run_until_complete(
5448 asyncio.tasks.gather(*to_cancel, loop=loop, return_exceptions=True))
5449
5450 for task in to_cancel:
5451 if task.cancelled():
5452 continue
5453 if task.exception() is not None:
5454 loop.call_exception_handler({
5455 'message': 'unhandled exception during asyncio.run() shutdown',
5456 'exception': task.exception(),
5457 'task': task,
5458 })
5459
5460
5461has_websockets = bool(compat_websockets)
8b7539d2 5462
5463
5464def merge_headers(*dicts):
5465 """Merge dicts of network headers case insensitively, prioritizing the latter ones"""
5466 return {k.capitalize(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}