]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
90f070b6d72aa60c75fcc9c1212f399bcb478faa
[yt-dlp.git] / yt_dlp / utils.py
1 #!/usr/bin/env python3
2 import atexit
3 import base64
4 import binascii
5 import calendar
6 import codecs
7 import collections
8 import contextlib
9 import ctypes
10 import datetime
11 import email.header
12 import email.utils
13 import errno
14 import functools
15 import gzip
16 import hashlib
17 import hmac
18 import importlib.util
19 import io
20 import itertools
21 import json
22 import locale
23 import math
24 import mimetypes
25 import operator
26 import os
27 import platform
28 import random
29 import re
30 import shlex
31 import socket
32 import ssl
33 import subprocess
34 import sys
35 import tempfile
36 import time
37 import traceback
38 import urllib.parse
39 import xml.etree.ElementTree
40 import zlib
41
42 from .compat import (
43 asyncio,
44 compat_chr,
45 compat_cookiejar,
46 compat_etree_fromstring,
47 compat_expanduser,
48 compat_html_entities,
49 compat_html_entities_html5,
50 compat_HTMLParseError,
51 compat_HTMLParser,
52 compat_http_client,
53 compat_HTTPError,
54 compat_os_name,
55 compat_parse_qs,
56 compat_shlex_quote,
57 compat_str,
58 compat_struct_pack,
59 compat_struct_unpack,
60 compat_urllib_error,
61 compat_urllib_parse_unquote_plus,
62 compat_urllib_parse_urlencode,
63 compat_urllib_parse_urlparse,
64 compat_urllib_request,
65 compat_urlparse,
66 )
67 from .dependencies import brotli, certifi, websockets
68 from .socks import ProxyType, sockssocket
69
70
71 def register_socks_protocols():
72 # "Register" SOCKS protocols
73 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
74 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
75 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
76 if scheme not in compat_urlparse.uses_netloc:
77 compat_urlparse.uses_netloc.append(scheme)
78
79
80 # This is not clearly defined otherwise
81 compiled_regex_type = type(re.compile(''))
82
83
84 def random_user_agent():
85 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
86 _CHROME_VERSIONS = (
87 '90.0.4430.212',
88 '90.0.4430.24',
89 '90.0.4430.70',
90 '90.0.4430.72',
91 '90.0.4430.85',
92 '90.0.4430.93',
93 '91.0.4472.101',
94 '91.0.4472.106',
95 '91.0.4472.114',
96 '91.0.4472.124',
97 '91.0.4472.164',
98 '91.0.4472.19',
99 '91.0.4472.77',
100 '92.0.4515.107',
101 '92.0.4515.115',
102 '92.0.4515.131',
103 '92.0.4515.159',
104 '92.0.4515.43',
105 '93.0.4556.0',
106 '93.0.4577.15',
107 '93.0.4577.63',
108 '93.0.4577.82',
109 '94.0.4606.41',
110 '94.0.4606.54',
111 '94.0.4606.61',
112 '94.0.4606.71',
113 '94.0.4606.81',
114 '94.0.4606.85',
115 '95.0.4638.17',
116 '95.0.4638.50',
117 '95.0.4638.54',
118 '95.0.4638.69',
119 '95.0.4638.74',
120 '96.0.4664.18',
121 '96.0.4664.45',
122 '96.0.4664.55',
123 '96.0.4664.93',
124 '97.0.4692.20',
125 )
126 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
127
128
129 SUPPORTED_ENCODINGS = [
130 'gzip', 'deflate'
131 ]
132 if brotli:
133 SUPPORTED_ENCODINGS.append('br')
134
135 std_headers = {
136 'User-Agent': random_user_agent(),
137 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
138 'Accept-Language': 'en-us,en;q=0.5',
139 'Sec-Fetch-Mode': 'navigate',
140 }
141
142
143 USER_AGENTS = {
144 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
145 }
146
147
148 NO_DEFAULT = object()
149
150 ENGLISH_MONTH_NAMES = [
151 'January', 'February', 'March', 'April', 'May', 'June',
152 'July', 'August', 'September', 'October', 'November', 'December']
153
154 MONTH_NAMES = {
155 'en': ENGLISH_MONTH_NAMES,
156 'fr': [
157 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
158 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
159 }
160
161 KNOWN_EXTENSIONS = (
162 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
163 'flv', 'f4v', 'f4a', 'f4b',
164 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
165 'mkv', 'mka', 'mk3d',
166 'avi', 'divx',
167 'mov',
168 'asf', 'wmv', 'wma',
169 '3gp', '3g2',
170 'mp3',
171 'flac',
172 'ape',
173 'wav',
174 'f4f', 'f4m', 'm3u8', 'smil')
175
176 # needed for sanitizing filenames in restricted mode
177 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
178 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
179 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
180
181 DATE_FORMATS = (
182 '%d %B %Y',
183 '%d %b %Y',
184 '%B %d %Y',
185 '%B %dst %Y',
186 '%B %dnd %Y',
187 '%B %drd %Y',
188 '%B %dth %Y',
189 '%b %d %Y',
190 '%b %dst %Y',
191 '%b %dnd %Y',
192 '%b %drd %Y',
193 '%b %dth %Y',
194 '%b %dst %Y %I:%M',
195 '%b %dnd %Y %I:%M',
196 '%b %drd %Y %I:%M',
197 '%b %dth %Y %I:%M',
198 '%Y %m %d',
199 '%Y-%m-%d',
200 '%Y.%m.%d.',
201 '%Y/%m/%d',
202 '%Y/%m/%d %H:%M',
203 '%Y/%m/%d %H:%M:%S',
204 '%Y%m%d%H%M',
205 '%Y%m%d%H%M%S',
206 '%Y%m%d',
207 '%Y-%m-%d %H:%M',
208 '%Y-%m-%d %H:%M:%S',
209 '%Y-%m-%d %H:%M:%S.%f',
210 '%Y-%m-%d %H:%M:%S:%f',
211 '%d.%m.%Y %H:%M',
212 '%d.%m.%Y %H.%M',
213 '%Y-%m-%dT%H:%M:%SZ',
214 '%Y-%m-%dT%H:%M:%S.%fZ',
215 '%Y-%m-%dT%H:%M:%S.%f0Z',
216 '%Y-%m-%dT%H:%M:%S',
217 '%Y-%m-%dT%H:%M:%S.%f',
218 '%Y-%m-%dT%H:%M',
219 '%b %d %Y at %H:%M',
220 '%b %d %Y at %H:%M:%S',
221 '%B %d %Y at %H:%M',
222 '%B %d %Y at %H:%M:%S',
223 '%H:%M %d-%b-%Y',
224 )
225
226 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
227 DATE_FORMATS_DAY_FIRST.extend([
228 '%d-%m-%Y',
229 '%d.%m.%Y',
230 '%d.%m.%y',
231 '%d/%m/%Y',
232 '%d/%m/%y',
233 '%d/%m/%Y %H:%M:%S',
234 ])
235
236 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
237 DATE_FORMATS_MONTH_FIRST.extend([
238 '%m-%d-%Y',
239 '%m.%d.%Y',
240 '%m/%d/%Y',
241 '%m/%d/%y',
242 '%m/%d/%Y %H:%M:%S',
243 ])
244
245 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
246 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
247
248
249 def preferredencoding():
250 """Get preferred encoding.
251
252 Returns the best encoding scheme for the system, based on
253 locale.getpreferredencoding() and some further tweaks.
254 """
255 try:
256 pref = locale.getpreferredencoding()
257 'TEST'.encode(pref)
258 except Exception:
259 pref = 'UTF-8'
260
261 return pref
262
263
264 def write_json_file(obj, fn):
265 """ Encode obj as JSON and write it to fn, atomically if possible """
266
267 tf = tempfile.NamedTemporaryFile(
268 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
269 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
270
271 try:
272 with tf:
273 json.dump(obj, tf, ensure_ascii=False)
274 if sys.platform == 'win32':
275 # Need to remove existing file on Windows, else os.rename raises
276 # WindowsError or FileExistsError.
277 with contextlib.suppress(OSError):
278 os.unlink(fn)
279 with contextlib.suppress(OSError):
280 mask = os.umask(0)
281 os.umask(mask)
282 os.chmod(tf.name, 0o666 & ~mask)
283 os.rename(tf.name, fn)
284 except Exception:
285 with contextlib.suppress(OSError):
286 os.remove(tf.name)
287 raise
288
289
290 def find_xpath_attr(node, xpath, key, val=None):
291 """ Find the xpath xpath[@key=val] """
292 assert re.match(r'^[a-zA-Z_-]+$', key)
293 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
294 return node.find(expr)
295
296 # On python2.6 the xml.etree.ElementTree.Element methods don't support
297 # the namespace parameter
298
299
300 def xpath_with_ns(path, ns_map):
301 components = [c.split(':') for c in path.split('/')]
302 replaced = []
303 for c in components:
304 if len(c) == 1:
305 replaced.append(c[0])
306 else:
307 ns, tag = c
308 replaced.append('{%s}%s' % (ns_map[ns], tag))
309 return '/'.join(replaced)
310
311
312 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
313 def _find_xpath(xpath):
314 return node.find(xpath)
315
316 if isinstance(xpath, (str, compat_str)):
317 n = _find_xpath(xpath)
318 else:
319 for xp in xpath:
320 n = _find_xpath(xp)
321 if n is not None:
322 break
323
324 if n is None:
325 if default is not NO_DEFAULT:
326 return default
327 elif fatal:
328 name = xpath if name is None else name
329 raise ExtractorError('Could not find XML element %s' % name)
330 else:
331 return None
332 return n
333
334
335 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
336 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
337 if n is None or n == default:
338 return n
339 if n.text is None:
340 if default is not NO_DEFAULT:
341 return default
342 elif fatal:
343 name = xpath if name is None else name
344 raise ExtractorError('Could not find XML element\'s text %s' % name)
345 else:
346 return None
347 return n.text
348
349
350 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
351 n = find_xpath_attr(node, xpath, key)
352 if n is None:
353 if default is not NO_DEFAULT:
354 return default
355 elif fatal:
356 name = f'{xpath}[@{key}]' if name is None else name
357 raise ExtractorError('Could not find XML attribute %s' % name)
358 else:
359 return None
360 return n.attrib[key]
361
362
363 def get_element_by_id(id, html):
364 """Return the content of the tag with the specified ID in the passed HTML document"""
365 return get_element_by_attribute('id', id, html)
366
367
368 def get_element_html_by_id(id, html):
369 """Return the html of the tag with the specified ID in the passed HTML document"""
370 return get_element_html_by_attribute('id', id, html)
371
372
373 def get_element_by_class(class_name, html):
374 """Return the content of the first tag with the specified class in the passed HTML document"""
375 retval = get_elements_by_class(class_name, html)
376 return retval[0] if retval else None
377
378
379 def get_element_html_by_class(class_name, html):
380 """Return the html of the first tag with the specified class in the passed HTML document"""
381 retval = get_elements_html_by_class(class_name, html)
382 return retval[0] if retval else None
383
384
385 def get_element_by_attribute(attribute, value, html, escape_value=True):
386 retval = get_elements_by_attribute(attribute, value, html, escape_value)
387 return retval[0] if retval else None
388
389
390 def get_element_html_by_attribute(attribute, value, html, escape_value=True):
391 retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
392 return retval[0] if retval else None
393
394
395 def get_elements_by_class(class_name, html):
396 """Return the content of all tags with the specified class in the passed HTML document as a list"""
397 return get_elements_by_attribute(
398 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
399 html, escape_value=False)
400
401
402 def get_elements_html_by_class(class_name, html):
403 """Return the html of all tags with the specified class in the passed HTML document as a list"""
404 return get_elements_html_by_attribute(
405 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
406 html, escape_value=False)
407
408
409 def get_elements_by_attribute(*args, **kwargs):
410 """Return the content of the tag with the specified attribute in the passed HTML document"""
411 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
412
413
414 def get_elements_html_by_attribute(*args, **kwargs):
415 """Return the html of the tag with the specified attribute in the passed HTML document"""
416 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
417
418
419 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
420 """
421 Return the text (content) and the html (whole) of the tag with the specified
422 attribute in the passed HTML document
423 """
424
425 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
426
427 value = re.escape(value) if escape_value else value
428
429 partial_element_re = rf'''(?x)
430 <(?P<tag>[a-zA-Z0-9:._-]+)
431 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
432 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
433 '''
434
435 for m in re.finditer(partial_element_re, html):
436 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
437
438 yield (
439 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
440 whole
441 )
442
443
444 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
445 """
446 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
447 closing tag for the first opening tag it has encountered, and can be used
448 as a context manager
449 """
450
451 class HTMLBreakOnClosingTagException(Exception):
452 pass
453
454 def __init__(self):
455 self.tagstack = collections.deque()
456 compat_HTMLParser.__init__(self)
457
458 def __enter__(self):
459 return self
460
461 def __exit__(self, *_):
462 self.close()
463
464 def close(self):
465 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
466 # so data remains buffered; we no longer have any interest in it, thus
467 # override this method to discard it
468 pass
469
470 def handle_starttag(self, tag, _):
471 self.tagstack.append(tag)
472
473 def handle_endtag(self, tag):
474 if not self.tagstack:
475 raise compat_HTMLParseError('no tags in the stack')
476 while self.tagstack:
477 inner_tag = self.tagstack.pop()
478 if inner_tag == tag:
479 break
480 else:
481 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
482 if not self.tagstack:
483 raise self.HTMLBreakOnClosingTagException()
484
485
486 def get_element_text_and_html_by_tag(tag, html):
487 """
488 For the first element with the specified tag in the passed HTML document
489 return its' content (text) and the whole element (html)
490 """
491 def find_or_raise(haystack, needle, exc):
492 try:
493 return haystack.index(needle)
494 except ValueError:
495 raise exc
496 closing_tag = f'</{tag}>'
497 whole_start = find_or_raise(
498 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
499 content_start = find_or_raise(
500 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
501 content_start += whole_start + 1
502 with HTMLBreakOnClosingTagParser() as parser:
503 parser.feed(html[whole_start:content_start])
504 if not parser.tagstack or parser.tagstack[0] != tag:
505 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
506 offset = content_start
507 while offset < len(html):
508 next_closing_tag_start = find_or_raise(
509 html[offset:], closing_tag,
510 compat_HTMLParseError(f'closing {tag} tag not found'))
511 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
512 try:
513 parser.feed(html[offset:offset + next_closing_tag_end])
514 offset += next_closing_tag_end
515 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
516 return html[content_start:offset + next_closing_tag_start], \
517 html[whole_start:offset + next_closing_tag_end]
518 raise compat_HTMLParseError('unexpected end of html')
519
520
521 class HTMLAttributeParser(compat_HTMLParser):
522 """Trivial HTML parser to gather the attributes for a single element"""
523
524 def __init__(self):
525 self.attrs = {}
526 compat_HTMLParser.__init__(self)
527
528 def handle_starttag(self, tag, attrs):
529 self.attrs = dict(attrs)
530
531
532 class HTMLListAttrsParser(compat_HTMLParser):
533 """HTML parser to gather the attributes for the elements of a list"""
534
535 def __init__(self):
536 compat_HTMLParser.__init__(self)
537 self.items = []
538 self._level = 0
539
540 def handle_starttag(self, tag, attrs):
541 if tag == 'li' and self._level == 0:
542 self.items.append(dict(attrs))
543 self._level += 1
544
545 def handle_endtag(self, tag):
546 self._level -= 1
547
548
549 def extract_attributes(html_element):
550 """Given a string for an HTML element such as
551 <el
552 a="foo" B="bar" c="&98;az" d=boz
553 empty= noval entity="&amp;"
554 sq='"' dq="'"
555 >
556 Decode and return a dictionary of attributes.
557 {
558 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
559 'empty': '', 'noval': None, 'entity': '&',
560 'sq': '"', 'dq': '\''
561 }.
562 """
563 parser = HTMLAttributeParser()
564 with contextlib.suppress(compat_HTMLParseError):
565 parser.feed(html_element)
566 parser.close()
567 return parser.attrs
568
569
570 def parse_list(webpage):
571 """Given a string for an series of HTML <li> elements,
572 return a dictionary of their attributes"""
573 parser = HTMLListAttrsParser()
574 parser.feed(webpage)
575 parser.close()
576 return parser.items
577
578
579 def clean_html(html):
580 """Clean an HTML snippet into a readable string"""
581
582 if html is None: # Convenience for sanitizing descriptions etc.
583 return html
584
585 html = re.sub(r'\s+', ' ', html)
586 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
587 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
588 # Strip html tags
589 html = re.sub('<.*?>', '', html)
590 # Replace html entities
591 html = unescapeHTML(html)
592 return html.strip()
593
594
595 def sanitize_open(filename, open_mode):
596 """Try to open the given filename, and slightly tweak it if this fails.
597
598 Attempts to open the given filename. If this fails, it tries to change
599 the filename slightly, step by step, until it's either able to open it
600 or it fails and raises a final exception, like the standard open()
601 function.
602
603 It returns the tuple (stream, definitive_file_name).
604 """
605 if filename == '-':
606 if sys.platform == 'win32':
607 import msvcrt
608 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
609 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
610
611 for attempt in range(2):
612 try:
613 try:
614 if sys.platform == 'win32':
615 # FIXME: An exclusive lock also locks the file from being read.
616 # Since windows locks are mandatory, don't lock the file on windows (for now).
617 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
618 raise LockingUnsupportedError()
619 stream = locked_file(filename, open_mode, block=False).__enter__()
620 except LockingUnsupportedError:
621 stream = open(filename, open_mode)
622 return (stream, filename)
623 except OSError as err:
624 if attempt or err.errno in (errno.EACCES,):
625 raise
626 old_filename, filename = filename, sanitize_path(filename)
627 if old_filename == filename:
628 raise
629
630
631 def timeconvert(timestr):
632 """Convert RFC 2822 defined time string into system timestamp"""
633 timestamp = None
634 timetuple = email.utils.parsedate_tz(timestr)
635 if timetuple is not None:
636 timestamp = email.utils.mktime_tz(timetuple)
637 return timestamp
638
639
640 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
641 """Sanitizes a string so it could be used as part of a filename.
642 @param restricted Use a stricter subset of allowed characters
643 @param is_id Whether this is an ID that should be kept unchanged if possible.
644 If unset, yt-dlp's new sanitization rules are in effect
645 """
646 if s == '':
647 return ''
648
649 def replace_insane(char):
650 if restricted and char in ACCENT_CHARS:
651 return ACCENT_CHARS[char]
652 elif not restricted and char == '\n':
653 return '\0 '
654 elif char == '?' or ord(char) < 32 or ord(char) == 127:
655 return ''
656 elif char == '"':
657 return '' if restricted else '\''
658 elif char == ':':
659 return '\0_\0-' if restricted else '\0 \0-'
660 elif char in '\\/|*<>':
661 return '\0_'
662 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
663 return '\0_'
664 return char
665
666 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
667 result = ''.join(map(replace_insane, s))
668 if is_id is NO_DEFAULT:
669 result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result) # Remove repeated substitute chars
670 STRIP_RE = '(?:\0.|[ _-])*'
671 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
672 result = result.replace('\0', '') or '_'
673
674 if not is_id:
675 while '__' in result:
676 result = result.replace('__', '_')
677 result = result.strip('_')
678 # Common case of "Foreign band name - English song title"
679 if restricted and result.startswith('-_'):
680 result = result[2:]
681 if result.startswith('-'):
682 result = '_' + result[len('-'):]
683 result = result.lstrip('.')
684 if not result:
685 result = '_'
686 return result
687
688
689 def sanitize_path(s, force=False):
690 """Sanitizes and normalizes path on Windows"""
691 if sys.platform == 'win32':
692 force = False
693 drive_or_unc, _ = os.path.splitdrive(s)
694 elif force:
695 drive_or_unc = ''
696 else:
697 return s
698
699 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
700 if drive_or_unc:
701 norm_path.pop(0)
702 sanitized_path = [
703 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
704 for path_part in norm_path]
705 if drive_or_unc:
706 sanitized_path.insert(0, drive_or_unc + os.path.sep)
707 elif force and s and s[0] == os.path.sep:
708 sanitized_path.insert(0, os.path.sep)
709 return os.path.join(*sanitized_path)
710
711
712 def sanitize_url(url):
713 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
714 # the number of unwanted failures due to missing protocol
715 if url.startswith('//'):
716 return 'http:%s' % url
717 # Fix some common typos seen so far
718 COMMON_TYPOS = (
719 # https://github.com/ytdl-org/youtube-dl/issues/15649
720 (r'^httpss://', r'https://'),
721 # https://bx1.be/lives/direct-tv/
722 (r'^rmtp([es]?)://', r'rtmp\1://'),
723 )
724 for mistake, fixup in COMMON_TYPOS:
725 if re.match(mistake, url):
726 return re.sub(mistake, fixup, url)
727 return url
728
729
730 def extract_basic_auth(url):
731 parts = compat_urlparse.urlsplit(url)
732 if parts.username is None:
733 return url, None
734 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
735 parts.hostname if parts.port is None
736 else '%s:%d' % (parts.hostname, parts.port))))
737 auth_payload = base64.b64encode(
738 ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
739 return url, 'Basic ' + auth_payload.decode('utf-8')
740
741
742 def sanitized_Request(url, *args, **kwargs):
743 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
744 if auth_header is not None:
745 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
746 headers['Authorization'] = auth_header
747 return compat_urllib_request.Request(url, *args, **kwargs)
748
749
750 def expand_path(s):
751 """Expand shell variables and ~"""
752 return os.path.expandvars(compat_expanduser(s))
753
754
755 def orderedSet(iterable):
756 """ Remove all duplicates from the input iterable """
757 res = []
758 for el in iterable:
759 if el not in res:
760 res.append(el)
761 return res
762
763
764 def _htmlentity_transform(entity_with_semicolon):
765 """Transforms an HTML entity to a character."""
766 entity = entity_with_semicolon[:-1]
767
768 # Known non-numeric HTML entity
769 if entity in compat_html_entities.name2codepoint:
770 return compat_chr(compat_html_entities.name2codepoint[entity])
771
772 # TODO: HTML5 allows entities without a semicolon. For example,
773 # '&Eacuteric' should be decoded as 'Éric'.
774 if entity_with_semicolon in compat_html_entities_html5:
775 return compat_html_entities_html5[entity_with_semicolon]
776
777 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
778 if mobj is not None:
779 numstr = mobj.group(1)
780 if numstr.startswith('x'):
781 base = 16
782 numstr = '0%s' % numstr
783 else:
784 base = 10
785 # See https://github.com/ytdl-org/youtube-dl/issues/7518
786 with contextlib.suppress(ValueError):
787 return compat_chr(int(numstr, base))
788
789 # Unknown entity in name, return its literal representation
790 return '&%s;' % entity
791
792
793 def unescapeHTML(s):
794 if s is None:
795 return None
796 assert isinstance(s, str)
797
798 return re.sub(
799 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
800
801
802 def escapeHTML(text):
803 return (
804 text
805 .replace('&', '&amp;')
806 .replace('<', '&lt;')
807 .replace('>', '&gt;')
808 .replace('"', '&quot;')
809 .replace("'", '&#39;')
810 )
811
812
813 def process_communicate_or_kill(p, *args, **kwargs):
814 try:
815 return p.communicate(*args, **kwargs)
816 except BaseException: # Including KeyboardInterrupt
817 p.kill()
818 p.wait()
819 raise
820
821
822 class Popen(subprocess.Popen):
823 if sys.platform == 'win32':
824 _startupinfo = subprocess.STARTUPINFO()
825 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
826 else:
827 _startupinfo = None
828
829 def __init__(self, *args, **kwargs):
830 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
831
832 def communicate_or_kill(self, *args, **kwargs):
833 return process_communicate_or_kill(self, *args, **kwargs)
834
835
836 def get_subprocess_encoding():
837 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
838 # For subprocess calls, encode with locale encoding
839 # Refer to http://stackoverflow.com/a/9951851/35070
840 encoding = preferredencoding()
841 else:
842 encoding = sys.getfilesystemencoding()
843 if encoding is None:
844 encoding = 'utf-8'
845 return encoding
846
847
848 def encodeFilename(s, for_subprocess=False):
849 assert isinstance(s, str)
850 return s
851
852
853 def decodeFilename(b, for_subprocess=False):
854 return b
855
856
857 def encodeArgument(s):
858 # Legacy code that uses byte strings
859 # Uncomment the following line after fixing all post processors
860 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
861 return s if isinstance(s, str) else s.decode('ascii')
862
863
864 def decodeArgument(b):
865 return b
866
867
868 def decodeOption(optval):
869 if optval is None:
870 return optval
871 if isinstance(optval, bytes):
872 optval = optval.decode(preferredencoding())
873
874 assert isinstance(optval, compat_str)
875 return optval
876
877
878 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
879
880
881 def timetuple_from_msec(msec):
882 secs, msec = divmod(msec, 1000)
883 mins, secs = divmod(secs, 60)
884 hrs, mins = divmod(mins, 60)
885 return _timetuple(hrs, mins, secs, msec)
886
887
888 def formatSeconds(secs, delim=':', msec=False):
889 time = timetuple_from_msec(secs * 1000)
890 if time.hours:
891 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
892 elif time.minutes:
893 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
894 else:
895 ret = '%d' % time.seconds
896 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
897
898
899 def _ssl_load_windows_store_certs(ssl_context, storename):
900 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
901 try:
902 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
903 if encoding == 'x509_asn' and (
904 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
905 except PermissionError:
906 return
907 for cert in certs:
908 with contextlib.suppress(ssl.SSLError):
909 ssl_context.load_verify_locations(cadata=cert)
910
911
912 def make_HTTPS_handler(params, **kwargs):
913 opts_check_certificate = not params.get('nocheckcertificate')
914 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
915 context.check_hostname = opts_check_certificate
916 if params.get('legacyserverconnect'):
917 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
918 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
919 if opts_check_certificate:
920 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
921 context.load_verify_locations(cafile=certifi.where())
922 else:
923 try:
924 context.load_default_certs()
925 # Work around the issue in load_default_certs when there are bad certificates. See:
926 # https://github.com/yt-dlp/yt-dlp/issues/1060,
927 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
928 except ssl.SSLError:
929 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
930 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
931 # Create a new context to discard any certificates that were already loaded
932 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
933 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
934 for storename in ('CA', 'ROOT'):
935 _ssl_load_windows_store_certs(context, storename)
936 context.set_default_verify_paths()
937 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
938
939
940 def bug_reports_message(before=';'):
941 msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , '
942 'filling out the appropriate issue template. '
943 'Confirm you are on the latest version using yt-dlp -U')
944
945 before = before.rstrip()
946 if not before or before.endswith(('.', '!', '?')):
947 msg = msg[0].title() + msg[1:]
948
949 return (before + ' ' if before else '') + msg
950
951
952 class YoutubeDLError(Exception):
953 """Base exception for YoutubeDL errors."""
954 msg = None
955
956 def __init__(self, msg=None):
957 if msg is not None:
958 self.msg = msg
959 elif self.msg is None:
960 self.msg = type(self).__name__
961 super().__init__(self.msg)
962
963
964 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
965 if hasattr(ssl, 'CertificateError'):
966 network_exceptions.append(ssl.CertificateError)
967 network_exceptions = tuple(network_exceptions)
968
969
970 class ExtractorError(YoutubeDLError):
971 """Error during info extraction."""
972
973 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
974 """ tb, if given, is the original traceback (so that it can be printed out).
975 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
976 """
977 if sys.exc_info()[0] in network_exceptions:
978 expected = True
979
980 self.orig_msg = str(msg)
981 self.traceback = tb
982 self.expected = expected
983 self.cause = cause
984 self.video_id = video_id
985 self.ie = ie
986 self.exc_info = sys.exc_info() # preserve original exception
987
988 super().__init__(''.join((
989 format_field(ie, template='[%s] '),
990 format_field(video_id, template='%s: '),
991 msg,
992 format_field(cause, template=' (caused by %r)'),
993 '' if expected else bug_reports_message())))
994
995 def format_traceback(self):
996 return join_nonempty(
997 self.traceback and ''.join(traceback.format_tb(self.traceback)),
998 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
999 delim='\n') or None
1000
1001
1002 class UnsupportedError(ExtractorError):
1003 def __init__(self, url):
1004 super().__init__(
1005 'Unsupported URL: %s' % url, expected=True)
1006 self.url = url
1007
1008
1009 class RegexNotFoundError(ExtractorError):
1010 """Error when a regex didn't match"""
1011 pass
1012
1013
1014 class GeoRestrictedError(ExtractorError):
1015 """Geographic restriction Error exception.
1016
1017 This exception may be thrown when a video is not available from your
1018 geographic location due to geographic restrictions imposed by a website.
1019 """
1020
1021 def __init__(self, msg, countries=None, **kwargs):
1022 kwargs['expected'] = True
1023 super().__init__(msg, **kwargs)
1024 self.countries = countries
1025
1026
1027 class DownloadError(YoutubeDLError):
1028 """Download Error exception.
1029
1030 This exception may be thrown by FileDownloader objects if they are not
1031 configured to continue on errors. They will contain the appropriate
1032 error message.
1033 """
1034
1035 def __init__(self, msg, exc_info=None):
1036 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1037 super().__init__(msg)
1038 self.exc_info = exc_info
1039
1040
1041 class EntryNotInPlaylist(YoutubeDLError):
1042 """Entry not in playlist exception.
1043
1044 This exception will be thrown by YoutubeDL when a requested entry
1045 is not found in the playlist info_dict
1046 """
1047 msg = 'Entry not found in info'
1048
1049
1050 class SameFileError(YoutubeDLError):
1051 """Same File exception.
1052
1053 This exception will be thrown by FileDownloader objects if they detect
1054 multiple files would have to be downloaded to the same file on disk.
1055 """
1056 msg = 'Fixed output name but more than one file to download'
1057
1058 def __init__(self, filename=None):
1059 if filename is not None:
1060 self.msg += f': {filename}'
1061 super().__init__(self.msg)
1062
1063
1064 class PostProcessingError(YoutubeDLError):
1065 """Post Processing exception.
1066
1067 This exception may be raised by PostProcessor's .run() method to
1068 indicate an error in the postprocessing task.
1069 """
1070
1071
1072 class DownloadCancelled(YoutubeDLError):
1073 """ Exception raised when the download queue should be interrupted """
1074 msg = 'The download was cancelled'
1075
1076
1077 class ExistingVideoReached(DownloadCancelled):
1078 """ --break-on-existing triggered """
1079 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1080
1081
1082 class RejectedVideoReached(DownloadCancelled):
1083 """ --break-on-reject triggered """
1084 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1085
1086
1087 class MaxDownloadsReached(DownloadCancelled):
1088 """ --max-downloads limit has been reached. """
1089 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1090
1091
1092 class ReExtractInfo(YoutubeDLError):
1093 """ Video info needs to be re-extracted. """
1094
1095 def __init__(self, msg, expected=False):
1096 super().__init__(msg)
1097 self.expected = expected
1098
1099
1100 class ThrottledDownload(ReExtractInfo):
1101 """ Download speed below --throttled-rate. """
1102 msg = 'The download speed is below throttle limit'
1103
1104 def __init__(self):
1105 super().__init__(self.msg, expected=False)
1106
1107
1108 class UnavailableVideoError(YoutubeDLError):
1109 """Unavailable Format exception.
1110
1111 This exception will be thrown when a video is requested
1112 in a format that is not available for that video.
1113 """
1114 msg = 'Unable to download video'
1115
1116 def __init__(self, err=None):
1117 if err is not None:
1118 self.msg += f': {err}'
1119 super().__init__(self.msg)
1120
1121
1122 class ContentTooShortError(YoutubeDLError):
1123 """Content Too Short exception.
1124
1125 This exception may be raised by FileDownloader objects when a file they
1126 download is too small for what the server announced first, indicating
1127 the connection was probably interrupted.
1128 """
1129
1130 def __init__(self, downloaded, expected):
1131 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1132 # Both in bytes
1133 self.downloaded = downloaded
1134 self.expected = expected
1135
1136
1137 class XAttrMetadataError(YoutubeDLError):
1138 def __init__(self, code=None, msg='Unknown error'):
1139 super().__init__(msg)
1140 self.code = code
1141 self.msg = msg
1142
1143 # Parsing code and msg
1144 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1145 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1146 self.reason = 'NO_SPACE'
1147 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1148 self.reason = 'VALUE_TOO_LONG'
1149 else:
1150 self.reason = 'NOT_SUPPORTED'
1151
1152
1153 class XAttrUnavailableError(YoutubeDLError):
1154 pass
1155
1156
1157 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1158 hc = http_class(*args, **kwargs)
1159 source_address = ydl_handler._params.get('source_address')
1160
1161 if source_address is not None:
1162 # This is to workaround _create_connection() from socket where it will try all
1163 # address data from getaddrinfo() including IPv6. This filters the result from
1164 # getaddrinfo() based on the source_address value.
1165 # This is based on the cpython socket.create_connection() function.
1166 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1167 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1168 host, port = address
1169 err = None
1170 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1171 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1172 ip_addrs = [addr for addr in addrs if addr[0] == af]
1173 if addrs and not ip_addrs:
1174 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1175 raise OSError(
1176 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1177 % (ip_version, source_address[0]))
1178 for res in ip_addrs:
1179 af, socktype, proto, canonname, sa = res
1180 sock = None
1181 try:
1182 sock = socket.socket(af, socktype, proto)
1183 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1184 sock.settimeout(timeout)
1185 sock.bind(source_address)
1186 sock.connect(sa)
1187 err = None # Explicitly break reference cycle
1188 return sock
1189 except OSError as _:
1190 err = _
1191 if sock is not None:
1192 sock.close()
1193 if err is not None:
1194 raise err
1195 else:
1196 raise OSError('getaddrinfo returns an empty list')
1197 if hasattr(hc, '_create_connection'):
1198 hc._create_connection = _create_connection
1199 hc.source_address = (source_address, 0)
1200
1201 return hc
1202
1203
1204 def handle_youtubedl_headers(headers):
1205 filtered_headers = headers
1206
1207 if 'Youtubedl-no-compression' in filtered_headers:
1208 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1209 del filtered_headers['Youtubedl-no-compression']
1210
1211 return filtered_headers
1212
1213
1214 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1215 """Handler for HTTP requests and responses.
1216
1217 This class, when installed with an OpenerDirector, automatically adds
1218 the standard headers to every HTTP request and handles gzipped and
1219 deflated responses from web servers. If compression is to be avoided in
1220 a particular request, the original request in the program code only has
1221 to include the HTTP header "Youtubedl-no-compression", which will be
1222 removed before making the real request.
1223
1224 Part of this code was copied from:
1225
1226 http://techknack.net/python-urllib2-handlers/
1227
1228 Andrew Rowls, the author of that code, agreed to release it to the
1229 public domain.
1230 """
1231
1232 def __init__(self, params, *args, **kwargs):
1233 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1234 self._params = params
1235
1236 def http_open(self, req):
1237 conn_class = compat_http_client.HTTPConnection
1238
1239 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1240 if socks_proxy:
1241 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1242 del req.headers['Ytdl-socks-proxy']
1243
1244 return self.do_open(functools.partial(
1245 _create_http_connection, self, conn_class, False),
1246 req)
1247
1248 @staticmethod
1249 def deflate(data):
1250 if not data:
1251 return data
1252 try:
1253 return zlib.decompress(data, -zlib.MAX_WBITS)
1254 except zlib.error:
1255 return zlib.decompress(data)
1256
1257 @staticmethod
1258 def brotli(data):
1259 if not data:
1260 return data
1261 return brotli.decompress(data)
1262
1263 def http_request(self, req):
1264 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1265 # always respected by websites, some tend to give out URLs with non percent-encoded
1266 # non-ASCII characters (see telemb.py, ard.py [#3412])
1267 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1268 # To work around aforementioned issue we will replace request's original URL with
1269 # percent-encoded one
1270 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1271 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1272 url = req.get_full_url()
1273 url_escaped = escape_url(url)
1274
1275 # Substitute URL if any change after escaping
1276 if url != url_escaped:
1277 req = update_Request(req, url=url_escaped)
1278
1279 for h, v in self._params.get('http_headers', std_headers).items():
1280 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1281 # The dict keys are capitalized because of this bug by urllib
1282 if h.capitalize() not in req.headers:
1283 req.add_header(h, v)
1284
1285 if 'Accept-encoding' not in req.headers:
1286 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1287
1288 req.headers = handle_youtubedl_headers(req.headers)
1289
1290 return req
1291
1292 def http_response(self, req, resp):
1293 old_resp = resp
1294 # gzip
1295 if resp.headers.get('Content-encoding', '') == 'gzip':
1296 content = resp.read()
1297 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1298 try:
1299 uncompressed = io.BytesIO(gz.read())
1300 except OSError as original_ioerror:
1301 # There may be junk add the end of the file
1302 # See http://stackoverflow.com/q/4928560/35070 for details
1303 for i in range(1, 1024):
1304 try:
1305 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1306 uncompressed = io.BytesIO(gz.read())
1307 except OSError:
1308 continue
1309 break
1310 else:
1311 raise original_ioerror
1312 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1313 resp.msg = old_resp.msg
1314 del resp.headers['Content-encoding']
1315 # deflate
1316 if resp.headers.get('Content-encoding', '') == 'deflate':
1317 gz = io.BytesIO(self.deflate(resp.read()))
1318 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1319 resp.msg = old_resp.msg
1320 del resp.headers['Content-encoding']
1321 # brotli
1322 if resp.headers.get('Content-encoding', '') == 'br':
1323 resp = compat_urllib_request.addinfourl(
1324 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1325 resp.msg = old_resp.msg
1326 del resp.headers['Content-encoding']
1327 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1328 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1329 if 300 <= resp.code < 400:
1330 location = resp.headers.get('Location')
1331 if location:
1332 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1333 location = location.encode('iso-8859-1').decode('utf-8')
1334 location_escaped = escape_url(location)
1335 if location != location_escaped:
1336 del resp.headers['Location']
1337 resp.headers['Location'] = location_escaped
1338 return resp
1339
1340 https_request = http_request
1341 https_response = http_response
1342
1343
1344 def make_socks_conn_class(base_class, socks_proxy):
1345 assert issubclass(base_class, (
1346 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1347
1348 url_components = compat_urlparse.urlparse(socks_proxy)
1349 if url_components.scheme.lower() == 'socks5':
1350 socks_type = ProxyType.SOCKS5
1351 elif url_components.scheme.lower() in ('socks', 'socks4'):
1352 socks_type = ProxyType.SOCKS4
1353 elif url_components.scheme.lower() == 'socks4a':
1354 socks_type = ProxyType.SOCKS4A
1355
1356 def unquote_if_non_empty(s):
1357 if not s:
1358 return s
1359 return compat_urllib_parse_unquote_plus(s)
1360
1361 proxy_args = (
1362 socks_type,
1363 url_components.hostname, url_components.port or 1080,
1364 True, # Remote DNS
1365 unquote_if_non_empty(url_components.username),
1366 unquote_if_non_empty(url_components.password),
1367 )
1368
1369 class SocksConnection(base_class):
1370 def connect(self):
1371 self.sock = sockssocket()
1372 self.sock.setproxy(*proxy_args)
1373 if isinstance(self.timeout, (int, float)):
1374 self.sock.settimeout(self.timeout)
1375 self.sock.connect((self.host, self.port))
1376
1377 if isinstance(self, compat_http_client.HTTPSConnection):
1378 if hasattr(self, '_context'): # Python > 2.6
1379 self.sock = self._context.wrap_socket(
1380 self.sock, server_hostname=self.host)
1381 else:
1382 self.sock = ssl.wrap_socket(self.sock)
1383
1384 return SocksConnection
1385
1386
1387 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1388 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1389 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1390 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1391 self._params = params
1392
1393 def https_open(self, req):
1394 kwargs = {}
1395 conn_class = self._https_conn_class
1396
1397 if hasattr(self, '_context'): # python > 2.6
1398 kwargs['context'] = self._context
1399 if hasattr(self, '_check_hostname'): # python 3.x
1400 kwargs['check_hostname'] = self._check_hostname
1401
1402 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1403 if socks_proxy:
1404 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1405 del req.headers['Ytdl-socks-proxy']
1406
1407 return self.do_open(functools.partial(
1408 _create_http_connection, self, conn_class, True),
1409 req, **kwargs)
1410
1411
1412 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1413 """
1414 See [1] for cookie file format.
1415
1416 1. https://curl.haxx.se/docs/http-cookies.html
1417 """
1418 _HTTPONLY_PREFIX = '#HttpOnly_'
1419 _ENTRY_LEN = 7
1420 _HEADER = '''# Netscape HTTP Cookie File
1421 # This file is generated by yt-dlp. Do not edit.
1422
1423 '''
1424 _CookieFileEntry = collections.namedtuple(
1425 'CookieFileEntry',
1426 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1427
1428 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1429 """
1430 Save cookies to a file.
1431
1432 Most of the code is taken from CPython 3.8 and slightly adapted
1433 to support cookie files with UTF-8 in both python 2 and 3.
1434 """
1435 if filename is None:
1436 if self.filename is not None:
1437 filename = self.filename
1438 else:
1439 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1440
1441 # Store session cookies with `expires` set to 0 instead of an empty
1442 # string
1443 for cookie in self:
1444 if cookie.expires is None:
1445 cookie.expires = 0
1446
1447 with open(filename, 'w', encoding='utf-8') as f:
1448 f.write(self._HEADER)
1449 now = time.time()
1450 for cookie in self:
1451 if not ignore_discard and cookie.discard:
1452 continue
1453 if not ignore_expires and cookie.is_expired(now):
1454 continue
1455 if cookie.secure:
1456 secure = 'TRUE'
1457 else:
1458 secure = 'FALSE'
1459 if cookie.domain.startswith('.'):
1460 initial_dot = 'TRUE'
1461 else:
1462 initial_dot = 'FALSE'
1463 if cookie.expires is not None:
1464 expires = compat_str(cookie.expires)
1465 else:
1466 expires = ''
1467 if cookie.value is None:
1468 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1469 # with no name, whereas http.cookiejar regards it as a
1470 # cookie with no value.
1471 name = ''
1472 value = cookie.name
1473 else:
1474 name = cookie.name
1475 value = cookie.value
1476 f.write(
1477 '\t'.join([cookie.domain, initial_dot, cookie.path,
1478 secure, expires, name, value]) + '\n')
1479
1480 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1481 """Load cookies from a file."""
1482 if filename is None:
1483 if self.filename is not None:
1484 filename = self.filename
1485 else:
1486 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1487
1488 def prepare_line(line):
1489 if line.startswith(self._HTTPONLY_PREFIX):
1490 line = line[len(self._HTTPONLY_PREFIX):]
1491 # comments and empty lines are fine
1492 if line.startswith('#') or not line.strip():
1493 return line
1494 cookie_list = line.split('\t')
1495 if len(cookie_list) != self._ENTRY_LEN:
1496 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1497 cookie = self._CookieFileEntry(*cookie_list)
1498 if cookie.expires_at and not cookie.expires_at.isdigit():
1499 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1500 return line
1501
1502 cf = io.StringIO()
1503 with open(filename, encoding='utf-8') as f:
1504 for line in f:
1505 try:
1506 cf.write(prepare_line(line))
1507 except compat_cookiejar.LoadError as e:
1508 write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n')
1509 continue
1510 cf.seek(0)
1511 self._really_load(cf, filename, ignore_discard, ignore_expires)
1512 # Session cookies are denoted by either `expires` field set to
1513 # an empty string or 0. MozillaCookieJar only recognizes the former
1514 # (see [1]). So we need force the latter to be recognized as session
1515 # cookies on our own.
1516 # Session cookies may be important for cookies-based authentication,
1517 # e.g. usually, when user does not check 'Remember me' check box while
1518 # logging in on a site, some important cookies are stored as session
1519 # cookies so that not recognizing them will result in failed login.
1520 # 1. https://bugs.python.org/issue17164
1521 for cookie in self:
1522 # Treat `expires=0` cookies as session cookies
1523 if cookie.expires == 0:
1524 cookie.expires = None
1525 cookie.discard = True
1526
1527
1528 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1529 def __init__(self, cookiejar=None):
1530 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1531
1532 def http_response(self, request, response):
1533 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1534
1535 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1536 https_response = http_response
1537
1538
1539 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1540 """YoutubeDL redirect handler
1541
1542 The code is based on HTTPRedirectHandler implementation from CPython [1].
1543
1544 This redirect handler solves two issues:
1545 - ensures redirect URL is always unicode under python 2
1546 - introduces support for experimental HTTP response status code
1547 308 Permanent Redirect [2] used by some sites [3]
1548
1549 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1550 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1551 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1552 """
1553
1554 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1555
1556 def redirect_request(self, req, fp, code, msg, headers, newurl):
1557 """Return a Request or None in response to a redirect.
1558
1559 This is called by the http_error_30x methods when a
1560 redirection response is received. If a redirection should
1561 take place, return a new Request to allow http_error_30x to
1562 perform the redirect. Otherwise, raise HTTPError if no-one
1563 else should try to handle this url. Return None if you can't
1564 but another Handler might.
1565 """
1566 m = req.get_method()
1567 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1568 or code in (301, 302, 303) and m == "POST")):
1569 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1570 # Strictly (according to RFC 2616), 301 or 302 in response to
1571 # a POST MUST NOT cause a redirection without confirmation
1572 # from the user (of urllib.request, in this case). In practice,
1573 # essentially all clients do redirect in this case, so we do
1574 # the same.
1575
1576 # Be conciliant with URIs containing a space. This is mainly
1577 # redundant with the more complete encoding done in http_error_302(),
1578 # but it is kept for compatibility with other callers.
1579 newurl = newurl.replace(' ', '%20')
1580
1581 CONTENT_HEADERS = ("content-length", "content-type")
1582 # NB: don't use dict comprehension for python 2.6 compatibility
1583 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1584 return compat_urllib_request.Request(
1585 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1586 unverifiable=True)
1587
1588
1589 def extract_timezone(date_str):
1590 m = re.search(
1591 r'''(?x)
1592 ^.{8,}? # >=8 char non-TZ prefix, if present
1593 (?P<tz>Z| # just the UTC Z, or
1594 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1595 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1596 [ ]? # optional space
1597 (?P<sign>\+|-) # +/-
1598 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1599 $)
1600 ''', date_str)
1601 if not m:
1602 timezone = datetime.timedelta()
1603 else:
1604 date_str = date_str[:-len(m.group('tz'))]
1605 if not m.group('sign'):
1606 timezone = datetime.timedelta()
1607 else:
1608 sign = 1 if m.group('sign') == '+' else -1
1609 timezone = datetime.timedelta(
1610 hours=sign * int(m.group('hours')),
1611 minutes=sign * int(m.group('minutes')))
1612 return timezone, date_str
1613
1614
1615 def parse_iso8601(date_str, delimiter='T', timezone=None):
1616 """ Return a UNIX timestamp from the given date """
1617
1618 if date_str is None:
1619 return None
1620
1621 date_str = re.sub(r'\.[0-9]+', '', date_str)
1622
1623 if timezone is None:
1624 timezone, date_str = extract_timezone(date_str)
1625
1626 with contextlib.suppress(ValueError):
1627 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1628 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1629 return calendar.timegm(dt.timetuple())
1630
1631
1632 def date_formats(day_first=True):
1633 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1634
1635
1636 def unified_strdate(date_str, day_first=True):
1637 """Return a string with the date in the format YYYYMMDD"""
1638
1639 if date_str is None:
1640 return None
1641 upload_date = None
1642 # Replace commas
1643 date_str = date_str.replace(',', ' ')
1644 # Remove AM/PM + timezone
1645 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1646 _, date_str = extract_timezone(date_str)
1647
1648 for expression in date_formats(day_first):
1649 with contextlib.suppress(ValueError):
1650 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1651 if upload_date is None:
1652 timetuple = email.utils.parsedate_tz(date_str)
1653 if timetuple:
1654 with contextlib.suppress(ValueError):
1655 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1656 if upload_date is not None:
1657 return compat_str(upload_date)
1658
1659
1660 def unified_timestamp(date_str, day_first=True):
1661 if date_str is None:
1662 return None
1663
1664 date_str = re.sub(r'[,|]', '', date_str)
1665
1666 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1667 timezone, date_str = extract_timezone(date_str)
1668
1669 # Remove AM/PM + timezone
1670 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1671
1672 # Remove unrecognized timezones from ISO 8601 alike timestamps
1673 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1674 if m:
1675 date_str = date_str[:-len(m.group('tz'))]
1676
1677 # Python only supports microseconds, so remove nanoseconds
1678 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1679 if m:
1680 date_str = m.group(1)
1681
1682 for expression in date_formats(day_first):
1683 with contextlib.suppress(ValueError):
1684 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1685 return calendar.timegm(dt.timetuple())
1686 timetuple = email.utils.parsedate_tz(date_str)
1687 if timetuple:
1688 return calendar.timegm(timetuple) + pm_delta * 3600
1689
1690
1691 def determine_ext(url, default_ext='unknown_video'):
1692 if url is None or '.' not in url:
1693 return default_ext
1694 guess = url.partition('?')[0].rpartition('.')[2]
1695 if re.match(r'^[A-Za-z0-9]+$', guess):
1696 return guess
1697 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1698 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1699 return guess.rstrip('/')
1700 else:
1701 return default_ext
1702
1703
1704 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1705 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1706
1707
1708 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1709 """
1710 Return a datetime object from a string in the format YYYYMMDD or
1711 (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1712
1713 format: string date format used to return datetime object from
1714 precision: round the time portion of a datetime object.
1715 auto|microsecond|second|minute|hour|day.
1716 auto: round to the unit provided in date_str (if applicable).
1717 """
1718 auto_precision = False
1719 if precision == 'auto':
1720 auto_precision = True
1721 precision = 'microsecond'
1722 today = datetime_round(datetime.datetime.utcnow(), precision)
1723 if date_str in ('now', 'today'):
1724 return today
1725 if date_str == 'yesterday':
1726 return today - datetime.timedelta(days=1)
1727 match = re.match(
1728 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1729 date_str)
1730 if match is not None:
1731 start_time = datetime_from_str(match.group('start'), precision, format)
1732 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1733 unit = match.group('unit')
1734 if unit == 'month' or unit == 'year':
1735 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1736 unit = 'day'
1737 else:
1738 if unit == 'week':
1739 unit = 'day'
1740 time *= 7
1741 delta = datetime.timedelta(**{unit + 's': time})
1742 new_date = start_time + delta
1743 if auto_precision:
1744 return datetime_round(new_date, unit)
1745 return new_date
1746
1747 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1748
1749
1750 def date_from_str(date_str, format='%Y%m%d', strict=False):
1751 """
1752 Return a datetime object from a string in the format YYYYMMDD or
1753 (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1754
1755 If "strict", only (now|today)[+-][0-9](day|week|month|year)(s)? is allowed
1756
1757 format: string date format used to return datetime object from
1758 """
1759 if strict and not re.fullmatch(r'\d{8}|(now|today)[+-]\d+(day|week|month|year)(s)?', date_str):
1760 raise ValueError(f'Invalid date format {date_str}')
1761 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1762
1763
1764 def datetime_add_months(dt, months):
1765 """Increment/Decrement a datetime object by months."""
1766 month = dt.month + months - 1
1767 year = dt.year + month // 12
1768 month = month % 12 + 1
1769 day = min(dt.day, calendar.monthrange(year, month)[1])
1770 return dt.replace(year, month, day)
1771
1772
1773 def datetime_round(dt, precision='day'):
1774 """
1775 Round a datetime object's time to a specific precision
1776 """
1777 if precision == 'microsecond':
1778 return dt
1779
1780 unit_seconds = {
1781 'day': 86400,
1782 'hour': 3600,
1783 'minute': 60,
1784 'second': 1,
1785 }
1786 roundto = lambda x, n: ((x + n / 2) // n) * n
1787 timestamp = calendar.timegm(dt.timetuple())
1788 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1789
1790
1791 def hyphenate_date(date_str):
1792 """
1793 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1794 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1795 if match is not None:
1796 return '-'.join(match.groups())
1797 else:
1798 return date_str
1799
1800
1801 class DateRange:
1802 """Represents a time interval between two dates"""
1803
1804 def __init__(self, start=None, end=None):
1805 """start and end must be strings in the format accepted by date"""
1806 if start is not None:
1807 self.start = date_from_str(start, strict=True)
1808 else:
1809 self.start = datetime.datetime.min.date()
1810 if end is not None:
1811 self.end = date_from_str(end, strict=True)
1812 else:
1813 self.end = datetime.datetime.max.date()
1814 if self.start > self.end:
1815 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1816
1817 @classmethod
1818 def day(cls, day):
1819 """Returns a range that only contains the given day"""
1820 return cls(day, day)
1821
1822 def __contains__(self, date):
1823 """Check if the date is in the range"""
1824 if not isinstance(date, datetime.date):
1825 date = date_from_str(date)
1826 return self.start <= date <= self.end
1827
1828 def __str__(self):
1829 return f'{self.start.isoformat()} - {self.end.isoformat()}'
1830
1831
1832 def platform_name():
1833 """ Returns the platform name as a compat_str """
1834 res = platform.platform()
1835 if isinstance(res, bytes):
1836 res = res.decode(preferredencoding())
1837
1838 assert isinstance(res, compat_str)
1839 return res
1840
1841
1842 def get_windows_version():
1843 ''' Get Windows version. None if it's not running on Windows '''
1844 if compat_os_name == 'nt':
1845 return version_tuple(platform.win32_ver()[1])
1846 else:
1847 return None
1848
1849
1850 def write_string(s, out=None, encoding=None):
1851 assert isinstance(s, str)
1852 out = out or sys.stderr
1853
1854 if 'b' in getattr(out, 'mode', ''):
1855 byt = s.encode(encoding or preferredencoding(), 'ignore')
1856 out.write(byt)
1857 elif hasattr(out, 'buffer'):
1858 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1859 byt = s.encode(enc, 'ignore')
1860 out.buffer.write(byt)
1861 else:
1862 out.write(s)
1863 out.flush()
1864
1865
1866 def bytes_to_intlist(bs):
1867 if not bs:
1868 return []
1869 if isinstance(bs[0], int): # Python 3
1870 return list(bs)
1871 else:
1872 return [ord(c) for c in bs]
1873
1874
1875 def intlist_to_bytes(xs):
1876 if not xs:
1877 return b''
1878 return compat_struct_pack('%dB' % len(xs), *xs)
1879
1880
1881 class LockingUnsupportedError(IOError):
1882 msg = 'File locking is not supported on this platform'
1883
1884 def __init__(self):
1885 super().__init__(self.msg)
1886
1887
1888 # Cross-platform file locking
1889 if sys.platform == 'win32':
1890 import ctypes.wintypes
1891 import msvcrt
1892
1893 class OVERLAPPED(ctypes.Structure):
1894 _fields_ = [
1895 ('Internal', ctypes.wintypes.LPVOID),
1896 ('InternalHigh', ctypes.wintypes.LPVOID),
1897 ('Offset', ctypes.wintypes.DWORD),
1898 ('OffsetHigh', ctypes.wintypes.DWORD),
1899 ('hEvent', ctypes.wintypes.HANDLE),
1900 ]
1901
1902 kernel32 = ctypes.windll.kernel32
1903 LockFileEx = kernel32.LockFileEx
1904 LockFileEx.argtypes = [
1905 ctypes.wintypes.HANDLE, # hFile
1906 ctypes.wintypes.DWORD, # dwFlags
1907 ctypes.wintypes.DWORD, # dwReserved
1908 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1909 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1910 ctypes.POINTER(OVERLAPPED) # Overlapped
1911 ]
1912 LockFileEx.restype = ctypes.wintypes.BOOL
1913 UnlockFileEx = kernel32.UnlockFileEx
1914 UnlockFileEx.argtypes = [
1915 ctypes.wintypes.HANDLE, # hFile
1916 ctypes.wintypes.DWORD, # dwReserved
1917 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1918 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1919 ctypes.POINTER(OVERLAPPED) # Overlapped
1920 ]
1921 UnlockFileEx.restype = ctypes.wintypes.BOOL
1922 whole_low = 0xffffffff
1923 whole_high = 0x7fffffff
1924
1925 def _lock_file(f, exclusive, block):
1926 overlapped = OVERLAPPED()
1927 overlapped.Offset = 0
1928 overlapped.OffsetHigh = 0
1929 overlapped.hEvent = 0
1930 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1931
1932 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1933 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1934 0, whole_low, whole_high, f._lock_file_overlapped_p):
1935 raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
1936
1937 def _unlock_file(f):
1938 assert f._lock_file_overlapped_p
1939 handle = msvcrt.get_osfhandle(f.fileno())
1940 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1941 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1942
1943 else:
1944 try:
1945 import fcntl
1946
1947 def _lock_file(f, exclusive, block):
1948 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1949 if not block:
1950 flags |= fcntl.LOCK_NB
1951 try:
1952 fcntl.flock(f, flags)
1953 except BlockingIOError:
1954 raise
1955 except OSError: # AOSP does not have flock()
1956 fcntl.lockf(f, flags)
1957
1958 def _unlock_file(f):
1959 try:
1960 fcntl.flock(f, fcntl.LOCK_UN)
1961 except OSError:
1962 fcntl.lockf(f, fcntl.LOCK_UN)
1963
1964 except ImportError:
1965
1966 def _lock_file(f, exclusive, block):
1967 raise LockingUnsupportedError()
1968
1969 def _unlock_file(f):
1970 raise LockingUnsupportedError()
1971
1972
1973 class locked_file:
1974 locked = False
1975
1976 def __init__(self, filename, mode, block=True, encoding=None):
1977 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
1978 raise NotImplementedError(mode)
1979 self.mode, self.block = mode, block
1980
1981 writable = any(f in mode for f in 'wax+')
1982 readable = any(f in mode for f in 'r+')
1983 flags = functools.reduce(operator.ior, (
1984 getattr(os, 'O_CLOEXEC', 0), # UNIX only
1985 getattr(os, 'O_BINARY', 0), # Windows only
1986 getattr(os, 'O_NOINHERIT', 0), # Windows only
1987 os.O_CREAT if writable else 0, # O_TRUNC only after locking
1988 os.O_APPEND if 'a' in mode else 0,
1989 os.O_EXCL if 'x' in mode else 0,
1990 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
1991 ))
1992
1993 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
1994
1995 def __enter__(self):
1996 exclusive = 'r' not in self.mode
1997 try:
1998 _lock_file(self.f, exclusive, self.block)
1999 self.locked = True
2000 except OSError:
2001 self.f.close()
2002 raise
2003 if 'w' in self.mode:
2004 self.f.truncate()
2005 return self
2006
2007 def unlock(self):
2008 if not self.locked:
2009 return
2010 try:
2011 _unlock_file(self.f)
2012 finally:
2013 self.locked = False
2014
2015 def __exit__(self, *_):
2016 try:
2017 self.unlock()
2018 finally:
2019 self.f.close()
2020
2021 open = __enter__
2022 close = __exit__
2023
2024 def __getattr__(self, attr):
2025 return getattr(self.f, attr)
2026
2027 def __iter__(self):
2028 return iter(self.f)
2029
2030
2031 def get_filesystem_encoding():
2032 encoding = sys.getfilesystemencoding()
2033 return encoding if encoding is not None else 'utf-8'
2034
2035
2036 def shell_quote(args):
2037 quoted_args = []
2038 encoding = get_filesystem_encoding()
2039 for a in args:
2040 if isinstance(a, bytes):
2041 # We may get a filename encoded with 'encodeFilename'
2042 a = a.decode(encoding)
2043 quoted_args.append(compat_shlex_quote(a))
2044 return ' '.join(quoted_args)
2045
2046
2047 def smuggle_url(url, data):
2048 """ Pass additional data in a URL for internal use. """
2049
2050 url, idata = unsmuggle_url(url, {})
2051 data.update(idata)
2052 sdata = compat_urllib_parse_urlencode(
2053 {'__youtubedl_smuggle': json.dumps(data)})
2054 return url + '#' + sdata
2055
2056
2057 def unsmuggle_url(smug_url, default=None):
2058 if '#__youtubedl_smuggle' not in smug_url:
2059 return smug_url, default
2060 url, _, sdata = smug_url.rpartition('#')
2061 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2062 data = json.loads(jsond)
2063 return url, data
2064
2065
2066 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2067 """ Formats numbers with decimal sufixes like K, M, etc """
2068 num, factor = float_or_none(num), float(factor)
2069 if num is None or num < 0:
2070 return None
2071 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2072 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2073 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2074 if factor == 1024:
2075 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2076 converted = num / (factor ** exponent)
2077 return fmt % (converted, suffix)
2078
2079
2080 def format_bytes(bytes):
2081 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2082
2083
2084 def lookup_unit_table(unit_table, s):
2085 units_re = '|'.join(re.escape(u) for u in unit_table)
2086 m = re.match(
2087 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2088 if not m:
2089 return None
2090 num_str = m.group('num').replace(',', '.')
2091 mult = unit_table[m.group('unit')]
2092 return int(float(num_str) * mult)
2093
2094
2095 def parse_filesize(s):
2096 if s is None:
2097 return None
2098
2099 # The lower-case forms are of course incorrect and unofficial,
2100 # but we support those too
2101 _UNIT_TABLE = {
2102 'B': 1,
2103 'b': 1,
2104 'bytes': 1,
2105 'KiB': 1024,
2106 'KB': 1000,
2107 'kB': 1024,
2108 'Kb': 1000,
2109 'kb': 1000,
2110 'kilobytes': 1000,
2111 'kibibytes': 1024,
2112 'MiB': 1024 ** 2,
2113 'MB': 1000 ** 2,
2114 'mB': 1024 ** 2,
2115 'Mb': 1000 ** 2,
2116 'mb': 1000 ** 2,
2117 'megabytes': 1000 ** 2,
2118 'mebibytes': 1024 ** 2,
2119 'GiB': 1024 ** 3,
2120 'GB': 1000 ** 3,
2121 'gB': 1024 ** 3,
2122 'Gb': 1000 ** 3,
2123 'gb': 1000 ** 3,
2124 'gigabytes': 1000 ** 3,
2125 'gibibytes': 1024 ** 3,
2126 'TiB': 1024 ** 4,
2127 'TB': 1000 ** 4,
2128 'tB': 1024 ** 4,
2129 'Tb': 1000 ** 4,
2130 'tb': 1000 ** 4,
2131 'terabytes': 1000 ** 4,
2132 'tebibytes': 1024 ** 4,
2133 'PiB': 1024 ** 5,
2134 'PB': 1000 ** 5,
2135 'pB': 1024 ** 5,
2136 'Pb': 1000 ** 5,
2137 'pb': 1000 ** 5,
2138 'petabytes': 1000 ** 5,
2139 'pebibytes': 1024 ** 5,
2140 'EiB': 1024 ** 6,
2141 'EB': 1000 ** 6,
2142 'eB': 1024 ** 6,
2143 'Eb': 1000 ** 6,
2144 'eb': 1000 ** 6,
2145 'exabytes': 1000 ** 6,
2146 'exbibytes': 1024 ** 6,
2147 'ZiB': 1024 ** 7,
2148 'ZB': 1000 ** 7,
2149 'zB': 1024 ** 7,
2150 'Zb': 1000 ** 7,
2151 'zb': 1000 ** 7,
2152 'zettabytes': 1000 ** 7,
2153 'zebibytes': 1024 ** 7,
2154 'YiB': 1024 ** 8,
2155 'YB': 1000 ** 8,
2156 'yB': 1024 ** 8,
2157 'Yb': 1000 ** 8,
2158 'yb': 1000 ** 8,
2159 'yottabytes': 1000 ** 8,
2160 'yobibytes': 1024 ** 8,
2161 }
2162
2163 return lookup_unit_table(_UNIT_TABLE, s)
2164
2165
2166 def parse_count(s):
2167 if s is None:
2168 return None
2169
2170 s = re.sub(r'^[^\d]+\s', '', s).strip()
2171
2172 if re.match(r'^[\d,.]+$', s):
2173 return str_to_int(s)
2174
2175 _UNIT_TABLE = {
2176 'k': 1000,
2177 'K': 1000,
2178 'm': 1000 ** 2,
2179 'M': 1000 ** 2,
2180 'kk': 1000 ** 2,
2181 'KK': 1000 ** 2,
2182 'b': 1000 ** 3,
2183 'B': 1000 ** 3,
2184 }
2185
2186 ret = lookup_unit_table(_UNIT_TABLE, s)
2187 if ret is not None:
2188 return ret
2189
2190 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2191 if mobj:
2192 return str_to_int(mobj.group(1))
2193
2194
2195 def parse_resolution(s, *, lenient=False):
2196 if s is None:
2197 return {}
2198
2199 if lenient:
2200 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2201 else:
2202 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2203 if mobj:
2204 return {
2205 'width': int(mobj.group('w')),
2206 'height': int(mobj.group('h')),
2207 }
2208
2209 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2210 if mobj:
2211 return {'height': int(mobj.group(1))}
2212
2213 mobj = re.search(r'\b([48])[kK]\b', s)
2214 if mobj:
2215 return {'height': int(mobj.group(1)) * 540}
2216
2217 return {}
2218
2219
2220 def parse_bitrate(s):
2221 if not isinstance(s, compat_str):
2222 return
2223 mobj = re.search(r'\b(\d+)\s*kbps', s)
2224 if mobj:
2225 return int(mobj.group(1))
2226
2227
2228 def month_by_name(name, lang='en'):
2229 """ Return the number of a month by (locale-independently) English name """
2230
2231 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2232
2233 try:
2234 return month_names.index(name) + 1
2235 except ValueError:
2236 return None
2237
2238
2239 def month_by_abbreviation(abbrev):
2240 """ Return the number of a month by (locale-independently) English
2241 abbreviations """
2242
2243 try:
2244 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2245 except ValueError:
2246 return None
2247
2248
2249 def fix_xml_ampersands(xml_str):
2250 """Replace all the '&' by '&amp;' in XML"""
2251 return re.sub(
2252 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2253 '&amp;',
2254 xml_str)
2255
2256
2257 def setproctitle(title):
2258 assert isinstance(title, compat_str)
2259
2260 # ctypes in Jython is not complete
2261 # http://bugs.jython.org/issue2148
2262 if sys.platform.startswith('java'):
2263 return
2264
2265 try:
2266 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2267 except OSError:
2268 return
2269 except TypeError:
2270 # LoadLibrary in Windows Python 2.7.13 only expects
2271 # a bytestring, but since unicode_literals turns
2272 # every string into a unicode string, it fails.
2273 return
2274 title_bytes = title.encode('utf-8')
2275 buf = ctypes.create_string_buffer(len(title_bytes))
2276 buf.value = title_bytes
2277 try:
2278 libc.prctl(15, buf, 0, 0, 0)
2279 except AttributeError:
2280 return # Strange libc, just skip this
2281
2282
2283 def remove_start(s, start):
2284 return s[len(start):] if s is not None and s.startswith(start) else s
2285
2286
2287 def remove_end(s, end):
2288 return s[:-len(end)] if s is not None and s.endswith(end) else s
2289
2290
2291 def remove_quotes(s):
2292 if s is None or len(s) < 2:
2293 return s
2294 for quote in ('"', "'", ):
2295 if s[0] == quote and s[-1] == quote:
2296 return s[1:-1]
2297 return s
2298
2299
2300 def get_domain(url):
2301 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2302 return domain.group('domain') if domain else None
2303
2304
2305 def url_basename(url):
2306 path = compat_urlparse.urlparse(url).path
2307 return path.strip('/').split('/')[-1]
2308
2309
2310 def base_url(url):
2311 return re.match(r'https?://[^?#&]+/', url).group()
2312
2313
2314 def urljoin(base, path):
2315 if isinstance(path, bytes):
2316 path = path.decode('utf-8')
2317 if not isinstance(path, compat_str) or not path:
2318 return None
2319 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2320 return path
2321 if isinstance(base, bytes):
2322 base = base.decode('utf-8')
2323 if not isinstance(base, compat_str) or not re.match(
2324 r'^(?:https?:)?//', base):
2325 return None
2326 return compat_urlparse.urljoin(base, path)
2327
2328
2329 class HEADRequest(compat_urllib_request.Request):
2330 def get_method(self):
2331 return 'HEAD'
2332
2333
2334 class PUTRequest(compat_urllib_request.Request):
2335 def get_method(self):
2336 return 'PUT'
2337
2338
2339 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2340 if get_attr and v is not None:
2341 v = getattr(v, get_attr, None)
2342 try:
2343 return int(v) * invscale // scale
2344 except (ValueError, TypeError, OverflowError):
2345 return default
2346
2347
2348 def str_or_none(v, default=None):
2349 return default if v is None else compat_str(v)
2350
2351
2352 def str_to_int(int_str):
2353 """ A more relaxed version of int_or_none """
2354 if isinstance(int_str, int):
2355 return int_str
2356 elif isinstance(int_str, compat_str):
2357 int_str = re.sub(r'[,\.\+]', '', int_str)
2358 return int_or_none(int_str)
2359
2360
2361 def float_or_none(v, scale=1, invscale=1, default=None):
2362 if v is None:
2363 return default
2364 try:
2365 return float(v) * invscale / scale
2366 except (ValueError, TypeError):
2367 return default
2368
2369
2370 def bool_or_none(v, default=None):
2371 return v if isinstance(v, bool) else default
2372
2373
2374 def strip_or_none(v, default=None):
2375 return v.strip() if isinstance(v, compat_str) else default
2376
2377
2378 def url_or_none(url):
2379 if not url or not isinstance(url, compat_str):
2380 return None
2381 url = url.strip()
2382 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2383
2384
2385 def request_to_url(req):
2386 if isinstance(req, compat_urllib_request.Request):
2387 return req.get_full_url()
2388 else:
2389 return req
2390
2391
2392 def strftime_or_none(timestamp, date_format, default=None):
2393 datetime_object = None
2394 try:
2395 if isinstance(timestamp, (int, float)): # unix timestamp
2396 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2397 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2398 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2399 return datetime_object.strftime(date_format)
2400 except (ValueError, TypeError, AttributeError):
2401 return default
2402
2403
2404 def parse_duration(s):
2405 if not isinstance(s, str):
2406 return None
2407 s = s.strip()
2408 if not s:
2409 return None
2410
2411 days, hours, mins, secs, ms = [None] * 5
2412 m = re.match(r'''(?x)
2413 (?P<before_secs>
2414 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2415 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2416 (?P<ms>[.:][0-9]+)?Z?$
2417 ''', s)
2418 if m:
2419 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2420 else:
2421 m = re.match(
2422 r'''(?ix)(?:P?
2423 (?:
2424 [0-9]+\s*y(?:ears?)?,?\s*
2425 )?
2426 (?:
2427 [0-9]+\s*m(?:onths?)?,?\s*
2428 )?
2429 (?:
2430 [0-9]+\s*w(?:eeks?)?,?\s*
2431 )?
2432 (?:
2433 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2434 )?
2435 T)?
2436 (?:
2437 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2438 )?
2439 (?:
2440 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2441 )?
2442 (?:
2443 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2444 )?Z?$''', s)
2445 if m:
2446 days, hours, mins, secs, ms = m.groups()
2447 else:
2448 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2449 if m:
2450 hours, mins = m.groups()
2451 else:
2452 return None
2453
2454 if ms:
2455 ms = ms.replace(':', '.')
2456 return sum(float(part or 0) * mult for part, mult in (
2457 (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))
2458
2459
2460 def prepend_extension(filename, ext, expected_real_ext=None):
2461 name, real_ext = os.path.splitext(filename)
2462 return (
2463 f'{name}.{ext}{real_ext}'
2464 if not expected_real_ext or real_ext[1:] == expected_real_ext
2465 else f'{filename}.{ext}')
2466
2467
2468 def replace_extension(filename, ext, expected_real_ext=None):
2469 name, real_ext = os.path.splitext(filename)
2470 return '{}.{}'.format(
2471 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2472 ext)
2473
2474
2475 def check_executable(exe, args=[]):
2476 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2477 args can be a list of arguments for a short output (like -version) """
2478 try:
2479 Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2480 except OSError:
2481 return False
2482 return exe
2483
2484
2485 def _get_exe_version_output(exe, args, *, to_screen=None):
2486 if to_screen:
2487 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2488 try:
2489 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2490 # SIGTTOU if yt-dlp is run in the background.
2491 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2492 out, _ = Popen(
2493 [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2494 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2495 except OSError:
2496 return False
2497 if isinstance(out, bytes): # Python 2.x
2498 out = out.decode('ascii', 'ignore')
2499 return out
2500
2501
2502 def detect_exe_version(output, version_re=None, unrecognized='present'):
2503 assert isinstance(output, compat_str)
2504 if version_re is None:
2505 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2506 m = re.search(version_re, output)
2507 if m:
2508 return m.group(1)
2509 else:
2510 return unrecognized
2511
2512
2513 def get_exe_version(exe, args=['--version'],
2514 version_re=None, unrecognized='present'):
2515 """ Returns the version of the specified executable,
2516 or False if the executable is not present """
2517 out = _get_exe_version_output(exe, args)
2518 return detect_exe_version(out, version_re, unrecognized) if out else False
2519
2520
2521 class LazyList(collections.abc.Sequence):
2522 ''' Lazy immutable list from an iterable
2523 Note that slices of a LazyList are lists and not LazyList'''
2524
2525 class IndexError(IndexError):
2526 pass
2527
2528 def __init__(self, iterable, *, reverse=False, _cache=None):
2529 self.__iterable = iter(iterable)
2530 self.__cache = [] if _cache is None else _cache
2531 self.__reversed = reverse
2532
2533 def __iter__(self):
2534 if self.__reversed:
2535 # We need to consume the entire iterable to iterate in reverse
2536 yield from self.exhaust()
2537 return
2538 yield from self.__cache
2539 for item in self.__iterable:
2540 self.__cache.append(item)
2541 yield item
2542
2543 def __exhaust(self):
2544 self.__cache.extend(self.__iterable)
2545 # Discard the emptied iterable to make it pickle-able
2546 self.__iterable = []
2547 return self.__cache
2548
2549 def exhaust(self):
2550 ''' Evaluate the entire iterable '''
2551 return self.__exhaust()[::-1 if self.__reversed else 1]
2552
2553 @staticmethod
2554 def __reverse_index(x):
2555 return None if x is None else -(x + 1)
2556
2557 def __getitem__(self, idx):
2558 if isinstance(idx, slice):
2559 if self.__reversed:
2560 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2561 start, stop, step = idx.start, idx.stop, idx.step or 1
2562 elif isinstance(idx, int):
2563 if self.__reversed:
2564 idx = self.__reverse_index(idx)
2565 start, stop, step = idx, idx, 0
2566 else:
2567 raise TypeError('indices must be integers or slices')
2568 if ((start or 0) < 0 or (stop or 0) < 0
2569 or (start is None and step < 0)
2570 or (stop is None and step > 0)):
2571 # We need to consume the entire iterable to be able to slice from the end
2572 # Obviously, never use this with infinite iterables
2573 self.__exhaust()
2574 try:
2575 return self.__cache[idx]
2576 except IndexError as e:
2577 raise self.IndexError(e) from e
2578 n = max(start or 0, stop or 0) - len(self.__cache) + 1
2579 if n > 0:
2580 self.__cache.extend(itertools.islice(self.__iterable, n))
2581 try:
2582 return self.__cache[idx]
2583 except IndexError as e:
2584 raise self.IndexError(e) from e
2585
2586 def __bool__(self):
2587 try:
2588 self[-1] if self.__reversed else self[0]
2589 except self.IndexError:
2590 return False
2591 return True
2592
2593 def __len__(self):
2594 self.__exhaust()
2595 return len(self.__cache)
2596
2597 def __reversed__(self):
2598 return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2599
2600 def __copy__(self):
2601 return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2602
2603 def __repr__(self):
2604 # repr and str should mimic a list. So we exhaust the iterable
2605 return repr(self.exhaust())
2606
2607 def __str__(self):
2608 return repr(self.exhaust())
2609
2610
2611 class PagedList:
2612
2613 class IndexError(IndexError):
2614 pass
2615
2616 def __len__(self):
2617 # This is only useful for tests
2618 return len(self.getslice())
2619
2620 def __init__(self, pagefunc, pagesize, use_cache=True):
2621 self._pagefunc = pagefunc
2622 self._pagesize = pagesize
2623 self._pagecount = float('inf')
2624 self._use_cache = use_cache
2625 self._cache = {}
2626
2627 def getpage(self, pagenum):
2628 page_results = self._cache.get(pagenum)
2629 if page_results is None:
2630 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2631 if self._use_cache:
2632 self._cache[pagenum] = page_results
2633 return page_results
2634
2635 def getslice(self, start=0, end=None):
2636 return list(self._getslice(start, end))
2637
2638 def _getslice(self, start, end):
2639 raise NotImplementedError('This method must be implemented by subclasses')
2640
2641 def __getitem__(self, idx):
2642 assert self._use_cache, 'Indexing PagedList requires cache'
2643 if not isinstance(idx, int) or idx < 0:
2644 raise TypeError('indices must be non-negative integers')
2645 entries = self.getslice(idx, idx + 1)
2646 if not entries:
2647 raise self.IndexError()
2648 return entries[0]
2649
2650
2651 class OnDemandPagedList(PagedList):
2652 """Download pages until a page with less than maximum results"""
2653
2654 def _getslice(self, start, end):
2655 for pagenum in itertools.count(start // self._pagesize):
2656 firstid = pagenum * self._pagesize
2657 nextfirstid = pagenum * self._pagesize + self._pagesize
2658 if start >= nextfirstid:
2659 continue
2660
2661 startv = (
2662 start % self._pagesize
2663 if firstid <= start < nextfirstid
2664 else 0)
2665 endv = (
2666 ((end - 1) % self._pagesize) + 1
2667 if (end is not None and firstid <= end <= nextfirstid)
2668 else None)
2669
2670 try:
2671 page_results = self.getpage(pagenum)
2672 except Exception:
2673 self._pagecount = pagenum - 1
2674 raise
2675 if startv != 0 or endv is not None:
2676 page_results = page_results[startv:endv]
2677 yield from page_results
2678
2679 # A little optimization - if current page is not "full", ie. does
2680 # not contain page_size videos then we can assume that this page
2681 # is the last one - there are no more ids on further pages -
2682 # i.e. no need to query again.
2683 if len(page_results) + startv < self._pagesize:
2684 break
2685
2686 # If we got the whole page, but the next page is not interesting,
2687 # break out early as well
2688 if end == nextfirstid:
2689 break
2690
2691
2692 class InAdvancePagedList(PagedList):
2693 """PagedList with total number of pages known in advance"""
2694
2695 def __init__(self, pagefunc, pagecount, pagesize):
2696 PagedList.__init__(self, pagefunc, pagesize, True)
2697 self._pagecount = pagecount
2698
2699 def _getslice(self, start, end):
2700 start_page = start // self._pagesize
2701 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2702 skip_elems = start - start_page * self._pagesize
2703 only_more = None if end is None else end - start
2704 for pagenum in range(start_page, end_page):
2705 page_results = self.getpage(pagenum)
2706 if skip_elems:
2707 page_results = page_results[skip_elems:]
2708 skip_elems = None
2709 if only_more is not None:
2710 if len(page_results) < only_more:
2711 only_more -= len(page_results)
2712 else:
2713 yield from page_results[:only_more]
2714 break
2715 yield from page_results
2716
2717
2718 def uppercase_escape(s):
2719 unicode_escape = codecs.getdecoder('unicode_escape')
2720 return re.sub(
2721 r'\\U[0-9a-fA-F]{8}',
2722 lambda m: unicode_escape(m.group(0))[0],
2723 s)
2724
2725
2726 def lowercase_escape(s):
2727 unicode_escape = codecs.getdecoder('unicode_escape')
2728 return re.sub(
2729 r'\\u[0-9a-fA-F]{4}',
2730 lambda m: unicode_escape(m.group(0))[0],
2731 s)
2732
2733
2734 def escape_rfc3986(s):
2735 """Escape non-ASCII characters as suggested by RFC 3986"""
2736 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2737
2738
2739 def escape_url(url):
2740 """Escape URL as suggested by RFC 3986"""
2741 url_parsed = compat_urllib_parse_urlparse(url)
2742 return url_parsed._replace(
2743 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2744 path=escape_rfc3986(url_parsed.path),
2745 params=escape_rfc3986(url_parsed.params),
2746 query=escape_rfc3986(url_parsed.query),
2747 fragment=escape_rfc3986(url_parsed.fragment)
2748 ).geturl()
2749
2750
2751 def parse_qs(url):
2752 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2753
2754
2755 def read_batch_urls(batch_fd):
2756 def fixup(url):
2757 if not isinstance(url, compat_str):
2758 url = url.decode('utf-8', 'replace')
2759 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2760 for bom in BOM_UTF8:
2761 if url.startswith(bom):
2762 url = url[len(bom):]
2763 url = url.lstrip()
2764 if not url or url.startswith(('#', ';', ']')):
2765 return False
2766 # "#" cannot be stripped out since it is part of the URI
2767 # However, it can be safely stipped out if follwing a whitespace
2768 return re.split(r'\s#', url, 1)[0].rstrip()
2769
2770 with contextlib.closing(batch_fd) as fd:
2771 return [url for url in map(fixup, fd) if url]
2772
2773
2774 def urlencode_postdata(*args, **kargs):
2775 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2776
2777
2778 def update_url_query(url, query):
2779 if not query:
2780 return url
2781 parsed_url = compat_urlparse.urlparse(url)
2782 qs = compat_parse_qs(parsed_url.query)
2783 qs.update(query)
2784 return compat_urlparse.urlunparse(parsed_url._replace(
2785 query=compat_urllib_parse_urlencode(qs, True)))
2786
2787
2788 def update_Request(req, url=None, data=None, headers={}, query={}):
2789 req_headers = req.headers.copy()
2790 req_headers.update(headers)
2791 req_data = data or req.data
2792 req_url = update_url_query(url or req.get_full_url(), query)
2793 req_get_method = req.get_method()
2794 if req_get_method == 'HEAD':
2795 req_type = HEADRequest
2796 elif req_get_method == 'PUT':
2797 req_type = PUTRequest
2798 else:
2799 req_type = compat_urllib_request.Request
2800 new_req = req_type(
2801 req_url, data=req_data, headers=req_headers,
2802 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2803 if hasattr(req, 'timeout'):
2804 new_req.timeout = req.timeout
2805 return new_req
2806
2807
2808 def _multipart_encode_impl(data, boundary):
2809 content_type = 'multipart/form-data; boundary=%s' % boundary
2810
2811 out = b''
2812 for k, v in data.items():
2813 out += b'--' + boundary.encode('ascii') + b'\r\n'
2814 if isinstance(k, compat_str):
2815 k = k.encode('utf-8')
2816 if isinstance(v, compat_str):
2817 v = v.encode('utf-8')
2818 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2819 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2820 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2821 if boundary.encode('ascii') in content:
2822 raise ValueError('Boundary overlaps with data')
2823 out += content
2824
2825 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2826
2827 return out, content_type
2828
2829
2830 def multipart_encode(data, boundary=None):
2831 '''
2832 Encode a dict to RFC 7578-compliant form-data
2833
2834 data:
2835 A dict where keys and values can be either Unicode or bytes-like
2836 objects.
2837 boundary:
2838 If specified a Unicode object, it's used as the boundary. Otherwise
2839 a random boundary is generated.
2840
2841 Reference: https://tools.ietf.org/html/rfc7578
2842 '''
2843 has_specified_boundary = boundary is not None
2844
2845 while True:
2846 if boundary is None:
2847 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2848
2849 try:
2850 out, content_type = _multipart_encode_impl(data, boundary)
2851 break
2852 except ValueError:
2853 if has_specified_boundary:
2854 raise
2855 boundary = None
2856
2857 return out, content_type
2858
2859
2860 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2861 for val in map(d.get, variadic(key_or_keys)):
2862 if val is not None and (val or not skip_false_values):
2863 return val
2864 return default
2865
2866
2867 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2868 for f in funcs:
2869 try:
2870 val = f(*args, **kwargs)
2871 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
2872 pass
2873 else:
2874 if expected_type is None or isinstance(val, expected_type):
2875 return val
2876
2877
2878 def try_get(src, getter, expected_type=None):
2879 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2880
2881
2882 def filter_dict(dct, cndn=lambda _, v: v is not None):
2883 return {k: v for k, v in dct.items() if cndn(k, v)}
2884
2885
2886 def merge_dicts(*dicts):
2887 merged = {}
2888 for a_dict in dicts:
2889 for k, v in a_dict.items():
2890 if (v is not None and k not in merged
2891 or isinstance(v, str) and merged[k] == ''):
2892 merged[k] = v
2893 return merged
2894
2895
2896 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2897 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2898
2899
2900 US_RATINGS = {
2901 'G': 0,
2902 'PG': 10,
2903 'PG-13': 13,
2904 'R': 16,
2905 'NC': 18,
2906 }
2907
2908
2909 TV_PARENTAL_GUIDELINES = {
2910 'TV-Y': 0,
2911 'TV-Y7': 7,
2912 'TV-G': 0,
2913 'TV-PG': 0,
2914 'TV-14': 14,
2915 'TV-MA': 17,
2916 }
2917
2918
2919 def parse_age_limit(s):
2920 # isinstance(False, int) is True. So type() must be used instead
2921 if type(s) is int:
2922 return s if 0 <= s <= 21 else None
2923 elif not isinstance(s, str):
2924 return None
2925 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2926 if m:
2927 return int(m.group('age'))
2928 s = s.upper()
2929 if s in US_RATINGS:
2930 return US_RATINGS[s]
2931 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2932 if m:
2933 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2934 return None
2935
2936
2937 def strip_jsonp(code):
2938 return re.sub(
2939 r'''(?sx)^
2940 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2941 (?:\s*&&\s*(?P=func_name))?
2942 \s*\(\s*(?P<callback_data>.*)\);?
2943 \s*?(?://[^\n]*)*$''',
2944 r'\g<callback_data>', code)
2945
2946
2947 def js_to_json(code, vars={}):
2948 # vars is a dict of var, val pairs to substitute
2949 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2950 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
2951 INTEGER_TABLE = (
2952 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2953 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
2954 )
2955
2956 def fix_kv(m):
2957 v = m.group(0)
2958 if v in ('true', 'false', 'null'):
2959 return v
2960 elif v in ('undefined', 'void 0'):
2961 return 'null'
2962 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2963 return ""
2964
2965 if v[0] in ("'", '"'):
2966 v = re.sub(r'(?s)\\.|"', lambda m: {
2967 '"': '\\"',
2968 "\\'": "'",
2969 '\\\n': '',
2970 '\\x': '\\u00',
2971 }.get(m.group(0), m.group(0)), v[1:-1])
2972 else:
2973 for regex, base in INTEGER_TABLE:
2974 im = re.match(regex, v)
2975 if im:
2976 i = int(im.group(1), base)
2977 return '"%d":' % i if v.endswith(':') else '%d' % i
2978
2979 if v in vars:
2980 return vars[v]
2981
2982 return '"%s"' % v
2983
2984 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
2985
2986 return re.sub(r'''(?sx)
2987 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2988 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2989 {comment}|,(?={skip}[\]}}])|
2990 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
2991 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2992 [0-9]+(?={skip}:)|
2993 !+
2994 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
2995
2996
2997 def qualities(quality_ids):
2998 """ Get a numeric quality value out of a list of possible values """
2999 def q(qid):
3000 try:
3001 return quality_ids.index(qid)
3002 except ValueError:
3003 return -1
3004 return q
3005
3006
3007 POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist')
3008
3009
3010 DEFAULT_OUTTMPL = {
3011 'default': '%(title)s [%(id)s].%(ext)s',
3012 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3013 }
3014 OUTTMPL_TYPES = {
3015 'chapter': None,
3016 'subtitle': None,
3017 'thumbnail': None,
3018 'description': 'description',
3019 'annotation': 'annotations.xml',
3020 'infojson': 'info.json',
3021 'link': None,
3022 'pl_video': None,
3023 'pl_thumbnail': None,
3024 'pl_description': 'description',
3025 'pl_infojson': 'info.json',
3026 }
3027
3028 # As of [1] format syntax is:
3029 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3030 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3031 STR_FORMAT_RE_TMPL = r'''(?x)
3032 (?<!%)(?P<prefix>(?:%%)*)
3033 %
3034 (?P<has_key>\((?P<key>{0})\))?
3035 (?P<format>
3036 (?P<conversion>[#0\-+ ]+)?
3037 (?P<min_width>\d+)?
3038 (?P<precision>\.\d+)?
3039 (?P<len_mod>[hlL])? # unused in python
3040 {1} # conversion type
3041 )
3042 '''
3043
3044
3045 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3046
3047
3048 def limit_length(s, length):
3049 """ Add ellipses to overly long strings """
3050 if s is None:
3051 return None
3052 ELLIPSES = '...'
3053 if len(s) > length:
3054 return s[:length - len(ELLIPSES)] + ELLIPSES
3055 return s
3056
3057
3058 def version_tuple(v):
3059 return tuple(int(e) for e in re.split(r'[-.]', v))
3060
3061
3062 def is_outdated_version(version, limit, assume_new=True):
3063 if not version:
3064 return not assume_new
3065 try:
3066 return version_tuple(version) < version_tuple(limit)
3067 except ValueError:
3068 return not assume_new
3069
3070
3071 def ytdl_is_updateable():
3072 """ Returns if yt-dlp can be updated with -U """
3073
3074 from .update import is_non_updateable
3075
3076 return not is_non_updateable()
3077
3078
3079 def args_to_str(args):
3080 # Get a short string representation for a subprocess command
3081 return ' '.join(compat_shlex_quote(a) for a in args)
3082
3083
3084 def error_to_compat_str(err):
3085 return str(err)
3086
3087
3088 def error_to_str(err):
3089 return f'{type(err).__name__}: {err}'
3090
3091
3092 def mimetype2ext(mt):
3093 if mt is None:
3094 return None
3095
3096 mt, _, params = mt.partition(';')
3097 mt = mt.strip()
3098
3099 FULL_MAP = {
3100 'audio/mp4': 'm4a',
3101 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3102 # it's the most popular one
3103 'audio/mpeg': 'mp3',
3104 'audio/x-wav': 'wav',
3105 'audio/wav': 'wav',
3106 'audio/wave': 'wav',
3107 }
3108
3109 ext = FULL_MAP.get(mt)
3110 if ext is not None:
3111 return ext
3112
3113 SUBTYPE_MAP = {
3114 '3gpp': '3gp',
3115 'smptett+xml': 'tt',
3116 'ttaf+xml': 'dfxp',
3117 'ttml+xml': 'ttml',
3118 'x-flv': 'flv',
3119 'x-mp4-fragmented': 'mp4',
3120 'x-ms-sami': 'sami',
3121 'x-ms-wmv': 'wmv',
3122 'mpegurl': 'm3u8',
3123 'x-mpegurl': 'm3u8',
3124 'vnd.apple.mpegurl': 'm3u8',
3125 'dash+xml': 'mpd',
3126 'f4m+xml': 'f4m',
3127 'hds+xml': 'f4m',
3128 'vnd.ms-sstr+xml': 'ism',
3129 'quicktime': 'mov',
3130 'mp2t': 'ts',
3131 'x-wav': 'wav',
3132 'filmstrip+json': 'fs',
3133 'svg+xml': 'svg',
3134 }
3135
3136 _, _, subtype = mt.rpartition('/')
3137 ext = SUBTYPE_MAP.get(subtype.lower())
3138 if ext is not None:
3139 return ext
3140
3141 SUFFIX_MAP = {
3142 'json': 'json',
3143 'xml': 'xml',
3144 'zip': 'zip',
3145 'gzip': 'gz',
3146 }
3147
3148 _, _, suffix = subtype.partition('+')
3149 ext = SUFFIX_MAP.get(suffix)
3150 if ext is not None:
3151 return ext
3152
3153 return subtype.replace('+', '.')
3154
3155
3156 def ext2mimetype(ext_or_url):
3157 if not ext_or_url:
3158 return None
3159 if '.' not in ext_or_url:
3160 ext_or_url = f'file.{ext_or_url}'
3161 return mimetypes.guess_type(ext_or_url)[0]
3162
3163
3164 def parse_codecs(codecs_str):
3165 # http://tools.ietf.org/html/rfc6381
3166 if not codecs_str:
3167 return {}
3168 split_codecs = list(filter(None, map(
3169 str.strip, codecs_str.strip().strip(',').split(','))))
3170 vcodec, acodec, tcodec, hdr = None, None, None, None
3171 for full_codec in split_codecs:
3172 parts = full_codec.split('.')
3173 codec = parts[0].replace('0', '')
3174 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3175 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3176 if not vcodec:
3177 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3178 if codec in ('dvh1', 'dvhe'):
3179 hdr = 'DV'
3180 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3181 hdr = 'HDR10'
3182 elif full_codec.replace('0', '').startswith('vp9.2'):
3183 hdr = 'HDR10'
3184 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3185 if not acodec:
3186 acodec = full_codec
3187 elif codec in ('stpp', 'wvtt',):
3188 if not tcodec:
3189 tcodec = full_codec
3190 else:
3191 write_string(f'WARNING: Unknown codec {full_codec}\n')
3192 if vcodec or acodec or tcodec:
3193 return {
3194 'vcodec': vcodec or 'none',
3195 'acodec': acodec or 'none',
3196 'dynamic_range': hdr,
3197 **({'tcodec': tcodec} if tcodec is not None else {}),
3198 }
3199 elif len(split_codecs) == 2:
3200 return {
3201 'vcodec': split_codecs[0],
3202 'acodec': split_codecs[1],
3203 }
3204 return {}
3205
3206
3207 def urlhandle_detect_ext(url_handle):
3208 getheader = url_handle.headers.get
3209
3210 cd = getheader('Content-Disposition')
3211 if cd:
3212 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3213 if m:
3214 e = determine_ext(m.group('filename'), default_ext=None)
3215 if e:
3216 return e
3217
3218 return mimetype2ext(getheader('Content-Type'))
3219
3220
3221 def encode_data_uri(data, mime_type):
3222 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3223
3224
3225 def age_restricted(content_limit, age_limit):
3226 """ Returns True iff the content should be blocked """
3227
3228 if age_limit is None: # No limit set
3229 return False
3230 if content_limit is None:
3231 return False # Content available for everyone
3232 return age_limit < content_limit
3233
3234
3235 def is_html(first_bytes):
3236 """ Detect whether a file contains HTML by examining its first bytes. """
3237
3238 BOMS = [
3239 (b'\xef\xbb\xbf', 'utf-8'),
3240 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3241 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3242 (b'\xff\xfe', 'utf-16-le'),
3243 (b'\xfe\xff', 'utf-16-be'),
3244 ]
3245 for bom, enc in BOMS:
3246 if first_bytes.startswith(bom):
3247 s = first_bytes[len(bom):].decode(enc, 'replace')
3248 break
3249 else:
3250 s = first_bytes.decode('utf-8', 'replace')
3251
3252 return re.match(r'^\s*<', s)
3253
3254
3255 def determine_protocol(info_dict):
3256 protocol = info_dict.get('protocol')
3257 if protocol is not None:
3258 return protocol
3259
3260 url = sanitize_url(info_dict['url'])
3261 if url.startswith('rtmp'):
3262 return 'rtmp'
3263 elif url.startswith('mms'):
3264 return 'mms'
3265 elif url.startswith('rtsp'):
3266 return 'rtsp'
3267
3268 ext = determine_ext(url)
3269 if ext == 'm3u8':
3270 return 'm3u8'
3271 elif ext == 'f4m':
3272 return 'f4m'
3273
3274 return compat_urllib_parse_urlparse(url).scheme
3275
3276
3277 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3278 """ Render a list of rows, each as a list of values.
3279 Text after a \t will be right aligned """
3280 def width(string):
3281 return len(remove_terminal_sequences(string).replace('\t', ''))
3282
3283 def get_max_lens(table):
3284 return [max(width(str(v)) for v in col) for col in zip(*table)]
3285
3286 def filter_using_list(row, filterArray):
3287 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3288
3289 max_lens = get_max_lens(data) if hide_empty else []
3290 header_row = filter_using_list(header_row, max_lens)
3291 data = [filter_using_list(row, max_lens) for row in data]
3292
3293 table = [header_row] + data
3294 max_lens = get_max_lens(table)
3295 extra_gap += 1
3296 if delim:
3297 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3298 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3299 for row in table:
3300 for pos, text in enumerate(map(str, row)):
3301 if '\t' in text:
3302 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3303 else:
3304 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3305 ret = '\n'.join(''.join(row).rstrip() for row in table)
3306 return ret
3307
3308
3309 def _match_one(filter_part, dct, incomplete):
3310 # TODO: Generalize code with YoutubeDL._build_format_filter
3311 STRING_OPERATORS = {
3312 '*=': operator.contains,
3313 '^=': lambda attr, value: attr.startswith(value),
3314 '$=': lambda attr, value: attr.endswith(value),
3315 '~=': lambda attr, value: re.search(value, attr),
3316 }
3317 COMPARISON_OPERATORS = {
3318 **STRING_OPERATORS,
3319 '<=': operator.le, # "<=" must be defined above "<"
3320 '<': operator.lt,
3321 '>=': operator.ge,
3322 '>': operator.gt,
3323 '=': operator.eq,
3324 }
3325
3326 if isinstance(incomplete, bool):
3327 is_incomplete = lambda _: incomplete
3328 else:
3329 is_incomplete = lambda k: k in incomplete
3330
3331 operator_rex = re.compile(r'''(?x)\s*
3332 (?P<key>[a-z_]+)
3333 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3334 (?:
3335 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3336 (?P<strval>.+?)
3337 )
3338 \s*$
3339 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3340 m = operator_rex.search(filter_part)
3341 if m:
3342 m = m.groupdict()
3343 unnegated_op = COMPARISON_OPERATORS[m['op']]
3344 if m['negation']:
3345 op = lambda attr, value: not unnegated_op(attr, value)
3346 else:
3347 op = unnegated_op
3348 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3349 if m['quote']:
3350 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3351 actual_value = dct.get(m['key'])
3352 numeric_comparison = None
3353 if isinstance(actual_value, (int, float)):
3354 # If the original field is a string and matching comparisonvalue is
3355 # a number we should respect the origin of the original field
3356 # and process comparison value as a string (see
3357 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3358 try:
3359 numeric_comparison = int(comparison_value)
3360 except ValueError:
3361 numeric_comparison = parse_filesize(comparison_value)
3362 if numeric_comparison is None:
3363 numeric_comparison = parse_filesize(f'{comparison_value}B')
3364 if numeric_comparison is None:
3365 numeric_comparison = parse_duration(comparison_value)
3366 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3367 raise ValueError('Operator %s only supports string values!' % m['op'])
3368 if actual_value is None:
3369 return is_incomplete(m['key']) or m['none_inclusive']
3370 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3371
3372 UNARY_OPERATORS = {
3373 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3374 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3375 }
3376 operator_rex = re.compile(r'''(?x)\s*
3377 (?P<op>%s)\s*(?P<key>[a-z_]+)
3378 \s*$
3379 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3380 m = operator_rex.search(filter_part)
3381 if m:
3382 op = UNARY_OPERATORS[m.group('op')]
3383 actual_value = dct.get(m.group('key'))
3384 if is_incomplete(m.group('key')) and actual_value is None:
3385 return True
3386 return op(actual_value)
3387
3388 raise ValueError('Invalid filter part %r' % filter_part)
3389
3390
3391 def match_str(filter_str, dct, incomplete=False):
3392 """ Filter a dictionary with a simple string syntax.
3393 @returns Whether the filter passes
3394 @param incomplete Set of keys that is expected to be missing from dct.
3395 Can be True/False to indicate all/none of the keys may be missing.
3396 All conditions on incomplete keys pass if the key is missing
3397 """
3398 return all(
3399 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3400 for filter_part in re.split(r'(?<!\\)&', filter_str))
3401
3402
3403 def match_filter_func(filters):
3404 if not filters:
3405 return None
3406 filters = variadic(filters)
3407
3408 def _match_func(info_dict, *args, **kwargs):
3409 if any(match_str(f, info_dict, *args, **kwargs) for f in filters):
3410 return None
3411 else:
3412 video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3413 filter_str = ') | ('.join(map(str.strip, filters))
3414 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3415 return _match_func
3416
3417
3418 def parse_dfxp_time_expr(time_expr):
3419 if not time_expr:
3420 return
3421
3422 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3423 if mobj:
3424 return float(mobj.group('time_offset'))
3425
3426 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3427 if mobj:
3428 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3429
3430
3431 def srt_subtitles_timecode(seconds):
3432 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3433
3434
3435 def ass_subtitles_timecode(seconds):
3436 time = timetuple_from_msec(seconds * 1000)
3437 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3438
3439
3440 def dfxp2srt(dfxp_data):
3441 '''
3442 @param dfxp_data A bytes-like object containing DFXP data
3443 @returns A unicode object containing converted SRT data
3444 '''
3445 LEGACY_NAMESPACES = (
3446 (b'http://www.w3.org/ns/ttml', [
3447 b'http://www.w3.org/2004/11/ttaf1',
3448 b'http://www.w3.org/2006/04/ttaf1',
3449 b'http://www.w3.org/2006/10/ttaf1',
3450 ]),
3451 (b'http://www.w3.org/ns/ttml#styling', [
3452 b'http://www.w3.org/ns/ttml#style',
3453 ]),
3454 )
3455
3456 SUPPORTED_STYLING = [
3457 'color',
3458 'fontFamily',
3459 'fontSize',
3460 'fontStyle',
3461 'fontWeight',
3462 'textDecoration'
3463 ]
3464
3465 _x = functools.partial(xpath_with_ns, ns_map={
3466 'xml': 'http://www.w3.org/XML/1998/namespace',
3467 'ttml': 'http://www.w3.org/ns/ttml',
3468 'tts': 'http://www.w3.org/ns/ttml#styling',
3469 })
3470
3471 styles = {}
3472 default_style = {}
3473
3474 class TTMLPElementParser:
3475 _out = ''
3476 _unclosed_elements = []
3477 _applied_styles = []
3478
3479 def start(self, tag, attrib):
3480 if tag in (_x('ttml:br'), 'br'):
3481 self._out += '\n'
3482 else:
3483 unclosed_elements = []
3484 style = {}
3485 element_style_id = attrib.get('style')
3486 if default_style:
3487 style.update(default_style)
3488 if element_style_id:
3489 style.update(styles.get(element_style_id, {}))
3490 for prop in SUPPORTED_STYLING:
3491 prop_val = attrib.get(_x('tts:' + prop))
3492 if prop_val:
3493 style[prop] = prop_val
3494 if style:
3495 font = ''
3496 for k, v in sorted(style.items()):
3497 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3498 continue
3499 if k == 'color':
3500 font += ' color="%s"' % v
3501 elif k == 'fontSize':
3502 font += ' size="%s"' % v
3503 elif k == 'fontFamily':
3504 font += ' face="%s"' % v
3505 elif k == 'fontWeight' and v == 'bold':
3506 self._out += '<b>'
3507 unclosed_elements.append('b')
3508 elif k == 'fontStyle' and v == 'italic':
3509 self._out += '<i>'
3510 unclosed_elements.append('i')
3511 elif k == 'textDecoration' and v == 'underline':
3512 self._out += '<u>'
3513 unclosed_elements.append('u')
3514 if font:
3515 self._out += '<font' + font + '>'
3516 unclosed_elements.append('font')
3517 applied_style = {}
3518 if self._applied_styles:
3519 applied_style.update(self._applied_styles[-1])
3520 applied_style.update(style)
3521 self._applied_styles.append(applied_style)
3522 self._unclosed_elements.append(unclosed_elements)
3523
3524 def end(self, tag):
3525 if tag not in (_x('ttml:br'), 'br'):
3526 unclosed_elements = self._unclosed_elements.pop()
3527 for element in reversed(unclosed_elements):
3528 self._out += '</%s>' % element
3529 if unclosed_elements and self._applied_styles:
3530 self._applied_styles.pop()
3531
3532 def data(self, data):
3533 self._out += data
3534
3535 def close(self):
3536 return self._out.strip()
3537
3538 def parse_node(node):
3539 target = TTMLPElementParser()
3540 parser = xml.etree.ElementTree.XMLParser(target=target)
3541 parser.feed(xml.etree.ElementTree.tostring(node))
3542 return parser.close()
3543
3544 for k, v in LEGACY_NAMESPACES:
3545 for ns in v:
3546 dfxp_data = dfxp_data.replace(ns, k)
3547
3548 dfxp = compat_etree_fromstring(dfxp_data)
3549 out = []
3550 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3551
3552 if not paras:
3553 raise ValueError('Invalid dfxp/TTML subtitle')
3554
3555 repeat = False
3556 while True:
3557 for style in dfxp.findall(_x('.//ttml:style')):
3558 style_id = style.get('id') or style.get(_x('xml:id'))
3559 if not style_id:
3560 continue
3561 parent_style_id = style.get('style')
3562 if parent_style_id:
3563 if parent_style_id not in styles:
3564 repeat = True
3565 continue
3566 styles[style_id] = styles[parent_style_id].copy()
3567 for prop in SUPPORTED_STYLING:
3568 prop_val = style.get(_x('tts:' + prop))
3569 if prop_val:
3570 styles.setdefault(style_id, {})[prop] = prop_val
3571 if repeat:
3572 repeat = False
3573 else:
3574 break
3575
3576 for p in ('body', 'div'):
3577 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3578 if ele is None:
3579 continue
3580 style = styles.get(ele.get('style'))
3581 if not style:
3582 continue
3583 default_style.update(style)
3584
3585 for para, index in zip(paras, itertools.count(1)):
3586 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3587 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3588 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3589 if begin_time is None:
3590 continue
3591 if not end_time:
3592 if not dur:
3593 continue
3594 end_time = begin_time + dur
3595 out.append('%d\n%s --> %s\n%s\n\n' % (
3596 index,
3597 srt_subtitles_timecode(begin_time),
3598 srt_subtitles_timecode(end_time),
3599 parse_node(para)))
3600
3601 return ''.join(out)
3602
3603
3604 def cli_option(params, command_option, param):
3605 param = params.get(param)
3606 if param:
3607 param = compat_str(param)
3608 return [command_option, param] if param is not None else []
3609
3610
3611 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3612 param = params.get(param)
3613 if param is None:
3614 return []
3615 assert isinstance(param, bool)
3616 if separator:
3617 return [command_option + separator + (true_value if param else false_value)]
3618 return [command_option, true_value if param else false_value]
3619
3620
3621 def cli_valueless_option(params, command_option, param, expected_value=True):
3622 param = params.get(param)
3623 return [command_option] if param == expected_value else []
3624
3625
3626 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3627 if isinstance(argdict, (list, tuple)): # for backward compatibility
3628 if use_compat:
3629 return argdict
3630 else:
3631 argdict = None
3632 if argdict is None:
3633 return default
3634 assert isinstance(argdict, dict)
3635
3636 assert isinstance(keys, (list, tuple))
3637 for key_list in keys:
3638 arg_list = list(filter(
3639 lambda x: x is not None,
3640 [argdict.get(key.lower()) for key in variadic(key_list)]))
3641 if arg_list:
3642 return [arg for args in arg_list for arg in args]
3643 return default
3644
3645
3646 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3647 main_key, exe = main_key.lower(), exe.lower()
3648 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3649 keys = [f'{root_key}{k}' for k in (keys or [''])]
3650 if root_key in keys:
3651 if main_key != exe:
3652 keys.append((main_key, exe))
3653 keys.append('default')
3654 else:
3655 use_compat = False
3656 return cli_configuration_args(argdict, keys, default, use_compat)
3657
3658
3659 class ISO639Utils:
3660 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3661 _lang_map = {
3662 'aa': 'aar',
3663 'ab': 'abk',
3664 'ae': 'ave',
3665 'af': 'afr',
3666 'ak': 'aka',
3667 'am': 'amh',
3668 'an': 'arg',
3669 'ar': 'ara',
3670 'as': 'asm',
3671 'av': 'ava',
3672 'ay': 'aym',
3673 'az': 'aze',
3674 'ba': 'bak',
3675 'be': 'bel',
3676 'bg': 'bul',
3677 'bh': 'bih',
3678 'bi': 'bis',
3679 'bm': 'bam',
3680 'bn': 'ben',
3681 'bo': 'bod',
3682 'br': 'bre',
3683 'bs': 'bos',
3684 'ca': 'cat',
3685 'ce': 'che',
3686 'ch': 'cha',
3687 'co': 'cos',
3688 'cr': 'cre',
3689 'cs': 'ces',
3690 'cu': 'chu',
3691 'cv': 'chv',
3692 'cy': 'cym',
3693 'da': 'dan',
3694 'de': 'deu',
3695 'dv': 'div',
3696 'dz': 'dzo',
3697 'ee': 'ewe',
3698 'el': 'ell',
3699 'en': 'eng',
3700 'eo': 'epo',
3701 'es': 'spa',
3702 'et': 'est',
3703 'eu': 'eus',
3704 'fa': 'fas',
3705 'ff': 'ful',
3706 'fi': 'fin',
3707 'fj': 'fij',
3708 'fo': 'fao',
3709 'fr': 'fra',
3710 'fy': 'fry',
3711 'ga': 'gle',
3712 'gd': 'gla',
3713 'gl': 'glg',
3714 'gn': 'grn',
3715 'gu': 'guj',
3716 'gv': 'glv',
3717 'ha': 'hau',
3718 'he': 'heb',
3719 'iw': 'heb', # Replaced by he in 1989 revision
3720 'hi': 'hin',
3721 'ho': 'hmo',
3722 'hr': 'hrv',
3723 'ht': 'hat',
3724 'hu': 'hun',
3725 'hy': 'hye',
3726 'hz': 'her',
3727 'ia': 'ina',
3728 'id': 'ind',
3729 'in': 'ind', # Replaced by id in 1989 revision
3730 'ie': 'ile',
3731 'ig': 'ibo',
3732 'ii': 'iii',
3733 'ik': 'ipk',
3734 'io': 'ido',
3735 'is': 'isl',
3736 'it': 'ita',
3737 'iu': 'iku',
3738 'ja': 'jpn',
3739 'jv': 'jav',
3740 'ka': 'kat',
3741 'kg': 'kon',
3742 'ki': 'kik',
3743 'kj': 'kua',
3744 'kk': 'kaz',
3745 'kl': 'kal',
3746 'km': 'khm',
3747 'kn': 'kan',
3748 'ko': 'kor',
3749 'kr': 'kau',
3750 'ks': 'kas',
3751 'ku': 'kur',
3752 'kv': 'kom',
3753 'kw': 'cor',
3754 'ky': 'kir',
3755 'la': 'lat',
3756 'lb': 'ltz',
3757 'lg': 'lug',
3758 'li': 'lim',
3759 'ln': 'lin',
3760 'lo': 'lao',
3761 'lt': 'lit',
3762 'lu': 'lub',
3763 'lv': 'lav',
3764 'mg': 'mlg',
3765 'mh': 'mah',
3766 'mi': 'mri',
3767 'mk': 'mkd',
3768 'ml': 'mal',
3769 'mn': 'mon',
3770 'mr': 'mar',
3771 'ms': 'msa',
3772 'mt': 'mlt',
3773 'my': 'mya',
3774 'na': 'nau',
3775 'nb': 'nob',
3776 'nd': 'nde',
3777 'ne': 'nep',
3778 'ng': 'ndo',
3779 'nl': 'nld',
3780 'nn': 'nno',
3781 'no': 'nor',
3782 'nr': 'nbl',
3783 'nv': 'nav',
3784 'ny': 'nya',
3785 'oc': 'oci',
3786 'oj': 'oji',
3787 'om': 'orm',
3788 'or': 'ori',
3789 'os': 'oss',
3790 'pa': 'pan',
3791 'pi': 'pli',
3792 'pl': 'pol',
3793 'ps': 'pus',
3794 'pt': 'por',
3795 'qu': 'que',
3796 'rm': 'roh',
3797 'rn': 'run',
3798 'ro': 'ron',
3799 'ru': 'rus',
3800 'rw': 'kin',
3801 'sa': 'san',
3802 'sc': 'srd',
3803 'sd': 'snd',
3804 'se': 'sme',
3805 'sg': 'sag',
3806 'si': 'sin',
3807 'sk': 'slk',
3808 'sl': 'slv',
3809 'sm': 'smo',
3810 'sn': 'sna',
3811 'so': 'som',
3812 'sq': 'sqi',
3813 'sr': 'srp',
3814 'ss': 'ssw',
3815 'st': 'sot',
3816 'su': 'sun',
3817 'sv': 'swe',
3818 'sw': 'swa',
3819 'ta': 'tam',
3820 'te': 'tel',
3821 'tg': 'tgk',
3822 'th': 'tha',
3823 'ti': 'tir',
3824 'tk': 'tuk',
3825 'tl': 'tgl',
3826 'tn': 'tsn',
3827 'to': 'ton',
3828 'tr': 'tur',
3829 'ts': 'tso',
3830 'tt': 'tat',
3831 'tw': 'twi',
3832 'ty': 'tah',
3833 'ug': 'uig',
3834 'uk': 'ukr',
3835 'ur': 'urd',
3836 'uz': 'uzb',
3837 've': 'ven',
3838 'vi': 'vie',
3839 'vo': 'vol',
3840 'wa': 'wln',
3841 'wo': 'wol',
3842 'xh': 'xho',
3843 'yi': 'yid',
3844 'ji': 'yid', # Replaced by yi in 1989 revision
3845 'yo': 'yor',
3846 'za': 'zha',
3847 'zh': 'zho',
3848 'zu': 'zul',
3849 }
3850
3851 @classmethod
3852 def short2long(cls, code):
3853 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3854 return cls._lang_map.get(code[:2])
3855
3856 @classmethod
3857 def long2short(cls, code):
3858 """Convert language code from ISO 639-2/T to ISO 639-1"""
3859 for short_name, long_name in cls._lang_map.items():
3860 if long_name == code:
3861 return short_name
3862
3863
3864 class ISO3166Utils:
3865 # From http://data.okfn.org/data/core/country-list
3866 _country_map = {
3867 'AF': 'Afghanistan',
3868 'AX': 'Åland Islands',
3869 'AL': 'Albania',
3870 'DZ': 'Algeria',
3871 'AS': 'American Samoa',
3872 'AD': 'Andorra',
3873 'AO': 'Angola',
3874 'AI': 'Anguilla',
3875 'AQ': 'Antarctica',
3876 'AG': 'Antigua and Barbuda',
3877 'AR': 'Argentina',
3878 'AM': 'Armenia',
3879 'AW': 'Aruba',
3880 'AU': 'Australia',
3881 'AT': 'Austria',
3882 'AZ': 'Azerbaijan',
3883 'BS': 'Bahamas',
3884 'BH': 'Bahrain',
3885 'BD': 'Bangladesh',
3886 'BB': 'Barbados',
3887 'BY': 'Belarus',
3888 'BE': 'Belgium',
3889 'BZ': 'Belize',
3890 'BJ': 'Benin',
3891 'BM': 'Bermuda',
3892 'BT': 'Bhutan',
3893 'BO': 'Bolivia, Plurinational State of',
3894 'BQ': 'Bonaire, Sint Eustatius and Saba',
3895 'BA': 'Bosnia and Herzegovina',
3896 'BW': 'Botswana',
3897 'BV': 'Bouvet Island',
3898 'BR': 'Brazil',
3899 'IO': 'British Indian Ocean Territory',
3900 'BN': 'Brunei Darussalam',
3901 'BG': 'Bulgaria',
3902 'BF': 'Burkina Faso',
3903 'BI': 'Burundi',
3904 'KH': 'Cambodia',
3905 'CM': 'Cameroon',
3906 'CA': 'Canada',
3907 'CV': 'Cape Verde',
3908 'KY': 'Cayman Islands',
3909 'CF': 'Central African Republic',
3910 'TD': 'Chad',
3911 'CL': 'Chile',
3912 'CN': 'China',
3913 'CX': 'Christmas Island',
3914 'CC': 'Cocos (Keeling) Islands',
3915 'CO': 'Colombia',
3916 'KM': 'Comoros',
3917 'CG': 'Congo',
3918 'CD': 'Congo, the Democratic Republic of the',
3919 'CK': 'Cook Islands',
3920 'CR': 'Costa Rica',
3921 'CI': 'Côte d\'Ivoire',
3922 'HR': 'Croatia',
3923 'CU': 'Cuba',
3924 'CW': 'Curaçao',
3925 'CY': 'Cyprus',
3926 'CZ': 'Czech Republic',
3927 'DK': 'Denmark',
3928 'DJ': 'Djibouti',
3929 'DM': 'Dominica',
3930 'DO': 'Dominican Republic',
3931 'EC': 'Ecuador',
3932 'EG': 'Egypt',
3933 'SV': 'El Salvador',
3934 'GQ': 'Equatorial Guinea',
3935 'ER': 'Eritrea',
3936 'EE': 'Estonia',
3937 'ET': 'Ethiopia',
3938 'FK': 'Falkland Islands (Malvinas)',
3939 'FO': 'Faroe Islands',
3940 'FJ': 'Fiji',
3941 'FI': 'Finland',
3942 'FR': 'France',
3943 'GF': 'French Guiana',
3944 'PF': 'French Polynesia',
3945 'TF': 'French Southern Territories',
3946 'GA': 'Gabon',
3947 'GM': 'Gambia',
3948 'GE': 'Georgia',
3949 'DE': 'Germany',
3950 'GH': 'Ghana',
3951 'GI': 'Gibraltar',
3952 'GR': 'Greece',
3953 'GL': 'Greenland',
3954 'GD': 'Grenada',
3955 'GP': 'Guadeloupe',
3956 'GU': 'Guam',
3957 'GT': 'Guatemala',
3958 'GG': 'Guernsey',
3959 'GN': 'Guinea',
3960 'GW': 'Guinea-Bissau',
3961 'GY': 'Guyana',
3962 'HT': 'Haiti',
3963 'HM': 'Heard Island and McDonald Islands',
3964 'VA': 'Holy See (Vatican City State)',
3965 'HN': 'Honduras',
3966 'HK': 'Hong Kong',
3967 'HU': 'Hungary',
3968 'IS': 'Iceland',
3969 'IN': 'India',
3970 'ID': 'Indonesia',
3971 'IR': 'Iran, Islamic Republic of',
3972 'IQ': 'Iraq',
3973 'IE': 'Ireland',
3974 'IM': 'Isle of Man',
3975 'IL': 'Israel',
3976 'IT': 'Italy',
3977 'JM': 'Jamaica',
3978 'JP': 'Japan',
3979 'JE': 'Jersey',
3980 'JO': 'Jordan',
3981 'KZ': 'Kazakhstan',
3982 'KE': 'Kenya',
3983 'KI': 'Kiribati',
3984 'KP': 'Korea, Democratic People\'s Republic of',
3985 'KR': 'Korea, Republic of',
3986 'KW': 'Kuwait',
3987 'KG': 'Kyrgyzstan',
3988 'LA': 'Lao People\'s Democratic Republic',
3989 'LV': 'Latvia',
3990 'LB': 'Lebanon',
3991 'LS': 'Lesotho',
3992 'LR': 'Liberia',
3993 'LY': 'Libya',
3994 'LI': 'Liechtenstein',
3995 'LT': 'Lithuania',
3996 'LU': 'Luxembourg',
3997 'MO': 'Macao',
3998 'MK': 'Macedonia, the Former Yugoslav Republic of',
3999 'MG': 'Madagascar',
4000 'MW': 'Malawi',
4001 'MY': 'Malaysia',
4002 'MV': 'Maldives',
4003 'ML': 'Mali',
4004 'MT': 'Malta',
4005 'MH': 'Marshall Islands',
4006 'MQ': 'Martinique',
4007 'MR': 'Mauritania',
4008 'MU': 'Mauritius',
4009 'YT': 'Mayotte',
4010 'MX': 'Mexico',
4011 'FM': 'Micronesia, Federated States of',
4012 'MD': 'Moldova, Republic of',
4013 'MC': 'Monaco',
4014 'MN': 'Mongolia',
4015 'ME': 'Montenegro',
4016 'MS': 'Montserrat',
4017 'MA': 'Morocco',
4018 'MZ': 'Mozambique',
4019 'MM': 'Myanmar',
4020 'NA': 'Namibia',
4021 'NR': 'Nauru',
4022 'NP': 'Nepal',
4023 'NL': 'Netherlands',
4024 'NC': 'New Caledonia',
4025 'NZ': 'New Zealand',
4026 'NI': 'Nicaragua',
4027 'NE': 'Niger',
4028 'NG': 'Nigeria',
4029 'NU': 'Niue',
4030 'NF': 'Norfolk Island',
4031 'MP': 'Northern Mariana Islands',
4032 'NO': 'Norway',
4033 'OM': 'Oman',
4034 'PK': 'Pakistan',
4035 'PW': 'Palau',
4036 'PS': 'Palestine, State of',
4037 'PA': 'Panama',
4038 'PG': 'Papua New Guinea',
4039 'PY': 'Paraguay',
4040 'PE': 'Peru',
4041 'PH': 'Philippines',
4042 'PN': 'Pitcairn',
4043 'PL': 'Poland',
4044 'PT': 'Portugal',
4045 'PR': 'Puerto Rico',
4046 'QA': 'Qatar',
4047 'RE': 'Réunion',
4048 'RO': 'Romania',
4049 'RU': 'Russian Federation',
4050 'RW': 'Rwanda',
4051 'BL': 'Saint Barthélemy',
4052 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4053 'KN': 'Saint Kitts and Nevis',
4054 'LC': 'Saint Lucia',
4055 'MF': 'Saint Martin (French part)',
4056 'PM': 'Saint Pierre and Miquelon',
4057 'VC': 'Saint Vincent and the Grenadines',
4058 'WS': 'Samoa',
4059 'SM': 'San Marino',
4060 'ST': 'Sao Tome and Principe',
4061 'SA': 'Saudi Arabia',
4062 'SN': 'Senegal',
4063 'RS': 'Serbia',
4064 'SC': 'Seychelles',
4065 'SL': 'Sierra Leone',
4066 'SG': 'Singapore',
4067 'SX': 'Sint Maarten (Dutch part)',
4068 'SK': 'Slovakia',
4069 'SI': 'Slovenia',
4070 'SB': 'Solomon Islands',
4071 'SO': 'Somalia',
4072 'ZA': 'South Africa',
4073 'GS': 'South Georgia and the South Sandwich Islands',
4074 'SS': 'South Sudan',
4075 'ES': 'Spain',
4076 'LK': 'Sri Lanka',
4077 'SD': 'Sudan',
4078 'SR': 'Suriname',
4079 'SJ': 'Svalbard and Jan Mayen',
4080 'SZ': 'Swaziland',
4081 'SE': 'Sweden',
4082 'CH': 'Switzerland',
4083 'SY': 'Syrian Arab Republic',
4084 'TW': 'Taiwan, Province of China',
4085 'TJ': 'Tajikistan',
4086 'TZ': 'Tanzania, United Republic of',
4087 'TH': 'Thailand',
4088 'TL': 'Timor-Leste',
4089 'TG': 'Togo',
4090 'TK': 'Tokelau',
4091 'TO': 'Tonga',
4092 'TT': 'Trinidad and Tobago',
4093 'TN': 'Tunisia',
4094 'TR': 'Turkey',
4095 'TM': 'Turkmenistan',
4096 'TC': 'Turks and Caicos Islands',
4097 'TV': 'Tuvalu',
4098 'UG': 'Uganda',
4099 'UA': 'Ukraine',
4100 'AE': 'United Arab Emirates',
4101 'GB': 'United Kingdom',
4102 'US': 'United States',
4103 'UM': 'United States Minor Outlying Islands',
4104 'UY': 'Uruguay',
4105 'UZ': 'Uzbekistan',
4106 'VU': 'Vanuatu',
4107 'VE': 'Venezuela, Bolivarian Republic of',
4108 'VN': 'Viet Nam',
4109 'VG': 'Virgin Islands, British',
4110 'VI': 'Virgin Islands, U.S.',
4111 'WF': 'Wallis and Futuna',
4112 'EH': 'Western Sahara',
4113 'YE': 'Yemen',
4114 'ZM': 'Zambia',
4115 'ZW': 'Zimbabwe',
4116 }
4117
4118 @classmethod
4119 def short2full(cls, code):
4120 """Convert an ISO 3166-2 country code to the corresponding full name"""
4121 return cls._country_map.get(code.upper())
4122
4123
4124 class GeoUtils:
4125 # Major IPv4 address blocks per country
4126 _country_ip_map = {
4127 'AD': '46.172.224.0/19',
4128 'AE': '94.200.0.0/13',
4129 'AF': '149.54.0.0/17',
4130 'AG': '209.59.64.0/18',
4131 'AI': '204.14.248.0/21',
4132 'AL': '46.99.0.0/16',
4133 'AM': '46.70.0.0/15',
4134 'AO': '105.168.0.0/13',
4135 'AP': '182.50.184.0/21',
4136 'AQ': '23.154.160.0/24',
4137 'AR': '181.0.0.0/12',
4138 'AS': '202.70.112.0/20',
4139 'AT': '77.116.0.0/14',
4140 'AU': '1.128.0.0/11',
4141 'AW': '181.41.0.0/18',
4142 'AX': '185.217.4.0/22',
4143 'AZ': '5.197.0.0/16',
4144 'BA': '31.176.128.0/17',
4145 'BB': '65.48.128.0/17',
4146 'BD': '114.130.0.0/16',
4147 'BE': '57.0.0.0/8',
4148 'BF': '102.178.0.0/15',
4149 'BG': '95.42.0.0/15',
4150 'BH': '37.131.0.0/17',
4151 'BI': '154.117.192.0/18',
4152 'BJ': '137.255.0.0/16',
4153 'BL': '185.212.72.0/23',
4154 'BM': '196.12.64.0/18',
4155 'BN': '156.31.0.0/16',
4156 'BO': '161.56.0.0/16',
4157 'BQ': '161.0.80.0/20',
4158 'BR': '191.128.0.0/12',
4159 'BS': '24.51.64.0/18',
4160 'BT': '119.2.96.0/19',
4161 'BW': '168.167.0.0/16',
4162 'BY': '178.120.0.0/13',
4163 'BZ': '179.42.192.0/18',
4164 'CA': '99.224.0.0/11',
4165 'CD': '41.243.0.0/16',
4166 'CF': '197.242.176.0/21',
4167 'CG': '160.113.0.0/16',
4168 'CH': '85.0.0.0/13',
4169 'CI': '102.136.0.0/14',
4170 'CK': '202.65.32.0/19',
4171 'CL': '152.172.0.0/14',
4172 'CM': '102.244.0.0/14',
4173 'CN': '36.128.0.0/10',
4174 'CO': '181.240.0.0/12',
4175 'CR': '201.192.0.0/12',
4176 'CU': '152.206.0.0/15',
4177 'CV': '165.90.96.0/19',
4178 'CW': '190.88.128.0/17',
4179 'CY': '31.153.0.0/16',
4180 'CZ': '88.100.0.0/14',
4181 'DE': '53.0.0.0/8',
4182 'DJ': '197.241.0.0/17',
4183 'DK': '87.48.0.0/12',
4184 'DM': '192.243.48.0/20',
4185 'DO': '152.166.0.0/15',
4186 'DZ': '41.96.0.0/12',
4187 'EC': '186.68.0.0/15',
4188 'EE': '90.190.0.0/15',
4189 'EG': '156.160.0.0/11',
4190 'ER': '196.200.96.0/20',
4191 'ES': '88.0.0.0/11',
4192 'ET': '196.188.0.0/14',
4193 'EU': '2.16.0.0/13',
4194 'FI': '91.152.0.0/13',
4195 'FJ': '144.120.0.0/16',
4196 'FK': '80.73.208.0/21',
4197 'FM': '119.252.112.0/20',
4198 'FO': '88.85.32.0/19',
4199 'FR': '90.0.0.0/9',
4200 'GA': '41.158.0.0/15',
4201 'GB': '25.0.0.0/8',
4202 'GD': '74.122.88.0/21',
4203 'GE': '31.146.0.0/16',
4204 'GF': '161.22.64.0/18',
4205 'GG': '62.68.160.0/19',
4206 'GH': '154.160.0.0/12',
4207 'GI': '95.164.0.0/16',
4208 'GL': '88.83.0.0/19',
4209 'GM': '160.182.0.0/15',
4210 'GN': '197.149.192.0/18',
4211 'GP': '104.250.0.0/19',
4212 'GQ': '105.235.224.0/20',
4213 'GR': '94.64.0.0/13',
4214 'GT': '168.234.0.0/16',
4215 'GU': '168.123.0.0/16',
4216 'GW': '197.214.80.0/20',
4217 'GY': '181.41.64.0/18',
4218 'HK': '113.252.0.0/14',
4219 'HN': '181.210.0.0/16',
4220 'HR': '93.136.0.0/13',
4221 'HT': '148.102.128.0/17',
4222 'HU': '84.0.0.0/14',
4223 'ID': '39.192.0.0/10',
4224 'IE': '87.32.0.0/12',
4225 'IL': '79.176.0.0/13',
4226 'IM': '5.62.80.0/20',
4227 'IN': '117.192.0.0/10',
4228 'IO': '203.83.48.0/21',
4229 'IQ': '37.236.0.0/14',
4230 'IR': '2.176.0.0/12',
4231 'IS': '82.221.0.0/16',
4232 'IT': '79.0.0.0/10',
4233 'JE': '87.244.64.0/18',
4234 'JM': '72.27.0.0/17',
4235 'JO': '176.29.0.0/16',
4236 'JP': '133.0.0.0/8',
4237 'KE': '105.48.0.0/12',
4238 'KG': '158.181.128.0/17',
4239 'KH': '36.37.128.0/17',
4240 'KI': '103.25.140.0/22',
4241 'KM': '197.255.224.0/20',
4242 'KN': '198.167.192.0/19',
4243 'KP': '175.45.176.0/22',
4244 'KR': '175.192.0.0/10',
4245 'KW': '37.36.0.0/14',
4246 'KY': '64.96.0.0/15',
4247 'KZ': '2.72.0.0/13',
4248 'LA': '115.84.64.0/18',
4249 'LB': '178.135.0.0/16',
4250 'LC': '24.92.144.0/20',
4251 'LI': '82.117.0.0/19',
4252 'LK': '112.134.0.0/15',
4253 'LR': '102.183.0.0/16',
4254 'LS': '129.232.0.0/17',
4255 'LT': '78.56.0.0/13',
4256 'LU': '188.42.0.0/16',
4257 'LV': '46.109.0.0/16',
4258 'LY': '41.252.0.0/14',
4259 'MA': '105.128.0.0/11',
4260 'MC': '88.209.64.0/18',
4261 'MD': '37.246.0.0/16',
4262 'ME': '178.175.0.0/17',
4263 'MF': '74.112.232.0/21',
4264 'MG': '154.126.0.0/17',
4265 'MH': '117.103.88.0/21',
4266 'MK': '77.28.0.0/15',
4267 'ML': '154.118.128.0/18',
4268 'MM': '37.111.0.0/17',
4269 'MN': '49.0.128.0/17',
4270 'MO': '60.246.0.0/16',
4271 'MP': '202.88.64.0/20',
4272 'MQ': '109.203.224.0/19',
4273 'MR': '41.188.64.0/18',
4274 'MS': '208.90.112.0/22',
4275 'MT': '46.11.0.0/16',
4276 'MU': '105.16.0.0/12',
4277 'MV': '27.114.128.0/18',
4278 'MW': '102.70.0.0/15',
4279 'MX': '187.192.0.0/11',
4280 'MY': '175.136.0.0/13',
4281 'MZ': '197.218.0.0/15',
4282 'NA': '41.182.0.0/16',
4283 'NC': '101.101.0.0/18',
4284 'NE': '197.214.0.0/18',
4285 'NF': '203.17.240.0/22',
4286 'NG': '105.112.0.0/12',
4287 'NI': '186.76.0.0/15',
4288 'NL': '145.96.0.0/11',
4289 'NO': '84.208.0.0/13',
4290 'NP': '36.252.0.0/15',
4291 'NR': '203.98.224.0/19',
4292 'NU': '49.156.48.0/22',
4293 'NZ': '49.224.0.0/14',
4294 'OM': '5.36.0.0/15',
4295 'PA': '186.72.0.0/15',
4296 'PE': '186.160.0.0/14',
4297 'PF': '123.50.64.0/18',
4298 'PG': '124.240.192.0/19',
4299 'PH': '49.144.0.0/13',
4300 'PK': '39.32.0.0/11',
4301 'PL': '83.0.0.0/11',
4302 'PM': '70.36.0.0/20',
4303 'PR': '66.50.0.0/16',
4304 'PS': '188.161.0.0/16',
4305 'PT': '85.240.0.0/13',
4306 'PW': '202.124.224.0/20',
4307 'PY': '181.120.0.0/14',
4308 'QA': '37.210.0.0/15',
4309 'RE': '102.35.0.0/16',
4310 'RO': '79.112.0.0/13',
4311 'RS': '93.86.0.0/15',
4312 'RU': '5.136.0.0/13',
4313 'RW': '41.186.0.0/16',
4314 'SA': '188.48.0.0/13',
4315 'SB': '202.1.160.0/19',
4316 'SC': '154.192.0.0/11',
4317 'SD': '102.120.0.0/13',
4318 'SE': '78.64.0.0/12',
4319 'SG': '8.128.0.0/10',
4320 'SI': '188.196.0.0/14',
4321 'SK': '78.98.0.0/15',
4322 'SL': '102.143.0.0/17',
4323 'SM': '89.186.32.0/19',
4324 'SN': '41.82.0.0/15',
4325 'SO': '154.115.192.0/18',
4326 'SR': '186.179.128.0/17',
4327 'SS': '105.235.208.0/21',
4328 'ST': '197.159.160.0/19',
4329 'SV': '168.243.0.0/16',
4330 'SX': '190.102.0.0/20',
4331 'SY': '5.0.0.0/16',
4332 'SZ': '41.84.224.0/19',
4333 'TC': '65.255.48.0/20',
4334 'TD': '154.68.128.0/19',
4335 'TG': '196.168.0.0/14',
4336 'TH': '171.96.0.0/13',
4337 'TJ': '85.9.128.0/18',
4338 'TK': '27.96.24.0/21',
4339 'TL': '180.189.160.0/20',
4340 'TM': '95.85.96.0/19',
4341 'TN': '197.0.0.0/11',
4342 'TO': '175.176.144.0/21',
4343 'TR': '78.160.0.0/11',
4344 'TT': '186.44.0.0/15',
4345 'TV': '202.2.96.0/19',
4346 'TW': '120.96.0.0/11',
4347 'TZ': '156.156.0.0/14',
4348 'UA': '37.52.0.0/14',
4349 'UG': '102.80.0.0/13',
4350 'US': '6.0.0.0/8',
4351 'UY': '167.56.0.0/13',
4352 'UZ': '84.54.64.0/18',
4353 'VA': '212.77.0.0/19',
4354 'VC': '207.191.240.0/21',
4355 'VE': '186.88.0.0/13',
4356 'VG': '66.81.192.0/20',
4357 'VI': '146.226.0.0/16',
4358 'VN': '14.160.0.0/11',
4359 'VU': '202.80.32.0/20',
4360 'WF': '117.20.32.0/21',
4361 'WS': '202.4.32.0/19',
4362 'YE': '134.35.0.0/16',
4363 'YT': '41.242.116.0/22',
4364 'ZA': '41.0.0.0/11',
4365 'ZM': '102.144.0.0/13',
4366 'ZW': '102.177.192.0/18',
4367 }
4368
4369 @classmethod
4370 def random_ipv4(cls, code_or_block):
4371 if len(code_or_block) == 2:
4372 block = cls._country_ip_map.get(code_or_block.upper())
4373 if not block:
4374 return None
4375 else:
4376 block = code_or_block
4377 addr, preflen = block.split('/')
4378 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4379 addr_max = addr_min | (0xffffffff >> int(preflen))
4380 return compat_str(socket.inet_ntoa(
4381 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4382
4383
4384 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4385 def __init__(self, proxies=None):
4386 # Set default handlers
4387 for type in ('http', 'https'):
4388 setattr(self, '%s_open' % type,
4389 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4390 meth(r, proxy, type))
4391 compat_urllib_request.ProxyHandler.__init__(self, proxies)
4392
4393 def proxy_open(self, req, proxy, type):
4394 req_proxy = req.headers.get('Ytdl-request-proxy')
4395 if req_proxy is not None:
4396 proxy = req_proxy
4397 del req.headers['Ytdl-request-proxy']
4398
4399 if proxy == '__noproxy__':
4400 return None # No Proxy
4401 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4402 req.add_header('Ytdl-socks-proxy', proxy)
4403 # yt-dlp's http/https handlers do wrapping the socket with socks
4404 return None
4405 return compat_urllib_request.ProxyHandler.proxy_open(
4406 self, req, proxy, type)
4407
4408
4409 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4410 # released into Public Domain
4411 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4412
4413 def long_to_bytes(n, blocksize=0):
4414 """long_to_bytes(n:long, blocksize:int) : string
4415 Convert a long integer to a byte string.
4416
4417 If optional blocksize is given and greater than zero, pad the front of the
4418 byte string with binary zeros so that the length is a multiple of
4419 blocksize.
4420 """
4421 # after much testing, this algorithm was deemed to be the fastest
4422 s = b''
4423 n = int(n)
4424 while n > 0:
4425 s = compat_struct_pack('>I', n & 0xffffffff) + s
4426 n = n >> 32
4427 # strip off leading zeros
4428 for i in range(len(s)):
4429 if s[i] != b'\000'[0]:
4430 break
4431 else:
4432 # only happens when n == 0
4433 s = b'\000'
4434 i = 0
4435 s = s[i:]
4436 # add back some pad bytes. this could be done more efficiently w.r.t. the
4437 # de-padding being done above, but sigh...
4438 if blocksize > 0 and len(s) % blocksize:
4439 s = (blocksize - len(s) % blocksize) * b'\000' + s
4440 return s
4441
4442
4443 def bytes_to_long(s):
4444 """bytes_to_long(string) : long
4445 Convert a byte string to a long integer.
4446
4447 This is (essentially) the inverse of long_to_bytes().
4448 """
4449 acc = 0
4450 length = len(s)
4451 if length % 4:
4452 extra = (4 - length % 4)
4453 s = b'\000' * extra + s
4454 length = length + extra
4455 for i in range(0, length, 4):
4456 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4457 return acc
4458
4459
4460 def ohdave_rsa_encrypt(data, exponent, modulus):
4461 '''
4462 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4463
4464 Input:
4465 data: data to encrypt, bytes-like object
4466 exponent, modulus: parameter e and N of RSA algorithm, both integer
4467 Output: hex string of encrypted data
4468
4469 Limitation: supports one block encryption only
4470 '''
4471
4472 payload = int(binascii.hexlify(data[::-1]), 16)
4473 encrypted = pow(payload, exponent, modulus)
4474 return '%x' % encrypted
4475
4476
4477 def pkcs1pad(data, length):
4478 """
4479 Padding input data with PKCS#1 scheme
4480
4481 @param {int[]} data input data
4482 @param {int} length target length
4483 @returns {int[]} padded data
4484 """
4485 if len(data) > length - 11:
4486 raise ValueError('Input data too long for PKCS#1 padding')
4487
4488 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4489 return [0, 2] + pseudo_random + [0] + data
4490
4491
4492 def encode_base_n(num, n, table=None):
4493 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4494 if not table:
4495 table = FULL_TABLE[:n]
4496
4497 if n > len(table):
4498 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4499
4500 if num == 0:
4501 return table[0]
4502
4503 ret = ''
4504 while num:
4505 ret = table[num % n] + ret
4506 num = num // n
4507 return ret
4508
4509
4510 def decode_packed_codes(code):
4511 mobj = re.search(PACKED_CODES_RE, code)
4512 obfuscated_code, base, count, symbols = mobj.groups()
4513 base = int(base)
4514 count = int(count)
4515 symbols = symbols.split('|')
4516 symbol_table = {}
4517
4518 while count:
4519 count -= 1
4520 base_n_count = encode_base_n(count, base)
4521 symbol_table[base_n_count] = symbols[count] or base_n_count
4522
4523 return re.sub(
4524 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4525 obfuscated_code)
4526
4527
4528 def caesar(s, alphabet, shift):
4529 if shift == 0:
4530 return s
4531 l = len(alphabet)
4532 return ''.join(
4533 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4534 for c in s)
4535
4536
4537 def rot47(s):
4538 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4539
4540
4541 def parse_m3u8_attributes(attrib):
4542 info = {}
4543 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4544 if val.startswith('"'):
4545 val = val[1:-1]
4546 info[key] = val
4547 return info
4548
4549
4550 def urshift(val, n):
4551 return val >> n if val >= 0 else (val + 0x100000000) >> n
4552
4553
4554 # Based on png2str() written by @gdkchan and improved by @yokrysty
4555 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4556 def decode_png(png_data):
4557 # Reference: https://www.w3.org/TR/PNG/
4558 header = png_data[8:]
4559
4560 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4561 raise OSError('Not a valid PNG file.')
4562
4563 int_map = {1: '>B', 2: '>H', 4: '>I'}
4564 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4565
4566 chunks = []
4567
4568 while header:
4569 length = unpack_integer(header[:4])
4570 header = header[4:]
4571
4572 chunk_type = header[:4]
4573 header = header[4:]
4574
4575 chunk_data = header[:length]
4576 header = header[length:]
4577
4578 header = header[4:] # Skip CRC
4579
4580 chunks.append({
4581 'type': chunk_type,
4582 'length': length,
4583 'data': chunk_data
4584 })
4585
4586 ihdr = chunks[0]['data']
4587
4588 width = unpack_integer(ihdr[:4])
4589 height = unpack_integer(ihdr[4:8])
4590
4591 idat = b''
4592
4593 for chunk in chunks:
4594 if chunk['type'] == b'IDAT':
4595 idat += chunk['data']
4596
4597 if not idat:
4598 raise OSError('Unable to read PNG data.')
4599
4600 decompressed_data = bytearray(zlib.decompress(idat))
4601
4602 stride = width * 3
4603 pixels = []
4604
4605 def _get_pixel(idx):
4606 x = idx % stride
4607 y = idx // stride
4608 return pixels[y][x]
4609
4610 for y in range(height):
4611 basePos = y * (1 + stride)
4612 filter_type = decompressed_data[basePos]
4613
4614 current_row = []
4615
4616 pixels.append(current_row)
4617
4618 for x in range(stride):
4619 color = decompressed_data[1 + basePos + x]
4620 basex = y * stride + x
4621 left = 0
4622 up = 0
4623
4624 if x > 2:
4625 left = _get_pixel(basex - 3)
4626 if y > 0:
4627 up = _get_pixel(basex - stride)
4628
4629 if filter_type == 1: # Sub
4630 color = (color + left) & 0xff
4631 elif filter_type == 2: # Up
4632 color = (color + up) & 0xff
4633 elif filter_type == 3: # Average
4634 color = (color + ((left + up) >> 1)) & 0xff
4635 elif filter_type == 4: # Paeth
4636 a = left
4637 b = up
4638 c = 0
4639
4640 if x > 2 and y > 0:
4641 c = _get_pixel(basex - stride - 3)
4642
4643 p = a + b - c
4644
4645 pa = abs(p - a)
4646 pb = abs(p - b)
4647 pc = abs(p - c)
4648
4649 if pa <= pb and pa <= pc:
4650 color = (color + a) & 0xff
4651 elif pb <= pc:
4652 color = (color + b) & 0xff
4653 else:
4654 color = (color + c) & 0xff
4655
4656 current_row.append(color)
4657
4658 return width, height, pixels
4659
4660
4661 def write_xattr(path, key, value):
4662 # This mess below finds the best xattr tool for the job
4663 try:
4664 # try the pyxattr module...
4665 import xattr
4666
4667 if hasattr(xattr, 'set'): # pyxattr
4668 # Unicode arguments are not supported in python-pyxattr until
4669 # version 0.5.0
4670 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4671 pyxattr_required_version = '0.5.0'
4672 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4673 # TODO: fallback to CLI tools
4674 raise XAttrUnavailableError(
4675 'python-pyxattr is detected but is too old. '
4676 'yt-dlp requires %s or above while your version is %s. '
4677 'Falling back to other xattr implementations' % (
4678 pyxattr_required_version, xattr.__version__))
4679
4680 setxattr = xattr.set
4681 else: # xattr
4682 setxattr = xattr.setxattr
4683
4684 try:
4685 setxattr(path, key, value)
4686 except OSError as e:
4687 raise XAttrMetadataError(e.errno, e.strerror)
4688
4689 except ImportError:
4690 if compat_os_name == 'nt':
4691 # Write xattrs to NTFS Alternate Data Streams:
4692 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4693 assert ':' not in key
4694 assert os.path.exists(path)
4695
4696 ads_fn = path + ':' + key
4697 try:
4698 with open(ads_fn, 'wb') as f:
4699 f.write(value)
4700 except OSError as e:
4701 raise XAttrMetadataError(e.errno, e.strerror)
4702 else:
4703 user_has_setfattr = check_executable('setfattr', ['--version'])
4704 user_has_xattr = check_executable('xattr', ['-h'])
4705
4706 if user_has_setfattr or user_has_xattr:
4707
4708 value = value.decode('utf-8')
4709 if user_has_setfattr:
4710 executable = 'setfattr'
4711 opts = ['-n', key, '-v', value]
4712 elif user_has_xattr:
4713 executable = 'xattr'
4714 opts = ['-w', key, value]
4715
4716 cmd = ([encodeFilename(executable, True)]
4717 + [encodeArgument(o) for o in opts]
4718 + [encodeFilename(path, True)])
4719
4720 try:
4721 p = Popen(
4722 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4723 except OSError as e:
4724 raise XAttrMetadataError(e.errno, e.strerror)
4725 stdout, stderr = p.communicate_or_kill()
4726 stderr = stderr.decode('utf-8', 'replace')
4727 if p.returncode != 0:
4728 raise XAttrMetadataError(p.returncode, stderr)
4729
4730 else:
4731 # On Unix, and can't find pyxattr, setfattr, or xattr.
4732 if sys.platform.startswith('linux'):
4733 raise XAttrUnavailableError(
4734 "Couldn't find a tool to set the xattrs. "
4735 "Install either the python 'pyxattr' or 'xattr' "
4736 "modules, or the GNU 'attr' package "
4737 "(which contains the 'setfattr' tool).")
4738 else:
4739 raise XAttrUnavailableError(
4740 "Couldn't find a tool to set the xattrs. "
4741 "Install either the python 'xattr' module, "
4742 "or the 'xattr' binary.")
4743
4744
4745 def random_birthday(year_field, month_field, day_field):
4746 start_date = datetime.date(1950, 1, 1)
4747 end_date = datetime.date(1995, 12, 31)
4748 offset = random.randint(0, (end_date - start_date).days)
4749 random_date = start_date + datetime.timedelta(offset)
4750 return {
4751 year_field: str(random_date.year),
4752 month_field: str(random_date.month),
4753 day_field: str(random_date.day),
4754 }
4755
4756
4757 # Templates for internet shortcut files, which are plain text files.
4758 DOT_URL_LINK_TEMPLATE = '''\
4759 [InternetShortcut]
4760 URL=%(url)s
4761 '''
4762
4763 DOT_WEBLOC_LINK_TEMPLATE = '''\
4764 <?xml version="1.0" encoding="UTF-8"?>
4765 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4766 <plist version="1.0">
4767 <dict>
4768 \t<key>URL</key>
4769 \t<string>%(url)s</string>
4770 </dict>
4771 </plist>
4772 '''
4773
4774 DOT_DESKTOP_LINK_TEMPLATE = '''\
4775 [Desktop Entry]
4776 Encoding=UTF-8
4777 Name=%(filename)s
4778 Type=Link
4779 URL=%(url)s
4780 Icon=text-html
4781 '''
4782
4783 LINK_TEMPLATES = {
4784 'url': DOT_URL_LINK_TEMPLATE,
4785 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4786 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4787 }
4788
4789
4790 def iri_to_uri(iri):
4791 """
4792 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4793
4794 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4795 """
4796
4797 iri_parts = compat_urllib_parse_urlparse(iri)
4798
4799 if '[' in iri_parts.netloc:
4800 raise ValueError('IPv6 URIs are not, yet, supported.')
4801 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4802
4803 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4804
4805 net_location = ''
4806 if iri_parts.username:
4807 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4808 if iri_parts.password is not None:
4809 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4810 net_location += '@'
4811
4812 net_location += iri_parts.hostname.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
4813 # The 'idna' encoding produces ASCII text.
4814 if iri_parts.port is not None and iri_parts.port != 80:
4815 net_location += ':' + str(iri_parts.port)
4816
4817 return urllib.parse.urlunparse(
4818 (iri_parts.scheme,
4819 net_location,
4820
4821 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4822
4823 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4824 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4825
4826 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4827 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4828
4829 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4830
4831 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4832
4833
4834 def to_high_limit_path(path):
4835 if sys.platform in ['win32', 'cygwin']:
4836 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4837 return '\\\\?\\' + os.path.abspath(path)
4838
4839 return path
4840
4841
4842 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4843 val = traverse_obj(obj, *variadic(field))
4844 if val in ignore:
4845 return default
4846 return template % (func(val) if func else val)
4847
4848
4849 def clean_podcast_url(url):
4850 return re.sub(r'''(?x)
4851 (?:
4852 (?:
4853 chtbl\.com/track|
4854 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4855 play\.podtrac\.com
4856 )/[^/]+|
4857 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4858 flex\.acast\.com|
4859 pd(?:
4860 cn\.co| # https://podcorn.com/analytics-prefix/
4861 st\.fm # https://podsights.com/docs/
4862 )/e
4863 )/''', '', url)
4864
4865
4866 _HEX_TABLE = '0123456789abcdef'
4867
4868
4869 def random_uuidv4():
4870 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4871
4872
4873 def make_dir(path, to_screen=None):
4874 try:
4875 dn = os.path.dirname(path)
4876 if dn and not os.path.exists(dn):
4877 os.makedirs(dn)
4878 return True
4879 except OSError as err:
4880 if callable(to_screen) is not None:
4881 to_screen('unable to create directory ' + error_to_compat_str(err))
4882 return False
4883
4884
4885 def get_executable_path():
4886 from zipimport import zipimporter
4887 if hasattr(sys, 'frozen'): # Running from PyInstaller
4888 path = os.path.dirname(sys.executable)
4889 elif isinstance(__loader__, zipimporter): # Running from ZIP
4890 path = os.path.join(os.path.dirname(__file__), '../..')
4891 else:
4892 path = os.path.join(os.path.dirname(__file__), '..')
4893 return os.path.abspath(path)
4894
4895
4896 def load_plugins(name, suffix, namespace):
4897 classes = {}
4898 with contextlib.suppress(FileNotFoundError):
4899 plugins_spec = importlib.util.spec_from_file_location(
4900 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
4901 plugins = importlib.util.module_from_spec(plugins_spec)
4902 sys.modules[plugins_spec.name] = plugins
4903 plugins_spec.loader.exec_module(plugins)
4904 for name in dir(plugins):
4905 if name in namespace:
4906 continue
4907 if not name.endswith(suffix):
4908 continue
4909 klass = getattr(plugins, name)
4910 classes[name] = namespace[name] = klass
4911 return classes
4912
4913
4914 def traverse_obj(
4915 obj, *path_list, default=None, expected_type=None, get_all=True,
4916 casesense=True, is_user_input=False, traverse_string=False):
4917 ''' Traverse nested list/dict/tuple
4918 @param path_list A list of paths which are checked one by one.
4919 Each path is a list of keys where each key is a:
4920 - None: Do nothing
4921 - string: A dictionary key
4922 - int: An index into a list
4923 - tuple: A list of keys all of which will be traversed
4924 - Ellipsis: Fetch all values in the object
4925 - Function: Takes the key and value as arguments
4926 and returns whether the key matches or not
4927 @param default Default value to return
4928 @param expected_type Only accept final value of this type (Can also be any callable)
4929 @param get_all Return all the values obtained from a path or only the first one
4930 @param casesense Whether to consider dictionary keys as case sensitive
4931 @param is_user_input Whether the keys are generated from user input. If True,
4932 strings are converted to int/slice if necessary
4933 @param traverse_string Whether to traverse inside strings. If True, any
4934 non-compatible object will also be converted into a string
4935 # TODO: Write tests
4936 '''
4937 if not casesense:
4938 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
4939 path_list = (map(_lower, variadic(path)) for path in path_list)
4940
4941 def _traverse_obj(obj, path, _current_depth=0):
4942 nonlocal depth
4943 path = tuple(variadic(path))
4944 for i, key in enumerate(path):
4945 if None in (key, obj):
4946 return obj
4947 if isinstance(key, (list, tuple)):
4948 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
4949 key = ...
4950 if key is ...:
4951 obj = (obj.values() if isinstance(obj, dict)
4952 else obj if isinstance(obj, (list, tuple, LazyList))
4953 else str(obj) if traverse_string else [])
4954 _current_depth += 1
4955 depth = max(depth, _current_depth)
4956 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
4957 elif callable(key):
4958 if isinstance(obj, (list, tuple, LazyList)):
4959 obj = enumerate(obj)
4960 elif isinstance(obj, dict):
4961 obj = obj.items()
4962 else:
4963 if not traverse_string:
4964 return None
4965 obj = str(obj)
4966 _current_depth += 1
4967 depth = max(depth, _current_depth)
4968 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
4969 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
4970 obj = (obj.get(key) if casesense or (key in obj)
4971 else next((v for k, v in obj.items() if _lower(k) == key), None))
4972 else:
4973 if is_user_input:
4974 key = (int_or_none(key) if ':' not in key
4975 else slice(*map(int_or_none, key.split(':'))))
4976 if key == slice(None):
4977 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
4978 if not isinstance(key, (int, slice)):
4979 return None
4980 if not isinstance(obj, (list, tuple, LazyList)):
4981 if not traverse_string:
4982 return None
4983 obj = str(obj)
4984 try:
4985 obj = obj[key]
4986 except IndexError:
4987 return None
4988 return obj
4989
4990 if isinstance(expected_type, type):
4991 type_test = lambda val: val if isinstance(val, expected_type) else None
4992 elif expected_type is not None:
4993 type_test = expected_type
4994 else:
4995 type_test = lambda val: val
4996
4997 for path in path_list:
4998 depth = 0
4999 val = _traverse_obj(obj, path)
5000 if val is not None:
5001 if depth:
5002 for _ in range(depth - 1):
5003 val = itertools.chain.from_iterable(v for v in val if v is not None)
5004 val = [v for v in map(type_test, val) if v is not None]
5005 if val:
5006 return val if get_all else val[0]
5007 else:
5008 val = type_test(val)
5009 if val is not None:
5010 return val
5011 return default
5012
5013
5014 def traverse_dict(dictn, keys, casesense=True):
5015 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5016 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5017 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5018
5019
5020 def get_first(obj, keys, **kwargs):
5021 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5022
5023
5024 def variadic(x, allowed_types=(str, bytes, dict)):
5025 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5026
5027
5028 def decode_base(value, digits):
5029 # This will convert given base-x string to scalar (long or int)
5030 table = {char: index for index, char in enumerate(digits)}
5031 result = 0
5032 base = len(digits)
5033 for chr in value:
5034 result *= base
5035 result += table[chr]
5036 return result
5037
5038
5039 def time_seconds(**kwargs):
5040 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5041 return t.timestamp()
5042
5043
5044 # create a JSON Web Signature (jws) with HS256 algorithm
5045 # the resulting format is in JWS Compact Serialization
5046 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5047 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5048 def jwt_encode_hs256(payload_data, key, headers={}):
5049 header_data = {
5050 'alg': 'HS256',
5051 'typ': 'JWT',
5052 }
5053 if headers:
5054 header_data.update(headers)
5055 header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5056 payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5057 h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5058 signature_b64 = base64.b64encode(h.digest())
5059 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5060 return token
5061
5062
5063 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5064 def jwt_decode_hs256(jwt):
5065 header_b64, payload_b64, signature_b64 = jwt.split('.')
5066 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5067 return payload_data
5068
5069
5070 def supports_terminal_sequences(stream):
5071 if compat_os_name == 'nt':
5072 from .compat import WINDOWS_VT_MODE # Must be imported locally
5073 if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5074 return False
5075 elif not os.getenv('TERM'):
5076 return False
5077 try:
5078 return stream.isatty()
5079 except BaseException:
5080 return False
5081
5082
5083 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5084
5085
5086 def remove_terminal_sequences(string):
5087 return _terminal_sequences_re.sub('', string)
5088
5089
5090 def number_of_digits(number):
5091 return len('%d' % number)
5092
5093
5094 def join_nonempty(*values, delim='-', from_dict=None):
5095 if from_dict is not None:
5096 values = map(from_dict.get, values)
5097 return delim.join(map(str, filter(None, values)))
5098
5099
5100 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5101 """
5102 Find the largest format dimensions in terms of video width and, for each thumbnail:
5103 * Modify the URL: Match the width with the provided regex and replace with the former width
5104 * Update dimensions
5105
5106 This function is useful with video services that scale the provided thumbnails on demand
5107 """
5108 _keys = ('width', 'height')
5109 max_dimensions = max(
5110 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5111 default=(0, 0))
5112 if not max_dimensions[0]:
5113 return thumbnails
5114 return [
5115 merge_dicts(
5116 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5117 dict(zip(_keys, max_dimensions)), thumbnail)
5118 for thumbnail in thumbnails
5119 ]
5120
5121
5122 def parse_http_range(range):
5123 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5124 if not range:
5125 return None, None, None
5126 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5127 if not crg:
5128 return None, None, None
5129 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5130
5131
5132 class Config:
5133 own_args = None
5134 filename = None
5135 __initialized = False
5136
5137 def __init__(self, parser, label=None):
5138 self._parser, self.label = parser, label
5139 self._loaded_paths, self.configs = set(), []
5140
5141 def init(self, args=None, filename=None):
5142 assert not self.__initialized
5143 directory = ''
5144 if filename:
5145 location = os.path.realpath(filename)
5146 directory = os.path.dirname(location)
5147 if location in self._loaded_paths:
5148 return False
5149 self._loaded_paths.add(location)
5150
5151 self.__initialized = True
5152 self.own_args, self.filename = args, filename
5153 for location in self._parser.parse_args(args)[0].config_locations or []:
5154 location = os.path.join(directory, expand_path(location))
5155 if os.path.isdir(location):
5156 location = os.path.join(location, 'yt-dlp.conf')
5157 if not os.path.exists(location):
5158 self._parser.error(f'config location {location} does not exist')
5159 self.append_config(self.read_file(location), location)
5160 return True
5161
5162 def __str__(self):
5163 label = join_nonempty(
5164 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5165 delim=' ')
5166 return join_nonempty(
5167 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5168 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5169 delim='\n')
5170
5171 @staticmethod
5172 def read_file(filename, default=[]):
5173 try:
5174 optionf = open(filename)
5175 except OSError:
5176 return default # silently skip if file is not present
5177 try:
5178 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5179 contents = optionf.read()
5180 res = shlex.split(contents, comments=True)
5181 finally:
5182 optionf.close()
5183 return res
5184
5185 @staticmethod
5186 def hide_login_info(opts):
5187 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5188 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5189
5190 def _scrub_eq(o):
5191 m = eqre.match(o)
5192 if m:
5193 return m.group('key') + '=PRIVATE'
5194 else:
5195 return o
5196
5197 opts = list(map(_scrub_eq, opts))
5198 for idx, opt in enumerate(opts):
5199 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5200 opts[idx + 1] = 'PRIVATE'
5201 return opts
5202
5203 def append_config(self, *args, label=None):
5204 config = type(self)(self._parser, label)
5205 config._loaded_paths = self._loaded_paths
5206 if config.init(*args):
5207 self.configs.append(config)
5208
5209 @property
5210 def all_args(self):
5211 for config in reversed(self.configs):
5212 yield from config.all_args
5213 yield from self.own_args or []
5214
5215 def parse_args(self):
5216 return self._parser.parse_args(self.all_args)
5217
5218
5219 class WebSocketsWrapper():
5220 """Wraps websockets module to use in non-async scopes"""
5221 pool = None
5222
5223 def __init__(self, url, headers=None, connect=True):
5224 self.loop = asyncio.events.new_event_loop()
5225 # XXX: "loop" is deprecated
5226 self.conn = websockets.connect(
5227 url, extra_headers=headers, ping_interval=None,
5228 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5229 if connect:
5230 self.__enter__()
5231 atexit.register(self.__exit__, None, None, None)
5232
5233 def __enter__(self):
5234 if not self.pool:
5235 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5236 return self
5237
5238 def send(self, *args):
5239 self.run_with_loop(self.pool.send(*args), self.loop)
5240
5241 def recv(self, *args):
5242 return self.run_with_loop(self.pool.recv(*args), self.loop)
5243
5244 def __exit__(self, type, value, traceback):
5245 try:
5246 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5247 finally:
5248 self.loop.close()
5249 self._cancel_all_tasks(self.loop)
5250
5251 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5252 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5253 @staticmethod
5254 def run_with_loop(main, loop):
5255 if not asyncio.coroutines.iscoroutine(main):
5256 raise ValueError(f'a coroutine was expected, got {main!r}')
5257
5258 try:
5259 return loop.run_until_complete(main)
5260 finally:
5261 loop.run_until_complete(loop.shutdown_asyncgens())
5262 if hasattr(loop, 'shutdown_default_executor'):
5263 loop.run_until_complete(loop.shutdown_default_executor())
5264
5265 @staticmethod
5266 def _cancel_all_tasks(loop):
5267 to_cancel = asyncio.tasks.all_tasks(loop)
5268
5269 if not to_cancel:
5270 return
5271
5272 for task in to_cancel:
5273 task.cancel()
5274
5275 # XXX: "loop" is removed in python 3.10+
5276 loop.run_until_complete(
5277 asyncio.tasks.gather(*to_cancel, loop=loop, return_exceptions=True))
5278
5279 for task in to_cancel:
5280 if task.cancelled():
5281 continue
5282 if task.exception() is not None:
5283 loop.call_exception_handler({
5284 'message': 'unhandled exception during asyncio.run() shutdown',
5285 'exception': task.exception(),
5286 'task': task,
5287 })
5288
5289
5290 def merge_headers(*dicts):
5291 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5292 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5293
5294
5295 class classproperty:
5296 def __init__(self, f):
5297 self.f = f
5298
5299 def __get__(self, _, cls):
5300 return self.f(cls)
5301
5302
5303 def Namespace(**kwargs):
5304 return collections.namedtuple('Namespace', kwargs)(**kwargs)
5305
5306
5307 # Deprecated
5308 has_certifi = bool(certifi)
5309 has_websockets = bool(websockets)