]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
[cleanup] Misc cleanup (#2173)
[yt-dlp.git] / yt_dlp / utils.py
1 #!/usr/bin/env python3
2 import asyncio
3 import atexit
4 import base64
5 import binascii
6 import calendar
7 import codecs
8 import collections
9 import contextlib
10 import ctypes
11 import datetime
12 import email.header
13 import email.utils
14 import errno
15 import functools
16 import gzip
17 import hashlib
18 import hmac
19 import importlib.util
20 import io
21 import itertools
22 import json
23 import locale
24 import math
25 import mimetypes
26 import operator
27 import os
28 import platform
29 import random
30 import re
31 import shlex
32 import socket
33 import ssl
34 import subprocess
35 import sys
36 import tempfile
37 import time
38 import traceback
39 import urllib.parse
40 import xml.etree.ElementTree
41 import zlib
42
43 from .compat import (
44 compat_brotli,
45 compat_chr,
46 compat_cookiejar,
47 compat_etree_fromstring,
48 compat_expanduser,
49 compat_html_entities,
50 compat_html_entities_html5,
51 compat_HTMLParseError,
52 compat_HTMLParser,
53 compat_http_client,
54 compat_HTTPError,
55 compat_os_name,
56 compat_parse_qs,
57 compat_shlex_quote,
58 compat_str,
59 compat_struct_pack,
60 compat_struct_unpack,
61 compat_urllib_error,
62 compat_urllib_parse_unquote_plus,
63 compat_urllib_parse_urlencode,
64 compat_urllib_parse_urlparse,
65 compat_urllib_request,
66 compat_urlparse,
67 compat_websockets,
68 )
69 from .socks import ProxyType, sockssocket
70
71 try:
72 import certifi
73 has_certifi = True
74 except ImportError:
75 has_certifi = False
76
77
78 def register_socks_protocols():
79 # "Register" SOCKS protocols
80 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
81 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
82 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
83 if scheme not in compat_urlparse.uses_netloc:
84 compat_urlparse.uses_netloc.append(scheme)
85
86
87 # This is not clearly defined otherwise
88 compiled_regex_type = type(re.compile(''))
89
90
91 def random_user_agent():
92 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
93 _CHROME_VERSIONS = (
94 '90.0.4430.212',
95 '90.0.4430.24',
96 '90.0.4430.70',
97 '90.0.4430.72',
98 '90.0.4430.85',
99 '90.0.4430.93',
100 '91.0.4472.101',
101 '91.0.4472.106',
102 '91.0.4472.114',
103 '91.0.4472.124',
104 '91.0.4472.164',
105 '91.0.4472.19',
106 '91.0.4472.77',
107 '92.0.4515.107',
108 '92.0.4515.115',
109 '92.0.4515.131',
110 '92.0.4515.159',
111 '92.0.4515.43',
112 '93.0.4556.0',
113 '93.0.4577.15',
114 '93.0.4577.63',
115 '93.0.4577.82',
116 '94.0.4606.41',
117 '94.0.4606.54',
118 '94.0.4606.61',
119 '94.0.4606.71',
120 '94.0.4606.81',
121 '94.0.4606.85',
122 '95.0.4638.17',
123 '95.0.4638.50',
124 '95.0.4638.54',
125 '95.0.4638.69',
126 '95.0.4638.74',
127 '96.0.4664.18',
128 '96.0.4664.45',
129 '96.0.4664.55',
130 '96.0.4664.93',
131 '97.0.4692.20',
132 )
133 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
134
135
136 SUPPORTED_ENCODINGS = [
137 'gzip', 'deflate'
138 ]
139 if compat_brotli:
140 SUPPORTED_ENCODINGS.append('br')
141
142 std_headers = {
143 'User-Agent': random_user_agent(),
144 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
145 'Accept-Language': 'en-us,en;q=0.5',
146 'Sec-Fetch-Mode': 'navigate',
147 }
148
149
150 USER_AGENTS = {
151 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
152 }
153
154
155 NO_DEFAULT = object()
156
157 ENGLISH_MONTH_NAMES = [
158 'January', 'February', 'March', 'April', 'May', 'June',
159 'July', 'August', 'September', 'October', 'November', 'December']
160
161 MONTH_NAMES = {
162 'en': ENGLISH_MONTH_NAMES,
163 'fr': [
164 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
165 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
166 }
167
168 KNOWN_EXTENSIONS = (
169 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
170 'flv', 'f4v', 'f4a', 'f4b',
171 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
172 'mkv', 'mka', 'mk3d',
173 'avi', 'divx',
174 'mov',
175 'asf', 'wmv', 'wma',
176 '3gp', '3g2',
177 'mp3',
178 'flac',
179 'ape',
180 'wav',
181 'f4f', 'f4m', 'm3u8', 'smil')
182
183 # needed for sanitizing filenames in restricted mode
184 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
185 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
186 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
187
188 DATE_FORMATS = (
189 '%d %B %Y',
190 '%d %b %Y',
191 '%B %d %Y',
192 '%B %dst %Y',
193 '%B %dnd %Y',
194 '%B %drd %Y',
195 '%B %dth %Y',
196 '%b %d %Y',
197 '%b %dst %Y',
198 '%b %dnd %Y',
199 '%b %drd %Y',
200 '%b %dth %Y',
201 '%b %dst %Y %I:%M',
202 '%b %dnd %Y %I:%M',
203 '%b %drd %Y %I:%M',
204 '%b %dth %Y %I:%M',
205 '%Y %m %d',
206 '%Y-%m-%d',
207 '%Y.%m.%d.',
208 '%Y/%m/%d',
209 '%Y/%m/%d %H:%M',
210 '%Y/%m/%d %H:%M:%S',
211 '%Y%m%d%H%M',
212 '%Y%m%d%H%M%S',
213 '%Y%m%d',
214 '%Y-%m-%d %H:%M',
215 '%Y-%m-%d %H:%M:%S',
216 '%Y-%m-%d %H:%M:%S.%f',
217 '%Y-%m-%d %H:%M:%S:%f',
218 '%d.%m.%Y %H:%M',
219 '%d.%m.%Y %H.%M',
220 '%Y-%m-%dT%H:%M:%SZ',
221 '%Y-%m-%dT%H:%M:%S.%fZ',
222 '%Y-%m-%dT%H:%M:%S.%f0Z',
223 '%Y-%m-%dT%H:%M:%S',
224 '%Y-%m-%dT%H:%M:%S.%f',
225 '%Y-%m-%dT%H:%M',
226 '%b %d %Y at %H:%M',
227 '%b %d %Y at %H:%M:%S',
228 '%B %d %Y at %H:%M',
229 '%B %d %Y at %H:%M:%S',
230 '%H:%M %d-%b-%Y',
231 )
232
233 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
234 DATE_FORMATS_DAY_FIRST.extend([
235 '%d-%m-%Y',
236 '%d.%m.%Y',
237 '%d.%m.%y',
238 '%d/%m/%Y',
239 '%d/%m/%y',
240 '%d/%m/%Y %H:%M:%S',
241 ])
242
243 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
244 DATE_FORMATS_MONTH_FIRST.extend([
245 '%m-%d-%Y',
246 '%m.%d.%Y',
247 '%m/%d/%Y',
248 '%m/%d/%y',
249 '%m/%d/%Y %H:%M:%S',
250 ])
251
252 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
253 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
254
255
256 def preferredencoding():
257 """Get preferred encoding.
258
259 Returns the best encoding scheme for the system, based on
260 locale.getpreferredencoding() and some further tweaks.
261 """
262 try:
263 pref = locale.getpreferredencoding()
264 'TEST'.encode(pref)
265 except Exception:
266 pref = 'UTF-8'
267
268 return pref
269
270
271 def write_json_file(obj, fn):
272 """ Encode obj as JSON and write it to fn, atomically if possible """
273
274 tf = tempfile.NamedTemporaryFile(
275 prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn),
276 suffix='.tmp', delete=False, mode='w', encoding='utf-8')
277
278 try:
279 with tf:
280 json.dump(obj, tf, ensure_ascii=False)
281 if sys.platform == 'win32':
282 # Need to remove existing file on Windows, else os.rename raises
283 # WindowsError or FileExistsError.
284 try:
285 os.unlink(fn)
286 except OSError:
287 pass
288 try:
289 mask = os.umask(0)
290 os.umask(mask)
291 os.chmod(tf.name, 0o666 & ~mask)
292 except OSError:
293 pass
294 os.rename(tf.name, fn)
295 except Exception:
296 try:
297 os.remove(tf.name)
298 except OSError:
299 pass
300 raise
301
302
303 def find_xpath_attr(node, xpath, key, val=None):
304 """ Find the xpath xpath[@key=val] """
305 assert re.match(r'^[a-zA-Z_-]+$', key)
306 expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']")
307 return node.find(expr)
308
309 # On python2.6 the xml.etree.ElementTree.Element methods don't support
310 # the namespace parameter
311
312
313 def xpath_with_ns(path, ns_map):
314 components = [c.split(':') for c in path.split('/')]
315 replaced = []
316 for c in components:
317 if len(c) == 1:
318 replaced.append(c[0])
319 else:
320 ns, tag = c
321 replaced.append('{%s}%s' % (ns_map[ns], tag))
322 return '/'.join(replaced)
323
324
325 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
326 def _find_xpath(xpath):
327 return node.find(xpath)
328
329 if isinstance(xpath, (str, compat_str)):
330 n = _find_xpath(xpath)
331 else:
332 for xp in xpath:
333 n = _find_xpath(xp)
334 if n is not None:
335 break
336
337 if n is None:
338 if default is not NO_DEFAULT:
339 return default
340 elif fatal:
341 name = xpath if name is None else name
342 raise ExtractorError('Could not find XML element %s' % name)
343 else:
344 return None
345 return n
346
347
348 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
349 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
350 if n is None or n == default:
351 return n
352 if n.text is None:
353 if default is not NO_DEFAULT:
354 return default
355 elif fatal:
356 name = xpath if name is None else name
357 raise ExtractorError('Could not find XML element\'s text %s' % name)
358 else:
359 return None
360 return n.text
361
362
363 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
364 n = find_xpath_attr(node, xpath, key)
365 if n is None:
366 if default is not NO_DEFAULT:
367 return default
368 elif fatal:
369 name = f'{xpath}[@{key}]' if name is None else name
370 raise ExtractorError('Could not find XML attribute %s' % name)
371 else:
372 return None
373 return n.attrib[key]
374
375
376 def get_element_by_id(id, html):
377 """Return the content of the tag with the specified ID in the passed HTML document"""
378 return get_element_by_attribute('id', id, html)
379
380
381 def get_element_html_by_id(id, html):
382 """Return the html of the tag with the specified ID in the passed HTML document"""
383 return get_element_html_by_attribute('id', id, html)
384
385
386 def get_element_by_class(class_name, html):
387 """Return the content of the first tag with the specified class in the passed HTML document"""
388 retval = get_elements_by_class(class_name, html)
389 return retval[0] if retval else None
390
391
392 def get_element_html_by_class(class_name, html):
393 """Return the html of the first tag with the specified class in the passed HTML document"""
394 retval = get_elements_html_by_class(class_name, html)
395 return retval[0] if retval else None
396
397
398 def get_element_by_attribute(attribute, value, html, escape_value=True):
399 retval = get_elements_by_attribute(attribute, value, html, escape_value)
400 return retval[0] if retval else None
401
402
403 def get_element_html_by_attribute(attribute, value, html, escape_value=True):
404 retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
405 return retval[0] if retval else None
406
407
408 def get_elements_by_class(class_name, html):
409 """Return the content of all tags with the specified class in the passed HTML document as a list"""
410 return get_elements_by_attribute(
411 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
412 html, escape_value=False)
413
414
415 def get_elements_html_by_class(class_name, html):
416 """Return the html of all tags with the specified class in the passed HTML document as a list"""
417 return get_elements_html_by_attribute(
418 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
419 html, escape_value=False)
420
421
422 def get_elements_by_attribute(*args, **kwargs):
423 """Return the content of the tag with the specified attribute in the passed HTML document"""
424 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
425
426
427 def get_elements_html_by_attribute(*args, **kwargs):
428 """Return the html of the tag with the specified attribute in the passed HTML document"""
429 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
430
431
432 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
433 """
434 Return the text (content) and the html (whole) of the tag with the specified
435 attribute in the passed HTML document
436 """
437
438 quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
439
440 value = re.escape(value) if escape_value else value
441
442 partial_element_re = rf'''(?x)
443 <(?P<tag>[a-zA-Z0-9:._-]+)
444 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
445 \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q)
446 '''
447
448 for m in re.finditer(partial_element_re, html):
449 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
450
451 yield (
452 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
453 whole
454 )
455
456
457 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
458 """
459 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
460 closing tag for the first opening tag it has encountered, and can be used
461 as a context manager
462 """
463
464 class HTMLBreakOnClosingTagException(Exception):
465 pass
466
467 def __init__(self):
468 self.tagstack = collections.deque()
469 compat_HTMLParser.__init__(self)
470
471 def __enter__(self):
472 return self
473
474 def __exit__(self, *_):
475 self.close()
476
477 def close(self):
478 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
479 # so data remains buffered; we no longer have any interest in it, thus
480 # override this method to discard it
481 pass
482
483 def handle_starttag(self, tag, _):
484 self.tagstack.append(tag)
485
486 def handle_endtag(self, tag):
487 if not self.tagstack:
488 raise compat_HTMLParseError('no tags in the stack')
489 while self.tagstack:
490 inner_tag = self.tagstack.pop()
491 if inner_tag == tag:
492 break
493 else:
494 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
495 if not self.tagstack:
496 raise self.HTMLBreakOnClosingTagException()
497
498
499 def get_element_text_and_html_by_tag(tag, html):
500 """
501 For the first element with the specified tag in the passed HTML document
502 return its' content (text) and the whole element (html)
503 """
504 def find_or_raise(haystack, needle, exc):
505 try:
506 return haystack.index(needle)
507 except ValueError:
508 raise exc
509 closing_tag = f'</{tag}>'
510 whole_start = find_or_raise(
511 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
512 content_start = find_or_raise(
513 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
514 content_start += whole_start + 1
515 with HTMLBreakOnClosingTagParser() as parser:
516 parser.feed(html[whole_start:content_start])
517 if not parser.tagstack or parser.tagstack[0] != tag:
518 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
519 offset = content_start
520 while offset < len(html):
521 next_closing_tag_start = find_or_raise(
522 html[offset:], closing_tag,
523 compat_HTMLParseError(f'closing {tag} tag not found'))
524 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
525 try:
526 parser.feed(html[offset:offset + next_closing_tag_end])
527 offset += next_closing_tag_end
528 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
529 return html[content_start:offset + next_closing_tag_start], \
530 html[whole_start:offset + next_closing_tag_end]
531 raise compat_HTMLParseError('unexpected end of html')
532
533
534 class HTMLAttributeParser(compat_HTMLParser):
535 """Trivial HTML parser to gather the attributes for a single element"""
536
537 def __init__(self):
538 self.attrs = {}
539 compat_HTMLParser.__init__(self)
540
541 def handle_starttag(self, tag, attrs):
542 self.attrs = dict(attrs)
543
544
545 class HTMLListAttrsParser(compat_HTMLParser):
546 """HTML parser to gather the attributes for the elements of a list"""
547
548 def __init__(self):
549 compat_HTMLParser.__init__(self)
550 self.items = []
551 self._level = 0
552
553 def handle_starttag(self, tag, attrs):
554 if tag == 'li' and self._level == 0:
555 self.items.append(dict(attrs))
556 self._level += 1
557
558 def handle_endtag(self, tag):
559 self._level -= 1
560
561
562 def extract_attributes(html_element):
563 """Given a string for an HTML element such as
564 <el
565 a="foo" B="bar" c="&98;az" d=boz
566 empty= noval entity="&amp;"
567 sq='"' dq="'"
568 >
569 Decode and return a dictionary of attributes.
570 {
571 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
572 'empty': '', 'noval': None, 'entity': '&',
573 'sq': '"', 'dq': '\''
574 }.
575 """
576 parser = HTMLAttributeParser()
577 try:
578 parser.feed(html_element)
579 parser.close()
580 # Older Python may throw HTMLParseError in case of malformed HTML
581 except compat_HTMLParseError:
582 pass
583 return parser.attrs
584
585
586 def parse_list(webpage):
587 """Given a string for an series of HTML <li> elements,
588 return a dictionary of their attributes"""
589 parser = HTMLListAttrsParser()
590 parser.feed(webpage)
591 parser.close()
592 return parser.items
593
594
595 def clean_html(html):
596 """Clean an HTML snippet into a readable string"""
597
598 if html is None: # Convenience for sanitizing descriptions etc.
599 return html
600
601 html = re.sub(r'\s+', ' ', html)
602 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
603 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
604 # Strip html tags
605 html = re.sub('<.*?>', '', html)
606 # Replace html entities
607 html = unescapeHTML(html)
608 return html.strip()
609
610
611 def sanitize_open(filename, open_mode):
612 """Try to open the given filename, and slightly tweak it if this fails.
613
614 Attempts to open the given filename. If this fails, it tries to change
615 the filename slightly, step by step, until it's either able to open it
616 or it fails and raises a final exception, like the standard open()
617 function.
618
619 It returns the tuple (stream, definitive_file_name).
620 """
621 if filename == '-':
622 if sys.platform == 'win32':
623 import msvcrt
624 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
625 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
626
627 for attempt in range(2):
628 try:
629 try:
630 if sys.platform == 'win32':
631 # FIXME: An exclusive lock also locks the file from being read.
632 # Since windows locks are mandatory, don't lock the file on windows (for now).
633 # Ref: https://github.com/yt-dlp/yt-dlp/issues/3124
634 raise LockingUnsupportedError()
635 stream = locked_file(filename, open_mode, block=False).__enter__()
636 except LockingUnsupportedError:
637 stream = open(filename, open_mode)
638 return (stream, filename)
639 except OSError as err:
640 if attempt or err.errno in (errno.EACCES,):
641 raise
642 old_filename, filename = filename, sanitize_path(filename)
643 if old_filename == filename:
644 raise
645
646
647 def timeconvert(timestr):
648 """Convert RFC 2822 defined time string into system timestamp"""
649 timestamp = None
650 timetuple = email.utils.parsedate_tz(timestr)
651 if timetuple is not None:
652 timestamp = email.utils.mktime_tz(timetuple)
653 return timestamp
654
655
656 def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT):
657 """Sanitizes a string so it could be used as part of a filename.
658 @param restricted Use a stricter subset of allowed characters
659 @param is_id Whether this is an ID that should be kept unchanged if possible.
660 If unset, yt-dlp's new sanitization rules are in effect
661 """
662 if s == '':
663 return ''
664
665 def replace_insane(char):
666 if restricted and char in ACCENT_CHARS:
667 return ACCENT_CHARS[char]
668 elif not restricted and char == '\n':
669 return '\0 '
670 elif char == '?' or ord(char) < 32 or ord(char) == 127:
671 return ''
672 elif char == '"':
673 return '' if restricted else '\''
674 elif char == ':':
675 return '\0_\0-' if restricted else '\0 \0-'
676 elif char in '\\/|*<>':
677 return '\0_'
678 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace() or ord(char) > 127):
679 return '\0_'
680 return char
681
682 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps
683 result = ''.join(map(replace_insane, s))
684 if is_id is NO_DEFAULT:
685 result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result) # Remove repeated substitute chars
686 STRIP_RE = '(?:\0.|[ _-])*'
687 result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end
688 result = result.replace('\0', '') or '_'
689
690 if not is_id:
691 while '__' in result:
692 result = result.replace('__', '_')
693 result = result.strip('_')
694 # Common case of "Foreign band name - English song title"
695 if restricted and result.startswith('-_'):
696 result = result[2:]
697 if result.startswith('-'):
698 result = '_' + result[len('-'):]
699 result = result.lstrip('.')
700 if not result:
701 result = '_'
702 return result
703
704
705 def sanitize_path(s, force=False):
706 """Sanitizes and normalizes path on Windows"""
707 if sys.platform == 'win32':
708 force = False
709 drive_or_unc, _ = os.path.splitdrive(s)
710 elif force:
711 drive_or_unc = ''
712 else:
713 return s
714
715 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
716 if drive_or_unc:
717 norm_path.pop(0)
718 sanitized_path = [
719 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
720 for path_part in norm_path]
721 if drive_or_unc:
722 sanitized_path.insert(0, drive_or_unc + os.path.sep)
723 elif force and s and s[0] == os.path.sep:
724 sanitized_path.insert(0, os.path.sep)
725 return os.path.join(*sanitized_path)
726
727
728 def sanitize_url(url):
729 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
730 # the number of unwanted failures due to missing protocol
731 if url.startswith('//'):
732 return 'http:%s' % url
733 # Fix some common typos seen so far
734 COMMON_TYPOS = (
735 # https://github.com/ytdl-org/youtube-dl/issues/15649
736 (r'^httpss://', r'https://'),
737 # https://bx1.be/lives/direct-tv/
738 (r'^rmtp([es]?)://', r'rtmp\1://'),
739 )
740 for mistake, fixup in COMMON_TYPOS:
741 if re.match(mistake, url):
742 return re.sub(mistake, fixup, url)
743 return url
744
745
746 def extract_basic_auth(url):
747 parts = compat_urlparse.urlsplit(url)
748 if parts.username is None:
749 return url, None
750 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
751 parts.hostname if parts.port is None
752 else '%s:%d' % (parts.hostname, parts.port))))
753 auth_payload = base64.b64encode(
754 ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
755 return url, 'Basic ' + auth_payload.decode('utf-8')
756
757
758 def sanitized_Request(url, *args, **kwargs):
759 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
760 if auth_header is not None:
761 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
762 headers['Authorization'] = auth_header
763 return compat_urllib_request.Request(url, *args, **kwargs)
764
765
766 def expand_path(s):
767 """Expand shell variables and ~"""
768 return os.path.expandvars(compat_expanduser(s))
769
770
771 def orderedSet(iterable):
772 """ Remove all duplicates from the input iterable """
773 res = []
774 for el in iterable:
775 if el not in res:
776 res.append(el)
777 return res
778
779
780 def _htmlentity_transform(entity_with_semicolon):
781 """Transforms an HTML entity to a character."""
782 entity = entity_with_semicolon[:-1]
783
784 # Known non-numeric HTML entity
785 if entity in compat_html_entities.name2codepoint:
786 return compat_chr(compat_html_entities.name2codepoint[entity])
787
788 # TODO: HTML5 allows entities without a semicolon. For example,
789 # '&Eacuteric' should be decoded as 'Éric'.
790 if entity_with_semicolon in compat_html_entities_html5:
791 return compat_html_entities_html5[entity_with_semicolon]
792
793 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
794 if mobj is not None:
795 numstr = mobj.group(1)
796 if numstr.startswith('x'):
797 base = 16
798 numstr = '0%s' % numstr
799 else:
800 base = 10
801 # See https://github.com/ytdl-org/youtube-dl/issues/7518
802 try:
803 return compat_chr(int(numstr, base))
804 except ValueError:
805 pass
806
807 # Unknown entity in name, return its literal representation
808 return '&%s;' % entity
809
810
811 def unescapeHTML(s):
812 if s is None:
813 return None
814 assert type(s) == compat_str
815
816 return re.sub(
817 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
818
819
820 def escapeHTML(text):
821 return (
822 text
823 .replace('&', '&amp;')
824 .replace('<', '&lt;')
825 .replace('>', '&gt;')
826 .replace('"', '&quot;')
827 .replace("'", '&#39;')
828 )
829
830
831 def process_communicate_or_kill(p, *args, **kwargs):
832 try:
833 return p.communicate(*args, **kwargs)
834 except BaseException: # Including KeyboardInterrupt
835 p.kill()
836 p.wait()
837 raise
838
839
840 class Popen(subprocess.Popen):
841 if sys.platform == 'win32':
842 _startupinfo = subprocess.STARTUPINFO()
843 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
844 else:
845 _startupinfo = None
846
847 def __init__(self, *args, **kwargs):
848 super().__init__(*args, **kwargs, startupinfo=self._startupinfo)
849
850 def communicate_or_kill(self, *args, **kwargs):
851 return process_communicate_or_kill(self, *args, **kwargs)
852
853
854 def get_subprocess_encoding():
855 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
856 # For subprocess calls, encode with locale encoding
857 # Refer to http://stackoverflow.com/a/9951851/35070
858 encoding = preferredencoding()
859 else:
860 encoding = sys.getfilesystemencoding()
861 if encoding is None:
862 encoding = 'utf-8'
863 return encoding
864
865
866 def encodeFilename(s, for_subprocess=False):
867 assert type(s) == str
868 return s
869
870
871 def decodeFilename(b, for_subprocess=False):
872 return b
873
874
875 def encodeArgument(s):
876 # Legacy code that uses byte strings
877 # Uncomment the following line after fixing all post processors
878 # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
879 return s if isinstance(s, str) else s.decode('ascii')
880
881
882 def decodeArgument(b):
883 return b
884
885
886 def decodeOption(optval):
887 if optval is None:
888 return optval
889 if isinstance(optval, bytes):
890 optval = optval.decode(preferredencoding())
891
892 assert isinstance(optval, compat_str)
893 return optval
894
895
896 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
897
898
899 def timetuple_from_msec(msec):
900 secs, msec = divmod(msec, 1000)
901 mins, secs = divmod(secs, 60)
902 hrs, mins = divmod(mins, 60)
903 return _timetuple(hrs, mins, secs, msec)
904
905
906 def formatSeconds(secs, delim=':', msec=False):
907 time = timetuple_from_msec(secs * 1000)
908 if time.hours:
909 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
910 elif time.minutes:
911 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
912 else:
913 ret = '%d' % time.seconds
914 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
915
916
917 def _ssl_load_windows_store_certs(ssl_context, storename):
918 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
919 try:
920 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
921 if encoding == 'x509_asn' and (
922 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
923 except PermissionError:
924 return
925 for cert in certs:
926 try:
927 ssl_context.load_verify_locations(cadata=cert)
928 except ssl.SSLError:
929 pass
930
931
932 def make_HTTPS_handler(params, **kwargs):
933 opts_check_certificate = not params.get('nocheckcertificate')
934 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
935 context.check_hostname = opts_check_certificate
936 if params.get('legacyserverconnect'):
937 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
938 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
939 if opts_check_certificate:
940 if has_certifi and 'no-certifi' not in params.get('compat_opts', []):
941 context.load_verify_locations(cafile=certifi.where())
942 else:
943 try:
944 context.load_default_certs()
945 # Work around the issue in load_default_certs when there are bad certificates. See:
946 # https://github.com/yt-dlp/yt-dlp/issues/1060,
947 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
948 except ssl.SSLError:
949 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
950 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
951 # Create a new context to discard any certificates that were already loaded
952 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
953 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
954 for storename in ('CA', 'ROOT'):
955 _ssl_load_windows_store_certs(context, storename)
956 context.set_default_verify_paths()
957 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
958
959
960 def bug_reports_message(before=';'):
961 msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp/issues?q= , '
962 'filling out the appropriate issue template. '
963 'Confirm you are on the latest version using yt-dlp -U')
964
965 before = before.rstrip()
966 if not before or before.endswith(('.', '!', '?')):
967 msg = msg[0].title() + msg[1:]
968
969 return (before + ' ' if before else '') + msg
970
971
972 class YoutubeDLError(Exception):
973 """Base exception for YoutubeDL errors."""
974 msg = None
975
976 def __init__(self, msg=None):
977 if msg is not None:
978 self.msg = msg
979 elif self.msg is None:
980 self.msg = type(self).__name__
981 super().__init__(self.msg)
982
983
984 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
985 if hasattr(ssl, 'CertificateError'):
986 network_exceptions.append(ssl.CertificateError)
987 network_exceptions = tuple(network_exceptions)
988
989
990 class ExtractorError(YoutubeDLError):
991 """Error during info extraction."""
992
993 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
994 """ tb, if given, is the original traceback (so that it can be printed out).
995 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
996 """
997 if sys.exc_info()[0] in network_exceptions:
998 expected = True
999
1000 self.orig_msg = str(msg)
1001 self.traceback = tb
1002 self.expected = expected
1003 self.cause = cause
1004 self.video_id = video_id
1005 self.ie = ie
1006 self.exc_info = sys.exc_info() # preserve original exception
1007
1008 super().__init__(''.join((
1009 format_field(ie, template='[%s] '),
1010 format_field(video_id, template='%s: '),
1011 msg,
1012 format_field(cause, template=' (caused by %r)'),
1013 '' if expected else bug_reports_message())))
1014
1015 def format_traceback(self):
1016 return join_nonempty(
1017 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1018 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1019 delim='\n') or None
1020
1021
1022 class UnsupportedError(ExtractorError):
1023 def __init__(self, url):
1024 super().__init__(
1025 'Unsupported URL: %s' % url, expected=True)
1026 self.url = url
1027
1028
1029 class RegexNotFoundError(ExtractorError):
1030 """Error when a regex didn't match"""
1031 pass
1032
1033
1034 class GeoRestrictedError(ExtractorError):
1035 """Geographic restriction Error exception.
1036
1037 This exception may be thrown when a video is not available from your
1038 geographic location due to geographic restrictions imposed by a website.
1039 """
1040
1041 def __init__(self, msg, countries=None, **kwargs):
1042 kwargs['expected'] = True
1043 super().__init__(msg, **kwargs)
1044 self.countries = countries
1045
1046
1047 class DownloadError(YoutubeDLError):
1048 """Download Error exception.
1049
1050 This exception may be thrown by FileDownloader objects if they are not
1051 configured to continue on errors. They will contain the appropriate
1052 error message.
1053 """
1054
1055 def __init__(self, msg, exc_info=None):
1056 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1057 super().__init__(msg)
1058 self.exc_info = exc_info
1059
1060
1061 class EntryNotInPlaylist(YoutubeDLError):
1062 """Entry not in playlist exception.
1063
1064 This exception will be thrown by YoutubeDL when a requested entry
1065 is not found in the playlist info_dict
1066 """
1067 msg = 'Entry not found in info'
1068
1069
1070 class SameFileError(YoutubeDLError):
1071 """Same File exception.
1072
1073 This exception will be thrown by FileDownloader objects if they detect
1074 multiple files would have to be downloaded to the same file on disk.
1075 """
1076 msg = 'Fixed output name but more than one file to download'
1077
1078 def __init__(self, filename=None):
1079 if filename is not None:
1080 self.msg += f': {filename}'
1081 super().__init__(self.msg)
1082
1083
1084 class PostProcessingError(YoutubeDLError):
1085 """Post Processing exception.
1086
1087 This exception may be raised by PostProcessor's .run() method to
1088 indicate an error in the postprocessing task.
1089 """
1090
1091
1092 class DownloadCancelled(YoutubeDLError):
1093 """ Exception raised when the download queue should be interrupted """
1094 msg = 'The download was cancelled'
1095
1096
1097 class ExistingVideoReached(DownloadCancelled):
1098 """ --break-on-existing triggered """
1099 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1100
1101
1102 class RejectedVideoReached(DownloadCancelled):
1103 """ --break-on-reject triggered """
1104 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1105
1106
1107 class MaxDownloadsReached(DownloadCancelled):
1108 """ --max-downloads limit has been reached. """
1109 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1110
1111
1112 class ReExtractInfo(YoutubeDLError):
1113 """ Video info needs to be re-extracted. """
1114
1115 def __init__(self, msg, expected=False):
1116 super().__init__(msg)
1117 self.expected = expected
1118
1119
1120 class ThrottledDownload(ReExtractInfo):
1121 """ Download speed below --throttled-rate. """
1122 msg = 'The download speed is below throttle limit'
1123
1124 def __init__(self):
1125 super().__init__(self.msg, expected=False)
1126
1127
1128 class UnavailableVideoError(YoutubeDLError):
1129 """Unavailable Format exception.
1130
1131 This exception will be thrown when a video is requested
1132 in a format that is not available for that video.
1133 """
1134 msg = 'Unable to download video'
1135
1136 def __init__(self, err=None):
1137 if err is not None:
1138 self.msg += f': {err}'
1139 super().__init__(self.msg)
1140
1141
1142 class ContentTooShortError(YoutubeDLError):
1143 """Content Too Short exception.
1144
1145 This exception may be raised by FileDownloader objects when a file they
1146 download is too small for what the server announced first, indicating
1147 the connection was probably interrupted.
1148 """
1149
1150 def __init__(self, downloaded, expected):
1151 super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes')
1152 # Both in bytes
1153 self.downloaded = downloaded
1154 self.expected = expected
1155
1156
1157 class XAttrMetadataError(YoutubeDLError):
1158 def __init__(self, code=None, msg='Unknown error'):
1159 super().__init__(msg)
1160 self.code = code
1161 self.msg = msg
1162
1163 # Parsing code and msg
1164 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1165 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1166 self.reason = 'NO_SPACE'
1167 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1168 self.reason = 'VALUE_TOO_LONG'
1169 else:
1170 self.reason = 'NOT_SUPPORTED'
1171
1172
1173 class XAttrUnavailableError(YoutubeDLError):
1174 pass
1175
1176
1177 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1178 hc = http_class(*args, **kwargs)
1179 source_address = ydl_handler._params.get('source_address')
1180
1181 if source_address is not None:
1182 # This is to workaround _create_connection() from socket where it will try all
1183 # address data from getaddrinfo() including IPv6. This filters the result from
1184 # getaddrinfo() based on the source_address value.
1185 # This is based on the cpython socket.create_connection() function.
1186 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1187 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1188 host, port = address
1189 err = None
1190 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1191 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1192 ip_addrs = [addr for addr in addrs if addr[0] == af]
1193 if addrs and not ip_addrs:
1194 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1195 raise OSError(
1196 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1197 % (ip_version, source_address[0]))
1198 for res in ip_addrs:
1199 af, socktype, proto, canonname, sa = res
1200 sock = None
1201 try:
1202 sock = socket.socket(af, socktype, proto)
1203 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1204 sock.settimeout(timeout)
1205 sock.bind(source_address)
1206 sock.connect(sa)
1207 err = None # Explicitly break reference cycle
1208 return sock
1209 except OSError as _:
1210 err = _
1211 if sock is not None:
1212 sock.close()
1213 if err is not None:
1214 raise err
1215 else:
1216 raise OSError('getaddrinfo returns an empty list')
1217 if hasattr(hc, '_create_connection'):
1218 hc._create_connection = _create_connection
1219 hc.source_address = (source_address, 0)
1220
1221 return hc
1222
1223
1224 def handle_youtubedl_headers(headers):
1225 filtered_headers = headers
1226
1227 if 'Youtubedl-no-compression' in filtered_headers:
1228 filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'}
1229 del filtered_headers['Youtubedl-no-compression']
1230
1231 return filtered_headers
1232
1233
1234 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1235 """Handler for HTTP requests and responses.
1236
1237 This class, when installed with an OpenerDirector, automatically adds
1238 the standard headers to every HTTP request and handles gzipped and
1239 deflated responses from web servers. If compression is to be avoided in
1240 a particular request, the original request in the program code only has
1241 to include the HTTP header "Youtubedl-no-compression", which will be
1242 removed before making the real request.
1243
1244 Part of this code was copied from:
1245
1246 http://techknack.net/python-urllib2-handlers/
1247
1248 Andrew Rowls, the author of that code, agreed to release it to the
1249 public domain.
1250 """
1251
1252 def __init__(self, params, *args, **kwargs):
1253 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1254 self._params = params
1255
1256 def http_open(self, req):
1257 conn_class = compat_http_client.HTTPConnection
1258
1259 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1260 if socks_proxy:
1261 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1262 del req.headers['Ytdl-socks-proxy']
1263
1264 return self.do_open(functools.partial(
1265 _create_http_connection, self, conn_class, False),
1266 req)
1267
1268 @staticmethod
1269 def deflate(data):
1270 if not data:
1271 return data
1272 try:
1273 return zlib.decompress(data, -zlib.MAX_WBITS)
1274 except zlib.error:
1275 return zlib.decompress(data)
1276
1277 @staticmethod
1278 def brotli(data):
1279 if not data:
1280 return data
1281 return compat_brotli.decompress(data)
1282
1283 def http_request(self, req):
1284 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1285 # always respected by websites, some tend to give out URLs with non percent-encoded
1286 # non-ASCII characters (see telemb.py, ard.py [#3412])
1287 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1288 # To work around aforementioned issue we will replace request's original URL with
1289 # percent-encoded one
1290 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1291 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1292 url = req.get_full_url()
1293 url_escaped = escape_url(url)
1294
1295 # Substitute URL if any change after escaping
1296 if url != url_escaped:
1297 req = update_Request(req, url=url_escaped)
1298
1299 for h, v in self._params.get('http_headers', std_headers).items():
1300 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1301 # The dict keys are capitalized because of this bug by urllib
1302 if h.capitalize() not in req.headers:
1303 req.add_header(h, v)
1304
1305 if 'Accept-encoding' not in req.headers:
1306 req.add_header('Accept-encoding', ', '.join(SUPPORTED_ENCODINGS))
1307
1308 req.headers = handle_youtubedl_headers(req.headers)
1309
1310 return req
1311
1312 def http_response(self, req, resp):
1313 old_resp = resp
1314 # gzip
1315 if resp.headers.get('Content-encoding', '') == 'gzip':
1316 content = resp.read()
1317 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1318 try:
1319 uncompressed = io.BytesIO(gz.read())
1320 except OSError as original_ioerror:
1321 # There may be junk add the end of the file
1322 # See http://stackoverflow.com/q/4928560/35070 for details
1323 for i in range(1, 1024):
1324 try:
1325 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1326 uncompressed = io.BytesIO(gz.read())
1327 except OSError:
1328 continue
1329 break
1330 else:
1331 raise original_ioerror
1332 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1333 resp.msg = old_resp.msg
1334 del resp.headers['Content-encoding']
1335 # deflate
1336 if resp.headers.get('Content-encoding', '') == 'deflate':
1337 gz = io.BytesIO(self.deflate(resp.read()))
1338 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1339 resp.msg = old_resp.msg
1340 del resp.headers['Content-encoding']
1341 # brotli
1342 if resp.headers.get('Content-encoding', '') == 'br':
1343 resp = compat_urllib_request.addinfourl(
1344 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1345 resp.msg = old_resp.msg
1346 del resp.headers['Content-encoding']
1347 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1348 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1349 if 300 <= resp.code < 400:
1350 location = resp.headers.get('Location')
1351 if location:
1352 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1353 location = location.encode('iso-8859-1').decode('utf-8')
1354 location_escaped = escape_url(location)
1355 if location != location_escaped:
1356 del resp.headers['Location']
1357 resp.headers['Location'] = location_escaped
1358 return resp
1359
1360 https_request = http_request
1361 https_response = http_response
1362
1363
1364 def make_socks_conn_class(base_class, socks_proxy):
1365 assert issubclass(base_class, (
1366 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1367
1368 url_components = compat_urlparse.urlparse(socks_proxy)
1369 if url_components.scheme.lower() == 'socks5':
1370 socks_type = ProxyType.SOCKS5
1371 elif url_components.scheme.lower() in ('socks', 'socks4'):
1372 socks_type = ProxyType.SOCKS4
1373 elif url_components.scheme.lower() == 'socks4a':
1374 socks_type = ProxyType.SOCKS4A
1375
1376 def unquote_if_non_empty(s):
1377 if not s:
1378 return s
1379 return compat_urllib_parse_unquote_plus(s)
1380
1381 proxy_args = (
1382 socks_type,
1383 url_components.hostname, url_components.port or 1080,
1384 True, # Remote DNS
1385 unquote_if_non_empty(url_components.username),
1386 unquote_if_non_empty(url_components.password),
1387 )
1388
1389 class SocksConnection(base_class):
1390 def connect(self):
1391 self.sock = sockssocket()
1392 self.sock.setproxy(*proxy_args)
1393 if type(self.timeout) in (int, float):
1394 self.sock.settimeout(self.timeout)
1395 self.sock.connect((self.host, self.port))
1396
1397 if isinstance(self, compat_http_client.HTTPSConnection):
1398 if hasattr(self, '_context'): # Python > 2.6
1399 self.sock = self._context.wrap_socket(
1400 self.sock, server_hostname=self.host)
1401 else:
1402 self.sock = ssl.wrap_socket(self.sock)
1403
1404 return SocksConnection
1405
1406
1407 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1408 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1409 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1410 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1411 self._params = params
1412
1413 def https_open(self, req):
1414 kwargs = {}
1415 conn_class = self._https_conn_class
1416
1417 if hasattr(self, '_context'): # python > 2.6
1418 kwargs['context'] = self._context
1419 if hasattr(self, '_check_hostname'): # python 3.x
1420 kwargs['check_hostname'] = self._check_hostname
1421
1422 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1423 if socks_proxy:
1424 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1425 del req.headers['Ytdl-socks-proxy']
1426
1427 return self.do_open(functools.partial(
1428 _create_http_connection, self, conn_class, True),
1429 req, **kwargs)
1430
1431
1432 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1433 """
1434 See [1] for cookie file format.
1435
1436 1. https://curl.haxx.se/docs/http-cookies.html
1437 """
1438 _HTTPONLY_PREFIX = '#HttpOnly_'
1439 _ENTRY_LEN = 7
1440 _HEADER = '''# Netscape HTTP Cookie File
1441 # This file is generated by yt-dlp. Do not edit.
1442
1443 '''
1444 _CookieFileEntry = collections.namedtuple(
1445 'CookieFileEntry',
1446 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1447
1448 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1449 """
1450 Save cookies to a file.
1451
1452 Most of the code is taken from CPython 3.8 and slightly adapted
1453 to support cookie files with UTF-8 in both python 2 and 3.
1454 """
1455 if filename is None:
1456 if self.filename is not None:
1457 filename = self.filename
1458 else:
1459 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1460
1461 # Store session cookies with `expires` set to 0 instead of an empty
1462 # string
1463 for cookie in self:
1464 if cookie.expires is None:
1465 cookie.expires = 0
1466
1467 with open(filename, 'w', encoding='utf-8') as f:
1468 f.write(self._HEADER)
1469 now = time.time()
1470 for cookie in self:
1471 if not ignore_discard and cookie.discard:
1472 continue
1473 if not ignore_expires and cookie.is_expired(now):
1474 continue
1475 if cookie.secure:
1476 secure = 'TRUE'
1477 else:
1478 secure = 'FALSE'
1479 if cookie.domain.startswith('.'):
1480 initial_dot = 'TRUE'
1481 else:
1482 initial_dot = 'FALSE'
1483 if cookie.expires is not None:
1484 expires = compat_str(cookie.expires)
1485 else:
1486 expires = ''
1487 if cookie.value is None:
1488 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1489 # with no name, whereas http.cookiejar regards it as a
1490 # cookie with no value.
1491 name = ''
1492 value = cookie.name
1493 else:
1494 name = cookie.name
1495 value = cookie.value
1496 f.write(
1497 '\t'.join([cookie.domain, initial_dot, cookie.path,
1498 secure, expires, name, value]) + '\n')
1499
1500 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1501 """Load cookies from a file."""
1502 if filename is None:
1503 if self.filename is not None:
1504 filename = self.filename
1505 else:
1506 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1507
1508 def prepare_line(line):
1509 if line.startswith(self._HTTPONLY_PREFIX):
1510 line = line[len(self._HTTPONLY_PREFIX):]
1511 # comments and empty lines are fine
1512 if line.startswith('#') or not line.strip():
1513 return line
1514 cookie_list = line.split('\t')
1515 if len(cookie_list) != self._ENTRY_LEN:
1516 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1517 cookie = self._CookieFileEntry(*cookie_list)
1518 if cookie.expires_at and not cookie.expires_at.isdigit():
1519 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1520 return line
1521
1522 cf = io.StringIO()
1523 with open(filename, encoding='utf-8') as f:
1524 for line in f:
1525 try:
1526 cf.write(prepare_line(line))
1527 except compat_cookiejar.LoadError as e:
1528 write_string(
1529 'WARNING: skipping cookie file entry due to %s: %r\n'
1530 % (e, line), sys.stderr)
1531 continue
1532 cf.seek(0)
1533 self._really_load(cf, filename, ignore_discard, ignore_expires)
1534 # Session cookies are denoted by either `expires` field set to
1535 # an empty string or 0. MozillaCookieJar only recognizes the former
1536 # (see [1]). So we need force the latter to be recognized as session
1537 # cookies on our own.
1538 # Session cookies may be important for cookies-based authentication,
1539 # e.g. usually, when user does not check 'Remember me' check box while
1540 # logging in on a site, some important cookies are stored as session
1541 # cookies so that not recognizing them will result in failed login.
1542 # 1. https://bugs.python.org/issue17164
1543 for cookie in self:
1544 # Treat `expires=0` cookies as session cookies
1545 if cookie.expires == 0:
1546 cookie.expires = None
1547 cookie.discard = True
1548
1549
1550 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1551 def __init__(self, cookiejar=None):
1552 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1553
1554 def http_response(self, request, response):
1555 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1556
1557 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1558 https_response = http_response
1559
1560
1561 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1562 """YoutubeDL redirect handler
1563
1564 The code is based on HTTPRedirectHandler implementation from CPython [1].
1565
1566 This redirect handler solves two issues:
1567 - ensures redirect URL is always unicode under python 2
1568 - introduces support for experimental HTTP response status code
1569 308 Permanent Redirect [2] used by some sites [3]
1570
1571 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1572 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1573 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1574 """
1575
1576 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1577
1578 def redirect_request(self, req, fp, code, msg, headers, newurl):
1579 """Return a Request or None in response to a redirect.
1580
1581 This is called by the http_error_30x methods when a
1582 redirection response is received. If a redirection should
1583 take place, return a new Request to allow http_error_30x to
1584 perform the redirect. Otherwise, raise HTTPError if no-one
1585 else should try to handle this url. Return None if you can't
1586 but another Handler might.
1587 """
1588 m = req.get_method()
1589 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1590 or code in (301, 302, 303) and m == "POST")):
1591 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1592 # Strictly (according to RFC 2616), 301 or 302 in response to
1593 # a POST MUST NOT cause a redirection without confirmation
1594 # from the user (of urllib.request, in this case). In practice,
1595 # essentially all clients do redirect in this case, so we do
1596 # the same.
1597
1598 # Be conciliant with URIs containing a space. This is mainly
1599 # redundant with the more complete encoding done in http_error_302(),
1600 # but it is kept for compatibility with other callers.
1601 newurl = newurl.replace(' ', '%20')
1602
1603 CONTENT_HEADERS = ("content-length", "content-type")
1604 # NB: don't use dict comprehension for python 2.6 compatibility
1605 newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS}
1606 return compat_urllib_request.Request(
1607 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1608 unverifiable=True)
1609
1610
1611 def extract_timezone(date_str):
1612 m = re.search(
1613 r'''(?x)
1614 ^.{8,}? # >=8 char non-TZ prefix, if present
1615 (?P<tz>Z| # just the UTC Z, or
1616 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1617 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1618 [ ]? # optional space
1619 (?P<sign>\+|-) # +/-
1620 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1621 $)
1622 ''', date_str)
1623 if not m:
1624 timezone = datetime.timedelta()
1625 else:
1626 date_str = date_str[:-len(m.group('tz'))]
1627 if not m.group('sign'):
1628 timezone = datetime.timedelta()
1629 else:
1630 sign = 1 if m.group('sign') == '+' else -1
1631 timezone = datetime.timedelta(
1632 hours=sign * int(m.group('hours')),
1633 minutes=sign * int(m.group('minutes')))
1634 return timezone, date_str
1635
1636
1637 def parse_iso8601(date_str, delimiter='T', timezone=None):
1638 """ Return a UNIX timestamp from the given date """
1639
1640 if date_str is None:
1641 return None
1642
1643 date_str = re.sub(r'\.[0-9]+', '', date_str)
1644
1645 if timezone is None:
1646 timezone, date_str = extract_timezone(date_str)
1647
1648 try:
1649 date_format = f'%Y-%m-%d{delimiter}%H:%M:%S'
1650 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1651 return calendar.timegm(dt.timetuple())
1652 except ValueError:
1653 pass
1654
1655
1656 def date_formats(day_first=True):
1657 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1658
1659
1660 def unified_strdate(date_str, day_first=True):
1661 """Return a string with the date in the format YYYYMMDD"""
1662
1663 if date_str is None:
1664 return None
1665 upload_date = None
1666 # Replace commas
1667 date_str = date_str.replace(',', ' ')
1668 # Remove AM/PM + timezone
1669 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1670 _, date_str = extract_timezone(date_str)
1671
1672 for expression in date_formats(day_first):
1673 try:
1674 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1675 except ValueError:
1676 pass
1677 if upload_date is None:
1678 timetuple = email.utils.parsedate_tz(date_str)
1679 if timetuple:
1680 try:
1681 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1682 except ValueError:
1683 pass
1684 if upload_date is not None:
1685 return compat_str(upload_date)
1686
1687
1688 def unified_timestamp(date_str, day_first=True):
1689 if date_str is None:
1690 return None
1691
1692 date_str = re.sub(r'[,|]', '', date_str)
1693
1694 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1695 timezone, date_str = extract_timezone(date_str)
1696
1697 # Remove AM/PM + timezone
1698 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1699
1700 # Remove unrecognized timezones from ISO 8601 alike timestamps
1701 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1702 if m:
1703 date_str = date_str[:-len(m.group('tz'))]
1704
1705 # Python only supports microseconds, so remove nanoseconds
1706 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1707 if m:
1708 date_str = m.group(1)
1709
1710 for expression in date_formats(day_first):
1711 try:
1712 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1713 return calendar.timegm(dt.timetuple())
1714 except ValueError:
1715 pass
1716 timetuple = email.utils.parsedate_tz(date_str)
1717 if timetuple:
1718 return calendar.timegm(timetuple) + pm_delta * 3600
1719
1720
1721 def determine_ext(url, default_ext='unknown_video'):
1722 if url is None or '.' not in url:
1723 return default_ext
1724 guess = url.partition('?')[0].rpartition('.')[2]
1725 if re.match(r'^[A-Za-z0-9]+$', guess):
1726 return guess
1727 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1728 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1729 return guess.rstrip('/')
1730 else:
1731 return default_ext
1732
1733
1734 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1735 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1736
1737
1738 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1739 """
1740 Return a datetime object from a string in the format YYYYMMDD or
1741 (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1742
1743 format: string date format used to return datetime object from
1744 precision: round the time portion of a datetime object.
1745 auto|microsecond|second|minute|hour|day.
1746 auto: round to the unit provided in date_str (if applicable).
1747 """
1748 auto_precision = False
1749 if precision == 'auto':
1750 auto_precision = True
1751 precision = 'microsecond'
1752 today = datetime_round(datetime.datetime.utcnow(), precision)
1753 if date_str in ('now', 'today'):
1754 return today
1755 if date_str == 'yesterday':
1756 return today - datetime.timedelta(days=1)
1757 match = re.match(
1758 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1759 date_str)
1760 if match is not None:
1761 start_time = datetime_from_str(match.group('start'), precision, format)
1762 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1763 unit = match.group('unit')
1764 if unit == 'month' or unit == 'year':
1765 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1766 unit = 'day'
1767 else:
1768 if unit == 'week':
1769 unit = 'day'
1770 time *= 7
1771 delta = datetime.timedelta(**{unit + 's': time})
1772 new_date = start_time + delta
1773 if auto_precision:
1774 return datetime_round(new_date, unit)
1775 return new_date
1776
1777 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1778
1779
1780 def date_from_str(date_str, format='%Y%m%d', strict=False):
1781 """
1782 Return a datetime object from a string in the format YYYYMMDD or
1783 (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1784
1785 If "strict", only (now|today)[+-][0-9](day|week|month|year)(s)? is allowed
1786
1787 format: string date format used to return datetime object from
1788 """
1789 if strict and not re.fullmatch(r'\d{8}|(now|today)[+-]\d+(day|week|month|year)(s)?', date_str):
1790 raise ValueError(f'Invalid date format {date_str}')
1791 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1792
1793
1794 def datetime_add_months(dt, months):
1795 """Increment/Decrement a datetime object by months."""
1796 month = dt.month + months - 1
1797 year = dt.year + month // 12
1798 month = month % 12 + 1
1799 day = min(dt.day, calendar.monthrange(year, month)[1])
1800 return dt.replace(year, month, day)
1801
1802
1803 def datetime_round(dt, precision='day'):
1804 """
1805 Round a datetime object's time to a specific precision
1806 """
1807 if precision == 'microsecond':
1808 return dt
1809
1810 unit_seconds = {
1811 'day': 86400,
1812 'hour': 3600,
1813 'minute': 60,
1814 'second': 1,
1815 }
1816 roundto = lambda x, n: ((x + n / 2) // n) * n
1817 timestamp = calendar.timegm(dt.timetuple())
1818 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1819
1820
1821 def hyphenate_date(date_str):
1822 """
1823 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1824 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1825 if match is not None:
1826 return '-'.join(match.groups())
1827 else:
1828 return date_str
1829
1830
1831 class DateRange:
1832 """Represents a time interval between two dates"""
1833
1834 def __init__(self, start=None, end=None):
1835 """start and end must be strings in the format accepted by date"""
1836 if start is not None:
1837 self.start = date_from_str(start, strict=True)
1838 else:
1839 self.start = datetime.datetime.min.date()
1840 if end is not None:
1841 self.end = date_from_str(end, strict=True)
1842 else:
1843 self.end = datetime.datetime.max.date()
1844 if self.start > self.end:
1845 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1846
1847 @classmethod
1848 def day(cls, day):
1849 """Returns a range that only contains the given day"""
1850 return cls(day, day)
1851
1852 def __contains__(self, date):
1853 """Check if the date is in the range"""
1854 if not isinstance(date, datetime.date):
1855 date = date_from_str(date)
1856 return self.start <= date <= self.end
1857
1858 def __str__(self):
1859 return f'{self.start.isoformat()} - {self.end.isoformat()}'
1860
1861
1862 def platform_name():
1863 """ Returns the platform name as a compat_str """
1864 res = platform.platform()
1865 if isinstance(res, bytes):
1866 res = res.decode(preferredencoding())
1867
1868 assert isinstance(res, compat_str)
1869 return res
1870
1871
1872 def get_windows_version():
1873 ''' Get Windows version. None if it's not running on Windows '''
1874 if compat_os_name == 'nt':
1875 return version_tuple(platform.win32_ver()[1])
1876 else:
1877 return None
1878
1879
1880 def write_string(s, out=None, encoding=None):
1881 if out is None:
1882 out = sys.stderr
1883 assert type(s) == compat_str
1884
1885 if 'b' in getattr(out, 'mode', ''):
1886 byt = s.encode(encoding or preferredencoding(), 'ignore')
1887 out.write(byt)
1888 elif hasattr(out, 'buffer'):
1889 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1890 byt = s.encode(enc, 'ignore')
1891 out.buffer.write(byt)
1892 else:
1893 out.write(s)
1894 out.flush()
1895
1896
1897 def bytes_to_intlist(bs):
1898 if not bs:
1899 return []
1900 if isinstance(bs[0], int): # Python 3
1901 return list(bs)
1902 else:
1903 return [ord(c) for c in bs]
1904
1905
1906 def intlist_to_bytes(xs):
1907 if not xs:
1908 return b''
1909 return compat_struct_pack('%dB' % len(xs), *xs)
1910
1911
1912 class LockingUnsupportedError(IOError):
1913 msg = 'File locking is not supported on this platform'
1914
1915 def __init__(self):
1916 super().__init__(self.msg)
1917
1918
1919 # Cross-platform file locking
1920 if sys.platform == 'win32':
1921 import ctypes.wintypes
1922 import msvcrt
1923
1924 class OVERLAPPED(ctypes.Structure):
1925 _fields_ = [
1926 ('Internal', ctypes.wintypes.LPVOID),
1927 ('InternalHigh', ctypes.wintypes.LPVOID),
1928 ('Offset', ctypes.wintypes.DWORD),
1929 ('OffsetHigh', ctypes.wintypes.DWORD),
1930 ('hEvent', ctypes.wintypes.HANDLE),
1931 ]
1932
1933 kernel32 = ctypes.windll.kernel32
1934 LockFileEx = kernel32.LockFileEx
1935 LockFileEx.argtypes = [
1936 ctypes.wintypes.HANDLE, # hFile
1937 ctypes.wintypes.DWORD, # dwFlags
1938 ctypes.wintypes.DWORD, # dwReserved
1939 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1940 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1941 ctypes.POINTER(OVERLAPPED) # Overlapped
1942 ]
1943 LockFileEx.restype = ctypes.wintypes.BOOL
1944 UnlockFileEx = kernel32.UnlockFileEx
1945 UnlockFileEx.argtypes = [
1946 ctypes.wintypes.HANDLE, # hFile
1947 ctypes.wintypes.DWORD, # dwReserved
1948 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1949 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1950 ctypes.POINTER(OVERLAPPED) # Overlapped
1951 ]
1952 UnlockFileEx.restype = ctypes.wintypes.BOOL
1953 whole_low = 0xffffffff
1954 whole_high = 0x7fffffff
1955
1956 def _lock_file(f, exclusive, block):
1957 overlapped = OVERLAPPED()
1958 overlapped.Offset = 0
1959 overlapped.OffsetHigh = 0
1960 overlapped.hEvent = 0
1961 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1962
1963 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
1964 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
1965 0, whole_low, whole_high, f._lock_file_overlapped_p):
1966 raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
1967
1968 def _unlock_file(f):
1969 assert f._lock_file_overlapped_p
1970 handle = msvcrt.get_osfhandle(f.fileno())
1971 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
1972 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1973
1974 else:
1975 try:
1976 import fcntl
1977
1978 def _lock_file(f, exclusive, block):
1979 flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH
1980 if not block:
1981 flags |= fcntl.LOCK_NB
1982 try:
1983 fcntl.flock(f, flags)
1984 except BlockingIOError:
1985 raise
1986 except OSError: # AOSP does not have flock()
1987 fcntl.lockf(f, flags)
1988
1989 def _unlock_file(f):
1990 try:
1991 fcntl.flock(f, fcntl.LOCK_UN)
1992 except OSError:
1993 fcntl.lockf(f, fcntl.LOCK_UN)
1994
1995 except ImportError:
1996
1997 def _lock_file(f, exclusive, block):
1998 raise LockingUnsupportedError()
1999
2000 def _unlock_file(f):
2001 raise LockingUnsupportedError()
2002
2003
2004 class locked_file:
2005 locked = False
2006
2007 def __init__(self, filename, mode, block=True, encoding=None):
2008 if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}:
2009 raise NotImplementedError(mode)
2010 self.mode, self.block = mode, block
2011
2012 writable = any(f in mode for f in 'wax+')
2013 readable = any(f in mode for f in 'r+')
2014 flags = functools.reduce(operator.ior, (
2015 getattr(os, 'O_CLOEXEC', 0), # UNIX only
2016 getattr(os, 'O_BINARY', 0), # Windows only
2017 getattr(os, 'O_NOINHERIT', 0), # Windows only
2018 os.O_CREAT if writable else 0, # O_TRUNC only after locking
2019 os.O_APPEND if 'a' in mode else 0,
2020 os.O_EXCL if 'x' in mode else 0,
2021 os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY,
2022 ))
2023
2024 self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding)
2025
2026 def __enter__(self):
2027 exclusive = 'r' not in self.mode
2028 try:
2029 _lock_file(self.f, exclusive, self.block)
2030 self.locked = True
2031 except OSError:
2032 self.f.close()
2033 raise
2034 if 'w' in self.mode:
2035 self.f.truncate()
2036 return self
2037
2038 def unlock(self):
2039 if not self.locked:
2040 return
2041 try:
2042 _unlock_file(self.f)
2043 finally:
2044 self.locked = False
2045
2046 def __exit__(self, *_):
2047 try:
2048 self.unlock()
2049 finally:
2050 self.f.close()
2051
2052 open = __enter__
2053 close = __exit__
2054
2055 def __getattr__(self, attr):
2056 return getattr(self.f, attr)
2057
2058 def __iter__(self):
2059 return iter(self.f)
2060
2061
2062 def get_filesystem_encoding():
2063 encoding = sys.getfilesystemencoding()
2064 return encoding if encoding is not None else 'utf-8'
2065
2066
2067 def shell_quote(args):
2068 quoted_args = []
2069 encoding = get_filesystem_encoding()
2070 for a in args:
2071 if isinstance(a, bytes):
2072 # We may get a filename encoded with 'encodeFilename'
2073 a = a.decode(encoding)
2074 quoted_args.append(compat_shlex_quote(a))
2075 return ' '.join(quoted_args)
2076
2077
2078 def smuggle_url(url, data):
2079 """ Pass additional data in a URL for internal use. """
2080
2081 url, idata = unsmuggle_url(url, {})
2082 data.update(idata)
2083 sdata = compat_urllib_parse_urlencode(
2084 {'__youtubedl_smuggle': json.dumps(data)})
2085 return url + '#' + sdata
2086
2087
2088 def unsmuggle_url(smug_url, default=None):
2089 if '#__youtubedl_smuggle' not in smug_url:
2090 return smug_url, default
2091 url, _, sdata = smug_url.rpartition('#')
2092 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2093 data = json.loads(jsond)
2094 return url, data
2095
2096
2097 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2098 """ Formats numbers with decimal sufixes like K, M, etc """
2099 num, factor = float_or_none(num), float(factor)
2100 if num is None or num < 0:
2101 return None
2102 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2103 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2104 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2105 if factor == 1024:
2106 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2107 converted = num / (factor ** exponent)
2108 return fmt % (converted, suffix)
2109
2110
2111 def format_bytes(bytes):
2112 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2113
2114
2115 def lookup_unit_table(unit_table, s):
2116 units_re = '|'.join(re.escape(u) for u in unit_table)
2117 m = re.match(
2118 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2119 if not m:
2120 return None
2121 num_str = m.group('num').replace(',', '.')
2122 mult = unit_table[m.group('unit')]
2123 return int(float(num_str) * mult)
2124
2125
2126 def parse_filesize(s):
2127 if s is None:
2128 return None
2129
2130 # The lower-case forms are of course incorrect and unofficial,
2131 # but we support those too
2132 _UNIT_TABLE = {
2133 'B': 1,
2134 'b': 1,
2135 'bytes': 1,
2136 'KiB': 1024,
2137 'KB': 1000,
2138 'kB': 1024,
2139 'Kb': 1000,
2140 'kb': 1000,
2141 'kilobytes': 1000,
2142 'kibibytes': 1024,
2143 'MiB': 1024 ** 2,
2144 'MB': 1000 ** 2,
2145 'mB': 1024 ** 2,
2146 'Mb': 1000 ** 2,
2147 'mb': 1000 ** 2,
2148 'megabytes': 1000 ** 2,
2149 'mebibytes': 1024 ** 2,
2150 'GiB': 1024 ** 3,
2151 'GB': 1000 ** 3,
2152 'gB': 1024 ** 3,
2153 'Gb': 1000 ** 3,
2154 'gb': 1000 ** 3,
2155 'gigabytes': 1000 ** 3,
2156 'gibibytes': 1024 ** 3,
2157 'TiB': 1024 ** 4,
2158 'TB': 1000 ** 4,
2159 'tB': 1024 ** 4,
2160 'Tb': 1000 ** 4,
2161 'tb': 1000 ** 4,
2162 'terabytes': 1000 ** 4,
2163 'tebibytes': 1024 ** 4,
2164 'PiB': 1024 ** 5,
2165 'PB': 1000 ** 5,
2166 'pB': 1024 ** 5,
2167 'Pb': 1000 ** 5,
2168 'pb': 1000 ** 5,
2169 'petabytes': 1000 ** 5,
2170 'pebibytes': 1024 ** 5,
2171 'EiB': 1024 ** 6,
2172 'EB': 1000 ** 6,
2173 'eB': 1024 ** 6,
2174 'Eb': 1000 ** 6,
2175 'eb': 1000 ** 6,
2176 'exabytes': 1000 ** 6,
2177 'exbibytes': 1024 ** 6,
2178 'ZiB': 1024 ** 7,
2179 'ZB': 1000 ** 7,
2180 'zB': 1024 ** 7,
2181 'Zb': 1000 ** 7,
2182 'zb': 1000 ** 7,
2183 'zettabytes': 1000 ** 7,
2184 'zebibytes': 1024 ** 7,
2185 'YiB': 1024 ** 8,
2186 'YB': 1000 ** 8,
2187 'yB': 1024 ** 8,
2188 'Yb': 1000 ** 8,
2189 'yb': 1000 ** 8,
2190 'yottabytes': 1000 ** 8,
2191 'yobibytes': 1024 ** 8,
2192 }
2193
2194 return lookup_unit_table(_UNIT_TABLE, s)
2195
2196
2197 def parse_count(s):
2198 if s is None:
2199 return None
2200
2201 s = re.sub(r'^[^\d]+\s', '', s).strip()
2202
2203 if re.match(r'^[\d,.]+$', s):
2204 return str_to_int(s)
2205
2206 _UNIT_TABLE = {
2207 'k': 1000,
2208 'K': 1000,
2209 'm': 1000 ** 2,
2210 'M': 1000 ** 2,
2211 'kk': 1000 ** 2,
2212 'KK': 1000 ** 2,
2213 'b': 1000 ** 3,
2214 'B': 1000 ** 3,
2215 }
2216
2217 ret = lookup_unit_table(_UNIT_TABLE, s)
2218 if ret is not None:
2219 return ret
2220
2221 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2222 if mobj:
2223 return str_to_int(mobj.group(1))
2224
2225
2226 def parse_resolution(s, *, lenient=False):
2227 if s is None:
2228 return {}
2229
2230 if lenient:
2231 mobj = re.search(r'(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)', s)
2232 else:
2233 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2234 if mobj:
2235 return {
2236 'width': int(mobj.group('w')),
2237 'height': int(mobj.group('h')),
2238 }
2239
2240 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2241 if mobj:
2242 return {'height': int(mobj.group(1))}
2243
2244 mobj = re.search(r'\b([48])[kK]\b', s)
2245 if mobj:
2246 return {'height': int(mobj.group(1)) * 540}
2247
2248 return {}
2249
2250
2251 def parse_bitrate(s):
2252 if not isinstance(s, compat_str):
2253 return
2254 mobj = re.search(r'\b(\d+)\s*kbps', s)
2255 if mobj:
2256 return int(mobj.group(1))
2257
2258
2259 def month_by_name(name, lang='en'):
2260 """ Return the number of a month by (locale-independently) English name """
2261
2262 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2263
2264 try:
2265 return month_names.index(name) + 1
2266 except ValueError:
2267 return None
2268
2269
2270 def month_by_abbreviation(abbrev):
2271 """ Return the number of a month by (locale-independently) English
2272 abbreviations """
2273
2274 try:
2275 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2276 except ValueError:
2277 return None
2278
2279
2280 def fix_xml_ampersands(xml_str):
2281 """Replace all the '&' by '&amp;' in XML"""
2282 return re.sub(
2283 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2284 '&amp;',
2285 xml_str)
2286
2287
2288 def setproctitle(title):
2289 assert isinstance(title, compat_str)
2290
2291 # ctypes in Jython is not complete
2292 # http://bugs.jython.org/issue2148
2293 if sys.platform.startswith('java'):
2294 return
2295
2296 try:
2297 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2298 except OSError:
2299 return
2300 except TypeError:
2301 # LoadLibrary in Windows Python 2.7.13 only expects
2302 # a bytestring, but since unicode_literals turns
2303 # every string into a unicode string, it fails.
2304 return
2305 title_bytes = title.encode('utf-8')
2306 buf = ctypes.create_string_buffer(len(title_bytes))
2307 buf.value = title_bytes
2308 try:
2309 libc.prctl(15, buf, 0, 0, 0)
2310 except AttributeError:
2311 return # Strange libc, just skip this
2312
2313
2314 def remove_start(s, start):
2315 return s[len(start):] if s is not None and s.startswith(start) else s
2316
2317
2318 def remove_end(s, end):
2319 return s[:-len(end)] if s is not None and s.endswith(end) else s
2320
2321
2322 def remove_quotes(s):
2323 if s is None or len(s) < 2:
2324 return s
2325 for quote in ('"', "'", ):
2326 if s[0] == quote and s[-1] == quote:
2327 return s[1:-1]
2328 return s
2329
2330
2331 def get_domain(url):
2332 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2333 return domain.group('domain') if domain else None
2334
2335
2336 def url_basename(url):
2337 path = compat_urlparse.urlparse(url).path
2338 return path.strip('/').split('/')[-1]
2339
2340
2341 def base_url(url):
2342 return re.match(r'https?://[^?#&]+/', url).group()
2343
2344
2345 def urljoin(base, path):
2346 if isinstance(path, bytes):
2347 path = path.decode('utf-8')
2348 if not isinstance(path, compat_str) or not path:
2349 return None
2350 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2351 return path
2352 if isinstance(base, bytes):
2353 base = base.decode('utf-8')
2354 if not isinstance(base, compat_str) or not re.match(
2355 r'^(?:https?:)?//', base):
2356 return None
2357 return compat_urlparse.urljoin(base, path)
2358
2359
2360 class HEADRequest(compat_urllib_request.Request):
2361 def get_method(self):
2362 return 'HEAD'
2363
2364
2365 class PUTRequest(compat_urllib_request.Request):
2366 def get_method(self):
2367 return 'PUT'
2368
2369
2370 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2371 if get_attr and v is not None:
2372 v = getattr(v, get_attr, None)
2373 try:
2374 return int(v) * invscale // scale
2375 except (ValueError, TypeError, OverflowError):
2376 return default
2377
2378
2379 def str_or_none(v, default=None):
2380 return default if v is None else compat_str(v)
2381
2382
2383 def str_to_int(int_str):
2384 """ A more relaxed version of int_or_none """
2385 if isinstance(int_str, int):
2386 return int_str
2387 elif isinstance(int_str, compat_str):
2388 int_str = re.sub(r'[,\.\+]', '', int_str)
2389 return int_or_none(int_str)
2390
2391
2392 def float_or_none(v, scale=1, invscale=1, default=None):
2393 if v is None:
2394 return default
2395 try:
2396 return float(v) * invscale / scale
2397 except (ValueError, TypeError):
2398 return default
2399
2400
2401 def bool_or_none(v, default=None):
2402 return v if isinstance(v, bool) else default
2403
2404
2405 def strip_or_none(v, default=None):
2406 return v.strip() if isinstance(v, compat_str) else default
2407
2408
2409 def url_or_none(url):
2410 if not url or not isinstance(url, compat_str):
2411 return None
2412 url = url.strip()
2413 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2414
2415
2416 def request_to_url(req):
2417 if isinstance(req, compat_urllib_request.Request):
2418 return req.get_full_url()
2419 else:
2420 return req
2421
2422
2423 def strftime_or_none(timestamp, date_format, default=None):
2424 datetime_object = None
2425 try:
2426 if isinstance(timestamp, (int, float)): # unix timestamp
2427 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2428 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2429 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2430 return datetime_object.strftime(date_format)
2431 except (ValueError, TypeError, AttributeError):
2432 return default
2433
2434
2435 def parse_duration(s):
2436 if not isinstance(s, str):
2437 return None
2438 s = s.strip()
2439 if not s:
2440 return None
2441
2442 days, hours, mins, secs, ms = [None] * 5
2443 m = re.match(r'''(?x)
2444 (?P<before_secs>
2445 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2446 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2447 (?P<ms>[.:][0-9]+)?Z?$
2448 ''', s)
2449 if m:
2450 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2451 else:
2452 m = re.match(
2453 r'''(?ix)(?:P?
2454 (?:
2455 [0-9]+\s*y(?:ears?)?,?\s*
2456 )?
2457 (?:
2458 [0-9]+\s*m(?:onths?)?,?\s*
2459 )?
2460 (?:
2461 [0-9]+\s*w(?:eeks?)?,?\s*
2462 )?
2463 (?:
2464 (?P<days>[0-9]+)\s*d(?:ays?)?,?\s*
2465 )?
2466 T)?
2467 (?:
2468 (?P<hours>[0-9]+)\s*h(?:ours?)?,?\s*
2469 )?
2470 (?:
2471 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?,?\s*
2472 )?
2473 (?:
2474 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2475 )?Z?$''', s)
2476 if m:
2477 days, hours, mins, secs, ms = m.groups()
2478 else:
2479 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2480 if m:
2481 hours, mins = m.groups()
2482 else:
2483 return None
2484
2485 duration = 0
2486 if secs:
2487 duration += float(secs)
2488 if mins:
2489 duration += float(mins) * 60
2490 if hours:
2491 duration += float(hours) * 60 * 60
2492 if days:
2493 duration += float(days) * 24 * 60 * 60
2494 if ms:
2495 duration += float(ms.replace(':', '.'))
2496 return duration
2497
2498
2499 def prepend_extension(filename, ext, expected_real_ext=None):
2500 name, real_ext = os.path.splitext(filename)
2501 return (
2502 f'{name}.{ext}{real_ext}'
2503 if not expected_real_ext or real_ext[1:] == expected_real_ext
2504 else f'{filename}.{ext}')
2505
2506
2507 def replace_extension(filename, ext, expected_real_ext=None):
2508 name, real_ext = os.path.splitext(filename)
2509 return '{}.{}'.format(
2510 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2511 ext)
2512
2513
2514 def check_executable(exe, args=[]):
2515 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2516 args can be a list of arguments for a short output (like -version) """
2517 try:
2518 Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2519 except OSError:
2520 return False
2521 return exe
2522
2523
2524 def _get_exe_version_output(exe, args, *, to_screen=None):
2525 if to_screen:
2526 to_screen(f'Checking exe version: {shell_quote([exe] + args)}')
2527 try:
2528 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2529 # SIGTTOU if yt-dlp is run in the background.
2530 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2531 out, _ = Popen(
2532 [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2533 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2534 except OSError:
2535 return False
2536 if isinstance(out, bytes): # Python 2.x
2537 out = out.decode('ascii', 'ignore')
2538 return out
2539
2540
2541 def detect_exe_version(output, version_re=None, unrecognized='present'):
2542 assert isinstance(output, compat_str)
2543 if version_re is None:
2544 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2545 m = re.search(version_re, output)
2546 if m:
2547 return m.group(1)
2548 else:
2549 return unrecognized
2550
2551
2552 def get_exe_version(exe, args=['--version'],
2553 version_re=None, unrecognized='present'):
2554 """ Returns the version of the specified executable,
2555 or False if the executable is not present """
2556 out = _get_exe_version_output(exe, args)
2557 return detect_exe_version(out, version_re, unrecognized) if out else False
2558
2559
2560 class LazyList(collections.abc.Sequence):
2561 ''' Lazy immutable list from an iterable
2562 Note that slices of a LazyList are lists and not LazyList'''
2563
2564 class IndexError(IndexError):
2565 pass
2566
2567 def __init__(self, iterable, *, reverse=False, _cache=None):
2568 self.__iterable = iter(iterable)
2569 self.__cache = [] if _cache is None else _cache
2570 self.__reversed = reverse
2571
2572 def __iter__(self):
2573 if self.__reversed:
2574 # We need to consume the entire iterable to iterate in reverse
2575 yield from self.exhaust()
2576 return
2577 yield from self.__cache
2578 for item in self.__iterable:
2579 self.__cache.append(item)
2580 yield item
2581
2582 def __exhaust(self):
2583 self.__cache.extend(self.__iterable)
2584 # Discard the emptied iterable to make it pickle-able
2585 self.__iterable = []
2586 return self.__cache
2587
2588 def exhaust(self):
2589 ''' Evaluate the entire iterable '''
2590 return self.__exhaust()[::-1 if self.__reversed else 1]
2591
2592 @staticmethod
2593 def __reverse_index(x):
2594 return None if x is None else -(x + 1)
2595
2596 def __getitem__(self, idx):
2597 if isinstance(idx, slice):
2598 if self.__reversed:
2599 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2600 start, stop, step = idx.start, idx.stop, idx.step or 1
2601 elif isinstance(idx, int):
2602 if self.__reversed:
2603 idx = self.__reverse_index(idx)
2604 start, stop, step = idx, idx, 0
2605 else:
2606 raise TypeError('indices must be integers or slices')
2607 if ((start or 0) < 0 or (stop or 0) < 0
2608 or (start is None and step < 0)
2609 or (stop is None and step > 0)):
2610 # We need to consume the entire iterable to be able to slice from the end
2611 # Obviously, never use this with infinite iterables
2612 self.__exhaust()
2613 try:
2614 return self.__cache[idx]
2615 except IndexError as e:
2616 raise self.IndexError(e) from e
2617 n = max(start or 0, stop or 0) - len(self.__cache) + 1
2618 if n > 0:
2619 self.__cache.extend(itertools.islice(self.__iterable, n))
2620 try:
2621 return self.__cache[idx]
2622 except IndexError as e:
2623 raise self.IndexError(e) from e
2624
2625 def __bool__(self):
2626 try:
2627 self[-1] if self.__reversed else self[0]
2628 except self.IndexError:
2629 return False
2630 return True
2631
2632 def __len__(self):
2633 self.__exhaust()
2634 return len(self.__cache)
2635
2636 def __reversed__(self):
2637 return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2638
2639 def __copy__(self):
2640 return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2641
2642 def __repr__(self):
2643 # repr and str should mimic a list. So we exhaust the iterable
2644 return repr(self.exhaust())
2645
2646 def __str__(self):
2647 return repr(self.exhaust())
2648
2649
2650 class PagedList:
2651
2652 class IndexError(IndexError):
2653 pass
2654
2655 def __len__(self):
2656 # This is only useful for tests
2657 return len(self.getslice())
2658
2659 def __init__(self, pagefunc, pagesize, use_cache=True):
2660 self._pagefunc = pagefunc
2661 self._pagesize = pagesize
2662 self._pagecount = float('inf')
2663 self._use_cache = use_cache
2664 self._cache = {}
2665
2666 def getpage(self, pagenum):
2667 page_results = self._cache.get(pagenum)
2668 if page_results is None:
2669 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2670 if self._use_cache:
2671 self._cache[pagenum] = page_results
2672 return page_results
2673
2674 def getslice(self, start=0, end=None):
2675 return list(self._getslice(start, end))
2676
2677 def _getslice(self, start, end):
2678 raise NotImplementedError('This method must be implemented by subclasses')
2679
2680 def __getitem__(self, idx):
2681 assert self._use_cache, 'Indexing PagedList requires cache'
2682 if not isinstance(idx, int) or idx < 0:
2683 raise TypeError('indices must be non-negative integers')
2684 entries = self.getslice(idx, idx + 1)
2685 if not entries:
2686 raise self.IndexError()
2687 return entries[0]
2688
2689
2690 class OnDemandPagedList(PagedList):
2691 """Download pages until a page with less than maximum results"""
2692
2693 def _getslice(self, start, end):
2694 for pagenum in itertools.count(start // self._pagesize):
2695 firstid = pagenum * self._pagesize
2696 nextfirstid = pagenum * self._pagesize + self._pagesize
2697 if start >= nextfirstid:
2698 continue
2699
2700 startv = (
2701 start % self._pagesize
2702 if firstid <= start < nextfirstid
2703 else 0)
2704 endv = (
2705 ((end - 1) % self._pagesize) + 1
2706 if (end is not None and firstid <= end <= nextfirstid)
2707 else None)
2708
2709 try:
2710 page_results = self.getpage(pagenum)
2711 except Exception:
2712 self._pagecount = pagenum - 1
2713 raise
2714 if startv != 0 or endv is not None:
2715 page_results = page_results[startv:endv]
2716 yield from page_results
2717
2718 # A little optimization - if current page is not "full", ie. does
2719 # not contain page_size videos then we can assume that this page
2720 # is the last one - there are no more ids on further pages -
2721 # i.e. no need to query again.
2722 if len(page_results) + startv < self._pagesize:
2723 break
2724
2725 # If we got the whole page, but the next page is not interesting,
2726 # break out early as well
2727 if end == nextfirstid:
2728 break
2729
2730
2731 class InAdvancePagedList(PagedList):
2732 """PagedList with total number of pages known in advance"""
2733
2734 def __init__(self, pagefunc, pagecount, pagesize):
2735 PagedList.__init__(self, pagefunc, pagesize, True)
2736 self._pagecount = pagecount
2737
2738 def _getslice(self, start, end):
2739 start_page = start // self._pagesize
2740 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2741 skip_elems = start - start_page * self._pagesize
2742 only_more = None if end is None else end - start
2743 for pagenum in range(start_page, end_page):
2744 page_results = self.getpage(pagenum)
2745 if skip_elems:
2746 page_results = page_results[skip_elems:]
2747 skip_elems = None
2748 if only_more is not None:
2749 if len(page_results) < only_more:
2750 only_more -= len(page_results)
2751 else:
2752 yield from page_results[:only_more]
2753 break
2754 yield from page_results
2755
2756
2757 def uppercase_escape(s):
2758 unicode_escape = codecs.getdecoder('unicode_escape')
2759 return re.sub(
2760 r'\\U[0-9a-fA-F]{8}',
2761 lambda m: unicode_escape(m.group(0))[0],
2762 s)
2763
2764
2765 def lowercase_escape(s):
2766 unicode_escape = codecs.getdecoder('unicode_escape')
2767 return re.sub(
2768 r'\\u[0-9a-fA-F]{4}',
2769 lambda m: unicode_escape(m.group(0))[0],
2770 s)
2771
2772
2773 def escape_rfc3986(s):
2774 """Escape non-ASCII characters as suggested by RFC 3986"""
2775 return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2776
2777
2778 def escape_url(url):
2779 """Escape URL as suggested by RFC 3986"""
2780 url_parsed = compat_urllib_parse_urlparse(url)
2781 return url_parsed._replace(
2782 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2783 path=escape_rfc3986(url_parsed.path),
2784 params=escape_rfc3986(url_parsed.params),
2785 query=escape_rfc3986(url_parsed.query),
2786 fragment=escape_rfc3986(url_parsed.fragment)
2787 ).geturl()
2788
2789
2790 def parse_qs(url):
2791 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2792
2793
2794 def read_batch_urls(batch_fd):
2795 def fixup(url):
2796 if not isinstance(url, compat_str):
2797 url = url.decode('utf-8', 'replace')
2798 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2799 for bom in BOM_UTF8:
2800 if url.startswith(bom):
2801 url = url[len(bom):]
2802 url = url.lstrip()
2803 if not url or url.startswith(('#', ';', ']')):
2804 return False
2805 # "#" cannot be stripped out since it is part of the URI
2806 # However, it can be safely stipped out if follwing a whitespace
2807 return re.split(r'\s#', url, 1)[0].rstrip()
2808
2809 with contextlib.closing(batch_fd) as fd:
2810 return [url for url in map(fixup, fd) if url]
2811
2812
2813 def urlencode_postdata(*args, **kargs):
2814 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2815
2816
2817 def update_url_query(url, query):
2818 if not query:
2819 return url
2820 parsed_url = compat_urlparse.urlparse(url)
2821 qs = compat_parse_qs(parsed_url.query)
2822 qs.update(query)
2823 return compat_urlparse.urlunparse(parsed_url._replace(
2824 query=compat_urllib_parse_urlencode(qs, True)))
2825
2826
2827 def update_Request(req, url=None, data=None, headers={}, query={}):
2828 req_headers = req.headers.copy()
2829 req_headers.update(headers)
2830 req_data = data or req.data
2831 req_url = update_url_query(url or req.get_full_url(), query)
2832 req_get_method = req.get_method()
2833 if req_get_method == 'HEAD':
2834 req_type = HEADRequest
2835 elif req_get_method == 'PUT':
2836 req_type = PUTRequest
2837 else:
2838 req_type = compat_urllib_request.Request
2839 new_req = req_type(
2840 req_url, data=req_data, headers=req_headers,
2841 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2842 if hasattr(req, 'timeout'):
2843 new_req.timeout = req.timeout
2844 return new_req
2845
2846
2847 def _multipart_encode_impl(data, boundary):
2848 content_type = 'multipart/form-data; boundary=%s' % boundary
2849
2850 out = b''
2851 for k, v in data.items():
2852 out += b'--' + boundary.encode('ascii') + b'\r\n'
2853 if isinstance(k, compat_str):
2854 k = k.encode('utf-8')
2855 if isinstance(v, compat_str):
2856 v = v.encode('utf-8')
2857 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2858 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2859 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2860 if boundary.encode('ascii') in content:
2861 raise ValueError('Boundary overlaps with data')
2862 out += content
2863
2864 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2865
2866 return out, content_type
2867
2868
2869 def multipart_encode(data, boundary=None):
2870 '''
2871 Encode a dict to RFC 7578-compliant form-data
2872
2873 data:
2874 A dict where keys and values can be either Unicode or bytes-like
2875 objects.
2876 boundary:
2877 If specified a Unicode object, it's used as the boundary. Otherwise
2878 a random boundary is generated.
2879
2880 Reference: https://tools.ietf.org/html/rfc7578
2881 '''
2882 has_specified_boundary = boundary is not None
2883
2884 while True:
2885 if boundary is None:
2886 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2887
2888 try:
2889 out, content_type = _multipart_encode_impl(data, boundary)
2890 break
2891 except ValueError:
2892 if has_specified_boundary:
2893 raise
2894 boundary = None
2895
2896 return out, content_type
2897
2898
2899 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2900 for val in map(d.get, variadic(key_or_keys)):
2901 if val is not None and (val or not skip_false_values):
2902 return val
2903 return default
2904
2905
2906 def try_call(*funcs, expected_type=None, args=[], kwargs={}):
2907 for f in funcs:
2908 try:
2909 val = f(*args, **kwargs)
2910 except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError):
2911 pass
2912 else:
2913 if expected_type is None or isinstance(val, expected_type):
2914 return val
2915
2916
2917 def try_get(src, getter, expected_type=None):
2918 return try_call(*variadic(getter), args=(src,), expected_type=expected_type)
2919
2920
2921 def filter_dict(dct, cndn=lambda _, v: v is not None):
2922 return {k: v for k, v in dct.items() if cndn(k, v)}
2923
2924
2925 def merge_dicts(*dicts):
2926 merged = {}
2927 for a_dict in dicts:
2928 for k, v in a_dict.items():
2929 if (v is not None and k not in merged
2930 or isinstance(v, str) and merged[k] == ''):
2931 merged[k] = v
2932 return merged
2933
2934
2935 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2936 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2937
2938
2939 US_RATINGS = {
2940 'G': 0,
2941 'PG': 10,
2942 'PG-13': 13,
2943 'R': 16,
2944 'NC': 18,
2945 }
2946
2947
2948 TV_PARENTAL_GUIDELINES = {
2949 'TV-Y': 0,
2950 'TV-Y7': 7,
2951 'TV-G': 0,
2952 'TV-PG': 0,
2953 'TV-14': 14,
2954 'TV-MA': 17,
2955 }
2956
2957
2958 def parse_age_limit(s):
2959 if type(s) == int:
2960 return s if 0 <= s <= 21 else None
2961 if not isinstance(s, str):
2962 return None
2963 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2964 if m:
2965 return int(m.group('age'))
2966 s = s.upper()
2967 if s in US_RATINGS:
2968 return US_RATINGS[s]
2969 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2970 if m:
2971 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2972 return None
2973
2974
2975 def strip_jsonp(code):
2976 return re.sub(
2977 r'''(?sx)^
2978 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2979 (?:\s*&&\s*(?P=func_name))?
2980 \s*\(\s*(?P<callback_data>.*)\);?
2981 \s*?(?://[^\n]*)*$''',
2982 r'\g<callback_data>', code)
2983
2984
2985 def js_to_json(code, vars={}):
2986 # vars is a dict of var, val pairs to substitute
2987 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2988 SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*'
2989 INTEGER_TABLE = (
2990 (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16),
2991 (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8),
2992 )
2993
2994 def fix_kv(m):
2995 v = m.group(0)
2996 if v in ('true', 'false', 'null'):
2997 return v
2998 elif v in ('undefined', 'void 0'):
2999 return 'null'
3000 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3001 return ""
3002
3003 if v[0] in ("'", '"'):
3004 v = re.sub(r'(?s)\\.|"', lambda m: {
3005 '"': '\\"',
3006 "\\'": "'",
3007 '\\\n': '',
3008 '\\x': '\\u00',
3009 }.get(m.group(0), m.group(0)), v[1:-1])
3010 else:
3011 for regex, base in INTEGER_TABLE:
3012 im = re.match(regex, v)
3013 if im:
3014 i = int(im.group(1), base)
3015 return '"%d":' % i if v.endswith(':') else '%d' % i
3016
3017 if v in vars:
3018 return vars[v]
3019
3020 return '"%s"' % v
3021
3022 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3023
3024 return re.sub(r'''(?sx)
3025 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3026 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3027 {comment}|,(?={skip}[\]}}])|
3028 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3029 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3030 [0-9]+(?={skip}:)|
3031 !+
3032 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3033
3034
3035 def qualities(quality_ids):
3036 """ Get a numeric quality value out of a list of possible values """
3037 def q(qid):
3038 try:
3039 return quality_ids.index(qid)
3040 except ValueError:
3041 return -1
3042 return q
3043
3044
3045 POSTPROCESS_WHEN = {'pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
3046
3047
3048 DEFAULT_OUTTMPL = {
3049 'default': '%(title)s [%(id)s].%(ext)s',
3050 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3051 }
3052 OUTTMPL_TYPES = {
3053 'chapter': None,
3054 'subtitle': None,
3055 'thumbnail': None,
3056 'description': 'description',
3057 'annotation': 'annotations.xml',
3058 'infojson': 'info.json',
3059 'link': None,
3060 'pl_video': None,
3061 'pl_thumbnail': None,
3062 'pl_description': 'description',
3063 'pl_infojson': 'info.json',
3064 }
3065
3066 # As of [1] format syntax is:
3067 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3068 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3069 STR_FORMAT_RE_TMPL = r'''(?x)
3070 (?<!%)(?P<prefix>(?:%%)*)
3071 %
3072 (?P<has_key>\((?P<key>{0})\))?
3073 (?P<format>
3074 (?P<conversion>[#0\-+ ]+)?
3075 (?P<min_width>\d+)?
3076 (?P<precision>\.\d+)?
3077 (?P<len_mod>[hlL])? # unused in python
3078 {1} # conversion type
3079 )
3080 '''
3081
3082
3083 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3084
3085
3086 def limit_length(s, length):
3087 """ Add ellipses to overly long strings """
3088 if s is None:
3089 return None
3090 ELLIPSES = '...'
3091 if len(s) > length:
3092 return s[:length - len(ELLIPSES)] + ELLIPSES
3093 return s
3094
3095
3096 def version_tuple(v):
3097 return tuple(int(e) for e in re.split(r'[-.]', v))
3098
3099
3100 def is_outdated_version(version, limit, assume_new=True):
3101 if not version:
3102 return not assume_new
3103 try:
3104 return version_tuple(version) < version_tuple(limit)
3105 except ValueError:
3106 return not assume_new
3107
3108
3109 def ytdl_is_updateable():
3110 """ Returns if yt-dlp can be updated with -U """
3111
3112 from .update import is_non_updateable
3113
3114 return not is_non_updateable()
3115
3116
3117 def args_to_str(args):
3118 # Get a short string representation for a subprocess command
3119 return ' '.join(compat_shlex_quote(a) for a in args)
3120
3121
3122 def error_to_compat_str(err):
3123 return str(err)
3124
3125
3126 def error_to_str(err):
3127 return f'{type(err).__name__}: {err}'
3128
3129
3130 def mimetype2ext(mt):
3131 if mt is None:
3132 return None
3133
3134 mt, _, params = mt.partition(';')
3135 mt = mt.strip()
3136
3137 FULL_MAP = {
3138 'audio/mp4': 'm4a',
3139 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3140 # it's the most popular one
3141 'audio/mpeg': 'mp3',
3142 'audio/x-wav': 'wav',
3143 'audio/wav': 'wav',
3144 'audio/wave': 'wav',
3145 }
3146
3147 ext = FULL_MAP.get(mt)
3148 if ext is not None:
3149 return ext
3150
3151 SUBTYPE_MAP = {
3152 '3gpp': '3gp',
3153 'smptett+xml': 'tt',
3154 'ttaf+xml': 'dfxp',
3155 'ttml+xml': 'ttml',
3156 'x-flv': 'flv',
3157 'x-mp4-fragmented': 'mp4',
3158 'x-ms-sami': 'sami',
3159 'x-ms-wmv': 'wmv',
3160 'mpegurl': 'm3u8',
3161 'x-mpegurl': 'm3u8',
3162 'vnd.apple.mpegurl': 'm3u8',
3163 'dash+xml': 'mpd',
3164 'f4m+xml': 'f4m',
3165 'hds+xml': 'f4m',
3166 'vnd.ms-sstr+xml': 'ism',
3167 'quicktime': 'mov',
3168 'mp2t': 'ts',
3169 'x-wav': 'wav',
3170 'filmstrip+json': 'fs',
3171 'svg+xml': 'svg',
3172 }
3173
3174 _, _, subtype = mt.rpartition('/')
3175 ext = SUBTYPE_MAP.get(subtype.lower())
3176 if ext is not None:
3177 return ext
3178
3179 SUFFIX_MAP = {
3180 'json': 'json',
3181 'xml': 'xml',
3182 'zip': 'zip',
3183 'gzip': 'gz',
3184 }
3185
3186 _, _, suffix = subtype.partition('+')
3187 ext = SUFFIX_MAP.get(suffix)
3188 if ext is not None:
3189 return ext
3190
3191 return subtype.replace('+', '.')
3192
3193
3194 def ext2mimetype(ext_or_url):
3195 if not ext_or_url:
3196 return None
3197 if '.' not in ext_or_url:
3198 ext_or_url = f'file.{ext_or_url}'
3199 return mimetypes.guess_type(ext_or_url)[0]
3200
3201
3202 def parse_codecs(codecs_str):
3203 # http://tools.ietf.org/html/rfc6381
3204 if not codecs_str:
3205 return {}
3206 split_codecs = list(filter(None, map(
3207 str.strip, codecs_str.strip().strip(',').split(','))))
3208 vcodec, acodec, tcodec, hdr = None, None, None, None
3209 for full_codec in split_codecs:
3210 parts = full_codec.split('.')
3211 codec = parts[0].replace('0', '')
3212 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3213 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3214 if not vcodec:
3215 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3216 if codec in ('dvh1', 'dvhe'):
3217 hdr = 'DV'
3218 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3219 hdr = 'HDR10'
3220 elif full_codec.replace('0', '').startswith('vp9.2'):
3221 hdr = 'HDR10'
3222 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3223 if not acodec:
3224 acodec = full_codec
3225 elif codec in ('stpp', 'wvtt',):
3226 if not tcodec:
3227 tcodec = full_codec
3228 else:
3229 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3230 if vcodec or acodec or tcodec:
3231 return {
3232 'vcodec': vcodec or 'none',
3233 'acodec': acodec or 'none',
3234 'dynamic_range': hdr,
3235 **({'tcodec': tcodec} if tcodec is not None else {}),
3236 }
3237 elif len(split_codecs) == 2:
3238 return {
3239 'vcodec': split_codecs[0],
3240 'acodec': split_codecs[1],
3241 }
3242 return {}
3243
3244
3245 def urlhandle_detect_ext(url_handle):
3246 getheader = url_handle.headers.get
3247
3248 cd = getheader('Content-Disposition')
3249 if cd:
3250 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3251 if m:
3252 e = determine_ext(m.group('filename'), default_ext=None)
3253 if e:
3254 return e
3255
3256 return mimetype2ext(getheader('Content-Type'))
3257
3258
3259 def encode_data_uri(data, mime_type):
3260 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3261
3262
3263 def age_restricted(content_limit, age_limit):
3264 """ Returns True iff the content should be blocked """
3265
3266 if age_limit is None: # No limit set
3267 return False
3268 if content_limit is None:
3269 return False # Content available for everyone
3270 return age_limit < content_limit
3271
3272
3273 def is_html(first_bytes):
3274 """ Detect whether a file contains HTML by examining its first bytes. """
3275
3276 BOMS = [
3277 (b'\xef\xbb\xbf', 'utf-8'),
3278 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3279 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3280 (b'\xff\xfe', 'utf-16-le'),
3281 (b'\xfe\xff', 'utf-16-be'),
3282 ]
3283 for bom, enc in BOMS:
3284 if first_bytes.startswith(bom):
3285 s = first_bytes[len(bom):].decode(enc, 'replace')
3286 break
3287 else:
3288 s = first_bytes.decode('utf-8', 'replace')
3289
3290 return re.match(r'^\s*<', s)
3291
3292
3293 def determine_protocol(info_dict):
3294 protocol = info_dict.get('protocol')
3295 if protocol is not None:
3296 return protocol
3297
3298 url = sanitize_url(info_dict['url'])
3299 if url.startswith('rtmp'):
3300 return 'rtmp'
3301 elif url.startswith('mms'):
3302 return 'mms'
3303 elif url.startswith('rtsp'):
3304 return 'rtsp'
3305
3306 ext = determine_ext(url)
3307 if ext == 'm3u8':
3308 return 'm3u8'
3309 elif ext == 'f4m':
3310 return 'f4m'
3311
3312 return compat_urllib_parse_urlparse(url).scheme
3313
3314
3315 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3316 """ Render a list of rows, each as a list of values.
3317 Text after a \t will be right aligned """
3318 def width(string):
3319 return len(remove_terminal_sequences(string).replace('\t', ''))
3320
3321 def get_max_lens(table):
3322 return [max(width(str(v)) for v in col) for col in zip(*table)]
3323
3324 def filter_using_list(row, filterArray):
3325 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3326
3327 max_lens = get_max_lens(data) if hide_empty else []
3328 header_row = filter_using_list(header_row, max_lens)
3329 data = [filter_using_list(row, max_lens) for row in data]
3330
3331 table = [header_row] + data
3332 max_lens = get_max_lens(table)
3333 extra_gap += 1
3334 if delim:
3335 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3336 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3337 for row in table:
3338 for pos, text in enumerate(map(str, row)):
3339 if '\t' in text:
3340 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3341 else:
3342 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3343 ret = '\n'.join(''.join(row).rstrip() for row in table)
3344 return ret
3345
3346
3347 def _match_one(filter_part, dct, incomplete):
3348 # TODO: Generalize code with YoutubeDL._build_format_filter
3349 STRING_OPERATORS = {
3350 '*=': operator.contains,
3351 '^=': lambda attr, value: attr.startswith(value),
3352 '$=': lambda attr, value: attr.endswith(value),
3353 '~=': lambda attr, value: re.search(value, attr),
3354 }
3355 COMPARISON_OPERATORS = {
3356 **STRING_OPERATORS,
3357 '<=': operator.le, # "<=" must be defined above "<"
3358 '<': operator.lt,
3359 '>=': operator.ge,
3360 '>': operator.gt,
3361 '=': operator.eq,
3362 }
3363
3364 if isinstance(incomplete, bool):
3365 is_incomplete = lambda _: incomplete
3366 else:
3367 is_incomplete = lambda k: k in incomplete
3368
3369 operator_rex = re.compile(r'''(?x)\s*
3370 (?P<key>[a-z_]+)
3371 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3372 (?:
3373 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3374 (?P<strval>.+?)
3375 )
3376 \s*$
3377 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3378 m = operator_rex.search(filter_part)
3379 if m:
3380 m = m.groupdict()
3381 unnegated_op = COMPARISON_OPERATORS[m['op']]
3382 if m['negation']:
3383 op = lambda attr, value: not unnegated_op(attr, value)
3384 else:
3385 op = unnegated_op
3386 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3387 if m['quote']:
3388 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3389 actual_value = dct.get(m['key'])
3390 numeric_comparison = None
3391 if isinstance(actual_value, (int, float)):
3392 # If the original field is a string and matching comparisonvalue is
3393 # a number we should respect the origin of the original field
3394 # and process comparison value as a string (see
3395 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3396 try:
3397 numeric_comparison = int(comparison_value)
3398 except ValueError:
3399 numeric_comparison = parse_filesize(comparison_value)
3400 if numeric_comparison is None:
3401 numeric_comparison = parse_filesize(f'{comparison_value}B')
3402 if numeric_comparison is None:
3403 numeric_comparison = parse_duration(comparison_value)
3404 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3405 raise ValueError('Operator %s only supports string values!' % m['op'])
3406 if actual_value is None:
3407 return is_incomplete(m['key']) or m['none_inclusive']
3408 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3409
3410 UNARY_OPERATORS = {
3411 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3412 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3413 }
3414 operator_rex = re.compile(r'''(?x)\s*
3415 (?P<op>%s)\s*(?P<key>[a-z_]+)
3416 \s*$
3417 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3418 m = operator_rex.search(filter_part)
3419 if m:
3420 op = UNARY_OPERATORS[m.group('op')]
3421 actual_value = dct.get(m.group('key'))
3422 if is_incomplete(m.group('key')) and actual_value is None:
3423 return True
3424 return op(actual_value)
3425
3426 raise ValueError('Invalid filter part %r' % filter_part)
3427
3428
3429 def match_str(filter_str, dct, incomplete=False):
3430 """ Filter a dictionary with a simple string syntax.
3431 @returns Whether the filter passes
3432 @param incomplete Set of keys that is expected to be missing from dct.
3433 Can be True/False to indicate all/none of the keys may be missing.
3434 All conditions on incomplete keys pass if the key is missing
3435 """
3436 return all(
3437 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3438 for filter_part in re.split(r'(?<!\\)&', filter_str))
3439
3440
3441 def match_filter_func(filters):
3442 if not filters:
3443 return None
3444 filters = variadic(filters)
3445
3446 def _match_func(info_dict, *args, **kwargs):
3447 if any(match_str(f, info_dict, *args, **kwargs) for f in filters):
3448 return None
3449 else:
3450 video_title = info_dict.get('title') or info_dict.get('id') or 'video'
3451 filter_str = ') | ('.join(map(str.strip, filters))
3452 return f'{video_title} does not pass filter ({filter_str}), skipping ..'
3453 return _match_func
3454
3455
3456 def parse_dfxp_time_expr(time_expr):
3457 if not time_expr:
3458 return
3459
3460 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3461 if mobj:
3462 return float(mobj.group('time_offset'))
3463
3464 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3465 if mobj:
3466 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3467
3468
3469 def srt_subtitles_timecode(seconds):
3470 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3471
3472
3473 def ass_subtitles_timecode(seconds):
3474 time = timetuple_from_msec(seconds * 1000)
3475 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3476
3477
3478 def dfxp2srt(dfxp_data):
3479 '''
3480 @param dfxp_data A bytes-like object containing DFXP data
3481 @returns A unicode object containing converted SRT data
3482 '''
3483 LEGACY_NAMESPACES = (
3484 (b'http://www.w3.org/ns/ttml', [
3485 b'http://www.w3.org/2004/11/ttaf1',
3486 b'http://www.w3.org/2006/04/ttaf1',
3487 b'http://www.w3.org/2006/10/ttaf1',
3488 ]),
3489 (b'http://www.w3.org/ns/ttml#styling', [
3490 b'http://www.w3.org/ns/ttml#style',
3491 ]),
3492 )
3493
3494 SUPPORTED_STYLING = [
3495 'color',
3496 'fontFamily',
3497 'fontSize',
3498 'fontStyle',
3499 'fontWeight',
3500 'textDecoration'
3501 ]
3502
3503 _x = functools.partial(xpath_with_ns, ns_map={
3504 'xml': 'http://www.w3.org/XML/1998/namespace',
3505 'ttml': 'http://www.w3.org/ns/ttml',
3506 'tts': 'http://www.w3.org/ns/ttml#styling',
3507 })
3508
3509 styles = {}
3510 default_style = {}
3511
3512 class TTMLPElementParser:
3513 _out = ''
3514 _unclosed_elements = []
3515 _applied_styles = []
3516
3517 def start(self, tag, attrib):
3518 if tag in (_x('ttml:br'), 'br'):
3519 self._out += '\n'
3520 else:
3521 unclosed_elements = []
3522 style = {}
3523 element_style_id = attrib.get('style')
3524 if default_style:
3525 style.update(default_style)
3526 if element_style_id:
3527 style.update(styles.get(element_style_id, {}))
3528 for prop in SUPPORTED_STYLING:
3529 prop_val = attrib.get(_x('tts:' + prop))
3530 if prop_val:
3531 style[prop] = prop_val
3532 if style:
3533 font = ''
3534 for k, v in sorted(style.items()):
3535 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3536 continue
3537 if k == 'color':
3538 font += ' color="%s"' % v
3539 elif k == 'fontSize':
3540 font += ' size="%s"' % v
3541 elif k == 'fontFamily':
3542 font += ' face="%s"' % v
3543 elif k == 'fontWeight' and v == 'bold':
3544 self._out += '<b>'
3545 unclosed_elements.append('b')
3546 elif k == 'fontStyle' and v == 'italic':
3547 self._out += '<i>'
3548 unclosed_elements.append('i')
3549 elif k == 'textDecoration' and v == 'underline':
3550 self._out += '<u>'
3551 unclosed_elements.append('u')
3552 if font:
3553 self._out += '<font' + font + '>'
3554 unclosed_elements.append('font')
3555 applied_style = {}
3556 if self._applied_styles:
3557 applied_style.update(self._applied_styles[-1])
3558 applied_style.update(style)
3559 self._applied_styles.append(applied_style)
3560 self._unclosed_elements.append(unclosed_elements)
3561
3562 def end(self, tag):
3563 if tag not in (_x('ttml:br'), 'br'):
3564 unclosed_elements = self._unclosed_elements.pop()
3565 for element in reversed(unclosed_elements):
3566 self._out += '</%s>' % element
3567 if unclosed_elements and self._applied_styles:
3568 self._applied_styles.pop()
3569
3570 def data(self, data):
3571 self._out += data
3572
3573 def close(self):
3574 return self._out.strip()
3575
3576 def parse_node(node):
3577 target = TTMLPElementParser()
3578 parser = xml.etree.ElementTree.XMLParser(target=target)
3579 parser.feed(xml.etree.ElementTree.tostring(node))
3580 return parser.close()
3581
3582 for k, v in LEGACY_NAMESPACES:
3583 for ns in v:
3584 dfxp_data = dfxp_data.replace(ns, k)
3585
3586 dfxp = compat_etree_fromstring(dfxp_data)
3587 out = []
3588 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3589
3590 if not paras:
3591 raise ValueError('Invalid dfxp/TTML subtitle')
3592
3593 repeat = False
3594 while True:
3595 for style in dfxp.findall(_x('.//ttml:style')):
3596 style_id = style.get('id') or style.get(_x('xml:id'))
3597 if not style_id:
3598 continue
3599 parent_style_id = style.get('style')
3600 if parent_style_id:
3601 if parent_style_id not in styles:
3602 repeat = True
3603 continue
3604 styles[style_id] = styles[parent_style_id].copy()
3605 for prop in SUPPORTED_STYLING:
3606 prop_val = style.get(_x('tts:' + prop))
3607 if prop_val:
3608 styles.setdefault(style_id, {})[prop] = prop_val
3609 if repeat:
3610 repeat = False
3611 else:
3612 break
3613
3614 for p in ('body', 'div'):
3615 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3616 if ele is None:
3617 continue
3618 style = styles.get(ele.get('style'))
3619 if not style:
3620 continue
3621 default_style.update(style)
3622
3623 for para, index in zip(paras, itertools.count(1)):
3624 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3625 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3626 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3627 if begin_time is None:
3628 continue
3629 if not end_time:
3630 if not dur:
3631 continue
3632 end_time = begin_time + dur
3633 out.append('%d\n%s --> %s\n%s\n\n' % (
3634 index,
3635 srt_subtitles_timecode(begin_time),
3636 srt_subtitles_timecode(end_time),
3637 parse_node(para)))
3638
3639 return ''.join(out)
3640
3641
3642 def cli_option(params, command_option, param):
3643 param = params.get(param)
3644 if param:
3645 param = compat_str(param)
3646 return [command_option, param] if param is not None else []
3647
3648
3649 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3650 param = params.get(param)
3651 if param is None:
3652 return []
3653 assert isinstance(param, bool)
3654 if separator:
3655 return [command_option + separator + (true_value if param else false_value)]
3656 return [command_option, true_value if param else false_value]
3657
3658
3659 def cli_valueless_option(params, command_option, param, expected_value=True):
3660 param = params.get(param)
3661 return [command_option] if param == expected_value else []
3662
3663
3664 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3665 if isinstance(argdict, (list, tuple)): # for backward compatibility
3666 if use_compat:
3667 return argdict
3668 else:
3669 argdict = None
3670 if argdict is None:
3671 return default
3672 assert isinstance(argdict, dict)
3673
3674 assert isinstance(keys, (list, tuple))
3675 for key_list in keys:
3676 arg_list = list(filter(
3677 lambda x: x is not None,
3678 [argdict.get(key.lower()) for key in variadic(key_list)]))
3679 if arg_list:
3680 return [arg for args in arg_list for arg in args]
3681 return default
3682
3683
3684 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3685 main_key, exe = main_key.lower(), exe.lower()
3686 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3687 keys = [f'{root_key}{k}' for k in (keys or [''])]
3688 if root_key in keys:
3689 if main_key != exe:
3690 keys.append((main_key, exe))
3691 keys.append('default')
3692 else:
3693 use_compat = False
3694 return cli_configuration_args(argdict, keys, default, use_compat)
3695
3696
3697 class ISO639Utils:
3698 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3699 _lang_map = {
3700 'aa': 'aar',
3701 'ab': 'abk',
3702 'ae': 'ave',
3703 'af': 'afr',
3704 'ak': 'aka',
3705 'am': 'amh',
3706 'an': 'arg',
3707 'ar': 'ara',
3708 'as': 'asm',
3709 'av': 'ava',
3710 'ay': 'aym',
3711 'az': 'aze',
3712 'ba': 'bak',
3713 'be': 'bel',
3714 'bg': 'bul',
3715 'bh': 'bih',
3716 'bi': 'bis',
3717 'bm': 'bam',
3718 'bn': 'ben',
3719 'bo': 'bod',
3720 'br': 'bre',
3721 'bs': 'bos',
3722 'ca': 'cat',
3723 'ce': 'che',
3724 'ch': 'cha',
3725 'co': 'cos',
3726 'cr': 'cre',
3727 'cs': 'ces',
3728 'cu': 'chu',
3729 'cv': 'chv',
3730 'cy': 'cym',
3731 'da': 'dan',
3732 'de': 'deu',
3733 'dv': 'div',
3734 'dz': 'dzo',
3735 'ee': 'ewe',
3736 'el': 'ell',
3737 'en': 'eng',
3738 'eo': 'epo',
3739 'es': 'spa',
3740 'et': 'est',
3741 'eu': 'eus',
3742 'fa': 'fas',
3743 'ff': 'ful',
3744 'fi': 'fin',
3745 'fj': 'fij',
3746 'fo': 'fao',
3747 'fr': 'fra',
3748 'fy': 'fry',
3749 'ga': 'gle',
3750 'gd': 'gla',
3751 'gl': 'glg',
3752 'gn': 'grn',
3753 'gu': 'guj',
3754 'gv': 'glv',
3755 'ha': 'hau',
3756 'he': 'heb',
3757 'iw': 'heb', # Replaced by he in 1989 revision
3758 'hi': 'hin',
3759 'ho': 'hmo',
3760 'hr': 'hrv',
3761 'ht': 'hat',
3762 'hu': 'hun',
3763 'hy': 'hye',
3764 'hz': 'her',
3765 'ia': 'ina',
3766 'id': 'ind',
3767 'in': 'ind', # Replaced by id in 1989 revision
3768 'ie': 'ile',
3769 'ig': 'ibo',
3770 'ii': 'iii',
3771 'ik': 'ipk',
3772 'io': 'ido',
3773 'is': 'isl',
3774 'it': 'ita',
3775 'iu': 'iku',
3776 'ja': 'jpn',
3777 'jv': 'jav',
3778 'ka': 'kat',
3779 'kg': 'kon',
3780 'ki': 'kik',
3781 'kj': 'kua',
3782 'kk': 'kaz',
3783 'kl': 'kal',
3784 'km': 'khm',
3785 'kn': 'kan',
3786 'ko': 'kor',
3787 'kr': 'kau',
3788 'ks': 'kas',
3789 'ku': 'kur',
3790 'kv': 'kom',
3791 'kw': 'cor',
3792 'ky': 'kir',
3793 'la': 'lat',
3794 'lb': 'ltz',
3795 'lg': 'lug',
3796 'li': 'lim',
3797 'ln': 'lin',
3798 'lo': 'lao',
3799 'lt': 'lit',
3800 'lu': 'lub',
3801 'lv': 'lav',
3802 'mg': 'mlg',
3803 'mh': 'mah',
3804 'mi': 'mri',
3805 'mk': 'mkd',
3806 'ml': 'mal',
3807 'mn': 'mon',
3808 'mr': 'mar',
3809 'ms': 'msa',
3810 'mt': 'mlt',
3811 'my': 'mya',
3812 'na': 'nau',
3813 'nb': 'nob',
3814 'nd': 'nde',
3815 'ne': 'nep',
3816 'ng': 'ndo',
3817 'nl': 'nld',
3818 'nn': 'nno',
3819 'no': 'nor',
3820 'nr': 'nbl',
3821 'nv': 'nav',
3822 'ny': 'nya',
3823 'oc': 'oci',
3824 'oj': 'oji',
3825 'om': 'orm',
3826 'or': 'ori',
3827 'os': 'oss',
3828 'pa': 'pan',
3829 'pi': 'pli',
3830 'pl': 'pol',
3831 'ps': 'pus',
3832 'pt': 'por',
3833 'qu': 'que',
3834 'rm': 'roh',
3835 'rn': 'run',
3836 'ro': 'ron',
3837 'ru': 'rus',
3838 'rw': 'kin',
3839 'sa': 'san',
3840 'sc': 'srd',
3841 'sd': 'snd',
3842 'se': 'sme',
3843 'sg': 'sag',
3844 'si': 'sin',
3845 'sk': 'slk',
3846 'sl': 'slv',
3847 'sm': 'smo',
3848 'sn': 'sna',
3849 'so': 'som',
3850 'sq': 'sqi',
3851 'sr': 'srp',
3852 'ss': 'ssw',
3853 'st': 'sot',
3854 'su': 'sun',
3855 'sv': 'swe',
3856 'sw': 'swa',
3857 'ta': 'tam',
3858 'te': 'tel',
3859 'tg': 'tgk',
3860 'th': 'tha',
3861 'ti': 'tir',
3862 'tk': 'tuk',
3863 'tl': 'tgl',
3864 'tn': 'tsn',
3865 'to': 'ton',
3866 'tr': 'tur',
3867 'ts': 'tso',
3868 'tt': 'tat',
3869 'tw': 'twi',
3870 'ty': 'tah',
3871 'ug': 'uig',
3872 'uk': 'ukr',
3873 'ur': 'urd',
3874 'uz': 'uzb',
3875 've': 'ven',
3876 'vi': 'vie',
3877 'vo': 'vol',
3878 'wa': 'wln',
3879 'wo': 'wol',
3880 'xh': 'xho',
3881 'yi': 'yid',
3882 'ji': 'yid', # Replaced by yi in 1989 revision
3883 'yo': 'yor',
3884 'za': 'zha',
3885 'zh': 'zho',
3886 'zu': 'zul',
3887 }
3888
3889 @classmethod
3890 def short2long(cls, code):
3891 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3892 return cls._lang_map.get(code[:2])
3893
3894 @classmethod
3895 def long2short(cls, code):
3896 """Convert language code from ISO 639-2/T to ISO 639-1"""
3897 for short_name, long_name in cls._lang_map.items():
3898 if long_name == code:
3899 return short_name
3900
3901
3902 class ISO3166Utils:
3903 # From http://data.okfn.org/data/core/country-list
3904 _country_map = {
3905 'AF': 'Afghanistan',
3906 'AX': 'Åland Islands',
3907 'AL': 'Albania',
3908 'DZ': 'Algeria',
3909 'AS': 'American Samoa',
3910 'AD': 'Andorra',
3911 'AO': 'Angola',
3912 'AI': 'Anguilla',
3913 'AQ': 'Antarctica',
3914 'AG': 'Antigua and Barbuda',
3915 'AR': 'Argentina',
3916 'AM': 'Armenia',
3917 'AW': 'Aruba',
3918 'AU': 'Australia',
3919 'AT': 'Austria',
3920 'AZ': 'Azerbaijan',
3921 'BS': 'Bahamas',
3922 'BH': 'Bahrain',
3923 'BD': 'Bangladesh',
3924 'BB': 'Barbados',
3925 'BY': 'Belarus',
3926 'BE': 'Belgium',
3927 'BZ': 'Belize',
3928 'BJ': 'Benin',
3929 'BM': 'Bermuda',
3930 'BT': 'Bhutan',
3931 'BO': 'Bolivia, Plurinational State of',
3932 'BQ': 'Bonaire, Sint Eustatius and Saba',
3933 'BA': 'Bosnia and Herzegovina',
3934 'BW': 'Botswana',
3935 'BV': 'Bouvet Island',
3936 'BR': 'Brazil',
3937 'IO': 'British Indian Ocean Territory',
3938 'BN': 'Brunei Darussalam',
3939 'BG': 'Bulgaria',
3940 'BF': 'Burkina Faso',
3941 'BI': 'Burundi',
3942 'KH': 'Cambodia',
3943 'CM': 'Cameroon',
3944 'CA': 'Canada',
3945 'CV': 'Cape Verde',
3946 'KY': 'Cayman Islands',
3947 'CF': 'Central African Republic',
3948 'TD': 'Chad',
3949 'CL': 'Chile',
3950 'CN': 'China',
3951 'CX': 'Christmas Island',
3952 'CC': 'Cocos (Keeling) Islands',
3953 'CO': 'Colombia',
3954 'KM': 'Comoros',
3955 'CG': 'Congo',
3956 'CD': 'Congo, the Democratic Republic of the',
3957 'CK': 'Cook Islands',
3958 'CR': 'Costa Rica',
3959 'CI': 'Côte d\'Ivoire',
3960 'HR': 'Croatia',
3961 'CU': 'Cuba',
3962 'CW': 'Curaçao',
3963 'CY': 'Cyprus',
3964 'CZ': 'Czech Republic',
3965 'DK': 'Denmark',
3966 'DJ': 'Djibouti',
3967 'DM': 'Dominica',
3968 'DO': 'Dominican Republic',
3969 'EC': 'Ecuador',
3970 'EG': 'Egypt',
3971 'SV': 'El Salvador',
3972 'GQ': 'Equatorial Guinea',
3973 'ER': 'Eritrea',
3974 'EE': 'Estonia',
3975 'ET': 'Ethiopia',
3976 'FK': 'Falkland Islands (Malvinas)',
3977 'FO': 'Faroe Islands',
3978 'FJ': 'Fiji',
3979 'FI': 'Finland',
3980 'FR': 'France',
3981 'GF': 'French Guiana',
3982 'PF': 'French Polynesia',
3983 'TF': 'French Southern Territories',
3984 'GA': 'Gabon',
3985 'GM': 'Gambia',
3986 'GE': 'Georgia',
3987 'DE': 'Germany',
3988 'GH': 'Ghana',
3989 'GI': 'Gibraltar',
3990 'GR': 'Greece',
3991 'GL': 'Greenland',
3992 'GD': 'Grenada',
3993 'GP': 'Guadeloupe',
3994 'GU': 'Guam',
3995 'GT': 'Guatemala',
3996 'GG': 'Guernsey',
3997 'GN': 'Guinea',
3998 'GW': 'Guinea-Bissau',
3999 'GY': 'Guyana',
4000 'HT': 'Haiti',
4001 'HM': 'Heard Island and McDonald Islands',
4002 'VA': 'Holy See (Vatican City State)',
4003 'HN': 'Honduras',
4004 'HK': 'Hong Kong',
4005 'HU': 'Hungary',
4006 'IS': 'Iceland',
4007 'IN': 'India',
4008 'ID': 'Indonesia',
4009 'IR': 'Iran, Islamic Republic of',
4010 'IQ': 'Iraq',
4011 'IE': 'Ireland',
4012 'IM': 'Isle of Man',
4013 'IL': 'Israel',
4014 'IT': 'Italy',
4015 'JM': 'Jamaica',
4016 'JP': 'Japan',
4017 'JE': 'Jersey',
4018 'JO': 'Jordan',
4019 'KZ': 'Kazakhstan',
4020 'KE': 'Kenya',
4021 'KI': 'Kiribati',
4022 'KP': 'Korea, Democratic People\'s Republic of',
4023 'KR': 'Korea, Republic of',
4024 'KW': 'Kuwait',
4025 'KG': 'Kyrgyzstan',
4026 'LA': 'Lao People\'s Democratic Republic',
4027 'LV': 'Latvia',
4028 'LB': 'Lebanon',
4029 'LS': 'Lesotho',
4030 'LR': 'Liberia',
4031 'LY': 'Libya',
4032 'LI': 'Liechtenstein',
4033 'LT': 'Lithuania',
4034 'LU': 'Luxembourg',
4035 'MO': 'Macao',
4036 'MK': 'Macedonia, the Former Yugoslav Republic of',
4037 'MG': 'Madagascar',
4038 'MW': 'Malawi',
4039 'MY': 'Malaysia',
4040 'MV': 'Maldives',
4041 'ML': 'Mali',
4042 'MT': 'Malta',
4043 'MH': 'Marshall Islands',
4044 'MQ': 'Martinique',
4045 'MR': 'Mauritania',
4046 'MU': 'Mauritius',
4047 'YT': 'Mayotte',
4048 'MX': 'Mexico',
4049 'FM': 'Micronesia, Federated States of',
4050 'MD': 'Moldova, Republic of',
4051 'MC': 'Monaco',
4052 'MN': 'Mongolia',
4053 'ME': 'Montenegro',
4054 'MS': 'Montserrat',
4055 'MA': 'Morocco',
4056 'MZ': 'Mozambique',
4057 'MM': 'Myanmar',
4058 'NA': 'Namibia',
4059 'NR': 'Nauru',
4060 'NP': 'Nepal',
4061 'NL': 'Netherlands',
4062 'NC': 'New Caledonia',
4063 'NZ': 'New Zealand',
4064 'NI': 'Nicaragua',
4065 'NE': 'Niger',
4066 'NG': 'Nigeria',
4067 'NU': 'Niue',
4068 'NF': 'Norfolk Island',
4069 'MP': 'Northern Mariana Islands',
4070 'NO': 'Norway',
4071 'OM': 'Oman',
4072 'PK': 'Pakistan',
4073 'PW': 'Palau',
4074 'PS': 'Palestine, State of',
4075 'PA': 'Panama',
4076 'PG': 'Papua New Guinea',
4077 'PY': 'Paraguay',
4078 'PE': 'Peru',
4079 'PH': 'Philippines',
4080 'PN': 'Pitcairn',
4081 'PL': 'Poland',
4082 'PT': 'Portugal',
4083 'PR': 'Puerto Rico',
4084 'QA': 'Qatar',
4085 'RE': 'Réunion',
4086 'RO': 'Romania',
4087 'RU': 'Russian Federation',
4088 'RW': 'Rwanda',
4089 'BL': 'Saint Barthélemy',
4090 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4091 'KN': 'Saint Kitts and Nevis',
4092 'LC': 'Saint Lucia',
4093 'MF': 'Saint Martin (French part)',
4094 'PM': 'Saint Pierre and Miquelon',
4095 'VC': 'Saint Vincent and the Grenadines',
4096 'WS': 'Samoa',
4097 'SM': 'San Marino',
4098 'ST': 'Sao Tome and Principe',
4099 'SA': 'Saudi Arabia',
4100 'SN': 'Senegal',
4101 'RS': 'Serbia',
4102 'SC': 'Seychelles',
4103 'SL': 'Sierra Leone',
4104 'SG': 'Singapore',
4105 'SX': 'Sint Maarten (Dutch part)',
4106 'SK': 'Slovakia',
4107 'SI': 'Slovenia',
4108 'SB': 'Solomon Islands',
4109 'SO': 'Somalia',
4110 'ZA': 'South Africa',
4111 'GS': 'South Georgia and the South Sandwich Islands',
4112 'SS': 'South Sudan',
4113 'ES': 'Spain',
4114 'LK': 'Sri Lanka',
4115 'SD': 'Sudan',
4116 'SR': 'Suriname',
4117 'SJ': 'Svalbard and Jan Mayen',
4118 'SZ': 'Swaziland',
4119 'SE': 'Sweden',
4120 'CH': 'Switzerland',
4121 'SY': 'Syrian Arab Republic',
4122 'TW': 'Taiwan, Province of China',
4123 'TJ': 'Tajikistan',
4124 'TZ': 'Tanzania, United Republic of',
4125 'TH': 'Thailand',
4126 'TL': 'Timor-Leste',
4127 'TG': 'Togo',
4128 'TK': 'Tokelau',
4129 'TO': 'Tonga',
4130 'TT': 'Trinidad and Tobago',
4131 'TN': 'Tunisia',
4132 'TR': 'Turkey',
4133 'TM': 'Turkmenistan',
4134 'TC': 'Turks and Caicos Islands',
4135 'TV': 'Tuvalu',
4136 'UG': 'Uganda',
4137 'UA': 'Ukraine',
4138 'AE': 'United Arab Emirates',
4139 'GB': 'United Kingdom',
4140 'US': 'United States',
4141 'UM': 'United States Minor Outlying Islands',
4142 'UY': 'Uruguay',
4143 'UZ': 'Uzbekistan',
4144 'VU': 'Vanuatu',
4145 'VE': 'Venezuela, Bolivarian Republic of',
4146 'VN': 'Viet Nam',
4147 'VG': 'Virgin Islands, British',
4148 'VI': 'Virgin Islands, U.S.',
4149 'WF': 'Wallis and Futuna',
4150 'EH': 'Western Sahara',
4151 'YE': 'Yemen',
4152 'ZM': 'Zambia',
4153 'ZW': 'Zimbabwe',
4154 }
4155
4156 @classmethod
4157 def short2full(cls, code):
4158 """Convert an ISO 3166-2 country code to the corresponding full name"""
4159 return cls._country_map.get(code.upper())
4160
4161
4162 class GeoUtils:
4163 # Major IPv4 address blocks per country
4164 _country_ip_map = {
4165 'AD': '46.172.224.0/19',
4166 'AE': '94.200.0.0/13',
4167 'AF': '149.54.0.0/17',
4168 'AG': '209.59.64.0/18',
4169 'AI': '204.14.248.0/21',
4170 'AL': '46.99.0.0/16',
4171 'AM': '46.70.0.0/15',
4172 'AO': '105.168.0.0/13',
4173 'AP': '182.50.184.0/21',
4174 'AQ': '23.154.160.0/24',
4175 'AR': '181.0.0.0/12',
4176 'AS': '202.70.112.0/20',
4177 'AT': '77.116.0.0/14',
4178 'AU': '1.128.0.0/11',
4179 'AW': '181.41.0.0/18',
4180 'AX': '185.217.4.0/22',
4181 'AZ': '5.197.0.0/16',
4182 'BA': '31.176.128.0/17',
4183 'BB': '65.48.128.0/17',
4184 'BD': '114.130.0.0/16',
4185 'BE': '57.0.0.0/8',
4186 'BF': '102.178.0.0/15',
4187 'BG': '95.42.0.0/15',
4188 'BH': '37.131.0.0/17',
4189 'BI': '154.117.192.0/18',
4190 'BJ': '137.255.0.0/16',
4191 'BL': '185.212.72.0/23',
4192 'BM': '196.12.64.0/18',
4193 'BN': '156.31.0.0/16',
4194 'BO': '161.56.0.0/16',
4195 'BQ': '161.0.80.0/20',
4196 'BR': '191.128.0.0/12',
4197 'BS': '24.51.64.0/18',
4198 'BT': '119.2.96.0/19',
4199 'BW': '168.167.0.0/16',
4200 'BY': '178.120.0.0/13',
4201 'BZ': '179.42.192.0/18',
4202 'CA': '99.224.0.0/11',
4203 'CD': '41.243.0.0/16',
4204 'CF': '197.242.176.0/21',
4205 'CG': '160.113.0.0/16',
4206 'CH': '85.0.0.0/13',
4207 'CI': '102.136.0.0/14',
4208 'CK': '202.65.32.0/19',
4209 'CL': '152.172.0.0/14',
4210 'CM': '102.244.0.0/14',
4211 'CN': '36.128.0.0/10',
4212 'CO': '181.240.0.0/12',
4213 'CR': '201.192.0.0/12',
4214 'CU': '152.206.0.0/15',
4215 'CV': '165.90.96.0/19',
4216 'CW': '190.88.128.0/17',
4217 'CY': '31.153.0.0/16',
4218 'CZ': '88.100.0.0/14',
4219 'DE': '53.0.0.0/8',
4220 'DJ': '197.241.0.0/17',
4221 'DK': '87.48.0.0/12',
4222 'DM': '192.243.48.0/20',
4223 'DO': '152.166.0.0/15',
4224 'DZ': '41.96.0.0/12',
4225 'EC': '186.68.0.0/15',
4226 'EE': '90.190.0.0/15',
4227 'EG': '156.160.0.0/11',
4228 'ER': '196.200.96.0/20',
4229 'ES': '88.0.0.0/11',
4230 'ET': '196.188.0.0/14',
4231 'EU': '2.16.0.0/13',
4232 'FI': '91.152.0.0/13',
4233 'FJ': '144.120.0.0/16',
4234 'FK': '80.73.208.0/21',
4235 'FM': '119.252.112.0/20',
4236 'FO': '88.85.32.0/19',
4237 'FR': '90.0.0.0/9',
4238 'GA': '41.158.0.0/15',
4239 'GB': '25.0.0.0/8',
4240 'GD': '74.122.88.0/21',
4241 'GE': '31.146.0.0/16',
4242 'GF': '161.22.64.0/18',
4243 'GG': '62.68.160.0/19',
4244 'GH': '154.160.0.0/12',
4245 'GI': '95.164.0.0/16',
4246 'GL': '88.83.0.0/19',
4247 'GM': '160.182.0.0/15',
4248 'GN': '197.149.192.0/18',
4249 'GP': '104.250.0.0/19',
4250 'GQ': '105.235.224.0/20',
4251 'GR': '94.64.0.0/13',
4252 'GT': '168.234.0.0/16',
4253 'GU': '168.123.0.0/16',
4254 'GW': '197.214.80.0/20',
4255 'GY': '181.41.64.0/18',
4256 'HK': '113.252.0.0/14',
4257 'HN': '181.210.0.0/16',
4258 'HR': '93.136.0.0/13',
4259 'HT': '148.102.128.0/17',
4260 'HU': '84.0.0.0/14',
4261 'ID': '39.192.0.0/10',
4262 'IE': '87.32.0.0/12',
4263 'IL': '79.176.0.0/13',
4264 'IM': '5.62.80.0/20',
4265 'IN': '117.192.0.0/10',
4266 'IO': '203.83.48.0/21',
4267 'IQ': '37.236.0.0/14',
4268 'IR': '2.176.0.0/12',
4269 'IS': '82.221.0.0/16',
4270 'IT': '79.0.0.0/10',
4271 'JE': '87.244.64.0/18',
4272 'JM': '72.27.0.0/17',
4273 'JO': '176.29.0.0/16',
4274 'JP': '133.0.0.0/8',
4275 'KE': '105.48.0.0/12',
4276 'KG': '158.181.128.0/17',
4277 'KH': '36.37.128.0/17',
4278 'KI': '103.25.140.0/22',
4279 'KM': '197.255.224.0/20',
4280 'KN': '198.167.192.0/19',
4281 'KP': '175.45.176.0/22',
4282 'KR': '175.192.0.0/10',
4283 'KW': '37.36.0.0/14',
4284 'KY': '64.96.0.0/15',
4285 'KZ': '2.72.0.0/13',
4286 'LA': '115.84.64.0/18',
4287 'LB': '178.135.0.0/16',
4288 'LC': '24.92.144.0/20',
4289 'LI': '82.117.0.0/19',
4290 'LK': '112.134.0.0/15',
4291 'LR': '102.183.0.0/16',
4292 'LS': '129.232.0.0/17',
4293 'LT': '78.56.0.0/13',
4294 'LU': '188.42.0.0/16',
4295 'LV': '46.109.0.0/16',
4296 'LY': '41.252.0.0/14',
4297 'MA': '105.128.0.0/11',
4298 'MC': '88.209.64.0/18',
4299 'MD': '37.246.0.0/16',
4300 'ME': '178.175.0.0/17',
4301 'MF': '74.112.232.0/21',
4302 'MG': '154.126.0.0/17',
4303 'MH': '117.103.88.0/21',
4304 'MK': '77.28.0.0/15',
4305 'ML': '154.118.128.0/18',
4306 'MM': '37.111.0.0/17',
4307 'MN': '49.0.128.0/17',
4308 'MO': '60.246.0.0/16',
4309 'MP': '202.88.64.0/20',
4310 'MQ': '109.203.224.0/19',
4311 'MR': '41.188.64.0/18',
4312 'MS': '208.90.112.0/22',
4313 'MT': '46.11.0.0/16',
4314 'MU': '105.16.0.0/12',
4315 'MV': '27.114.128.0/18',
4316 'MW': '102.70.0.0/15',
4317 'MX': '187.192.0.0/11',
4318 'MY': '175.136.0.0/13',
4319 'MZ': '197.218.0.0/15',
4320 'NA': '41.182.0.0/16',
4321 'NC': '101.101.0.0/18',
4322 'NE': '197.214.0.0/18',
4323 'NF': '203.17.240.0/22',
4324 'NG': '105.112.0.0/12',
4325 'NI': '186.76.0.0/15',
4326 'NL': '145.96.0.0/11',
4327 'NO': '84.208.0.0/13',
4328 'NP': '36.252.0.0/15',
4329 'NR': '203.98.224.0/19',
4330 'NU': '49.156.48.0/22',
4331 'NZ': '49.224.0.0/14',
4332 'OM': '5.36.0.0/15',
4333 'PA': '186.72.0.0/15',
4334 'PE': '186.160.0.0/14',
4335 'PF': '123.50.64.0/18',
4336 'PG': '124.240.192.0/19',
4337 'PH': '49.144.0.0/13',
4338 'PK': '39.32.0.0/11',
4339 'PL': '83.0.0.0/11',
4340 'PM': '70.36.0.0/20',
4341 'PR': '66.50.0.0/16',
4342 'PS': '188.161.0.0/16',
4343 'PT': '85.240.0.0/13',
4344 'PW': '202.124.224.0/20',
4345 'PY': '181.120.0.0/14',
4346 'QA': '37.210.0.0/15',
4347 'RE': '102.35.0.0/16',
4348 'RO': '79.112.0.0/13',
4349 'RS': '93.86.0.0/15',
4350 'RU': '5.136.0.0/13',
4351 'RW': '41.186.0.0/16',
4352 'SA': '188.48.0.0/13',
4353 'SB': '202.1.160.0/19',
4354 'SC': '154.192.0.0/11',
4355 'SD': '102.120.0.0/13',
4356 'SE': '78.64.0.0/12',
4357 'SG': '8.128.0.0/10',
4358 'SI': '188.196.0.0/14',
4359 'SK': '78.98.0.0/15',
4360 'SL': '102.143.0.0/17',
4361 'SM': '89.186.32.0/19',
4362 'SN': '41.82.0.0/15',
4363 'SO': '154.115.192.0/18',
4364 'SR': '186.179.128.0/17',
4365 'SS': '105.235.208.0/21',
4366 'ST': '197.159.160.0/19',
4367 'SV': '168.243.0.0/16',
4368 'SX': '190.102.0.0/20',
4369 'SY': '5.0.0.0/16',
4370 'SZ': '41.84.224.0/19',
4371 'TC': '65.255.48.0/20',
4372 'TD': '154.68.128.0/19',
4373 'TG': '196.168.0.0/14',
4374 'TH': '171.96.0.0/13',
4375 'TJ': '85.9.128.0/18',
4376 'TK': '27.96.24.0/21',
4377 'TL': '180.189.160.0/20',
4378 'TM': '95.85.96.0/19',
4379 'TN': '197.0.0.0/11',
4380 'TO': '175.176.144.0/21',
4381 'TR': '78.160.0.0/11',
4382 'TT': '186.44.0.0/15',
4383 'TV': '202.2.96.0/19',
4384 'TW': '120.96.0.0/11',
4385 'TZ': '156.156.0.0/14',
4386 'UA': '37.52.0.0/14',
4387 'UG': '102.80.0.0/13',
4388 'US': '6.0.0.0/8',
4389 'UY': '167.56.0.0/13',
4390 'UZ': '84.54.64.0/18',
4391 'VA': '212.77.0.0/19',
4392 'VC': '207.191.240.0/21',
4393 'VE': '186.88.0.0/13',
4394 'VG': '66.81.192.0/20',
4395 'VI': '146.226.0.0/16',
4396 'VN': '14.160.0.0/11',
4397 'VU': '202.80.32.0/20',
4398 'WF': '117.20.32.0/21',
4399 'WS': '202.4.32.0/19',
4400 'YE': '134.35.0.0/16',
4401 'YT': '41.242.116.0/22',
4402 'ZA': '41.0.0.0/11',
4403 'ZM': '102.144.0.0/13',
4404 'ZW': '102.177.192.0/18',
4405 }
4406
4407 @classmethod
4408 def random_ipv4(cls, code_or_block):
4409 if len(code_or_block) == 2:
4410 block = cls._country_ip_map.get(code_or_block.upper())
4411 if not block:
4412 return None
4413 else:
4414 block = code_or_block
4415 addr, preflen = block.split('/')
4416 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4417 addr_max = addr_min | (0xffffffff >> int(preflen))
4418 return compat_str(socket.inet_ntoa(
4419 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4420
4421
4422 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4423 def __init__(self, proxies=None):
4424 # Set default handlers
4425 for type in ('http', 'https'):
4426 setattr(self, '%s_open' % type,
4427 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4428 meth(r, proxy, type))
4429 compat_urllib_request.ProxyHandler.__init__(self, proxies)
4430
4431 def proxy_open(self, req, proxy, type):
4432 req_proxy = req.headers.get('Ytdl-request-proxy')
4433 if req_proxy is not None:
4434 proxy = req_proxy
4435 del req.headers['Ytdl-request-proxy']
4436
4437 if proxy == '__noproxy__':
4438 return None # No Proxy
4439 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4440 req.add_header('Ytdl-socks-proxy', proxy)
4441 # yt-dlp's http/https handlers do wrapping the socket with socks
4442 return None
4443 return compat_urllib_request.ProxyHandler.proxy_open(
4444 self, req, proxy, type)
4445
4446
4447 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4448 # released into Public Domain
4449 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4450
4451 def long_to_bytes(n, blocksize=0):
4452 """long_to_bytes(n:long, blocksize:int) : string
4453 Convert a long integer to a byte string.
4454
4455 If optional blocksize is given and greater than zero, pad the front of the
4456 byte string with binary zeros so that the length is a multiple of
4457 blocksize.
4458 """
4459 # after much testing, this algorithm was deemed to be the fastest
4460 s = b''
4461 n = int(n)
4462 while n > 0:
4463 s = compat_struct_pack('>I', n & 0xffffffff) + s
4464 n = n >> 32
4465 # strip off leading zeros
4466 for i in range(len(s)):
4467 if s[i] != b'\000'[0]:
4468 break
4469 else:
4470 # only happens when n == 0
4471 s = b'\000'
4472 i = 0
4473 s = s[i:]
4474 # add back some pad bytes. this could be done more efficiently w.r.t. the
4475 # de-padding being done above, but sigh...
4476 if blocksize > 0 and len(s) % blocksize:
4477 s = (blocksize - len(s) % blocksize) * b'\000' + s
4478 return s
4479
4480
4481 def bytes_to_long(s):
4482 """bytes_to_long(string) : long
4483 Convert a byte string to a long integer.
4484
4485 This is (essentially) the inverse of long_to_bytes().
4486 """
4487 acc = 0
4488 length = len(s)
4489 if length % 4:
4490 extra = (4 - length % 4)
4491 s = b'\000' * extra + s
4492 length = length + extra
4493 for i in range(0, length, 4):
4494 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4495 return acc
4496
4497
4498 def ohdave_rsa_encrypt(data, exponent, modulus):
4499 '''
4500 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4501
4502 Input:
4503 data: data to encrypt, bytes-like object
4504 exponent, modulus: parameter e and N of RSA algorithm, both integer
4505 Output: hex string of encrypted data
4506
4507 Limitation: supports one block encryption only
4508 '''
4509
4510 payload = int(binascii.hexlify(data[::-1]), 16)
4511 encrypted = pow(payload, exponent, modulus)
4512 return '%x' % encrypted
4513
4514
4515 def pkcs1pad(data, length):
4516 """
4517 Padding input data with PKCS#1 scheme
4518
4519 @param {int[]} data input data
4520 @param {int} length target length
4521 @returns {int[]} padded data
4522 """
4523 if len(data) > length - 11:
4524 raise ValueError('Input data too long for PKCS#1 padding')
4525
4526 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4527 return [0, 2] + pseudo_random + [0] + data
4528
4529
4530 def encode_base_n(num, n, table=None):
4531 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4532 if not table:
4533 table = FULL_TABLE[:n]
4534
4535 if n > len(table):
4536 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4537
4538 if num == 0:
4539 return table[0]
4540
4541 ret = ''
4542 while num:
4543 ret = table[num % n] + ret
4544 num = num // n
4545 return ret
4546
4547
4548 def decode_packed_codes(code):
4549 mobj = re.search(PACKED_CODES_RE, code)
4550 obfuscated_code, base, count, symbols = mobj.groups()
4551 base = int(base)
4552 count = int(count)
4553 symbols = symbols.split('|')
4554 symbol_table = {}
4555
4556 while count:
4557 count -= 1
4558 base_n_count = encode_base_n(count, base)
4559 symbol_table[base_n_count] = symbols[count] or base_n_count
4560
4561 return re.sub(
4562 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4563 obfuscated_code)
4564
4565
4566 def caesar(s, alphabet, shift):
4567 if shift == 0:
4568 return s
4569 l = len(alphabet)
4570 return ''.join(
4571 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4572 for c in s)
4573
4574
4575 def rot47(s):
4576 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4577
4578
4579 def parse_m3u8_attributes(attrib):
4580 info = {}
4581 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4582 if val.startswith('"'):
4583 val = val[1:-1]
4584 info[key] = val
4585 return info
4586
4587
4588 def urshift(val, n):
4589 return val >> n if val >= 0 else (val + 0x100000000) >> n
4590
4591
4592 # Based on png2str() written by @gdkchan and improved by @yokrysty
4593 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4594 def decode_png(png_data):
4595 # Reference: https://www.w3.org/TR/PNG/
4596 header = png_data[8:]
4597
4598 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4599 raise OSError('Not a valid PNG file.')
4600
4601 int_map = {1: '>B', 2: '>H', 4: '>I'}
4602 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4603
4604 chunks = []
4605
4606 while header:
4607 length = unpack_integer(header[:4])
4608 header = header[4:]
4609
4610 chunk_type = header[:4]
4611 header = header[4:]
4612
4613 chunk_data = header[:length]
4614 header = header[length:]
4615
4616 header = header[4:] # Skip CRC
4617
4618 chunks.append({
4619 'type': chunk_type,
4620 'length': length,
4621 'data': chunk_data
4622 })
4623
4624 ihdr = chunks[0]['data']
4625
4626 width = unpack_integer(ihdr[:4])
4627 height = unpack_integer(ihdr[4:8])
4628
4629 idat = b''
4630
4631 for chunk in chunks:
4632 if chunk['type'] == b'IDAT':
4633 idat += chunk['data']
4634
4635 if not idat:
4636 raise OSError('Unable to read PNG data.')
4637
4638 decompressed_data = bytearray(zlib.decompress(idat))
4639
4640 stride = width * 3
4641 pixels = []
4642
4643 def _get_pixel(idx):
4644 x = idx % stride
4645 y = idx // stride
4646 return pixels[y][x]
4647
4648 for y in range(height):
4649 basePos = y * (1 + stride)
4650 filter_type = decompressed_data[basePos]
4651
4652 current_row = []
4653
4654 pixels.append(current_row)
4655
4656 for x in range(stride):
4657 color = decompressed_data[1 + basePos + x]
4658 basex = y * stride + x
4659 left = 0
4660 up = 0
4661
4662 if x > 2:
4663 left = _get_pixel(basex - 3)
4664 if y > 0:
4665 up = _get_pixel(basex - stride)
4666
4667 if filter_type == 1: # Sub
4668 color = (color + left) & 0xff
4669 elif filter_type == 2: # Up
4670 color = (color + up) & 0xff
4671 elif filter_type == 3: # Average
4672 color = (color + ((left + up) >> 1)) & 0xff
4673 elif filter_type == 4: # Paeth
4674 a = left
4675 b = up
4676 c = 0
4677
4678 if x > 2 and y > 0:
4679 c = _get_pixel(basex - stride - 3)
4680
4681 p = a + b - c
4682
4683 pa = abs(p - a)
4684 pb = abs(p - b)
4685 pc = abs(p - c)
4686
4687 if pa <= pb and pa <= pc:
4688 color = (color + a) & 0xff
4689 elif pb <= pc:
4690 color = (color + b) & 0xff
4691 else:
4692 color = (color + c) & 0xff
4693
4694 current_row.append(color)
4695
4696 return width, height, pixels
4697
4698
4699 def write_xattr(path, key, value):
4700 # This mess below finds the best xattr tool for the job
4701 try:
4702 # try the pyxattr module...
4703 import xattr
4704
4705 if hasattr(xattr, 'set'): # pyxattr
4706 # Unicode arguments are not supported in python-pyxattr until
4707 # version 0.5.0
4708 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4709 pyxattr_required_version = '0.5.0'
4710 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4711 # TODO: fallback to CLI tools
4712 raise XAttrUnavailableError(
4713 'python-pyxattr is detected but is too old. '
4714 'yt-dlp requires %s or above while your version is %s. '
4715 'Falling back to other xattr implementations' % (
4716 pyxattr_required_version, xattr.__version__))
4717
4718 setxattr = xattr.set
4719 else: # xattr
4720 setxattr = xattr.setxattr
4721
4722 try:
4723 setxattr(path, key, value)
4724 except OSError as e:
4725 raise XAttrMetadataError(e.errno, e.strerror)
4726
4727 except ImportError:
4728 if compat_os_name == 'nt':
4729 # Write xattrs to NTFS Alternate Data Streams:
4730 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4731 assert ':' not in key
4732 assert os.path.exists(path)
4733
4734 ads_fn = path + ':' + key
4735 try:
4736 with open(ads_fn, 'wb') as f:
4737 f.write(value)
4738 except OSError as e:
4739 raise XAttrMetadataError(e.errno, e.strerror)
4740 else:
4741 user_has_setfattr = check_executable('setfattr', ['--version'])
4742 user_has_xattr = check_executable('xattr', ['-h'])
4743
4744 if user_has_setfattr or user_has_xattr:
4745
4746 value = value.decode('utf-8')
4747 if user_has_setfattr:
4748 executable = 'setfattr'
4749 opts = ['-n', key, '-v', value]
4750 elif user_has_xattr:
4751 executable = 'xattr'
4752 opts = ['-w', key, value]
4753
4754 cmd = ([encodeFilename(executable, True)]
4755 + [encodeArgument(o) for o in opts]
4756 + [encodeFilename(path, True)])
4757
4758 try:
4759 p = Popen(
4760 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4761 except OSError as e:
4762 raise XAttrMetadataError(e.errno, e.strerror)
4763 stdout, stderr = p.communicate_or_kill()
4764 stderr = stderr.decode('utf-8', 'replace')
4765 if p.returncode != 0:
4766 raise XAttrMetadataError(p.returncode, stderr)
4767
4768 else:
4769 # On Unix, and can't find pyxattr, setfattr, or xattr.
4770 if sys.platform.startswith('linux'):
4771 raise XAttrUnavailableError(
4772 "Couldn't find a tool to set the xattrs. "
4773 "Install either the python 'pyxattr' or 'xattr' "
4774 "modules, or the GNU 'attr' package "
4775 "(which contains the 'setfattr' tool).")
4776 else:
4777 raise XAttrUnavailableError(
4778 "Couldn't find a tool to set the xattrs. "
4779 "Install either the python 'xattr' module, "
4780 "or the 'xattr' binary.")
4781
4782
4783 def random_birthday(year_field, month_field, day_field):
4784 start_date = datetime.date(1950, 1, 1)
4785 end_date = datetime.date(1995, 12, 31)
4786 offset = random.randint(0, (end_date - start_date).days)
4787 random_date = start_date + datetime.timedelta(offset)
4788 return {
4789 year_field: str(random_date.year),
4790 month_field: str(random_date.month),
4791 day_field: str(random_date.day),
4792 }
4793
4794
4795 # Templates for internet shortcut files, which are plain text files.
4796 DOT_URL_LINK_TEMPLATE = '''\
4797 [InternetShortcut]
4798 URL=%(url)s
4799 '''
4800
4801 DOT_WEBLOC_LINK_TEMPLATE = '''\
4802 <?xml version="1.0" encoding="UTF-8"?>
4803 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4804 <plist version="1.0">
4805 <dict>
4806 \t<key>URL</key>
4807 \t<string>%(url)s</string>
4808 </dict>
4809 </plist>
4810 '''
4811
4812 DOT_DESKTOP_LINK_TEMPLATE = '''\
4813 [Desktop Entry]
4814 Encoding=UTF-8
4815 Name=%(filename)s
4816 Type=Link
4817 URL=%(url)s
4818 Icon=text-html
4819 '''
4820
4821 LINK_TEMPLATES = {
4822 'url': DOT_URL_LINK_TEMPLATE,
4823 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4824 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4825 }
4826
4827
4828 def iri_to_uri(iri):
4829 """
4830 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4831
4832 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4833 """
4834
4835 iri_parts = compat_urllib_parse_urlparse(iri)
4836
4837 if '[' in iri_parts.netloc:
4838 raise ValueError('IPv6 URIs are not, yet, supported.')
4839 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4840
4841 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4842
4843 net_location = ''
4844 if iri_parts.username:
4845 net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~")
4846 if iri_parts.password is not None:
4847 net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~")
4848 net_location += '@'
4849
4850 net_location += iri_parts.hostname.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
4851 # The 'idna' encoding produces ASCII text.
4852 if iri_parts.port is not None and iri_parts.port != 80:
4853 net_location += ':' + str(iri_parts.port)
4854
4855 return urllib.parse.urlunparse(
4856 (iri_parts.scheme,
4857 net_location,
4858
4859 urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4860
4861 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4862 urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4863
4864 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4865 urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4866
4867 urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4868
4869 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4870
4871
4872 def to_high_limit_path(path):
4873 if sys.platform in ['win32', 'cygwin']:
4874 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4875 return '\\\\?\\' + os.path.abspath(path)
4876
4877 return path
4878
4879
4880 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4881 val = traverse_obj(obj, *variadic(field))
4882 if val in ignore:
4883 return default
4884 return template % (func(val) if func else val)
4885
4886
4887 def clean_podcast_url(url):
4888 return re.sub(r'''(?x)
4889 (?:
4890 (?:
4891 chtbl\.com/track|
4892 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4893 play\.podtrac\.com
4894 )/[^/]+|
4895 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4896 flex\.acast\.com|
4897 pd(?:
4898 cn\.co| # https://podcorn.com/analytics-prefix/
4899 st\.fm # https://podsights.com/docs/
4900 )/e
4901 )/''', '', url)
4902
4903
4904 _HEX_TABLE = '0123456789abcdef'
4905
4906
4907 def random_uuidv4():
4908 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4909
4910
4911 def make_dir(path, to_screen=None):
4912 try:
4913 dn = os.path.dirname(path)
4914 if dn and not os.path.exists(dn):
4915 os.makedirs(dn)
4916 return True
4917 except OSError as err:
4918 if callable(to_screen) is not None:
4919 to_screen('unable to create directory ' + error_to_compat_str(err))
4920 return False
4921
4922
4923 def get_executable_path():
4924 from zipimport import zipimporter
4925 if hasattr(sys, 'frozen'): # Running from PyInstaller
4926 path = os.path.dirname(sys.executable)
4927 elif isinstance(__loader__, zipimporter): # Running from ZIP
4928 path = os.path.join(os.path.dirname(__file__), '../..')
4929 else:
4930 path = os.path.join(os.path.dirname(__file__), '..')
4931 return os.path.abspath(path)
4932
4933
4934 def load_plugins(name, suffix, namespace):
4935 classes = {}
4936 try:
4937 plugins_spec = importlib.util.spec_from_file_location(
4938 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
4939 plugins = importlib.util.module_from_spec(plugins_spec)
4940 sys.modules[plugins_spec.name] = plugins
4941 plugins_spec.loader.exec_module(plugins)
4942 for name in dir(plugins):
4943 if name in namespace:
4944 continue
4945 if not name.endswith(suffix):
4946 continue
4947 klass = getattr(plugins, name)
4948 classes[name] = namespace[name] = klass
4949 except FileNotFoundError:
4950 pass
4951 return classes
4952
4953
4954 def traverse_obj(
4955 obj, *path_list, default=None, expected_type=None, get_all=True,
4956 casesense=True, is_user_input=False, traverse_string=False):
4957 ''' Traverse nested list/dict/tuple
4958 @param path_list A list of paths which are checked one by one.
4959 Each path is a list of keys where each key is a string,
4960 a function, a tuple of strings/None or "...".
4961 When a fuction is given, it takes the key and value as arguments
4962 and returns whether the key matches or not. When a tuple is given,
4963 all the keys given in the tuple are traversed, and
4964 "..." traverses all the keys in the object
4965 "None" returns the object without traversal
4966 @param default Default value to return
4967 @param expected_type Only accept final value of this type (Can also be any callable)
4968 @param get_all Return all the values obtained from a path or only the first one
4969 @param casesense Whether to consider dictionary keys as case sensitive
4970 @param is_user_input Whether the keys are generated from user input. If True,
4971 strings are converted to int/slice if necessary
4972 @param traverse_string Whether to traverse inside strings. If True, any
4973 non-compatible object will also be converted into a string
4974 # TODO: Write tests
4975 '''
4976 if not casesense:
4977 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
4978 path_list = (map(_lower, variadic(path)) for path in path_list)
4979
4980 def _traverse_obj(obj, path, _current_depth=0):
4981 nonlocal depth
4982 path = tuple(variadic(path))
4983 for i, key in enumerate(path):
4984 if None in (key, obj):
4985 return obj
4986 if isinstance(key, (list, tuple)):
4987 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
4988 key = ...
4989 if key is ...:
4990 obj = (obj.values() if isinstance(obj, dict)
4991 else obj if isinstance(obj, (list, tuple, LazyList))
4992 else str(obj) if traverse_string else [])
4993 _current_depth += 1
4994 depth = max(depth, _current_depth)
4995 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
4996 elif callable(key):
4997 if isinstance(obj, (list, tuple, LazyList)):
4998 obj = enumerate(obj)
4999 elif isinstance(obj, dict):
5000 obj = obj.items()
5001 else:
5002 if not traverse_string:
5003 return None
5004 obj = str(obj)
5005 _current_depth += 1
5006 depth = max(depth, _current_depth)
5007 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))]
5008 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5009 obj = (obj.get(key) if casesense or (key in obj)
5010 else next((v for k, v in obj.items() if _lower(k) == key), None))
5011 else:
5012 if is_user_input:
5013 key = (int_or_none(key) if ':' not in key
5014 else slice(*map(int_or_none, key.split(':'))))
5015 if key == slice(None):
5016 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5017 if not isinstance(key, (int, slice)):
5018 return None
5019 if not isinstance(obj, (list, tuple, LazyList)):
5020 if not traverse_string:
5021 return None
5022 obj = str(obj)
5023 try:
5024 obj = obj[key]
5025 except IndexError:
5026 return None
5027 return obj
5028
5029 if isinstance(expected_type, type):
5030 type_test = lambda val: val if isinstance(val, expected_type) else None
5031 elif expected_type is not None:
5032 type_test = expected_type
5033 else:
5034 type_test = lambda val: val
5035
5036 for path in path_list:
5037 depth = 0
5038 val = _traverse_obj(obj, path)
5039 if val is not None:
5040 if depth:
5041 for _ in range(depth - 1):
5042 val = itertools.chain.from_iterable(v for v in val if v is not None)
5043 val = [v for v in map(type_test, val) if v is not None]
5044 if val:
5045 return val if get_all else val[0]
5046 else:
5047 val = type_test(val)
5048 if val is not None:
5049 return val
5050 return default
5051
5052
5053 def traverse_dict(dictn, keys, casesense=True):
5054 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5055 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5056 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5057
5058
5059 def get_first(obj, keys, **kwargs):
5060 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5061
5062
5063 def variadic(x, allowed_types=(str, bytes, dict)):
5064 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5065
5066
5067 def decode_base(value, digits):
5068 # This will convert given base-x string to scalar (long or int)
5069 table = {char: index for index, char in enumerate(digits)}
5070 result = 0
5071 base = len(digits)
5072 for chr in value:
5073 result *= base
5074 result += table[chr]
5075 return result
5076
5077
5078 def time_seconds(**kwargs):
5079 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5080 return t.timestamp()
5081
5082
5083 # create a JSON Web Signature (jws) with HS256 algorithm
5084 # the resulting format is in JWS Compact Serialization
5085 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5086 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5087 def jwt_encode_hs256(payload_data, key, headers={}):
5088 header_data = {
5089 'alg': 'HS256',
5090 'typ': 'JWT',
5091 }
5092 if headers:
5093 header_data.update(headers)
5094 header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5095 payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5096 h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5097 signature_b64 = base64.b64encode(h.digest())
5098 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5099 return token
5100
5101
5102 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5103 def jwt_decode_hs256(jwt):
5104 header_b64, payload_b64, signature_b64 = jwt.split('.')
5105 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5106 return payload_data
5107
5108
5109 def supports_terminal_sequences(stream):
5110 if compat_os_name == 'nt':
5111 from .compat import WINDOWS_VT_MODE # Must be imported locally
5112 if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5113 return False
5114 elif not os.getenv('TERM'):
5115 return False
5116 try:
5117 return stream.isatty()
5118 except BaseException:
5119 return False
5120
5121
5122 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5123
5124
5125 def remove_terminal_sequences(string):
5126 return _terminal_sequences_re.sub('', string)
5127
5128
5129 def number_of_digits(number):
5130 return len('%d' % number)
5131
5132
5133 def join_nonempty(*values, delim='-', from_dict=None):
5134 if from_dict is not None:
5135 values = map(from_dict.get, values)
5136 return delim.join(map(str, filter(None, values)))
5137
5138
5139 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5140 """
5141 Find the largest format dimensions in terms of video width and, for each thumbnail:
5142 * Modify the URL: Match the width with the provided regex and replace with the former width
5143 * Update dimensions
5144
5145 This function is useful with video services that scale the provided thumbnails on demand
5146 """
5147 _keys = ('width', 'height')
5148 max_dimensions = max(
5149 (tuple(format.get(k) or 0 for k in _keys) for format in formats),
5150 default=(0, 0))
5151 if not max_dimensions[0]:
5152 return thumbnails
5153 return [
5154 merge_dicts(
5155 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5156 dict(zip(_keys, max_dimensions)), thumbnail)
5157 for thumbnail in thumbnails
5158 ]
5159
5160
5161 def parse_http_range(range):
5162 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5163 if not range:
5164 return None, None, None
5165 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5166 if not crg:
5167 return None, None, None
5168 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5169
5170
5171 class Config:
5172 own_args = None
5173 filename = None
5174 __initialized = False
5175
5176 def __init__(self, parser, label=None):
5177 self._parser, self.label = parser, label
5178 self._loaded_paths, self.configs = set(), []
5179
5180 def init(self, args=None, filename=None):
5181 assert not self.__initialized
5182 directory = ''
5183 if filename:
5184 location = os.path.realpath(filename)
5185 directory = os.path.dirname(location)
5186 if location in self._loaded_paths:
5187 return False
5188 self._loaded_paths.add(location)
5189
5190 self.__initialized = True
5191 self.own_args, self.filename = args, filename
5192 for location in self._parser.parse_args(args)[0].config_locations or []:
5193 location = os.path.join(directory, expand_path(location))
5194 if os.path.isdir(location):
5195 location = os.path.join(location, 'yt-dlp.conf')
5196 if not os.path.exists(location):
5197 self._parser.error(f'config location {location} does not exist')
5198 self.append_config(self.read_file(location), location)
5199 return True
5200
5201 def __str__(self):
5202 label = join_nonempty(
5203 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5204 delim=' ')
5205 return join_nonempty(
5206 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5207 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5208 delim='\n')
5209
5210 @staticmethod
5211 def read_file(filename, default=[]):
5212 try:
5213 optionf = open(filename)
5214 except OSError:
5215 return default # silently skip if file is not present
5216 try:
5217 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5218 contents = optionf.read()
5219 res = shlex.split(contents, comments=True)
5220 finally:
5221 optionf.close()
5222 return res
5223
5224 @staticmethod
5225 def hide_login_info(opts):
5226 PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'}
5227 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5228
5229 def _scrub_eq(o):
5230 m = eqre.match(o)
5231 if m:
5232 return m.group('key') + '=PRIVATE'
5233 else:
5234 return o
5235
5236 opts = list(map(_scrub_eq, opts))
5237 for idx, opt in enumerate(opts):
5238 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5239 opts[idx + 1] = 'PRIVATE'
5240 return opts
5241
5242 def append_config(self, *args, label=None):
5243 config = type(self)(self._parser, label)
5244 config._loaded_paths = self._loaded_paths
5245 if config.init(*args):
5246 self.configs.append(config)
5247
5248 @property
5249 def all_args(self):
5250 for config in reversed(self.configs):
5251 yield from config.all_args
5252 yield from self.own_args or []
5253
5254 def parse_args(self):
5255 return self._parser.parse_args(list(self.all_args))
5256
5257
5258 class WebSocketsWrapper():
5259 """Wraps websockets module to use in non-async scopes"""
5260
5261 def __init__(self, url, headers=None, connect=True):
5262 self.loop = asyncio.events.new_event_loop()
5263 self.conn = compat_websockets.connect(
5264 url, extra_headers=headers, ping_interval=None,
5265 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5266 if connect:
5267 self.__enter__()
5268 atexit.register(self.__exit__, None, None, None)
5269
5270 def __enter__(self):
5271 if not self.pool:
5272 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5273 return self
5274
5275 def send(self, *args):
5276 self.run_with_loop(self.pool.send(*args), self.loop)
5277
5278 def recv(self, *args):
5279 return self.run_with_loop(self.pool.recv(*args), self.loop)
5280
5281 def __exit__(self, type, value, traceback):
5282 try:
5283 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5284 finally:
5285 self.loop.close()
5286 self._cancel_all_tasks(self.loop)
5287
5288 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5289 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5290 @staticmethod
5291 def run_with_loop(main, loop):
5292 if not asyncio.coroutines.iscoroutine(main):
5293 raise ValueError(f'a coroutine was expected, got {main!r}')
5294
5295 try:
5296 return loop.run_until_complete(main)
5297 finally:
5298 loop.run_until_complete(loop.shutdown_asyncgens())
5299 if hasattr(loop, 'shutdown_default_executor'):
5300 loop.run_until_complete(loop.shutdown_default_executor())
5301
5302 @staticmethod
5303 def _cancel_all_tasks(loop):
5304 to_cancel = asyncio.tasks.all_tasks(loop)
5305
5306 if not to_cancel:
5307 return
5308
5309 for task in to_cancel:
5310 task.cancel()
5311
5312 loop.run_until_complete(
5313 asyncio.tasks.gather(*to_cancel, loop=loop, return_exceptions=True))
5314
5315 for task in to_cancel:
5316 if task.cancelled():
5317 continue
5318 if task.exception() is not None:
5319 loop.call_exception_handler({
5320 'message': 'unhandled exception during asyncio.run() shutdown',
5321 'exception': task.exception(),
5322 'task': task,
5323 })
5324
5325
5326 has_websockets = bool(compat_websockets)
5327
5328
5329 def merge_headers(*dicts):
5330 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5331 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}
5332
5333
5334 class classproperty:
5335 def __init__(self, f):
5336 self.f = f
5337
5338 def __get__(self, _, cls):
5339 return self.f(cls)