]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
[utils] Improve `get_elements_text_and_html_by_attribute` regex (#2280)
[yt-dlp.git] / yt_dlp / utils.py
1 #!/usr/bin/env python3
2 # coding: utf-8
3
4 from __future__ import unicode_literals
5
6 import base64
7 import binascii
8 import calendar
9 import codecs
10 import collections
11 import contextlib
12 import ctypes
13 import datetime
14 import email.utils
15 import email.header
16 import errno
17 import functools
18 import gzip
19 import hashlib
20 import hmac
21 import importlib.util
22 import io
23 import itertools
24 import json
25 import locale
26 import math
27 import operator
28 import os
29 import platform
30 import random
31 import re
32 import socket
33 import ssl
34 import subprocess
35 import sys
36 import tempfile
37 import time
38 import traceback
39 import xml.etree.ElementTree
40 import zlib
41 import mimetypes
42
43 from .compat import (
44 compat_HTMLParseError,
45 compat_HTMLParser,
46 compat_HTTPError,
47 compat_basestring,
48 compat_chr,
49 compat_cookiejar,
50 compat_ctypes_WINFUNCTYPE,
51 compat_etree_fromstring,
52 compat_expanduser,
53 compat_html_entities,
54 compat_html_entities_html5,
55 compat_http_client,
56 compat_integer_types,
57 compat_numeric_types,
58 compat_kwargs,
59 compat_os_name,
60 compat_parse_qs,
61 compat_shlex_split,
62 compat_shlex_quote,
63 compat_str,
64 compat_struct_pack,
65 compat_struct_unpack,
66 compat_urllib_error,
67 compat_urllib_parse,
68 compat_urllib_parse_urlencode,
69 compat_urllib_parse_urlparse,
70 compat_urllib_parse_urlunparse,
71 compat_urllib_parse_quote,
72 compat_urllib_parse_quote_plus,
73 compat_urllib_parse_unquote_plus,
74 compat_urllib_request,
75 compat_urlparse,
76 compat_xpath,
77 )
78
79 from .socks import (
80 ProxyType,
81 sockssocket,
82 )
83
84
85 def register_socks_protocols():
86 # "Register" SOCKS protocols
87 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
88 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
89 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
90 if scheme not in compat_urlparse.uses_netloc:
91 compat_urlparse.uses_netloc.append(scheme)
92
93
94 # This is not clearly defined otherwise
95 compiled_regex_type = type(re.compile(''))
96
97
98 def random_user_agent():
99 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
100 _CHROME_VERSIONS = (
101 '90.0.4430.212',
102 '90.0.4430.24',
103 '90.0.4430.70',
104 '90.0.4430.72',
105 '90.0.4430.85',
106 '90.0.4430.93',
107 '91.0.4472.101',
108 '91.0.4472.106',
109 '91.0.4472.114',
110 '91.0.4472.124',
111 '91.0.4472.164',
112 '91.0.4472.19',
113 '91.0.4472.77',
114 '92.0.4515.107',
115 '92.0.4515.115',
116 '92.0.4515.131',
117 '92.0.4515.159',
118 '92.0.4515.43',
119 '93.0.4556.0',
120 '93.0.4577.15',
121 '93.0.4577.63',
122 '93.0.4577.82',
123 '94.0.4606.41',
124 '94.0.4606.54',
125 '94.0.4606.61',
126 '94.0.4606.71',
127 '94.0.4606.81',
128 '94.0.4606.85',
129 '95.0.4638.17',
130 '95.0.4638.50',
131 '95.0.4638.54',
132 '95.0.4638.69',
133 '95.0.4638.74',
134 '96.0.4664.18',
135 '96.0.4664.45',
136 '96.0.4664.55',
137 '96.0.4664.93',
138 '97.0.4692.20',
139 )
140 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
141
142
143 std_headers = {
144 'User-Agent': random_user_agent(),
145 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
146 'Accept-Encoding': 'gzip, deflate',
147 'Accept-Language': 'en-us,en;q=0.5',
148 }
149
150
151 USER_AGENTS = {
152 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
153 }
154
155
156 NO_DEFAULT = object()
157
158 ENGLISH_MONTH_NAMES = [
159 'January', 'February', 'March', 'April', 'May', 'June',
160 'July', 'August', 'September', 'October', 'November', 'December']
161
162 MONTH_NAMES = {
163 'en': ENGLISH_MONTH_NAMES,
164 'fr': [
165 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
166 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
167 }
168
169 KNOWN_EXTENSIONS = (
170 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
171 'flv', 'f4v', 'f4a', 'f4b',
172 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
173 'mkv', 'mka', 'mk3d',
174 'avi', 'divx',
175 'mov',
176 'asf', 'wmv', 'wma',
177 '3gp', '3g2',
178 'mp3',
179 'flac',
180 'ape',
181 'wav',
182 'f4f', 'f4m', 'm3u8', 'smil')
183
184 # needed for sanitizing filenames in restricted mode
185 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
186 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
187 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
188
189 DATE_FORMATS = (
190 '%d %B %Y',
191 '%d %b %Y',
192 '%B %d %Y',
193 '%B %dst %Y',
194 '%B %dnd %Y',
195 '%B %drd %Y',
196 '%B %dth %Y',
197 '%b %d %Y',
198 '%b %dst %Y',
199 '%b %dnd %Y',
200 '%b %drd %Y',
201 '%b %dth %Y',
202 '%b %dst %Y %I:%M',
203 '%b %dnd %Y %I:%M',
204 '%b %drd %Y %I:%M',
205 '%b %dth %Y %I:%M',
206 '%Y %m %d',
207 '%Y-%m-%d',
208 '%Y.%m.%d.',
209 '%Y/%m/%d',
210 '%Y/%m/%d %H:%M',
211 '%Y/%m/%d %H:%M:%S',
212 '%Y%m%d%H%M',
213 '%Y%m%d%H%M%S',
214 '%Y%m%d',
215 '%Y-%m-%d %H:%M',
216 '%Y-%m-%d %H:%M:%S',
217 '%Y-%m-%d %H:%M:%S.%f',
218 '%Y-%m-%d %H:%M:%S:%f',
219 '%d.%m.%Y %H:%M',
220 '%d.%m.%Y %H.%M',
221 '%Y-%m-%dT%H:%M:%SZ',
222 '%Y-%m-%dT%H:%M:%S.%fZ',
223 '%Y-%m-%dT%H:%M:%S.%f0Z',
224 '%Y-%m-%dT%H:%M:%S',
225 '%Y-%m-%dT%H:%M:%S.%f',
226 '%Y-%m-%dT%H:%M',
227 '%b %d %Y at %H:%M',
228 '%b %d %Y at %H:%M:%S',
229 '%B %d %Y at %H:%M',
230 '%B %d %Y at %H:%M:%S',
231 '%H:%M %d-%b-%Y',
232 )
233
234 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
235 DATE_FORMATS_DAY_FIRST.extend([
236 '%d-%m-%Y',
237 '%d.%m.%Y',
238 '%d.%m.%y',
239 '%d/%m/%Y',
240 '%d/%m/%y',
241 '%d/%m/%Y %H:%M:%S',
242 ])
243
244 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
245 DATE_FORMATS_MONTH_FIRST.extend([
246 '%m-%d-%Y',
247 '%m.%d.%Y',
248 '%m/%d/%Y',
249 '%m/%d/%y',
250 '%m/%d/%Y %H:%M:%S',
251 ])
252
253 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
254 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
255
256
257 def preferredencoding():
258 """Get preferred encoding.
259
260 Returns the best encoding scheme for the system, based on
261 locale.getpreferredencoding() and some further tweaks.
262 """
263 try:
264 pref = locale.getpreferredencoding()
265 'TEST'.encode(pref)
266 except Exception:
267 pref = 'UTF-8'
268
269 return pref
270
271
272 def write_json_file(obj, fn):
273 """ Encode obj as JSON and write it to fn, atomically if possible """
274
275 fn = encodeFilename(fn)
276 if sys.version_info < (3, 0) and sys.platform != 'win32':
277 encoding = get_filesystem_encoding()
278 # os.path.basename returns a bytes object, but NamedTemporaryFile
279 # will fail if the filename contains non ascii characters unless we
280 # use a unicode object
281 path_basename = lambda f: os.path.basename(fn).decode(encoding)
282 # the same for os.path.dirname
283 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
284 else:
285 path_basename = os.path.basename
286 path_dirname = os.path.dirname
287
288 args = {
289 'suffix': '.tmp',
290 'prefix': path_basename(fn) + '.',
291 'dir': path_dirname(fn),
292 'delete': False,
293 }
294
295 # In Python 2.x, json.dump expects a bytestream.
296 # In Python 3.x, it writes to a character stream
297 if sys.version_info < (3, 0):
298 args['mode'] = 'wb'
299 else:
300 args.update({
301 'mode': 'w',
302 'encoding': 'utf-8',
303 })
304
305 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
306
307 try:
308 with tf:
309 json.dump(obj, tf, ensure_ascii=False)
310 if sys.platform == 'win32':
311 # Need to remove existing file on Windows, else os.rename raises
312 # WindowsError or FileExistsError.
313 try:
314 os.unlink(fn)
315 except OSError:
316 pass
317 try:
318 mask = os.umask(0)
319 os.umask(mask)
320 os.chmod(tf.name, 0o666 & ~mask)
321 except OSError:
322 pass
323 os.rename(tf.name, fn)
324 except Exception:
325 try:
326 os.remove(tf.name)
327 except OSError:
328 pass
329 raise
330
331
332 if sys.version_info >= (2, 7):
333 def find_xpath_attr(node, xpath, key, val=None):
334 """ Find the xpath xpath[@key=val] """
335 assert re.match(r'^[a-zA-Z_-]+$', key)
336 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
337 return node.find(expr)
338 else:
339 def find_xpath_attr(node, xpath, key, val=None):
340 for f in node.findall(compat_xpath(xpath)):
341 if key not in f.attrib:
342 continue
343 if val is None or f.attrib.get(key) == val:
344 return f
345 return None
346
347 # On python2.6 the xml.etree.ElementTree.Element methods don't support
348 # the namespace parameter
349
350
351 def xpath_with_ns(path, ns_map):
352 components = [c.split(':') for c in path.split('/')]
353 replaced = []
354 for c in components:
355 if len(c) == 1:
356 replaced.append(c[0])
357 else:
358 ns, tag = c
359 replaced.append('{%s}%s' % (ns_map[ns], tag))
360 return '/'.join(replaced)
361
362
363 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
364 def _find_xpath(xpath):
365 return node.find(compat_xpath(xpath))
366
367 if isinstance(xpath, (str, compat_str)):
368 n = _find_xpath(xpath)
369 else:
370 for xp in xpath:
371 n = _find_xpath(xp)
372 if n is not None:
373 break
374
375 if n is None:
376 if default is not NO_DEFAULT:
377 return default
378 elif fatal:
379 name = xpath if name is None else name
380 raise ExtractorError('Could not find XML element %s' % name)
381 else:
382 return None
383 return n
384
385
386 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
387 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
388 if n is None or n == default:
389 return n
390 if n.text is None:
391 if default is not NO_DEFAULT:
392 return default
393 elif fatal:
394 name = xpath if name is None else name
395 raise ExtractorError('Could not find XML element\'s text %s' % name)
396 else:
397 return None
398 return n.text
399
400
401 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
402 n = find_xpath_attr(node, xpath, key)
403 if n is None:
404 if default is not NO_DEFAULT:
405 return default
406 elif fatal:
407 name = '%s[@%s]' % (xpath, key) if name is None else name
408 raise ExtractorError('Could not find XML attribute %s' % name)
409 else:
410 return None
411 return n.attrib[key]
412
413
414 def get_element_by_id(id, html):
415 """Return the content of the tag with the specified ID in the passed HTML document"""
416 return get_element_by_attribute('id', id, html)
417
418
419 def get_element_html_by_id(id, html):
420 """Return the html of the tag with the specified ID in the passed HTML document"""
421 return get_element_html_by_attribute('id', id, html)
422
423
424 def get_element_by_class(class_name, html):
425 """Return the content of the first tag with the specified class in the passed HTML document"""
426 retval = get_elements_by_class(class_name, html)
427 return retval[0] if retval else None
428
429
430 def get_element_html_by_class(class_name, html):
431 """Return the html of the first tag with the specified class in the passed HTML document"""
432 retval = get_elements_html_by_class(class_name, html)
433 return retval[0] if retval else None
434
435
436 def get_element_by_attribute(attribute, value, html, escape_value=True):
437 retval = get_elements_by_attribute(attribute, value, html, escape_value)
438 return retval[0] if retval else None
439
440
441 def get_element_html_by_attribute(attribute, value, html, escape_value=True):
442 retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
443 return retval[0] if retval else None
444
445
446 def get_elements_by_class(class_name, html):
447 """Return the content of all tags with the specified class in the passed HTML document as a list"""
448 return get_elements_by_attribute(
449 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
450 html, escape_value=False)
451
452
453 def get_elements_html_by_class(class_name, html):
454 """Return the html of all tags with the specified class in the passed HTML document as a list"""
455 return get_elements_html_by_attribute(
456 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
457 html, escape_value=False)
458
459
460 def get_elements_by_attribute(*args, **kwargs):
461 """Return the content of the tag with the specified attribute in the passed HTML document"""
462 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
463
464
465 def get_elements_html_by_attribute(*args, **kwargs):
466 """Return the html of the tag with the specified attribute in the passed HTML document"""
467 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
468
469
470 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
471 """
472 Return the text (content) and the html (whole) of the tag with the specified
473 attribute in the passed HTML document
474 """
475
476 value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
477
478 value = re.escape(value) if escape_value else value
479
480 partial_element_re = r'''(?x)
481 <(?P<tag>[a-zA-Z0-9:._-]+)
482 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
483 \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
484 ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
485
486 for m in re.finditer(partial_element_re, html):
487 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
488
489 yield (
490 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
491 whole
492 )
493
494
495 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
496 """
497 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
498 closing tag for the first opening tag it has encountered, and can be used
499 as a context manager
500 """
501
502 class HTMLBreakOnClosingTagException(Exception):
503 pass
504
505 def __init__(self):
506 self.tagstack = collections.deque()
507 compat_HTMLParser.__init__(self)
508
509 def __enter__(self):
510 return self
511
512 def __exit__(self, *_):
513 self.close()
514
515 def close(self):
516 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
517 # so data remains buffered; we no longer have any interest in it, thus
518 # override this method to discard it
519 pass
520
521 def handle_starttag(self, tag, _):
522 self.tagstack.append(tag)
523
524 def handle_endtag(self, tag):
525 if not self.tagstack:
526 raise compat_HTMLParseError('no tags in the stack')
527 while self.tagstack:
528 inner_tag = self.tagstack.pop()
529 if inner_tag == tag:
530 break
531 else:
532 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
533 if not self.tagstack:
534 raise self.HTMLBreakOnClosingTagException()
535
536
537 def get_element_text_and_html_by_tag(tag, html):
538 """
539 For the first element with the specified tag in the passed HTML document
540 return its' content (text) and the whole element (html)
541 """
542 def find_or_raise(haystack, needle, exc):
543 try:
544 return haystack.index(needle)
545 except ValueError:
546 raise exc
547 closing_tag = f'</{tag}>'
548 whole_start = find_or_raise(
549 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
550 content_start = find_or_raise(
551 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
552 content_start += whole_start + 1
553 with HTMLBreakOnClosingTagParser() as parser:
554 parser.feed(html[whole_start:content_start])
555 if not parser.tagstack or parser.tagstack[0] != tag:
556 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
557 offset = content_start
558 while offset < len(html):
559 next_closing_tag_start = find_or_raise(
560 html[offset:], closing_tag,
561 compat_HTMLParseError(f'closing {tag} tag not found'))
562 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
563 try:
564 parser.feed(html[offset:offset + next_closing_tag_end])
565 offset += next_closing_tag_end
566 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
567 return html[content_start:offset + next_closing_tag_start], \
568 html[whole_start:offset + next_closing_tag_end]
569 raise compat_HTMLParseError('unexpected end of html')
570
571
572 class HTMLAttributeParser(compat_HTMLParser):
573 """Trivial HTML parser to gather the attributes for a single element"""
574
575 def __init__(self):
576 self.attrs = {}
577 compat_HTMLParser.__init__(self)
578
579 def handle_starttag(self, tag, attrs):
580 self.attrs = dict(attrs)
581
582
583 class HTMLListAttrsParser(compat_HTMLParser):
584 """HTML parser to gather the attributes for the elements of a list"""
585
586 def __init__(self):
587 compat_HTMLParser.__init__(self)
588 self.items = []
589 self._level = 0
590
591 def handle_starttag(self, tag, attrs):
592 if tag == 'li' and self._level == 0:
593 self.items.append(dict(attrs))
594 self._level += 1
595
596 def handle_endtag(self, tag):
597 self._level -= 1
598
599
600 def extract_attributes(html_element):
601 """Given a string for an HTML element such as
602 <el
603 a="foo" B="bar" c="&98;az" d=boz
604 empty= noval entity="&amp;"
605 sq='"' dq="'"
606 >
607 Decode and return a dictionary of attributes.
608 {
609 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
610 'empty': '', 'noval': None, 'entity': '&',
611 'sq': '"', 'dq': '\''
612 }.
613 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
614 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
615 """
616 parser = HTMLAttributeParser()
617 try:
618 parser.feed(html_element)
619 parser.close()
620 # Older Python may throw HTMLParseError in case of malformed HTML
621 except compat_HTMLParseError:
622 pass
623 return parser.attrs
624
625
626 def parse_list(webpage):
627 """Given a string for an series of HTML <li> elements,
628 return a dictionary of their attributes"""
629 parser = HTMLListAttrsParser()
630 parser.feed(webpage)
631 parser.close()
632 return parser.items
633
634
635 def clean_html(html):
636 """Clean an HTML snippet into a readable string"""
637
638 if html is None: # Convenience for sanitizing descriptions etc.
639 return html
640
641 # Newline vs <br />
642 html = html.replace('\n', ' ')
643 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
644 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
645 # Strip html tags
646 html = re.sub('<.*?>', '', html)
647 # Replace html entities
648 html = unescapeHTML(html)
649 return html.strip()
650
651
652 def sanitize_open(filename, open_mode):
653 """Try to open the given filename, and slightly tweak it if this fails.
654
655 Attempts to open the given filename. If this fails, it tries to change
656 the filename slightly, step by step, until it's either able to open it
657 or it fails and raises a final exception, like the standard open()
658 function.
659
660 It returns the tuple (stream, definitive_file_name).
661 """
662 try:
663 if filename == '-':
664 if sys.platform == 'win32':
665 import msvcrt
666 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
667 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
668 stream = open(encodeFilename(filename), open_mode)
669 return (stream, filename)
670 except (IOError, OSError) as err:
671 if err.errno in (errno.EACCES,):
672 raise
673
674 # In case of error, try to remove win32 forbidden chars
675 alt_filename = sanitize_path(filename)
676 if alt_filename == filename:
677 raise
678 else:
679 # An exception here should be caught in the caller
680 stream = open(encodeFilename(alt_filename), open_mode)
681 return (stream, alt_filename)
682
683
684 def timeconvert(timestr):
685 """Convert RFC 2822 defined time string into system timestamp"""
686 timestamp = None
687 timetuple = email.utils.parsedate_tz(timestr)
688 if timetuple is not None:
689 timestamp = email.utils.mktime_tz(timetuple)
690 return timestamp
691
692
693 def sanitize_filename(s, restricted=False, is_id=False):
694 """Sanitizes a string so it could be used as part of a filename.
695 If restricted is set, use a stricter subset of allowed characters.
696 Set is_id if this is not an arbitrary string, but an ID that should be kept
697 if possible.
698 """
699 def replace_insane(char):
700 if restricted and char in ACCENT_CHARS:
701 return ACCENT_CHARS[char]
702 elif not restricted and char == '\n':
703 return ' '
704 elif char == '?' or ord(char) < 32 or ord(char) == 127:
705 return ''
706 elif char == '"':
707 return '' if restricted else '\''
708 elif char == ':':
709 return '_-' if restricted else ' -'
710 elif char in '\\/|*<>':
711 return '_'
712 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
713 return '_'
714 if restricted and ord(char) > 127:
715 return '_'
716 return char
717
718 if s == '':
719 return ''
720 # Handle timestamps
721 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
722 result = ''.join(map(replace_insane, s))
723 if not is_id:
724 while '__' in result:
725 result = result.replace('__', '_')
726 result = result.strip('_')
727 # Common case of "Foreign band name - English song title"
728 if restricted and result.startswith('-_'):
729 result = result[2:]
730 if result.startswith('-'):
731 result = '_' + result[len('-'):]
732 result = result.lstrip('.')
733 if not result:
734 result = '_'
735 return result
736
737
738 def sanitize_path(s, force=False):
739 """Sanitizes and normalizes path on Windows"""
740 if sys.platform == 'win32':
741 force = False
742 drive_or_unc, _ = os.path.splitdrive(s)
743 if sys.version_info < (2, 7) and not drive_or_unc:
744 drive_or_unc, _ = os.path.splitunc(s)
745 elif force:
746 drive_or_unc = ''
747 else:
748 return s
749
750 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
751 if drive_or_unc:
752 norm_path.pop(0)
753 sanitized_path = [
754 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
755 for path_part in norm_path]
756 if drive_or_unc:
757 sanitized_path.insert(0, drive_or_unc + os.path.sep)
758 elif force and s[0] == os.path.sep:
759 sanitized_path.insert(0, os.path.sep)
760 return os.path.join(*sanitized_path)
761
762
763 def sanitize_url(url):
764 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
765 # the number of unwanted failures due to missing protocol
766 if url.startswith('//'):
767 return 'http:%s' % url
768 # Fix some common typos seen so far
769 COMMON_TYPOS = (
770 # https://github.com/ytdl-org/youtube-dl/issues/15649
771 (r'^httpss://', r'https://'),
772 # https://bx1.be/lives/direct-tv/
773 (r'^rmtp([es]?)://', r'rtmp\1://'),
774 )
775 for mistake, fixup in COMMON_TYPOS:
776 if re.match(mistake, url):
777 return re.sub(mistake, fixup, url)
778 return url
779
780
781 def extract_basic_auth(url):
782 parts = compat_urlparse.urlsplit(url)
783 if parts.username is None:
784 return url, None
785 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
786 parts.hostname if parts.port is None
787 else '%s:%d' % (parts.hostname, parts.port))))
788 auth_payload = base64.b64encode(
789 ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
790 return url, 'Basic ' + auth_payload.decode('utf-8')
791
792
793 def sanitized_Request(url, *args, **kwargs):
794 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
795 if auth_header is not None:
796 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
797 headers['Authorization'] = auth_header
798 return compat_urllib_request.Request(url, *args, **kwargs)
799
800
801 def expand_path(s):
802 """Expand shell variables and ~"""
803 return os.path.expandvars(compat_expanduser(s))
804
805
806 def orderedSet(iterable):
807 """ Remove all duplicates from the input iterable """
808 res = []
809 for el in iterable:
810 if el not in res:
811 res.append(el)
812 return res
813
814
815 def _htmlentity_transform(entity_with_semicolon):
816 """Transforms an HTML entity to a character."""
817 entity = entity_with_semicolon[:-1]
818
819 # Known non-numeric HTML entity
820 if entity in compat_html_entities.name2codepoint:
821 return compat_chr(compat_html_entities.name2codepoint[entity])
822
823 # TODO: HTML5 allows entities without a semicolon. For example,
824 # '&Eacuteric' should be decoded as 'Éric'.
825 if entity_with_semicolon in compat_html_entities_html5:
826 return compat_html_entities_html5[entity_with_semicolon]
827
828 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
829 if mobj is not None:
830 numstr = mobj.group(1)
831 if numstr.startswith('x'):
832 base = 16
833 numstr = '0%s' % numstr
834 else:
835 base = 10
836 # See https://github.com/ytdl-org/youtube-dl/issues/7518
837 try:
838 return compat_chr(int(numstr, base))
839 except ValueError:
840 pass
841
842 # Unknown entity in name, return its literal representation
843 return '&%s;' % entity
844
845
846 def unescapeHTML(s):
847 if s is None:
848 return None
849 assert type(s) == compat_str
850
851 return re.sub(
852 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
853
854
855 def escapeHTML(text):
856 return (
857 text
858 .replace('&', '&amp;')
859 .replace('<', '&lt;')
860 .replace('>', '&gt;')
861 .replace('"', '&quot;')
862 .replace("'", '&#39;')
863 )
864
865
866 def process_communicate_or_kill(p, *args, **kwargs):
867 try:
868 return p.communicate(*args, **kwargs)
869 except BaseException: # Including KeyboardInterrupt
870 p.kill()
871 p.wait()
872 raise
873
874
875 class Popen(subprocess.Popen):
876 if sys.platform == 'win32':
877 _startupinfo = subprocess.STARTUPINFO()
878 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
879 else:
880 _startupinfo = None
881
882 def __init__(self, *args, **kwargs):
883 super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
884
885 def communicate_or_kill(self, *args, **kwargs):
886 return process_communicate_or_kill(self, *args, **kwargs)
887
888
889 def get_subprocess_encoding():
890 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
891 # For subprocess calls, encode with locale encoding
892 # Refer to http://stackoverflow.com/a/9951851/35070
893 encoding = preferredencoding()
894 else:
895 encoding = sys.getfilesystemencoding()
896 if encoding is None:
897 encoding = 'utf-8'
898 return encoding
899
900
901 def encodeFilename(s, for_subprocess=False):
902 """
903 @param s The name of the file
904 """
905
906 assert type(s) == compat_str
907
908 # Python 3 has a Unicode API
909 if sys.version_info >= (3, 0):
910 return s
911
912 # Pass '' directly to use Unicode APIs on Windows 2000 and up
913 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
914 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
915 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
916 return s
917
918 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
919 if sys.platform.startswith('java'):
920 return s
921
922 return s.encode(get_subprocess_encoding(), 'ignore')
923
924
925 def decodeFilename(b, for_subprocess=False):
926
927 if sys.version_info >= (3, 0):
928 return b
929
930 if not isinstance(b, bytes):
931 return b
932
933 return b.decode(get_subprocess_encoding(), 'ignore')
934
935
936 def encodeArgument(s):
937 if not isinstance(s, compat_str):
938 # Legacy code that uses byte strings
939 # Uncomment the following line after fixing all post processors
940 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
941 s = s.decode('ascii')
942 return encodeFilename(s, True)
943
944
945 def decodeArgument(b):
946 return decodeFilename(b, True)
947
948
949 def decodeOption(optval):
950 if optval is None:
951 return optval
952 if isinstance(optval, bytes):
953 optval = optval.decode(preferredencoding())
954
955 assert isinstance(optval, compat_str)
956 return optval
957
958
959 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
960
961
962 def timetuple_from_msec(msec):
963 secs, msec = divmod(msec, 1000)
964 mins, secs = divmod(secs, 60)
965 hrs, mins = divmod(mins, 60)
966 return _timetuple(hrs, mins, secs, msec)
967
968
969 def formatSeconds(secs, delim=':', msec=False):
970 time = timetuple_from_msec(secs * 1000)
971 if time.hours:
972 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
973 elif time.minutes:
974 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
975 else:
976 ret = '%d' % time.seconds
977 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
978
979
980 def _ssl_load_windows_store_certs(ssl_context, storename):
981 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
982 try:
983 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
984 if encoding == 'x509_asn' and (
985 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
986 except PermissionError:
987 return
988 for cert in certs:
989 try:
990 ssl_context.load_verify_locations(cadata=cert)
991 except ssl.SSLError:
992 pass
993
994
995 def make_HTTPS_handler(params, **kwargs):
996 opts_check_certificate = not params.get('nocheckcertificate')
997 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
998 context.check_hostname = opts_check_certificate
999 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1000 if opts_check_certificate:
1001 try:
1002 context.load_default_certs()
1003 # Work around the issue in load_default_certs when there are bad certificates. See:
1004 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1005 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1006 except ssl.SSLError:
1007 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1008 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1009 # Create a new context to discard any certificates that were already loaded
1010 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1011 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
1012 for storename in ('CA', 'ROOT'):
1013 _ssl_load_windows_store_certs(context, storename)
1014 context.set_default_verify_paths()
1015 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1016
1017
1018 def bug_reports_message(before=';'):
1019 if ytdl_is_updateable():
1020 update_cmd = 'type yt-dlp -U to update'
1021 else:
1022 update_cmd = 'see https://github.com/yt-dlp/yt-dlp on how to update'
1023 msg = 'please report this issue on https://github.com/yt-dlp/yt-dlp .'
1024 msg += ' Make sure you are using the latest version; %s.' % update_cmd
1025 msg += ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
1026
1027 before = before.rstrip()
1028 if not before or before.endswith(('.', '!', '?')):
1029 msg = msg[0].title() + msg[1:]
1030
1031 return (before + ' ' if before else '') + msg
1032
1033
1034 class YoutubeDLError(Exception):
1035 """Base exception for YoutubeDL errors."""
1036 msg = None
1037
1038 def __init__(self, msg=None):
1039 if msg is not None:
1040 self.msg = msg
1041 elif self.msg is None:
1042 self.msg = type(self).__name__
1043 super().__init__(self.msg)
1044
1045
1046 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1047 if hasattr(ssl, 'CertificateError'):
1048 network_exceptions.append(ssl.CertificateError)
1049 network_exceptions = tuple(network_exceptions)
1050
1051
1052 class ExtractorError(YoutubeDLError):
1053 """Error during info extraction."""
1054
1055 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1056 """ tb, if given, is the original traceback (so that it can be printed out).
1057 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1058 """
1059 if sys.exc_info()[0] in network_exceptions:
1060 expected = True
1061
1062 self.msg = str(msg)
1063 self.traceback = tb
1064 self.expected = expected
1065 self.cause = cause
1066 self.video_id = video_id
1067 self.ie = ie
1068 self.exc_info = sys.exc_info() # preserve original exception
1069
1070 super(ExtractorError, self).__init__(''.join((
1071 format_field(ie, template='[%s] '),
1072 format_field(video_id, template='%s: '),
1073 self.msg,
1074 format_field(cause, template=' (caused by %r)'),
1075 '' if expected else bug_reports_message())))
1076
1077 def format_traceback(self):
1078 if self.traceback is None:
1079 return None
1080 return ''.join(traceback.format_tb(self.traceback))
1081
1082
1083 class UnsupportedError(ExtractorError):
1084 def __init__(self, url):
1085 super(UnsupportedError, self).__init__(
1086 'Unsupported URL: %s' % url, expected=True)
1087 self.url = url
1088
1089
1090 class RegexNotFoundError(ExtractorError):
1091 """Error when a regex didn't match"""
1092 pass
1093
1094
1095 class GeoRestrictedError(ExtractorError):
1096 """Geographic restriction Error exception.
1097
1098 This exception may be thrown when a video is not available from your
1099 geographic location due to geographic restrictions imposed by a website.
1100 """
1101
1102 def __init__(self, msg, countries=None, **kwargs):
1103 kwargs['expected'] = True
1104 super(GeoRestrictedError, self).__init__(msg, **kwargs)
1105 self.countries = countries
1106
1107
1108 class DownloadError(YoutubeDLError):
1109 """Download Error exception.
1110
1111 This exception may be thrown by FileDownloader objects if they are not
1112 configured to continue on errors. They will contain the appropriate
1113 error message.
1114 """
1115
1116 def __init__(self, msg, exc_info=None):
1117 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1118 super(DownloadError, self).__init__(msg)
1119 self.exc_info = exc_info
1120
1121
1122 class EntryNotInPlaylist(YoutubeDLError):
1123 """Entry not in playlist exception.
1124
1125 This exception will be thrown by YoutubeDL when a requested entry
1126 is not found in the playlist info_dict
1127 """
1128 msg = 'Entry not found in info'
1129
1130
1131 class SameFileError(YoutubeDLError):
1132 """Same File exception.
1133
1134 This exception will be thrown by FileDownloader objects if they detect
1135 multiple files would have to be downloaded to the same file on disk.
1136 """
1137 msg = 'Fixed output name but more than one file to download'
1138
1139 def __init__(self, filename=None):
1140 if filename is not None:
1141 self.msg += f': {filename}'
1142 super().__init__(self.msg)
1143
1144
1145 class PostProcessingError(YoutubeDLError):
1146 """Post Processing exception.
1147
1148 This exception may be raised by PostProcessor's .run() method to
1149 indicate an error in the postprocessing task.
1150 """
1151
1152
1153 class DownloadCancelled(YoutubeDLError):
1154 """ Exception raised when the download queue should be interrupted """
1155 msg = 'The download was cancelled'
1156
1157
1158 class ExistingVideoReached(DownloadCancelled):
1159 """ --break-on-existing triggered """
1160 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1161
1162
1163 class RejectedVideoReached(DownloadCancelled):
1164 """ --break-on-reject triggered """
1165 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1166
1167
1168 class MaxDownloadsReached(DownloadCancelled):
1169 """ --max-downloads limit has been reached. """
1170 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1171
1172
1173 class ReExtractInfo(YoutubeDLError):
1174 """ Video info needs to be re-extracted. """
1175
1176 def __init__(self, msg, expected=False):
1177 super().__init__(msg)
1178 self.expected = expected
1179
1180
1181 class ThrottledDownload(ReExtractInfo):
1182 """ Download speed below --throttled-rate. """
1183 msg = 'The download speed is below throttle limit'
1184
1185 def __init__(self):
1186 super().__init__(self.msg, expected=False)
1187
1188
1189 class UnavailableVideoError(YoutubeDLError):
1190 """Unavailable Format exception.
1191
1192 This exception will be thrown when a video is requested
1193 in a format that is not available for that video.
1194 """
1195 msg = 'Unable to download video'
1196
1197 def __init__(self, err=None):
1198 if err is not None:
1199 self.msg += f': {err}'
1200 super().__init__(self.msg)
1201
1202
1203 class ContentTooShortError(YoutubeDLError):
1204 """Content Too Short exception.
1205
1206 This exception may be raised by FileDownloader objects when a file they
1207 download is too small for what the server announced first, indicating
1208 the connection was probably interrupted.
1209 """
1210
1211 def __init__(self, downloaded, expected):
1212 super(ContentTooShortError, self).__init__(
1213 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1214 )
1215 # Both in bytes
1216 self.downloaded = downloaded
1217 self.expected = expected
1218
1219
1220 class XAttrMetadataError(YoutubeDLError):
1221 def __init__(self, code=None, msg='Unknown error'):
1222 super(XAttrMetadataError, self).__init__(msg)
1223 self.code = code
1224 self.msg = msg
1225
1226 # Parsing code and msg
1227 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1228 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1229 self.reason = 'NO_SPACE'
1230 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1231 self.reason = 'VALUE_TOO_LONG'
1232 else:
1233 self.reason = 'NOT_SUPPORTED'
1234
1235
1236 class XAttrUnavailableError(YoutubeDLError):
1237 pass
1238
1239
1240 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1241 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1242 # expected HTTP responses to meet HTTP/1.0 or later (see also
1243 # https://github.com/ytdl-org/youtube-dl/issues/6727)
1244 if sys.version_info < (3, 0):
1245 kwargs['strict'] = True
1246 hc = http_class(*args, **compat_kwargs(kwargs))
1247 source_address = ydl_handler._params.get('source_address')
1248
1249 if source_address is not None:
1250 # This is to workaround _create_connection() from socket where it will try all
1251 # address data from getaddrinfo() including IPv6. This filters the result from
1252 # getaddrinfo() based on the source_address value.
1253 # This is based on the cpython socket.create_connection() function.
1254 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1255 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1256 host, port = address
1257 err = None
1258 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1259 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1260 ip_addrs = [addr for addr in addrs if addr[0] == af]
1261 if addrs and not ip_addrs:
1262 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1263 raise socket.error(
1264 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1265 % (ip_version, source_address[0]))
1266 for res in ip_addrs:
1267 af, socktype, proto, canonname, sa = res
1268 sock = None
1269 try:
1270 sock = socket.socket(af, socktype, proto)
1271 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1272 sock.settimeout(timeout)
1273 sock.bind(source_address)
1274 sock.connect(sa)
1275 err = None # Explicitly break reference cycle
1276 return sock
1277 except socket.error as _:
1278 err = _
1279 if sock is not None:
1280 sock.close()
1281 if err is not None:
1282 raise err
1283 else:
1284 raise socket.error('getaddrinfo returns an empty list')
1285 if hasattr(hc, '_create_connection'):
1286 hc._create_connection = _create_connection
1287 sa = (source_address, 0)
1288 if hasattr(hc, 'source_address'): # Python 2.7+
1289 hc.source_address = sa
1290 else: # Python 2.6
1291 def _hc_connect(self, *args, **kwargs):
1292 sock = _create_connection(
1293 (self.host, self.port), self.timeout, sa)
1294 if is_https:
1295 self.sock = ssl.wrap_socket(
1296 sock, self.key_file, self.cert_file,
1297 ssl_version=ssl.PROTOCOL_TLSv1)
1298 else:
1299 self.sock = sock
1300 hc.connect = functools.partial(_hc_connect, hc)
1301
1302 return hc
1303
1304
1305 def handle_youtubedl_headers(headers):
1306 filtered_headers = headers
1307
1308 if 'Youtubedl-no-compression' in filtered_headers:
1309 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1310 del filtered_headers['Youtubedl-no-compression']
1311
1312 return filtered_headers
1313
1314
1315 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1316 """Handler for HTTP requests and responses.
1317
1318 This class, when installed with an OpenerDirector, automatically adds
1319 the standard headers to every HTTP request and handles gzipped and
1320 deflated responses from web servers. If compression is to be avoided in
1321 a particular request, the original request in the program code only has
1322 to include the HTTP header "Youtubedl-no-compression", which will be
1323 removed before making the real request.
1324
1325 Part of this code was copied from:
1326
1327 http://techknack.net/python-urllib2-handlers/
1328
1329 Andrew Rowls, the author of that code, agreed to release it to the
1330 public domain.
1331 """
1332
1333 def __init__(self, params, *args, **kwargs):
1334 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1335 self._params = params
1336
1337 def http_open(self, req):
1338 conn_class = compat_http_client.HTTPConnection
1339
1340 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1341 if socks_proxy:
1342 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1343 del req.headers['Ytdl-socks-proxy']
1344
1345 return self.do_open(functools.partial(
1346 _create_http_connection, self, conn_class, False),
1347 req)
1348
1349 @staticmethod
1350 def deflate(data):
1351 if not data:
1352 return data
1353 try:
1354 return zlib.decompress(data, -zlib.MAX_WBITS)
1355 except zlib.error:
1356 return zlib.decompress(data)
1357
1358 def http_request(self, req):
1359 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1360 # always respected by websites, some tend to give out URLs with non percent-encoded
1361 # non-ASCII characters (see telemb.py, ard.py [#3412])
1362 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1363 # To work around aforementioned issue we will replace request's original URL with
1364 # percent-encoded one
1365 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1366 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1367 url = req.get_full_url()
1368 url_escaped = escape_url(url)
1369
1370 # Substitute URL if any change after escaping
1371 if url != url_escaped:
1372 req = update_Request(req, url=url_escaped)
1373
1374 for h, v in std_headers.items():
1375 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1376 # The dict keys are capitalized because of this bug by urllib
1377 if h.capitalize() not in req.headers:
1378 req.add_header(h, v)
1379
1380 req.headers = handle_youtubedl_headers(req.headers)
1381
1382 if sys.version_info < (2, 7) and '#' in req.get_full_url():
1383 # Python 2.6 is brain-dead when it comes to fragments
1384 req._Request__original = req._Request__original.partition('#')[0]
1385 req._Request__r_type = req._Request__r_type.partition('#')[0]
1386
1387 return req
1388
1389 def http_response(self, req, resp):
1390 old_resp = resp
1391 # gzip
1392 if resp.headers.get('Content-encoding', '') == 'gzip':
1393 content = resp.read()
1394 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1395 try:
1396 uncompressed = io.BytesIO(gz.read())
1397 except IOError as original_ioerror:
1398 # There may be junk add the end of the file
1399 # See http://stackoverflow.com/q/4928560/35070 for details
1400 for i in range(1, 1024):
1401 try:
1402 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1403 uncompressed = io.BytesIO(gz.read())
1404 except IOError:
1405 continue
1406 break
1407 else:
1408 raise original_ioerror
1409 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1410 resp.msg = old_resp.msg
1411 del resp.headers['Content-encoding']
1412 # deflate
1413 if resp.headers.get('Content-encoding', '') == 'deflate':
1414 gz = io.BytesIO(self.deflate(resp.read()))
1415 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1416 resp.msg = old_resp.msg
1417 del resp.headers['Content-encoding']
1418 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1419 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1420 if 300 <= resp.code < 400:
1421 location = resp.headers.get('Location')
1422 if location:
1423 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1424 if sys.version_info >= (3, 0):
1425 location = location.encode('iso-8859-1').decode('utf-8')
1426 else:
1427 location = location.decode('utf-8')
1428 location_escaped = escape_url(location)
1429 if location != location_escaped:
1430 del resp.headers['Location']
1431 if sys.version_info < (3, 0):
1432 location_escaped = location_escaped.encode('utf-8')
1433 resp.headers['Location'] = location_escaped
1434 return resp
1435
1436 https_request = http_request
1437 https_response = http_response
1438
1439
1440 def make_socks_conn_class(base_class, socks_proxy):
1441 assert issubclass(base_class, (
1442 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1443
1444 url_components = compat_urlparse.urlparse(socks_proxy)
1445 if url_components.scheme.lower() == 'socks5':
1446 socks_type = ProxyType.SOCKS5
1447 elif url_components.scheme.lower() in ('socks', 'socks4'):
1448 socks_type = ProxyType.SOCKS4
1449 elif url_components.scheme.lower() == 'socks4a':
1450 socks_type = ProxyType.SOCKS4A
1451
1452 def unquote_if_non_empty(s):
1453 if not s:
1454 return s
1455 return compat_urllib_parse_unquote_plus(s)
1456
1457 proxy_args = (
1458 socks_type,
1459 url_components.hostname, url_components.port or 1080,
1460 True, # Remote DNS
1461 unquote_if_non_empty(url_components.username),
1462 unquote_if_non_empty(url_components.password),
1463 )
1464
1465 class SocksConnection(base_class):
1466 def connect(self):
1467 self.sock = sockssocket()
1468 self.sock.setproxy(*proxy_args)
1469 if type(self.timeout) in (int, float):
1470 self.sock.settimeout(self.timeout)
1471 self.sock.connect((self.host, self.port))
1472
1473 if isinstance(self, compat_http_client.HTTPSConnection):
1474 if hasattr(self, '_context'): # Python > 2.6
1475 self.sock = self._context.wrap_socket(
1476 self.sock, server_hostname=self.host)
1477 else:
1478 self.sock = ssl.wrap_socket(self.sock)
1479
1480 return SocksConnection
1481
1482
1483 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1484 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1485 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1486 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1487 self._params = params
1488
1489 def https_open(self, req):
1490 kwargs = {}
1491 conn_class = self._https_conn_class
1492
1493 if hasattr(self, '_context'): # python > 2.6
1494 kwargs['context'] = self._context
1495 if hasattr(self, '_check_hostname'): # python 3.x
1496 kwargs['check_hostname'] = self._check_hostname
1497
1498 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1499 if socks_proxy:
1500 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1501 del req.headers['Ytdl-socks-proxy']
1502
1503 return self.do_open(functools.partial(
1504 _create_http_connection, self, conn_class, True),
1505 req, **kwargs)
1506
1507
1508 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1509 """
1510 See [1] for cookie file format.
1511
1512 1. https://curl.haxx.se/docs/http-cookies.html
1513 """
1514 _HTTPONLY_PREFIX = '#HttpOnly_'
1515 _ENTRY_LEN = 7
1516 _HEADER = '''# Netscape HTTP Cookie File
1517 # This file is generated by yt-dlp. Do not edit.
1518
1519 '''
1520 _CookieFileEntry = collections.namedtuple(
1521 'CookieFileEntry',
1522 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1523
1524 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1525 """
1526 Save cookies to a file.
1527
1528 Most of the code is taken from CPython 3.8 and slightly adapted
1529 to support cookie files with UTF-8 in both python 2 and 3.
1530 """
1531 if filename is None:
1532 if self.filename is not None:
1533 filename = self.filename
1534 else:
1535 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1536
1537 # Store session cookies with `expires` set to 0 instead of an empty
1538 # string
1539 for cookie in self:
1540 if cookie.expires is None:
1541 cookie.expires = 0
1542
1543 with io.open(filename, 'w', encoding='utf-8') as f:
1544 f.write(self._HEADER)
1545 now = time.time()
1546 for cookie in self:
1547 if not ignore_discard and cookie.discard:
1548 continue
1549 if not ignore_expires and cookie.is_expired(now):
1550 continue
1551 if cookie.secure:
1552 secure = 'TRUE'
1553 else:
1554 secure = 'FALSE'
1555 if cookie.domain.startswith('.'):
1556 initial_dot = 'TRUE'
1557 else:
1558 initial_dot = 'FALSE'
1559 if cookie.expires is not None:
1560 expires = compat_str(cookie.expires)
1561 else:
1562 expires = ''
1563 if cookie.value is None:
1564 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1565 # with no name, whereas http.cookiejar regards it as a
1566 # cookie with no value.
1567 name = ''
1568 value = cookie.name
1569 else:
1570 name = cookie.name
1571 value = cookie.value
1572 f.write(
1573 '\t'.join([cookie.domain, initial_dot, cookie.path,
1574 secure, expires, name, value]) + '\n')
1575
1576 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1577 """Load cookies from a file."""
1578 if filename is None:
1579 if self.filename is not None:
1580 filename = self.filename
1581 else:
1582 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1583
1584 def prepare_line(line):
1585 if line.startswith(self._HTTPONLY_PREFIX):
1586 line = line[len(self._HTTPONLY_PREFIX):]
1587 # comments and empty lines are fine
1588 if line.startswith('#') or not line.strip():
1589 return line
1590 cookie_list = line.split('\t')
1591 if len(cookie_list) != self._ENTRY_LEN:
1592 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1593 cookie = self._CookieFileEntry(*cookie_list)
1594 if cookie.expires_at and not cookie.expires_at.isdigit():
1595 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1596 return line
1597
1598 cf = io.StringIO()
1599 with io.open(filename, encoding='utf-8') as f:
1600 for line in f:
1601 try:
1602 cf.write(prepare_line(line))
1603 except compat_cookiejar.LoadError as e:
1604 write_string(
1605 'WARNING: skipping cookie file entry due to %s: %r\n'
1606 % (e, line), sys.stderr)
1607 continue
1608 cf.seek(0)
1609 self._really_load(cf, filename, ignore_discard, ignore_expires)
1610 # Session cookies are denoted by either `expires` field set to
1611 # an empty string or 0. MozillaCookieJar only recognizes the former
1612 # (see [1]). So we need force the latter to be recognized as session
1613 # cookies on our own.
1614 # Session cookies may be important for cookies-based authentication,
1615 # e.g. usually, when user does not check 'Remember me' check box while
1616 # logging in on a site, some important cookies are stored as session
1617 # cookies so that not recognizing them will result in failed login.
1618 # 1. https://bugs.python.org/issue17164
1619 for cookie in self:
1620 # Treat `expires=0` cookies as session cookies
1621 if cookie.expires == 0:
1622 cookie.expires = None
1623 cookie.discard = True
1624
1625
1626 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1627 def __init__(self, cookiejar=None):
1628 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1629
1630 def http_response(self, request, response):
1631 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1632 # characters in Set-Cookie HTTP header of last response (see
1633 # https://github.com/ytdl-org/youtube-dl/issues/6769).
1634 # In order to at least prevent crashing we will percent encode Set-Cookie
1635 # header before HTTPCookieProcessor starts processing it.
1636 # if sys.version_info < (3, 0) and response.headers:
1637 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1638 # set_cookie = response.headers.get(set_cookie_header)
1639 # if set_cookie:
1640 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1641 # if set_cookie != set_cookie_escaped:
1642 # del response.headers[set_cookie_header]
1643 # response.headers[set_cookie_header] = set_cookie_escaped
1644 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1645
1646 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1647 https_response = http_response
1648
1649
1650 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1651 """YoutubeDL redirect handler
1652
1653 The code is based on HTTPRedirectHandler implementation from CPython [1].
1654
1655 This redirect handler solves two issues:
1656 - ensures redirect URL is always unicode under python 2
1657 - introduces support for experimental HTTP response status code
1658 308 Permanent Redirect [2] used by some sites [3]
1659
1660 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1661 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1662 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1663 """
1664
1665 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1666
1667 def redirect_request(self, req, fp, code, msg, headers, newurl):
1668 """Return a Request or None in response to a redirect.
1669
1670 This is called by the http_error_30x methods when a
1671 redirection response is received. If a redirection should
1672 take place, return a new Request to allow http_error_30x to
1673 perform the redirect. Otherwise, raise HTTPError if no-one
1674 else should try to handle this url. Return None if you can't
1675 but another Handler might.
1676 """
1677 m = req.get_method()
1678 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1679 or code in (301, 302, 303) and m == "POST")):
1680 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1681 # Strictly (according to RFC 2616), 301 or 302 in response to
1682 # a POST MUST NOT cause a redirection without confirmation
1683 # from the user (of urllib.request, in this case). In practice,
1684 # essentially all clients do redirect in this case, so we do
1685 # the same.
1686
1687 # On python 2 urlh.geturl() may sometimes return redirect URL
1688 # as byte string instead of unicode. This workaround allows
1689 # to force it always return unicode.
1690 if sys.version_info[0] < 3:
1691 newurl = compat_str(newurl)
1692
1693 # Be conciliant with URIs containing a space. This is mainly
1694 # redundant with the more complete encoding done in http_error_302(),
1695 # but it is kept for compatibility with other callers.
1696 newurl = newurl.replace(' ', '%20')
1697
1698 CONTENT_HEADERS = ("content-length", "content-type")
1699 # NB: don't use dict comprehension for python 2.6 compatibility
1700 newheaders = dict((k, v) for k, v in req.headers.items()
1701 if k.lower() not in CONTENT_HEADERS)
1702 return compat_urllib_request.Request(
1703 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1704 unverifiable=True)
1705
1706
1707 def extract_timezone(date_str):
1708 m = re.search(
1709 r'''(?x)
1710 ^.{8,}? # >=8 char non-TZ prefix, if present
1711 (?P<tz>Z| # just the UTC Z, or
1712 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1713 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1714 [ ]? # optional space
1715 (?P<sign>\+|-) # +/-
1716 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1717 $)
1718 ''', date_str)
1719 if not m:
1720 timezone = datetime.timedelta()
1721 else:
1722 date_str = date_str[:-len(m.group('tz'))]
1723 if not m.group('sign'):
1724 timezone = datetime.timedelta()
1725 else:
1726 sign = 1 if m.group('sign') == '+' else -1
1727 timezone = datetime.timedelta(
1728 hours=sign * int(m.group('hours')),
1729 minutes=sign * int(m.group('minutes')))
1730 return timezone, date_str
1731
1732
1733 def parse_iso8601(date_str, delimiter='T', timezone=None):
1734 """ Return a UNIX timestamp from the given date """
1735
1736 if date_str is None:
1737 return None
1738
1739 date_str = re.sub(r'\.[0-9]+', '', date_str)
1740
1741 if timezone is None:
1742 timezone, date_str = extract_timezone(date_str)
1743
1744 try:
1745 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1746 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1747 return calendar.timegm(dt.timetuple())
1748 except ValueError:
1749 pass
1750
1751
1752 def date_formats(day_first=True):
1753 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1754
1755
1756 def unified_strdate(date_str, day_first=True):
1757 """Return a string with the date in the format YYYYMMDD"""
1758
1759 if date_str is None:
1760 return None
1761 upload_date = None
1762 # Replace commas
1763 date_str = date_str.replace(',', ' ')
1764 # Remove AM/PM + timezone
1765 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1766 _, date_str = extract_timezone(date_str)
1767
1768 for expression in date_formats(day_first):
1769 try:
1770 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1771 except ValueError:
1772 pass
1773 if upload_date is None:
1774 timetuple = email.utils.parsedate_tz(date_str)
1775 if timetuple:
1776 try:
1777 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1778 except ValueError:
1779 pass
1780 if upload_date is not None:
1781 return compat_str(upload_date)
1782
1783
1784 def unified_timestamp(date_str, day_first=True):
1785 if date_str is None:
1786 return None
1787
1788 date_str = re.sub(r'[,|]', '', date_str)
1789
1790 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1791 timezone, date_str = extract_timezone(date_str)
1792
1793 # Remove AM/PM + timezone
1794 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1795
1796 # Remove unrecognized timezones from ISO 8601 alike timestamps
1797 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1798 if m:
1799 date_str = date_str[:-len(m.group('tz'))]
1800
1801 # Python only supports microseconds, so remove nanoseconds
1802 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1803 if m:
1804 date_str = m.group(1)
1805
1806 for expression in date_formats(day_first):
1807 try:
1808 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1809 return calendar.timegm(dt.timetuple())
1810 except ValueError:
1811 pass
1812 timetuple = email.utils.parsedate_tz(date_str)
1813 if timetuple:
1814 return calendar.timegm(timetuple) + pm_delta * 3600
1815
1816
1817 def determine_ext(url, default_ext='unknown_video'):
1818 if url is None or '.' not in url:
1819 return default_ext
1820 guess = url.partition('?')[0].rpartition('.')[2]
1821 if re.match(r'^[A-Za-z0-9]+$', guess):
1822 return guess
1823 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1824 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1825 return guess.rstrip('/')
1826 else:
1827 return default_ext
1828
1829
1830 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1831 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1832
1833
1834 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1835 """
1836 Return a datetime object from a string in the format YYYYMMDD or
1837 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1838
1839 format: string date format used to return datetime object from
1840 precision: round the time portion of a datetime object.
1841 auto|microsecond|second|minute|hour|day.
1842 auto: round to the unit provided in date_str (if applicable).
1843 """
1844 auto_precision = False
1845 if precision == 'auto':
1846 auto_precision = True
1847 precision = 'microsecond'
1848 today = datetime_round(datetime.datetime.now(), precision)
1849 if date_str in ('now', 'today'):
1850 return today
1851 if date_str == 'yesterday':
1852 return today - datetime.timedelta(days=1)
1853 match = re.match(
1854 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1855 date_str)
1856 if match is not None:
1857 start_time = datetime_from_str(match.group('start'), precision, format)
1858 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1859 unit = match.group('unit')
1860 if unit == 'month' or unit == 'year':
1861 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1862 unit = 'day'
1863 else:
1864 if unit == 'week':
1865 unit = 'day'
1866 time *= 7
1867 delta = datetime.timedelta(**{unit + 's': time})
1868 new_date = start_time + delta
1869 if auto_precision:
1870 return datetime_round(new_date, unit)
1871 return new_date
1872
1873 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1874
1875
1876 def date_from_str(date_str, format='%Y%m%d'):
1877 """
1878 Return a datetime object from a string in the format YYYYMMDD or
1879 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1880
1881 format: string date format used to return datetime object from
1882 """
1883 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1884
1885
1886 def datetime_add_months(dt, months):
1887 """Increment/Decrement a datetime object by months."""
1888 month = dt.month + months - 1
1889 year = dt.year + month // 12
1890 month = month % 12 + 1
1891 day = min(dt.day, calendar.monthrange(year, month)[1])
1892 return dt.replace(year, month, day)
1893
1894
1895 def datetime_round(dt, precision='day'):
1896 """
1897 Round a datetime object's time to a specific precision
1898 """
1899 if precision == 'microsecond':
1900 return dt
1901
1902 unit_seconds = {
1903 'day': 86400,
1904 'hour': 3600,
1905 'minute': 60,
1906 'second': 1,
1907 }
1908 roundto = lambda x, n: ((x + n / 2) // n) * n
1909 timestamp = calendar.timegm(dt.timetuple())
1910 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1911
1912
1913 def hyphenate_date(date_str):
1914 """
1915 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1916 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1917 if match is not None:
1918 return '-'.join(match.groups())
1919 else:
1920 return date_str
1921
1922
1923 class DateRange(object):
1924 """Represents a time interval between two dates"""
1925
1926 def __init__(self, start=None, end=None):
1927 """start and end must be strings in the format accepted by date"""
1928 if start is not None:
1929 self.start = date_from_str(start)
1930 else:
1931 self.start = datetime.datetime.min.date()
1932 if end is not None:
1933 self.end = date_from_str(end)
1934 else:
1935 self.end = datetime.datetime.max.date()
1936 if self.start > self.end:
1937 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1938
1939 @classmethod
1940 def day(cls, day):
1941 """Returns a range that only contains the given day"""
1942 return cls(day, day)
1943
1944 def __contains__(self, date):
1945 """Check if the date is in the range"""
1946 if not isinstance(date, datetime.date):
1947 date = date_from_str(date)
1948 return self.start <= date <= self.end
1949
1950 def __str__(self):
1951 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1952
1953
1954 def platform_name():
1955 """ Returns the platform name as a compat_str """
1956 res = platform.platform()
1957 if isinstance(res, bytes):
1958 res = res.decode(preferredencoding())
1959
1960 assert isinstance(res, compat_str)
1961 return res
1962
1963
1964 def get_windows_version():
1965 ''' Get Windows version. None if it's not running on Windows '''
1966 if compat_os_name == 'nt':
1967 return version_tuple(platform.win32_ver()[1])
1968 else:
1969 return None
1970
1971
1972 def _windows_write_string(s, out):
1973 """ Returns True if the string was written using special methods,
1974 False if it has yet to be written out."""
1975 # Adapted from http://stackoverflow.com/a/3259271/35070
1976
1977 import ctypes.wintypes
1978
1979 WIN_OUTPUT_IDS = {
1980 1: -11,
1981 2: -12,
1982 }
1983
1984 try:
1985 fileno = out.fileno()
1986 except AttributeError:
1987 # If the output stream doesn't have a fileno, it's virtual
1988 return False
1989 except io.UnsupportedOperation:
1990 # Some strange Windows pseudo files?
1991 return False
1992 if fileno not in WIN_OUTPUT_IDS:
1993 return False
1994
1995 GetStdHandle = compat_ctypes_WINFUNCTYPE(
1996 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1997 ('GetStdHandle', ctypes.windll.kernel32))
1998 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1999
2000 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
2001 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
2002 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
2003 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
2004 written = ctypes.wintypes.DWORD(0)
2005
2006 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
2007 FILE_TYPE_CHAR = 0x0002
2008 FILE_TYPE_REMOTE = 0x8000
2009 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
2010 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
2011 ctypes.POINTER(ctypes.wintypes.DWORD))(
2012 ('GetConsoleMode', ctypes.windll.kernel32))
2013 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
2014
2015 def not_a_console(handle):
2016 if handle == INVALID_HANDLE_VALUE or handle is None:
2017 return True
2018 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
2019 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
2020
2021 if not_a_console(h):
2022 return False
2023
2024 def next_nonbmp_pos(s):
2025 try:
2026 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
2027 except StopIteration:
2028 return len(s)
2029
2030 while s:
2031 count = min(next_nonbmp_pos(s), 1024)
2032
2033 ret = WriteConsoleW(
2034 h, s, count if count else 2, ctypes.byref(written), None)
2035 if ret == 0:
2036 raise OSError('Failed to write string')
2037 if not count: # We just wrote a non-BMP character
2038 assert written.value == 2
2039 s = s[1:]
2040 else:
2041 assert written.value > 0
2042 s = s[written.value:]
2043 return True
2044
2045
2046 def write_string(s, out=None, encoding=None):
2047 if out is None:
2048 out = sys.stderr
2049 assert type(s) == compat_str
2050
2051 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
2052 if _windows_write_string(s, out):
2053 return
2054
2055 if ('b' in getattr(out, 'mode', '')
2056 or sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
2057 byt = s.encode(encoding or preferredencoding(), 'ignore')
2058 out.write(byt)
2059 elif hasattr(out, 'buffer'):
2060 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2061 byt = s.encode(enc, 'ignore')
2062 out.buffer.write(byt)
2063 else:
2064 out.write(s)
2065 out.flush()
2066
2067
2068 def bytes_to_intlist(bs):
2069 if not bs:
2070 return []
2071 if isinstance(bs[0], int): # Python 3
2072 return list(bs)
2073 else:
2074 return [ord(c) for c in bs]
2075
2076
2077 def intlist_to_bytes(xs):
2078 if not xs:
2079 return b''
2080 return compat_struct_pack('%dB' % len(xs), *xs)
2081
2082
2083 # Cross-platform file locking
2084 if sys.platform == 'win32':
2085 import ctypes.wintypes
2086 import msvcrt
2087
2088 class OVERLAPPED(ctypes.Structure):
2089 _fields_ = [
2090 ('Internal', ctypes.wintypes.LPVOID),
2091 ('InternalHigh', ctypes.wintypes.LPVOID),
2092 ('Offset', ctypes.wintypes.DWORD),
2093 ('OffsetHigh', ctypes.wintypes.DWORD),
2094 ('hEvent', ctypes.wintypes.HANDLE),
2095 ]
2096
2097 kernel32 = ctypes.windll.kernel32
2098 LockFileEx = kernel32.LockFileEx
2099 LockFileEx.argtypes = [
2100 ctypes.wintypes.HANDLE, # hFile
2101 ctypes.wintypes.DWORD, # dwFlags
2102 ctypes.wintypes.DWORD, # dwReserved
2103 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2104 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2105 ctypes.POINTER(OVERLAPPED) # Overlapped
2106 ]
2107 LockFileEx.restype = ctypes.wintypes.BOOL
2108 UnlockFileEx = kernel32.UnlockFileEx
2109 UnlockFileEx.argtypes = [
2110 ctypes.wintypes.HANDLE, # hFile
2111 ctypes.wintypes.DWORD, # dwReserved
2112 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2113 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2114 ctypes.POINTER(OVERLAPPED) # Overlapped
2115 ]
2116 UnlockFileEx.restype = ctypes.wintypes.BOOL
2117 whole_low = 0xffffffff
2118 whole_high = 0x7fffffff
2119
2120 def _lock_file(f, exclusive):
2121 overlapped = OVERLAPPED()
2122 overlapped.Offset = 0
2123 overlapped.OffsetHigh = 0
2124 overlapped.hEvent = 0
2125 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2126 handle = msvcrt.get_osfhandle(f.fileno())
2127 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
2128 whole_low, whole_high, f._lock_file_overlapped_p):
2129 raise OSError('Locking file failed: %r' % ctypes.FormatError())
2130
2131 def _unlock_file(f):
2132 assert f._lock_file_overlapped_p
2133 handle = msvcrt.get_osfhandle(f.fileno())
2134 if not UnlockFileEx(handle, 0,
2135 whole_low, whole_high, f._lock_file_overlapped_p):
2136 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2137
2138 else:
2139 # Some platforms, such as Jython, is missing fcntl
2140 try:
2141 import fcntl
2142
2143 def _lock_file(f, exclusive):
2144 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
2145
2146 def _unlock_file(f):
2147 fcntl.flock(f, fcntl.LOCK_UN)
2148 except ImportError:
2149 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2150
2151 def _lock_file(f, exclusive):
2152 raise IOError(UNSUPPORTED_MSG)
2153
2154 def _unlock_file(f):
2155 raise IOError(UNSUPPORTED_MSG)
2156
2157
2158 class locked_file(object):
2159 def __init__(self, filename, mode, encoding=None):
2160 assert mode in ['r', 'a', 'w']
2161 self.f = io.open(filename, mode, encoding=encoding)
2162 self.mode = mode
2163
2164 def __enter__(self):
2165 exclusive = self.mode != 'r'
2166 try:
2167 _lock_file(self.f, exclusive)
2168 except IOError:
2169 self.f.close()
2170 raise
2171 return self
2172
2173 def __exit__(self, etype, value, traceback):
2174 try:
2175 _unlock_file(self.f)
2176 finally:
2177 self.f.close()
2178
2179 def __iter__(self):
2180 return iter(self.f)
2181
2182 def write(self, *args):
2183 return self.f.write(*args)
2184
2185 def read(self, *args):
2186 return self.f.read(*args)
2187
2188
2189 def get_filesystem_encoding():
2190 encoding = sys.getfilesystemencoding()
2191 return encoding if encoding is not None else 'utf-8'
2192
2193
2194 def shell_quote(args):
2195 quoted_args = []
2196 encoding = get_filesystem_encoding()
2197 for a in args:
2198 if isinstance(a, bytes):
2199 # We may get a filename encoded with 'encodeFilename'
2200 a = a.decode(encoding)
2201 quoted_args.append(compat_shlex_quote(a))
2202 return ' '.join(quoted_args)
2203
2204
2205 def smuggle_url(url, data):
2206 """ Pass additional data in a URL for internal use. """
2207
2208 url, idata = unsmuggle_url(url, {})
2209 data.update(idata)
2210 sdata = compat_urllib_parse_urlencode(
2211 {'__youtubedl_smuggle': json.dumps(data)})
2212 return url + '#' + sdata
2213
2214
2215 def unsmuggle_url(smug_url, default=None):
2216 if '#__youtubedl_smuggle' not in smug_url:
2217 return smug_url, default
2218 url, _, sdata = smug_url.rpartition('#')
2219 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2220 data = json.loads(jsond)
2221 return url, data
2222
2223
2224 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2225 """ Formats numbers with decimal sufixes like K, M, etc """
2226 num, factor = float_or_none(num), float(factor)
2227 if num is None:
2228 return None
2229 exponent = 0 if num == 0 else int(math.log(num, factor))
2230 suffix = ['', *'kMGTPEZY'][exponent]
2231 if factor == 1024:
2232 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2233 converted = num / (factor ** exponent)
2234 return fmt % (converted, suffix)
2235
2236
2237 def format_bytes(bytes):
2238 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2239
2240
2241 def lookup_unit_table(unit_table, s):
2242 units_re = '|'.join(re.escape(u) for u in unit_table)
2243 m = re.match(
2244 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2245 if not m:
2246 return None
2247 num_str = m.group('num').replace(',', '.')
2248 mult = unit_table[m.group('unit')]
2249 return int(float(num_str) * mult)
2250
2251
2252 def parse_filesize(s):
2253 if s is None:
2254 return None
2255
2256 # The lower-case forms are of course incorrect and unofficial,
2257 # but we support those too
2258 _UNIT_TABLE = {
2259 'B': 1,
2260 'b': 1,
2261 'bytes': 1,
2262 'KiB': 1024,
2263 'KB': 1000,
2264 'kB': 1024,
2265 'Kb': 1000,
2266 'kb': 1000,
2267 'kilobytes': 1000,
2268 'kibibytes': 1024,
2269 'MiB': 1024 ** 2,
2270 'MB': 1000 ** 2,
2271 'mB': 1024 ** 2,
2272 'Mb': 1000 ** 2,
2273 'mb': 1000 ** 2,
2274 'megabytes': 1000 ** 2,
2275 'mebibytes': 1024 ** 2,
2276 'GiB': 1024 ** 3,
2277 'GB': 1000 ** 3,
2278 'gB': 1024 ** 3,
2279 'Gb': 1000 ** 3,
2280 'gb': 1000 ** 3,
2281 'gigabytes': 1000 ** 3,
2282 'gibibytes': 1024 ** 3,
2283 'TiB': 1024 ** 4,
2284 'TB': 1000 ** 4,
2285 'tB': 1024 ** 4,
2286 'Tb': 1000 ** 4,
2287 'tb': 1000 ** 4,
2288 'terabytes': 1000 ** 4,
2289 'tebibytes': 1024 ** 4,
2290 'PiB': 1024 ** 5,
2291 'PB': 1000 ** 5,
2292 'pB': 1024 ** 5,
2293 'Pb': 1000 ** 5,
2294 'pb': 1000 ** 5,
2295 'petabytes': 1000 ** 5,
2296 'pebibytes': 1024 ** 5,
2297 'EiB': 1024 ** 6,
2298 'EB': 1000 ** 6,
2299 'eB': 1024 ** 6,
2300 'Eb': 1000 ** 6,
2301 'eb': 1000 ** 6,
2302 'exabytes': 1000 ** 6,
2303 'exbibytes': 1024 ** 6,
2304 'ZiB': 1024 ** 7,
2305 'ZB': 1000 ** 7,
2306 'zB': 1024 ** 7,
2307 'Zb': 1000 ** 7,
2308 'zb': 1000 ** 7,
2309 'zettabytes': 1000 ** 7,
2310 'zebibytes': 1024 ** 7,
2311 'YiB': 1024 ** 8,
2312 'YB': 1000 ** 8,
2313 'yB': 1024 ** 8,
2314 'Yb': 1000 ** 8,
2315 'yb': 1000 ** 8,
2316 'yottabytes': 1000 ** 8,
2317 'yobibytes': 1024 ** 8,
2318 }
2319
2320 return lookup_unit_table(_UNIT_TABLE, s)
2321
2322
2323 def parse_count(s):
2324 if s is None:
2325 return None
2326
2327 s = re.sub(r'^[^\d]+\s', '', s).strip()
2328
2329 if re.match(r'^[\d,.]+$', s):
2330 return str_to_int(s)
2331
2332 _UNIT_TABLE = {
2333 'k': 1000,
2334 'K': 1000,
2335 'm': 1000 ** 2,
2336 'M': 1000 ** 2,
2337 'kk': 1000 ** 2,
2338 'KK': 1000 ** 2,
2339 'b': 1000 ** 3,
2340 'B': 1000 ** 3,
2341 }
2342
2343 ret = lookup_unit_table(_UNIT_TABLE, s)
2344 if ret is not None:
2345 return ret
2346
2347 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2348 if mobj:
2349 return str_to_int(mobj.group(1))
2350
2351
2352 def parse_resolution(s):
2353 if s is None:
2354 return {}
2355
2356 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2357 if mobj:
2358 return {
2359 'width': int(mobj.group('w')),
2360 'height': int(mobj.group('h')),
2361 }
2362
2363 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2364 if mobj:
2365 return {'height': int(mobj.group(1))}
2366
2367 mobj = re.search(r'\b([48])[kK]\b', s)
2368 if mobj:
2369 return {'height': int(mobj.group(1)) * 540}
2370
2371 return {}
2372
2373
2374 def parse_bitrate(s):
2375 if not isinstance(s, compat_str):
2376 return
2377 mobj = re.search(r'\b(\d+)\s*kbps', s)
2378 if mobj:
2379 return int(mobj.group(1))
2380
2381
2382 def month_by_name(name, lang='en'):
2383 """ Return the number of a month by (locale-independently) English name """
2384
2385 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2386
2387 try:
2388 return month_names.index(name) + 1
2389 except ValueError:
2390 return None
2391
2392
2393 def month_by_abbreviation(abbrev):
2394 """ Return the number of a month by (locale-independently) English
2395 abbreviations """
2396
2397 try:
2398 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2399 except ValueError:
2400 return None
2401
2402
2403 def fix_xml_ampersands(xml_str):
2404 """Replace all the '&' by '&amp;' in XML"""
2405 return re.sub(
2406 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2407 '&amp;',
2408 xml_str)
2409
2410
2411 def setproctitle(title):
2412 assert isinstance(title, compat_str)
2413
2414 # ctypes in Jython is not complete
2415 # http://bugs.jython.org/issue2148
2416 if sys.platform.startswith('java'):
2417 return
2418
2419 try:
2420 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2421 except OSError:
2422 return
2423 except TypeError:
2424 # LoadLibrary in Windows Python 2.7.13 only expects
2425 # a bytestring, but since unicode_literals turns
2426 # every string into a unicode string, it fails.
2427 return
2428 title_bytes = title.encode('utf-8')
2429 buf = ctypes.create_string_buffer(len(title_bytes))
2430 buf.value = title_bytes
2431 try:
2432 libc.prctl(15, buf, 0, 0, 0)
2433 except AttributeError:
2434 return # Strange libc, just skip this
2435
2436
2437 def remove_start(s, start):
2438 return s[len(start):] if s is not None and s.startswith(start) else s
2439
2440
2441 def remove_end(s, end):
2442 return s[:-len(end)] if s is not None and s.endswith(end) else s
2443
2444
2445 def remove_quotes(s):
2446 if s is None or len(s) < 2:
2447 return s
2448 for quote in ('"', "'", ):
2449 if s[0] == quote and s[-1] == quote:
2450 return s[1:-1]
2451 return s
2452
2453
2454 def get_domain(url):
2455 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2456 return domain.group('domain') if domain else None
2457
2458
2459 def url_basename(url):
2460 path = compat_urlparse.urlparse(url).path
2461 return path.strip('/').split('/')[-1]
2462
2463
2464 def base_url(url):
2465 return re.match(r'https?://[^?#&]+/', url).group()
2466
2467
2468 def urljoin(base, path):
2469 if isinstance(path, bytes):
2470 path = path.decode('utf-8')
2471 if not isinstance(path, compat_str) or not path:
2472 return None
2473 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2474 return path
2475 if isinstance(base, bytes):
2476 base = base.decode('utf-8')
2477 if not isinstance(base, compat_str) or not re.match(
2478 r'^(?:https?:)?//', base):
2479 return None
2480 return compat_urlparse.urljoin(base, path)
2481
2482
2483 class HEADRequest(compat_urllib_request.Request):
2484 def get_method(self):
2485 return 'HEAD'
2486
2487
2488 class PUTRequest(compat_urllib_request.Request):
2489 def get_method(self):
2490 return 'PUT'
2491
2492
2493 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2494 if get_attr and v is not None:
2495 v = getattr(v, get_attr, None)
2496 try:
2497 return int(v) * invscale // scale
2498 except (ValueError, TypeError, OverflowError):
2499 return default
2500
2501
2502 def str_or_none(v, default=None):
2503 return default if v is None else compat_str(v)
2504
2505
2506 def str_to_int(int_str):
2507 """ A more relaxed version of int_or_none """
2508 if isinstance(int_str, compat_integer_types):
2509 return int_str
2510 elif isinstance(int_str, compat_str):
2511 int_str = re.sub(r'[,\.\+]', '', int_str)
2512 return int_or_none(int_str)
2513
2514
2515 def float_or_none(v, scale=1, invscale=1, default=None):
2516 if v is None:
2517 return default
2518 try:
2519 return float(v) * invscale / scale
2520 except (ValueError, TypeError):
2521 return default
2522
2523
2524 def bool_or_none(v, default=None):
2525 return v if isinstance(v, bool) else default
2526
2527
2528 def strip_or_none(v, default=None):
2529 return v.strip() if isinstance(v, compat_str) else default
2530
2531
2532 def url_or_none(url):
2533 if not url or not isinstance(url, compat_str):
2534 return None
2535 url = url.strip()
2536 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2537
2538
2539 def strftime_or_none(timestamp, date_format, default=None):
2540 datetime_object = None
2541 try:
2542 if isinstance(timestamp, compat_numeric_types): # unix timestamp
2543 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2544 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2545 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2546 return datetime_object.strftime(date_format)
2547 except (ValueError, TypeError, AttributeError):
2548 return default
2549
2550
2551 def parse_duration(s):
2552 if not isinstance(s, compat_basestring):
2553 return None
2554 s = s.strip()
2555 if not s:
2556 return None
2557
2558 days, hours, mins, secs, ms = [None] * 5
2559 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
2560 if m:
2561 days, hours, mins, secs, ms = m.groups()
2562 else:
2563 m = re.match(
2564 r'''(?ix)(?:P?
2565 (?:
2566 [0-9]+\s*y(?:ears?)?\s*
2567 )?
2568 (?:
2569 [0-9]+\s*m(?:onths?)?\s*
2570 )?
2571 (?:
2572 [0-9]+\s*w(?:eeks?)?\s*
2573 )?
2574 (?:
2575 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2576 )?
2577 T)?
2578 (?:
2579 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2580 )?
2581 (?:
2582 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2583 )?
2584 (?:
2585 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2586 )?Z?$''', s)
2587 if m:
2588 days, hours, mins, secs, ms = m.groups()
2589 else:
2590 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2591 if m:
2592 hours, mins = m.groups()
2593 else:
2594 return None
2595
2596 duration = 0
2597 if secs:
2598 duration += float(secs)
2599 if mins:
2600 duration += float(mins) * 60
2601 if hours:
2602 duration += float(hours) * 60 * 60
2603 if days:
2604 duration += float(days) * 24 * 60 * 60
2605 if ms:
2606 duration += float(ms)
2607 return duration
2608
2609
2610 def prepend_extension(filename, ext, expected_real_ext=None):
2611 name, real_ext = os.path.splitext(filename)
2612 return (
2613 '{0}.{1}{2}'.format(name, ext, real_ext)
2614 if not expected_real_ext or real_ext[1:] == expected_real_ext
2615 else '{0}.{1}'.format(filename, ext))
2616
2617
2618 def replace_extension(filename, ext, expected_real_ext=None):
2619 name, real_ext = os.path.splitext(filename)
2620 return '{0}.{1}'.format(
2621 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2622 ext)
2623
2624
2625 def check_executable(exe, args=[]):
2626 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2627 args can be a list of arguments for a short output (like -version) """
2628 try:
2629 Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2630 except OSError:
2631 return False
2632 return exe
2633
2634
2635 def _get_exe_version_output(exe, args):
2636 try:
2637 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2638 # SIGTTOU if yt-dlp is run in the background.
2639 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2640 out, _ = Popen(
2641 [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2642 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2643 except OSError:
2644 return False
2645 if isinstance(out, bytes): # Python 2.x
2646 out = out.decode('ascii', 'ignore')
2647 return out
2648
2649
2650 def detect_exe_version(output, version_re=None, unrecognized='present'):
2651 assert isinstance(output, compat_str)
2652 if version_re is None:
2653 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2654 m = re.search(version_re, output)
2655 if m:
2656 return m.group(1)
2657 else:
2658 return unrecognized
2659
2660
2661 def get_exe_version(exe, args=['--version'],
2662 version_re=None, unrecognized='present'):
2663 """ Returns the version of the specified executable,
2664 or False if the executable is not present """
2665 out = _get_exe_version_output(exe, args)
2666 return detect_exe_version(out, version_re, unrecognized) if out else False
2667
2668
2669 class LazyList(collections.abc.Sequence):
2670 ''' Lazy immutable list from an iterable
2671 Note that slices of a LazyList are lists and not LazyList'''
2672
2673 class IndexError(IndexError):
2674 pass
2675
2676 def __init__(self, iterable, *, reverse=False, _cache=None):
2677 self.__iterable = iter(iterable)
2678 self.__cache = [] if _cache is None else _cache
2679 self.__reversed = reverse
2680
2681 def __iter__(self):
2682 if self.__reversed:
2683 # We need to consume the entire iterable to iterate in reverse
2684 yield from self.exhaust()
2685 return
2686 yield from self.__cache
2687 for item in self.__iterable:
2688 self.__cache.append(item)
2689 yield item
2690
2691 def __exhaust(self):
2692 self.__cache.extend(self.__iterable)
2693 # Discard the emptied iterable to make it pickle-able
2694 self.__iterable = []
2695 return self.__cache
2696
2697 def exhaust(self):
2698 ''' Evaluate the entire iterable '''
2699 return self.__exhaust()[::-1 if self.__reversed else 1]
2700
2701 @staticmethod
2702 def __reverse_index(x):
2703 return None if x is None else -(x + 1)
2704
2705 def __getitem__(self, idx):
2706 if isinstance(idx, slice):
2707 if self.__reversed:
2708 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2709 start, stop, step = idx.start, idx.stop, idx.step or 1
2710 elif isinstance(idx, int):
2711 if self.__reversed:
2712 idx = self.__reverse_index(idx)
2713 start, stop, step = idx, idx, 0
2714 else:
2715 raise TypeError('indices must be integers or slices')
2716 if ((start or 0) < 0 or (stop or 0) < 0
2717 or (start is None and step < 0)
2718 or (stop is None and step > 0)):
2719 # We need to consume the entire iterable to be able to slice from the end
2720 # Obviously, never use this with infinite iterables
2721 self.__exhaust()
2722 try:
2723 return self.__cache[idx]
2724 except IndexError as e:
2725 raise self.IndexError(e) from e
2726 n = max(start or 0, stop or 0) - len(self.__cache) + 1
2727 if n > 0:
2728 self.__cache.extend(itertools.islice(self.__iterable, n))
2729 try:
2730 return self.__cache[idx]
2731 except IndexError as e:
2732 raise self.IndexError(e) from e
2733
2734 def __bool__(self):
2735 try:
2736 self[-1] if self.__reversed else self[0]
2737 except self.IndexError:
2738 return False
2739 return True
2740
2741 def __len__(self):
2742 self.__exhaust()
2743 return len(self.__cache)
2744
2745 def __reversed__(self):
2746 return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2747
2748 def __copy__(self):
2749 return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2750
2751 def __repr__(self):
2752 # repr and str should mimic a list. So we exhaust the iterable
2753 return repr(self.exhaust())
2754
2755 def __str__(self):
2756 return repr(self.exhaust())
2757
2758
2759 class PagedList:
2760
2761 class IndexError(IndexError):
2762 pass
2763
2764 def __len__(self):
2765 # This is only useful for tests
2766 return len(self.getslice())
2767
2768 def __init__(self, pagefunc, pagesize, use_cache=True):
2769 self._pagefunc = pagefunc
2770 self._pagesize = pagesize
2771 self._use_cache = use_cache
2772 self._cache = {}
2773
2774 def getpage(self, pagenum):
2775 page_results = self._cache.get(pagenum)
2776 if page_results is None:
2777 page_results = list(self._pagefunc(pagenum))
2778 if self._use_cache:
2779 self._cache[pagenum] = page_results
2780 return page_results
2781
2782 def getslice(self, start=0, end=None):
2783 return list(self._getslice(start, end))
2784
2785 def _getslice(self, start, end):
2786 raise NotImplementedError('This method must be implemented by subclasses')
2787
2788 def __getitem__(self, idx):
2789 # NOTE: cache must be enabled if this is used
2790 if not isinstance(idx, int) or idx < 0:
2791 raise TypeError('indices must be non-negative integers')
2792 entries = self.getslice(idx, idx + 1)
2793 if not entries:
2794 raise self.IndexError()
2795 return entries[0]
2796
2797
2798 class OnDemandPagedList(PagedList):
2799 def _getslice(self, start, end):
2800 for pagenum in itertools.count(start // self._pagesize):
2801 firstid = pagenum * self._pagesize
2802 nextfirstid = pagenum * self._pagesize + self._pagesize
2803 if start >= nextfirstid:
2804 continue
2805
2806 startv = (
2807 start % self._pagesize
2808 if firstid <= start < nextfirstid
2809 else 0)
2810 endv = (
2811 ((end - 1) % self._pagesize) + 1
2812 if (end is not None and firstid <= end <= nextfirstid)
2813 else None)
2814
2815 page_results = self.getpage(pagenum)
2816 if startv != 0 or endv is not None:
2817 page_results = page_results[startv:endv]
2818 yield from page_results
2819
2820 # A little optimization - if current page is not "full", ie. does
2821 # not contain page_size videos then we can assume that this page
2822 # is the last one - there are no more ids on further pages -
2823 # i.e. no need to query again.
2824 if len(page_results) + startv < self._pagesize:
2825 break
2826
2827 # If we got the whole page, but the next page is not interesting,
2828 # break out early as well
2829 if end == nextfirstid:
2830 break
2831
2832
2833 class InAdvancePagedList(PagedList):
2834 def __init__(self, pagefunc, pagecount, pagesize):
2835 self._pagecount = pagecount
2836 PagedList.__init__(self, pagefunc, pagesize, True)
2837
2838 def _getslice(self, start, end):
2839 start_page = start // self._pagesize
2840 end_page = (
2841 self._pagecount if end is None else (end // self._pagesize + 1))
2842 skip_elems = start - start_page * self._pagesize
2843 only_more = None if end is None else end - start
2844 for pagenum in range(start_page, end_page):
2845 page_results = self.getpage(pagenum)
2846 if skip_elems:
2847 page_results = page_results[skip_elems:]
2848 skip_elems = None
2849 if only_more is not None:
2850 if len(page_results) < only_more:
2851 only_more -= len(page_results)
2852 else:
2853 yield from page_results[:only_more]
2854 break
2855 yield from page_results
2856
2857
2858 def uppercase_escape(s):
2859 unicode_escape = codecs.getdecoder('unicode_escape')
2860 return re.sub(
2861 r'\\U[0-9a-fA-F]{8}',
2862 lambda m: unicode_escape(m.group(0))[0],
2863 s)
2864
2865
2866 def lowercase_escape(s):
2867 unicode_escape = codecs.getdecoder('unicode_escape')
2868 return re.sub(
2869 r'\\u[0-9a-fA-F]{4}',
2870 lambda m: unicode_escape(m.group(0))[0],
2871 s)
2872
2873
2874 def escape_rfc3986(s):
2875 """Escape non-ASCII characters as suggested by RFC 3986"""
2876 if sys.version_info < (3, 0) and isinstance(s, compat_str):
2877 s = s.encode('utf-8')
2878 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2879
2880
2881 def escape_url(url):
2882 """Escape URL as suggested by RFC 3986"""
2883 url_parsed = compat_urllib_parse_urlparse(url)
2884 return url_parsed._replace(
2885 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2886 path=escape_rfc3986(url_parsed.path),
2887 params=escape_rfc3986(url_parsed.params),
2888 query=escape_rfc3986(url_parsed.query),
2889 fragment=escape_rfc3986(url_parsed.fragment)
2890 ).geturl()
2891
2892
2893 def parse_qs(url):
2894 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2895
2896
2897 def read_batch_urls(batch_fd):
2898 def fixup(url):
2899 if not isinstance(url, compat_str):
2900 url = url.decode('utf-8', 'replace')
2901 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2902 for bom in BOM_UTF8:
2903 if url.startswith(bom):
2904 url = url[len(bom):]
2905 url = url.lstrip()
2906 if not url or url.startswith(('#', ';', ']')):
2907 return False
2908 # "#" cannot be stripped out since it is part of the URI
2909 # However, it can be safely stipped out if follwing a whitespace
2910 return re.split(r'\s#', url, 1)[0].rstrip()
2911
2912 with contextlib.closing(batch_fd) as fd:
2913 return [url for url in map(fixup, fd) if url]
2914
2915
2916 def urlencode_postdata(*args, **kargs):
2917 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2918
2919
2920 def update_url_query(url, query):
2921 if not query:
2922 return url
2923 parsed_url = compat_urlparse.urlparse(url)
2924 qs = compat_parse_qs(parsed_url.query)
2925 qs.update(query)
2926 return compat_urlparse.urlunparse(parsed_url._replace(
2927 query=compat_urllib_parse_urlencode(qs, True)))
2928
2929
2930 def update_Request(req, url=None, data=None, headers={}, query={}):
2931 req_headers = req.headers.copy()
2932 req_headers.update(headers)
2933 req_data = data or req.data
2934 req_url = update_url_query(url or req.get_full_url(), query)
2935 req_get_method = req.get_method()
2936 if req_get_method == 'HEAD':
2937 req_type = HEADRequest
2938 elif req_get_method == 'PUT':
2939 req_type = PUTRequest
2940 else:
2941 req_type = compat_urllib_request.Request
2942 new_req = req_type(
2943 req_url, data=req_data, headers=req_headers,
2944 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2945 if hasattr(req, 'timeout'):
2946 new_req.timeout = req.timeout
2947 return new_req
2948
2949
2950 def _multipart_encode_impl(data, boundary):
2951 content_type = 'multipart/form-data; boundary=%s' % boundary
2952
2953 out = b''
2954 for k, v in data.items():
2955 out += b'--' + boundary.encode('ascii') + b'\r\n'
2956 if isinstance(k, compat_str):
2957 k = k.encode('utf-8')
2958 if isinstance(v, compat_str):
2959 v = v.encode('utf-8')
2960 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2961 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2962 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2963 if boundary.encode('ascii') in content:
2964 raise ValueError('Boundary overlaps with data')
2965 out += content
2966
2967 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2968
2969 return out, content_type
2970
2971
2972 def multipart_encode(data, boundary=None):
2973 '''
2974 Encode a dict to RFC 7578-compliant form-data
2975
2976 data:
2977 A dict where keys and values can be either Unicode or bytes-like
2978 objects.
2979 boundary:
2980 If specified a Unicode object, it's used as the boundary. Otherwise
2981 a random boundary is generated.
2982
2983 Reference: https://tools.ietf.org/html/rfc7578
2984 '''
2985 has_specified_boundary = boundary is not None
2986
2987 while True:
2988 if boundary is None:
2989 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2990
2991 try:
2992 out, content_type = _multipart_encode_impl(data, boundary)
2993 break
2994 except ValueError:
2995 if has_specified_boundary:
2996 raise
2997 boundary = None
2998
2999 return out, content_type
3000
3001
3002 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3003 if isinstance(key_or_keys, (list, tuple)):
3004 for key in key_or_keys:
3005 if key not in d or d[key] is None or skip_false_values and not d[key]:
3006 continue
3007 return d[key]
3008 return default
3009 return d.get(key_or_keys, default)
3010
3011
3012 def try_get(src, getter, expected_type=None):
3013 for get in variadic(getter):
3014 try:
3015 v = get(src)
3016 except (AttributeError, KeyError, TypeError, IndexError):
3017 pass
3018 else:
3019 if expected_type is None or isinstance(v, expected_type):
3020 return v
3021
3022
3023 def merge_dicts(*dicts):
3024 merged = {}
3025 for a_dict in dicts:
3026 for k, v in a_dict.items():
3027 if v is None:
3028 continue
3029 if (k not in merged
3030 or (isinstance(v, compat_str) and v
3031 and isinstance(merged[k], compat_str)
3032 and not merged[k])):
3033 merged[k] = v
3034 return merged
3035
3036
3037 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3038 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3039
3040
3041 US_RATINGS = {
3042 'G': 0,
3043 'PG': 10,
3044 'PG-13': 13,
3045 'R': 16,
3046 'NC': 18,
3047 }
3048
3049
3050 TV_PARENTAL_GUIDELINES = {
3051 'TV-Y': 0,
3052 'TV-Y7': 7,
3053 'TV-G': 0,
3054 'TV-PG': 0,
3055 'TV-14': 14,
3056 'TV-MA': 17,
3057 }
3058
3059
3060 def parse_age_limit(s):
3061 if type(s) == int:
3062 return s if 0 <= s <= 21 else None
3063 if not isinstance(s, compat_basestring):
3064 return None
3065 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3066 if m:
3067 return int(m.group('age'))
3068 s = s.upper()
3069 if s in US_RATINGS:
3070 return US_RATINGS[s]
3071 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3072 if m:
3073 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3074 return None
3075
3076
3077 def strip_jsonp(code):
3078 return re.sub(
3079 r'''(?sx)^
3080 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3081 (?:\s*&&\s*(?P=func_name))?
3082 \s*\(\s*(?P<callback_data>.*)\);?
3083 \s*?(?://[^\n]*)*$''',
3084 r'\g<callback_data>', code)
3085
3086
3087 def js_to_json(code, vars={}):
3088 # vars is a dict of var, val pairs to substitute
3089 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3090 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3091 INTEGER_TABLE = (
3092 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3093 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3094 )
3095
3096 def fix_kv(m):
3097 v = m.group(0)
3098 if v in ('true', 'false', 'null'):
3099 return v
3100 elif v in ('undefined', 'void 0'):
3101 return 'null'
3102 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3103 return ""
3104
3105 if v[0] in ("'", '"'):
3106 v = re.sub(r'(?s)\\.|"', lambda m: {
3107 '"': '\\"',
3108 "\\'": "'",
3109 '\\\n': '',
3110 '\\x': '\\u00',
3111 }.get(m.group(0), m.group(0)), v[1:-1])
3112 else:
3113 for regex, base in INTEGER_TABLE:
3114 im = re.match(regex, v)
3115 if im:
3116 i = int(im.group(1), base)
3117 return '"%d":' % i if v.endswith(':') else '%d' % i
3118
3119 if v in vars:
3120 return vars[v]
3121
3122 return '"%s"' % v
3123
3124 return re.sub(r'''(?sx)
3125 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3126 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3127 {comment}|,(?={skip}[\]}}])|
3128 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3129 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3130 [0-9]+(?={skip}:)|
3131 !+
3132 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3133
3134
3135 def qualities(quality_ids):
3136 """ Get a numeric quality value out of a list of possible values """
3137 def q(qid):
3138 try:
3139 return quality_ids.index(qid)
3140 except ValueError:
3141 return -1
3142 return q
3143
3144
3145 POSTPROCESS_WHEN = {'pre_process', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
3146
3147
3148 DEFAULT_OUTTMPL = {
3149 'default': '%(title)s [%(id)s].%(ext)s',
3150 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3151 }
3152 OUTTMPL_TYPES = {
3153 'chapter': None,
3154 'subtitle': None,
3155 'thumbnail': None,
3156 'description': 'description',
3157 'annotation': 'annotations.xml',
3158 'infojson': 'info.json',
3159 'link': None,
3160 'pl_thumbnail': None,
3161 'pl_description': 'description',
3162 'pl_infojson': 'info.json',
3163 }
3164
3165 # As of [1] format syntax is:
3166 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3167 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3168 STR_FORMAT_RE_TMPL = r'''(?x)
3169 (?<!%)(?P<prefix>(?:%%)*)
3170 %
3171 (?P<has_key>\((?P<key>{0})\))?
3172 (?P<format>
3173 (?P<conversion>[#0\-+ ]+)?
3174 (?P<min_width>\d+)?
3175 (?P<precision>\.\d+)?
3176 (?P<len_mod>[hlL])? # unused in python
3177 {1} # conversion type
3178 )
3179 '''
3180
3181
3182 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3183
3184
3185 def limit_length(s, length):
3186 """ Add ellipses to overly long strings """
3187 if s is None:
3188 return None
3189 ELLIPSES = '...'
3190 if len(s) > length:
3191 return s[:length - len(ELLIPSES)] + ELLIPSES
3192 return s
3193
3194
3195 def version_tuple(v):
3196 return tuple(int(e) for e in re.split(r'[-.]', v))
3197
3198
3199 def is_outdated_version(version, limit, assume_new=True):
3200 if not version:
3201 return not assume_new
3202 try:
3203 return version_tuple(version) < version_tuple(limit)
3204 except ValueError:
3205 return not assume_new
3206
3207
3208 def ytdl_is_updateable():
3209 """ Returns if yt-dlp can be updated with -U """
3210
3211 from .update import is_non_updateable
3212
3213 return not is_non_updateable()
3214
3215
3216 def args_to_str(args):
3217 # Get a short string representation for a subprocess command
3218 return ' '.join(compat_shlex_quote(a) for a in args)
3219
3220
3221 def error_to_compat_str(err):
3222 err_str = str(err)
3223 # On python 2 error byte string must be decoded with proper
3224 # encoding rather than ascii
3225 if sys.version_info[0] < 3:
3226 err_str = err_str.decode(preferredencoding())
3227 return err_str
3228
3229
3230 def mimetype2ext(mt):
3231 if mt is None:
3232 return None
3233
3234 mt, _, params = mt.partition(';')
3235 mt = mt.strip()
3236
3237 FULL_MAP = {
3238 'audio/mp4': 'm4a',
3239 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3240 # it's the most popular one
3241 'audio/mpeg': 'mp3',
3242 'audio/x-wav': 'wav',
3243 'audio/wav': 'wav',
3244 'audio/wave': 'wav',
3245 }
3246
3247 ext = FULL_MAP.get(mt)
3248 if ext is not None:
3249 return ext
3250
3251 SUBTYPE_MAP = {
3252 '3gpp': '3gp',
3253 'smptett+xml': 'tt',
3254 'ttaf+xml': 'dfxp',
3255 'ttml+xml': 'ttml',
3256 'x-flv': 'flv',
3257 'x-mp4-fragmented': 'mp4',
3258 'x-ms-sami': 'sami',
3259 'x-ms-wmv': 'wmv',
3260 'mpegurl': 'm3u8',
3261 'x-mpegurl': 'm3u8',
3262 'vnd.apple.mpegurl': 'm3u8',
3263 'dash+xml': 'mpd',
3264 'f4m+xml': 'f4m',
3265 'hds+xml': 'f4m',
3266 'vnd.ms-sstr+xml': 'ism',
3267 'quicktime': 'mov',
3268 'mp2t': 'ts',
3269 'x-wav': 'wav',
3270 'filmstrip+json': 'fs',
3271 'svg+xml': 'svg',
3272 }
3273
3274 _, _, subtype = mt.rpartition('/')
3275 ext = SUBTYPE_MAP.get(subtype.lower())
3276 if ext is not None:
3277 return ext
3278
3279 SUFFIX_MAP = {
3280 'json': 'json',
3281 'xml': 'xml',
3282 'zip': 'zip',
3283 'gzip': 'gz',
3284 }
3285
3286 _, _, suffix = subtype.partition('+')
3287 ext = SUFFIX_MAP.get(suffix)
3288 if ext is not None:
3289 return ext
3290
3291 return subtype.replace('+', '.')
3292
3293
3294 def ext2mimetype(ext_or_url):
3295 if not ext_or_url:
3296 return None
3297 if '.' not in ext_or_url:
3298 ext_or_url = f'file.{ext_or_url}'
3299 return mimetypes.guess_type(ext_or_url)[0]
3300
3301
3302 def parse_codecs(codecs_str):
3303 # http://tools.ietf.org/html/rfc6381
3304 if not codecs_str:
3305 return {}
3306 split_codecs = list(filter(None, map(
3307 str.strip, codecs_str.strip().strip(',').split(','))))
3308 vcodec, acodec, tcodec, hdr = None, None, None, None
3309 for full_codec in split_codecs:
3310 parts = full_codec.split('.')
3311 codec = parts[0].replace('0', '')
3312 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3313 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3314 if not vcodec:
3315 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3316 if codec in ('dvh1', 'dvhe'):
3317 hdr = 'DV'
3318 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3319 hdr = 'HDR10'
3320 elif full_codec.replace('0', '').startswith('vp9.2'):
3321 hdr = 'HDR10'
3322 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3323 if not acodec:
3324 acodec = full_codec
3325 elif codec in ('stpp', 'wvtt',):
3326 if not tcodec:
3327 tcodec = full_codec
3328 else:
3329 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3330 if vcodec or acodec or tcodec:
3331 return {
3332 'vcodec': vcodec or 'none',
3333 'acodec': acodec or 'none',
3334 'dynamic_range': hdr,
3335 **({'tcodec': tcodec} if tcodec is not None else {}),
3336 }
3337 elif len(split_codecs) == 2:
3338 return {
3339 'vcodec': split_codecs[0],
3340 'acodec': split_codecs[1],
3341 }
3342 return {}
3343
3344
3345 def urlhandle_detect_ext(url_handle):
3346 getheader = url_handle.headers.get
3347
3348 cd = getheader('Content-Disposition')
3349 if cd:
3350 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3351 if m:
3352 e = determine_ext(m.group('filename'), default_ext=None)
3353 if e:
3354 return e
3355
3356 return mimetype2ext(getheader('Content-Type'))
3357
3358
3359 def encode_data_uri(data, mime_type):
3360 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3361
3362
3363 def age_restricted(content_limit, age_limit):
3364 """ Returns True iff the content should be blocked """
3365
3366 if age_limit is None: # No limit set
3367 return False
3368 if content_limit is None:
3369 return False # Content available for everyone
3370 return age_limit < content_limit
3371
3372
3373 def is_html(first_bytes):
3374 """ Detect whether a file contains HTML by examining its first bytes. """
3375
3376 BOMS = [
3377 (b'\xef\xbb\xbf', 'utf-8'),
3378 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3379 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3380 (b'\xff\xfe', 'utf-16-le'),
3381 (b'\xfe\xff', 'utf-16-be'),
3382 ]
3383 for bom, enc in BOMS:
3384 if first_bytes.startswith(bom):
3385 s = first_bytes[len(bom):].decode(enc, 'replace')
3386 break
3387 else:
3388 s = first_bytes.decode('utf-8', 'replace')
3389
3390 return re.match(r'^\s*<', s)
3391
3392
3393 def determine_protocol(info_dict):
3394 protocol = info_dict.get('protocol')
3395 if protocol is not None:
3396 return protocol
3397
3398 url = sanitize_url(info_dict['url'])
3399 if url.startswith('rtmp'):
3400 return 'rtmp'
3401 elif url.startswith('mms'):
3402 return 'mms'
3403 elif url.startswith('rtsp'):
3404 return 'rtsp'
3405
3406 ext = determine_ext(url)
3407 if ext == 'm3u8':
3408 return 'm3u8'
3409 elif ext == 'f4m':
3410 return 'f4m'
3411
3412 return compat_urllib_parse_urlparse(url).scheme
3413
3414
3415 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3416 """ Render a list of rows, each as a list of values.
3417 Text after a \t will be right aligned """
3418 def width(string):
3419 return len(remove_terminal_sequences(string).replace('\t', ''))
3420
3421 def get_max_lens(table):
3422 return [max(width(str(v)) for v in col) for col in zip(*table)]
3423
3424 def filter_using_list(row, filterArray):
3425 return [col for (take, col) in zip(filterArray, row) if take]
3426
3427 if hide_empty:
3428 max_lens = get_max_lens(data)
3429 header_row = filter_using_list(header_row, max_lens)
3430 data = [filter_using_list(row, max_lens) for row in data]
3431
3432 table = [header_row] + data
3433 max_lens = get_max_lens(table)
3434 extra_gap += 1
3435 if delim:
3436 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3437 table[1][-1] = table[1][-1][:-extra_gap] # Remove extra_gap from end of delimiter
3438 for row in table:
3439 for pos, text in enumerate(map(str, row)):
3440 if '\t' in text:
3441 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3442 else:
3443 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3444 ret = '\n'.join(''.join(row).rstrip() for row in table)
3445 return ret
3446
3447
3448 def _match_one(filter_part, dct, incomplete):
3449 # TODO: Generalize code with YoutubeDL._build_format_filter
3450 STRING_OPERATORS = {
3451 '*=': operator.contains,
3452 '^=': lambda attr, value: attr.startswith(value),
3453 '$=': lambda attr, value: attr.endswith(value),
3454 '~=': lambda attr, value: re.search(value, attr),
3455 }
3456 COMPARISON_OPERATORS = {
3457 **STRING_OPERATORS,
3458 '<=': operator.le, # "<=" must be defined above "<"
3459 '<': operator.lt,
3460 '>=': operator.ge,
3461 '>': operator.gt,
3462 '=': operator.eq,
3463 }
3464
3465 operator_rex = re.compile(r'''(?x)\s*
3466 (?P<key>[a-z_]+)
3467 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3468 (?:
3469 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3470 (?P<strval>.+?)
3471 )
3472 \s*$
3473 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3474 m = operator_rex.search(filter_part)
3475 if m:
3476 m = m.groupdict()
3477 unnegated_op = COMPARISON_OPERATORS[m['op']]
3478 if m['negation']:
3479 op = lambda attr, value: not unnegated_op(attr, value)
3480 else:
3481 op = unnegated_op
3482 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3483 if m['quote']:
3484 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3485 actual_value = dct.get(m['key'])
3486 numeric_comparison = None
3487 if isinstance(actual_value, compat_numeric_types):
3488 # If the original field is a string and matching comparisonvalue is
3489 # a number we should respect the origin of the original field
3490 # and process comparison value as a string (see
3491 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3492 try:
3493 numeric_comparison = int(comparison_value)
3494 except ValueError:
3495 numeric_comparison = parse_filesize(comparison_value)
3496 if numeric_comparison is None:
3497 numeric_comparison = parse_filesize(f'{comparison_value}B')
3498 if numeric_comparison is None:
3499 numeric_comparison = parse_duration(comparison_value)
3500 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3501 raise ValueError('Operator %s only supports string values!' % m['op'])
3502 if actual_value is None:
3503 return incomplete or m['none_inclusive']
3504 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3505
3506 UNARY_OPERATORS = {
3507 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3508 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3509 }
3510 operator_rex = re.compile(r'''(?x)\s*
3511 (?P<op>%s)\s*(?P<key>[a-z_]+)
3512 \s*$
3513 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3514 m = operator_rex.search(filter_part)
3515 if m:
3516 op = UNARY_OPERATORS[m.group('op')]
3517 actual_value = dct.get(m.group('key'))
3518 if incomplete and actual_value is None:
3519 return True
3520 return op(actual_value)
3521
3522 raise ValueError('Invalid filter part %r' % filter_part)
3523
3524
3525 def match_str(filter_str, dct, incomplete=False):
3526 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3527 When incomplete, all conditions passes on missing fields
3528 """
3529 return all(
3530 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3531 for filter_part in re.split(r'(?<!\\)&', filter_str))
3532
3533
3534 def match_filter_func(filter_str):
3535 def _match_func(info_dict, *args, **kwargs):
3536 if match_str(filter_str, info_dict, *args, **kwargs):
3537 return None
3538 else:
3539 video_title = info_dict.get('title', info_dict.get('id', 'video'))
3540 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3541 return _match_func
3542
3543
3544 def parse_dfxp_time_expr(time_expr):
3545 if not time_expr:
3546 return
3547
3548 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3549 if mobj:
3550 return float(mobj.group('time_offset'))
3551
3552 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3553 if mobj:
3554 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3555
3556
3557 def srt_subtitles_timecode(seconds):
3558 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3559
3560
3561 def ass_subtitles_timecode(seconds):
3562 time = timetuple_from_msec(seconds * 1000)
3563 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3564
3565
3566 def dfxp2srt(dfxp_data):
3567 '''
3568 @param dfxp_data A bytes-like object containing DFXP data
3569 @returns A unicode object containing converted SRT data
3570 '''
3571 LEGACY_NAMESPACES = (
3572 (b'http://www.w3.org/ns/ttml', [
3573 b'http://www.w3.org/2004/11/ttaf1',
3574 b'http://www.w3.org/2006/04/ttaf1',
3575 b'http://www.w3.org/2006/10/ttaf1',
3576 ]),
3577 (b'http://www.w3.org/ns/ttml#styling', [
3578 b'http://www.w3.org/ns/ttml#style',
3579 ]),
3580 )
3581
3582 SUPPORTED_STYLING = [
3583 'color',
3584 'fontFamily',
3585 'fontSize',
3586 'fontStyle',
3587 'fontWeight',
3588 'textDecoration'
3589 ]
3590
3591 _x = functools.partial(xpath_with_ns, ns_map={
3592 'xml': 'http://www.w3.org/XML/1998/namespace',
3593 'ttml': 'http://www.w3.org/ns/ttml',
3594 'tts': 'http://www.w3.org/ns/ttml#styling',
3595 })
3596
3597 styles = {}
3598 default_style = {}
3599
3600 class TTMLPElementParser(object):
3601 _out = ''
3602 _unclosed_elements = []
3603 _applied_styles = []
3604
3605 def start(self, tag, attrib):
3606 if tag in (_x('ttml:br'), 'br'):
3607 self._out += '\n'
3608 else:
3609 unclosed_elements = []
3610 style = {}
3611 element_style_id = attrib.get('style')
3612 if default_style:
3613 style.update(default_style)
3614 if element_style_id:
3615 style.update(styles.get(element_style_id, {}))
3616 for prop in SUPPORTED_STYLING:
3617 prop_val = attrib.get(_x('tts:' + prop))
3618 if prop_val:
3619 style[prop] = prop_val
3620 if style:
3621 font = ''
3622 for k, v in sorted(style.items()):
3623 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3624 continue
3625 if k == 'color':
3626 font += ' color="%s"' % v
3627 elif k == 'fontSize':
3628 font += ' size="%s"' % v
3629 elif k == 'fontFamily':
3630 font += ' face="%s"' % v
3631 elif k == 'fontWeight' and v == 'bold':
3632 self._out += '<b>'
3633 unclosed_elements.append('b')
3634 elif k == 'fontStyle' and v == 'italic':
3635 self._out += '<i>'
3636 unclosed_elements.append('i')
3637 elif k == 'textDecoration' and v == 'underline':
3638 self._out += '<u>'
3639 unclosed_elements.append('u')
3640 if font:
3641 self._out += '<font' + font + '>'
3642 unclosed_elements.append('font')
3643 applied_style = {}
3644 if self._applied_styles:
3645 applied_style.update(self._applied_styles[-1])
3646 applied_style.update(style)
3647 self._applied_styles.append(applied_style)
3648 self._unclosed_elements.append(unclosed_elements)
3649
3650 def end(self, tag):
3651 if tag not in (_x('ttml:br'), 'br'):
3652 unclosed_elements = self._unclosed_elements.pop()
3653 for element in reversed(unclosed_elements):
3654 self._out += '</%s>' % element
3655 if unclosed_elements and self._applied_styles:
3656 self._applied_styles.pop()
3657
3658 def data(self, data):
3659 self._out += data
3660
3661 def close(self):
3662 return self._out.strip()
3663
3664 def parse_node(node):
3665 target = TTMLPElementParser()
3666 parser = xml.etree.ElementTree.XMLParser(target=target)
3667 parser.feed(xml.etree.ElementTree.tostring(node))
3668 return parser.close()
3669
3670 for k, v in LEGACY_NAMESPACES:
3671 for ns in v:
3672 dfxp_data = dfxp_data.replace(ns, k)
3673
3674 dfxp = compat_etree_fromstring(dfxp_data)
3675 out = []
3676 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3677
3678 if not paras:
3679 raise ValueError('Invalid dfxp/TTML subtitle')
3680
3681 repeat = False
3682 while True:
3683 for style in dfxp.findall(_x('.//ttml:style')):
3684 style_id = style.get('id') or style.get(_x('xml:id'))
3685 if not style_id:
3686 continue
3687 parent_style_id = style.get('style')
3688 if parent_style_id:
3689 if parent_style_id not in styles:
3690 repeat = True
3691 continue
3692 styles[style_id] = styles[parent_style_id].copy()
3693 for prop in SUPPORTED_STYLING:
3694 prop_val = style.get(_x('tts:' + prop))
3695 if prop_val:
3696 styles.setdefault(style_id, {})[prop] = prop_val
3697 if repeat:
3698 repeat = False
3699 else:
3700 break
3701
3702 for p in ('body', 'div'):
3703 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3704 if ele is None:
3705 continue
3706 style = styles.get(ele.get('style'))
3707 if not style:
3708 continue
3709 default_style.update(style)
3710
3711 for para, index in zip(paras, itertools.count(1)):
3712 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3713 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3714 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3715 if begin_time is None:
3716 continue
3717 if not end_time:
3718 if not dur:
3719 continue
3720 end_time = begin_time + dur
3721 out.append('%d\n%s --> %s\n%s\n\n' % (
3722 index,
3723 srt_subtitles_timecode(begin_time),
3724 srt_subtitles_timecode(end_time),
3725 parse_node(para)))
3726
3727 return ''.join(out)
3728
3729
3730 def cli_option(params, command_option, param):
3731 param = params.get(param)
3732 if param:
3733 param = compat_str(param)
3734 return [command_option, param] if param is not None else []
3735
3736
3737 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3738 param = params.get(param)
3739 if param is None:
3740 return []
3741 assert isinstance(param, bool)
3742 if separator:
3743 return [command_option + separator + (true_value if param else false_value)]
3744 return [command_option, true_value if param else false_value]
3745
3746
3747 def cli_valueless_option(params, command_option, param, expected_value=True):
3748 param = params.get(param)
3749 return [command_option] if param == expected_value else []
3750
3751
3752 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3753 if isinstance(argdict, (list, tuple)): # for backward compatibility
3754 if use_compat:
3755 return argdict
3756 else:
3757 argdict = None
3758 if argdict is None:
3759 return default
3760 assert isinstance(argdict, dict)
3761
3762 assert isinstance(keys, (list, tuple))
3763 for key_list in keys:
3764 arg_list = list(filter(
3765 lambda x: x is not None,
3766 [argdict.get(key.lower()) for key in variadic(key_list)]))
3767 if arg_list:
3768 return [arg for args in arg_list for arg in args]
3769 return default
3770
3771
3772 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3773 main_key, exe = main_key.lower(), exe.lower()
3774 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3775 keys = [f'{root_key}{k}' for k in (keys or [''])]
3776 if root_key in keys:
3777 if main_key != exe:
3778 keys.append((main_key, exe))
3779 keys.append('default')
3780 else:
3781 use_compat = False
3782 return cli_configuration_args(argdict, keys, default, use_compat)
3783
3784
3785 class ISO639Utils(object):
3786 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3787 _lang_map = {
3788 'aa': 'aar',
3789 'ab': 'abk',
3790 'ae': 'ave',
3791 'af': 'afr',
3792 'ak': 'aka',
3793 'am': 'amh',
3794 'an': 'arg',
3795 'ar': 'ara',
3796 'as': 'asm',
3797 'av': 'ava',
3798 'ay': 'aym',
3799 'az': 'aze',
3800 'ba': 'bak',
3801 'be': 'bel',
3802 'bg': 'bul',
3803 'bh': 'bih',
3804 'bi': 'bis',
3805 'bm': 'bam',
3806 'bn': 'ben',
3807 'bo': 'bod',
3808 'br': 'bre',
3809 'bs': 'bos',
3810 'ca': 'cat',
3811 'ce': 'che',
3812 'ch': 'cha',
3813 'co': 'cos',
3814 'cr': 'cre',
3815 'cs': 'ces',
3816 'cu': 'chu',
3817 'cv': 'chv',
3818 'cy': 'cym',
3819 'da': 'dan',
3820 'de': 'deu',
3821 'dv': 'div',
3822 'dz': 'dzo',
3823 'ee': 'ewe',
3824 'el': 'ell',
3825 'en': 'eng',
3826 'eo': 'epo',
3827 'es': 'spa',
3828 'et': 'est',
3829 'eu': 'eus',
3830 'fa': 'fas',
3831 'ff': 'ful',
3832 'fi': 'fin',
3833 'fj': 'fij',
3834 'fo': 'fao',
3835 'fr': 'fra',
3836 'fy': 'fry',
3837 'ga': 'gle',
3838 'gd': 'gla',
3839 'gl': 'glg',
3840 'gn': 'grn',
3841 'gu': 'guj',
3842 'gv': 'glv',
3843 'ha': 'hau',
3844 'he': 'heb',
3845 'iw': 'heb', # Replaced by he in 1989 revision
3846 'hi': 'hin',
3847 'ho': 'hmo',
3848 'hr': 'hrv',
3849 'ht': 'hat',
3850 'hu': 'hun',
3851 'hy': 'hye',
3852 'hz': 'her',
3853 'ia': 'ina',
3854 'id': 'ind',
3855 'in': 'ind', # Replaced by id in 1989 revision
3856 'ie': 'ile',
3857 'ig': 'ibo',
3858 'ii': 'iii',
3859 'ik': 'ipk',
3860 'io': 'ido',
3861 'is': 'isl',
3862 'it': 'ita',
3863 'iu': 'iku',
3864 'ja': 'jpn',
3865 'jv': 'jav',
3866 'ka': 'kat',
3867 'kg': 'kon',
3868 'ki': 'kik',
3869 'kj': 'kua',
3870 'kk': 'kaz',
3871 'kl': 'kal',
3872 'km': 'khm',
3873 'kn': 'kan',
3874 'ko': 'kor',
3875 'kr': 'kau',
3876 'ks': 'kas',
3877 'ku': 'kur',
3878 'kv': 'kom',
3879 'kw': 'cor',
3880 'ky': 'kir',
3881 'la': 'lat',
3882 'lb': 'ltz',
3883 'lg': 'lug',
3884 'li': 'lim',
3885 'ln': 'lin',
3886 'lo': 'lao',
3887 'lt': 'lit',
3888 'lu': 'lub',
3889 'lv': 'lav',
3890 'mg': 'mlg',
3891 'mh': 'mah',
3892 'mi': 'mri',
3893 'mk': 'mkd',
3894 'ml': 'mal',
3895 'mn': 'mon',
3896 'mr': 'mar',
3897 'ms': 'msa',
3898 'mt': 'mlt',
3899 'my': 'mya',
3900 'na': 'nau',
3901 'nb': 'nob',
3902 'nd': 'nde',
3903 'ne': 'nep',
3904 'ng': 'ndo',
3905 'nl': 'nld',
3906 'nn': 'nno',
3907 'no': 'nor',
3908 'nr': 'nbl',
3909 'nv': 'nav',
3910 'ny': 'nya',
3911 'oc': 'oci',
3912 'oj': 'oji',
3913 'om': 'orm',
3914 'or': 'ori',
3915 'os': 'oss',
3916 'pa': 'pan',
3917 'pi': 'pli',
3918 'pl': 'pol',
3919 'ps': 'pus',
3920 'pt': 'por',
3921 'qu': 'que',
3922 'rm': 'roh',
3923 'rn': 'run',
3924 'ro': 'ron',
3925 'ru': 'rus',
3926 'rw': 'kin',
3927 'sa': 'san',
3928 'sc': 'srd',
3929 'sd': 'snd',
3930 'se': 'sme',
3931 'sg': 'sag',
3932 'si': 'sin',
3933 'sk': 'slk',
3934 'sl': 'slv',
3935 'sm': 'smo',
3936 'sn': 'sna',
3937 'so': 'som',
3938 'sq': 'sqi',
3939 'sr': 'srp',
3940 'ss': 'ssw',
3941 'st': 'sot',
3942 'su': 'sun',
3943 'sv': 'swe',
3944 'sw': 'swa',
3945 'ta': 'tam',
3946 'te': 'tel',
3947 'tg': 'tgk',
3948 'th': 'tha',
3949 'ti': 'tir',
3950 'tk': 'tuk',
3951 'tl': 'tgl',
3952 'tn': 'tsn',
3953 'to': 'ton',
3954 'tr': 'tur',
3955 'ts': 'tso',
3956 'tt': 'tat',
3957 'tw': 'twi',
3958 'ty': 'tah',
3959 'ug': 'uig',
3960 'uk': 'ukr',
3961 'ur': 'urd',
3962 'uz': 'uzb',
3963 've': 'ven',
3964 'vi': 'vie',
3965 'vo': 'vol',
3966 'wa': 'wln',
3967 'wo': 'wol',
3968 'xh': 'xho',
3969 'yi': 'yid',
3970 'ji': 'yid', # Replaced by yi in 1989 revision
3971 'yo': 'yor',
3972 'za': 'zha',
3973 'zh': 'zho',
3974 'zu': 'zul',
3975 }
3976
3977 @classmethod
3978 def short2long(cls, code):
3979 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3980 return cls._lang_map.get(code[:2])
3981
3982 @classmethod
3983 def long2short(cls, code):
3984 """Convert language code from ISO 639-2/T to ISO 639-1"""
3985 for short_name, long_name in cls._lang_map.items():
3986 if long_name == code:
3987 return short_name
3988
3989
3990 class ISO3166Utils(object):
3991 # From http://data.okfn.org/data/core/country-list
3992 _country_map = {
3993 'AF': 'Afghanistan',
3994 'AX': 'Åland Islands',
3995 'AL': 'Albania',
3996 'DZ': 'Algeria',
3997 'AS': 'American Samoa',
3998 'AD': 'Andorra',
3999 'AO': 'Angola',
4000 'AI': 'Anguilla',
4001 'AQ': 'Antarctica',
4002 'AG': 'Antigua and Barbuda',
4003 'AR': 'Argentina',
4004 'AM': 'Armenia',
4005 'AW': 'Aruba',
4006 'AU': 'Australia',
4007 'AT': 'Austria',
4008 'AZ': 'Azerbaijan',
4009 'BS': 'Bahamas',
4010 'BH': 'Bahrain',
4011 'BD': 'Bangladesh',
4012 'BB': 'Barbados',
4013 'BY': 'Belarus',
4014 'BE': 'Belgium',
4015 'BZ': 'Belize',
4016 'BJ': 'Benin',
4017 'BM': 'Bermuda',
4018 'BT': 'Bhutan',
4019 'BO': 'Bolivia, Plurinational State of',
4020 'BQ': 'Bonaire, Sint Eustatius and Saba',
4021 'BA': 'Bosnia and Herzegovina',
4022 'BW': 'Botswana',
4023 'BV': 'Bouvet Island',
4024 'BR': 'Brazil',
4025 'IO': 'British Indian Ocean Territory',
4026 'BN': 'Brunei Darussalam',
4027 'BG': 'Bulgaria',
4028 'BF': 'Burkina Faso',
4029 'BI': 'Burundi',
4030 'KH': 'Cambodia',
4031 'CM': 'Cameroon',
4032 'CA': 'Canada',
4033 'CV': 'Cape Verde',
4034 'KY': 'Cayman Islands',
4035 'CF': 'Central African Republic',
4036 'TD': 'Chad',
4037 'CL': 'Chile',
4038 'CN': 'China',
4039 'CX': 'Christmas Island',
4040 'CC': 'Cocos (Keeling) Islands',
4041 'CO': 'Colombia',
4042 'KM': 'Comoros',
4043 'CG': 'Congo',
4044 'CD': 'Congo, the Democratic Republic of the',
4045 'CK': 'Cook Islands',
4046 'CR': 'Costa Rica',
4047 'CI': 'Côte d\'Ivoire',
4048 'HR': 'Croatia',
4049 'CU': 'Cuba',
4050 'CW': 'Curaçao',
4051 'CY': 'Cyprus',
4052 'CZ': 'Czech Republic',
4053 'DK': 'Denmark',
4054 'DJ': 'Djibouti',
4055 'DM': 'Dominica',
4056 'DO': 'Dominican Republic',
4057 'EC': 'Ecuador',
4058 'EG': 'Egypt',
4059 'SV': 'El Salvador',
4060 'GQ': 'Equatorial Guinea',
4061 'ER': 'Eritrea',
4062 'EE': 'Estonia',
4063 'ET': 'Ethiopia',
4064 'FK': 'Falkland Islands (Malvinas)',
4065 'FO': 'Faroe Islands',
4066 'FJ': 'Fiji',
4067 'FI': 'Finland',
4068 'FR': 'France',
4069 'GF': 'French Guiana',
4070 'PF': 'French Polynesia',
4071 'TF': 'French Southern Territories',
4072 'GA': 'Gabon',
4073 'GM': 'Gambia',
4074 'GE': 'Georgia',
4075 'DE': 'Germany',
4076 'GH': 'Ghana',
4077 'GI': 'Gibraltar',
4078 'GR': 'Greece',
4079 'GL': 'Greenland',
4080 'GD': 'Grenada',
4081 'GP': 'Guadeloupe',
4082 'GU': 'Guam',
4083 'GT': 'Guatemala',
4084 'GG': 'Guernsey',
4085 'GN': 'Guinea',
4086 'GW': 'Guinea-Bissau',
4087 'GY': 'Guyana',
4088 'HT': 'Haiti',
4089 'HM': 'Heard Island and McDonald Islands',
4090 'VA': 'Holy See (Vatican City State)',
4091 'HN': 'Honduras',
4092 'HK': 'Hong Kong',
4093 'HU': 'Hungary',
4094 'IS': 'Iceland',
4095 'IN': 'India',
4096 'ID': 'Indonesia',
4097 'IR': 'Iran, Islamic Republic of',
4098 'IQ': 'Iraq',
4099 'IE': 'Ireland',
4100 'IM': 'Isle of Man',
4101 'IL': 'Israel',
4102 'IT': 'Italy',
4103 'JM': 'Jamaica',
4104 'JP': 'Japan',
4105 'JE': 'Jersey',
4106 'JO': 'Jordan',
4107 'KZ': 'Kazakhstan',
4108 'KE': 'Kenya',
4109 'KI': 'Kiribati',
4110 'KP': 'Korea, Democratic People\'s Republic of',
4111 'KR': 'Korea, Republic of',
4112 'KW': 'Kuwait',
4113 'KG': 'Kyrgyzstan',
4114 'LA': 'Lao People\'s Democratic Republic',
4115 'LV': 'Latvia',
4116 'LB': 'Lebanon',
4117 'LS': 'Lesotho',
4118 'LR': 'Liberia',
4119 'LY': 'Libya',
4120 'LI': 'Liechtenstein',
4121 'LT': 'Lithuania',
4122 'LU': 'Luxembourg',
4123 'MO': 'Macao',
4124 'MK': 'Macedonia, the Former Yugoslav Republic of',
4125 'MG': 'Madagascar',
4126 'MW': 'Malawi',
4127 'MY': 'Malaysia',
4128 'MV': 'Maldives',
4129 'ML': 'Mali',
4130 'MT': 'Malta',
4131 'MH': 'Marshall Islands',
4132 'MQ': 'Martinique',
4133 'MR': 'Mauritania',
4134 'MU': 'Mauritius',
4135 'YT': 'Mayotte',
4136 'MX': 'Mexico',
4137 'FM': 'Micronesia, Federated States of',
4138 'MD': 'Moldova, Republic of',
4139 'MC': 'Monaco',
4140 'MN': 'Mongolia',
4141 'ME': 'Montenegro',
4142 'MS': 'Montserrat',
4143 'MA': 'Morocco',
4144 'MZ': 'Mozambique',
4145 'MM': 'Myanmar',
4146 'NA': 'Namibia',
4147 'NR': 'Nauru',
4148 'NP': 'Nepal',
4149 'NL': 'Netherlands',
4150 'NC': 'New Caledonia',
4151 'NZ': 'New Zealand',
4152 'NI': 'Nicaragua',
4153 'NE': 'Niger',
4154 'NG': 'Nigeria',
4155 'NU': 'Niue',
4156 'NF': 'Norfolk Island',
4157 'MP': 'Northern Mariana Islands',
4158 'NO': 'Norway',
4159 'OM': 'Oman',
4160 'PK': 'Pakistan',
4161 'PW': 'Palau',
4162 'PS': 'Palestine, State of',
4163 'PA': 'Panama',
4164 'PG': 'Papua New Guinea',
4165 'PY': 'Paraguay',
4166 'PE': 'Peru',
4167 'PH': 'Philippines',
4168 'PN': 'Pitcairn',
4169 'PL': 'Poland',
4170 'PT': 'Portugal',
4171 'PR': 'Puerto Rico',
4172 'QA': 'Qatar',
4173 'RE': 'Réunion',
4174 'RO': 'Romania',
4175 'RU': 'Russian Federation',
4176 'RW': 'Rwanda',
4177 'BL': 'Saint Barthélemy',
4178 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4179 'KN': 'Saint Kitts and Nevis',
4180 'LC': 'Saint Lucia',
4181 'MF': 'Saint Martin (French part)',
4182 'PM': 'Saint Pierre and Miquelon',
4183 'VC': 'Saint Vincent and the Grenadines',
4184 'WS': 'Samoa',
4185 'SM': 'San Marino',
4186 'ST': 'Sao Tome and Principe',
4187 'SA': 'Saudi Arabia',
4188 'SN': 'Senegal',
4189 'RS': 'Serbia',
4190 'SC': 'Seychelles',
4191 'SL': 'Sierra Leone',
4192 'SG': 'Singapore',
4193 'SX': 'Sint Maarten (Dutch part)',
4194 'SK': 'Slovakia',
4195 'SI': 'Slovenia',
4196 'SB': 'Solomon Islands',
4197 'SO': 'Somalia',
4198 'ZA': 'South Africa',
4199 'GS': 'South Georgia and the South Sandwich Islands',
4200 'SS': 'South Sudan',
4201 'ES': 'Spain',
4202 'LK': 'Sri Lanka',
4203 'SD': 'Sudan',
4204 'SR': 'Suriname',
4205 'SJ': 'Svalbard and Jan Mayen',
4206 'SZ': 'Swaziland',
4207 'SE': 'Sweden',
4208 'CH': 'Switzerland',
4209 'SY': 'Syrian Arab Republic',
4210 'TW': 'Taiwan, Province of China',
4211 'TJ': 'Tajikistan',
4212 'TZ': 'Tanzania, United Republic of',
4213 'TH': 'Thailand',
4214 'TL': 'Timor-Leste',
4215 'TG': 'Togo',
4216 'TK': 'Tokelau',
4217 'TO': 'Tonga',
4218 'TT': 'Trinidad and Tobago',
4219 'TN': 'Tunisia',
4220 'TR': 'Turkey',
4221 'TM': 'Turkmenistan',
4222 'TC': 'Turks and Caicos Islands',
4223 'TV': 'Tuvalu',
4224 'UG': 'Uganda',
4225 'UA': 'Ukraine',
4226 'AE': 'United Arab Emirates',
4227 'GB': 'United Kingdom',
4228 'US': 'United States',
4229 'UM': 'United States Minor Outlying Islands',
4230 'UY': 'Uruguay',
4231 'UZ': 'Uzbekistan',
4232 'VU': 'Vanuatu',
4233 'VE': 'Venezuela, Bolivarian Republic of',
4234 'VN': 'Viet Nam',
4235 'VG': 'Virgin Islands, British',
4236 'VI': 'Virgin Islands, U.S.',
4237 'WF': 'Wallis and Futuna',
4238 'EH': 'Western Sahara',
4239 'YE': 'Yemen',
4240 'ZM': 'Zambia',
4241 'ZW': 'Zimbabwe',
4242 }
4243
4244 @classmethod
4245 def short2full(cls, code):
4246 """Convert an ISO 3166-2 country code to the corresponding full name"""
4247 return cls._country_map.get(code.upper())
4248
4249
4250 class GeoUtils(object):
4251 # Major IPv4 address blocks per country
4252 _country_ip_map = {
4253 'AD': '46.172.224.0/19',
4254 'AE': '94.200.0.0/13',
4255 'AF': '149.54.0.0/17',
4256 'AG': '209.59.64.0/18',
4257 'AI': '204.14.248.0/21',
4258 'AL': '46.99.0.0/16',
4259 'AM': '46.70.0.0/15',
4260 'AO': '105.168.0.0/13',
4261 'AP': '182.50.184.0/21',
4262 'AQ': '23.154.160.0/24',
4263 'AR': '181.0.0.0/12',
4264 'AS': '202.70.112.0/20',
4265 'AT': '77.116.0.0/14',
4266 'AU': '1.128.0.0/11',
4267 'AW': '181.41.0.0/18',
4268 'AX': '185.217.4.0/22',
4269 'AZ': '5.197.0.0/16',
4270 'BA': '31.176.128.0/17',
4271 'BB': '65.48.128.0/17',
4272 'BD': '114.130.0.0/16',
4273 'BE': '57.0.0.0/8',
4274 'BF': '102.178.0.0/15',
4275 'BG': '95.42.0.0/15',
4276 'BH': '37.131.0.0/17',
4277 'BI': '154.117.192.0/18',
4278 'BJ': '137.255.0.0/16',
4279 'BL': '185.212.72.0/23',
4280 'BM': '196.12.64.0/18',
4281 'BN': '156.31.0.0/16',
4282 'BO': '161.56.0.0/16',
4283 'BQ': '161.0.80.0/20',
4284 'BR': '191.128.0.0/12',
4285 'BS': '24.51.64.0/18',
4286 'BT': '119.2.96.0/19',
4287 'BW': '168.167.0.0/16',
4288 'BY': '178.120.0.0/13',
4289 'BZ': '179.42.192.0/18',
4290 'CA': '99.224.0.0/11',
4291 'CD': '41.243.0.0/16',
4292 'CF': '197.242.176.0/21',
4293 'CG': '160.113.0.0/16',
4294 'CH': '85.0.0.0/13',
4295 'CI': '102.136.0.0/14',
4296 'CK': '202.65.32.0/19',
4297 'CL': '152.172.0.0/14',
4298 'CM': '102.244.0.0/14',
4299 'CN': '36.128.0.0/10',
4300 'CO': '181.240.0.0/12',
4301 'CR': '201.192.0.0/12',
4302 'CU': '152.206.0.0/15',
4303 'CV': '165.90.96.0/19',
4304 'CW': '190.88.128.0/17',
4305 'CY': '31.153.0.0/16',
4306 'CZ': '88.100.0.0/14',
4307 'DE': '53.0.0.0/8',
4308 'DJ': '197.241.0.0/17',
4309 'DK': '87.48.0.0/12',
4310 'DM': '192.243.48.0/20',
4311 'DO': '152.166.0.0/15',
4312 'DZ': '41.96.0.0/12',
4313 'EC': '186.68.0.0/15',
4314 'EE': '90.190.0.0/15',
4315 'EG': '156.160.0.0/11',
4316 'ER': '196.200.96.0/20',
4317 'ES': '88.0.0.0/11',
4318 'ET': '196.188.0.0/14',
4319 'EU': '2.16.0.0/13',
4320 'FI': '91.152.0.0/13',
4321 'FJ': '144.120.0.0/16',
4322 'FK': '80.73.208.0/21',
4323 'FM': '119.252.112.0/20',
4324 'FO': '88.85.32.0/19',
4325 'FR': '90.0.0.0/9',
4326 'GA': '41.158.0.0/15',
4327 'GB': '25.0.0.0/8',
4328 'GD': '74.122.88.0/21',
4329 'GE': '31.146.0.0/16',
4330 'GF': '161.22.64.0/18',
4331 'GG': '62.68.160.0/19',
4332 'GH': '154.160.0.0/12',
4333 'GI': '95.164.0.0/16',
4334 'GL': '88.83.0.0/19',
4335 'GM': '160.182.0.0/15',
4336 'GN': '197.149.192.0/18',
4337 'GP': '104.250.0.0/19',
4338 'GQ': '105.235.224.0/20',
4339 'GR': '94.64.0.0/13',
4340 'GT': '168.234.0.0/16',
4341 'GU': '168.123.0.0/16',
4342 'GW': '197.214.80.0/20',
4343 'GY': '181.41.64.0/18',
4344 'HK': '113.252.0.0/14',
4345 'HN': '181.210.0.0/16',
4346 'HR': '93.136.0.0/13',
4347 'HT': '148.102.128.0/17',
4348 'HU': '84.0.0.0/14',
4349 'ID': '39.192.0.0/10',
4350 'IE': '87.32.0.0/12',
4351 'IL': '79.176.0.0/13',
4352 'IM': '5.62.80.0/20',
4353 'IN': '117.192.0.0/10',
4354 'IO': '203.83.48.0/21',
4355 'IQ': '37.236.0.0/14',
4356 'IR': '2.176.0.0/12',
4357 'IS': '82.221.0.0/16',
4358 'IT': '79.0.0.0/10',
4359 'JE': '87.244.64.0/18',
4360 'JM': '72.27.0.0/17',
4361 'JO': '176.29.0.0/16',
4362 'JP': '133.0.0.0/8',
4363 'KE': '105.48.0.0/12',
4364 'KG': '158.181.128.0/17',
4365 'KH': '36.37.128.0/17',
4366 'KI': '103.25.140.0/22',
4367 'KM': '197.255.224.0/20',
4368 'KN': '198.167.192.0/19',
4369 'KP': '175.45.176.0/22',
4370 'KR': '175.192.0.0/10',
4371 'KW': '37.36.0.0/14',
4372 'KY': '64.96.0.0/15',
4373 'KZ': '2.72.0.0/13',
4374 'LA': '115.84.64.0/18',
4375 'LB': '178.135.0.0/16',
4376 'LC': '24.92.144.0/20',
4377 'LI': '82.117.0.0/19',
4378 'LK': '112.134.0.0/15',
4379 'LR': '102.183.0.0/16',
4380 'LS': '129.232.0.0/17',
4381 'LT': '78.56.0.0/13',
4382 'LU': '188.42.0.0/16',
4383 'LV': '46.109.0.0/16',
4384 'LY': '41.252.0.0/14',
4385 'MA': '105.128.0.0/11',
4386 'MC': '88.209.64.0/18',
4387 'MD': '37.246.0.0/16',
4388 'ME': '178.175.0.0/17',
4389 'MF': '74.112.232.0/21',
4390 'MG': '154.126.0.0/17',
4391 'MH': '117.103.88.0/21',
4392 'MK': '77.28.0.0/15',
4393 'ML': '154.118.128.0/18',
4394 'MM': '37.111.0.0/17',
4395 'MN': '49.0.128.0/17',
4396 'MO': '60.246.0.0/16',
4397 'MP': '202.88.64.0/20',
4398 'MQ': '109.203.224.0/19',
4399 'MR': '41.188.64.0/18',
4400 'MS': '208.90.112.0/22',
4401 'MT': '46.11.0.0/16',
4402 'MU': '105.16.0.0/12',
4403 'MV': '27.114.128.0/18',
4404 'MW': '102.70.0.0/15',
4405 'MX': '187.192.0.0/11',
4406 'MY': '175.136.0.0/13',
4407 'MZ': '197.218.0.0/15',
4408 'NA': '41.182.0.0/16',
4409 'NC': '101.101.0.0/18',
4410 'NE': '197.214.0.0/18',
4411 'NF': '203.17.240.0/22',
4412 'NG': '105.112.0.0/12',
4413 'NI': '186.76.0.0/15',
4414 'NL': '145.96.0.0/11',
4415 'NO': '84.208.0.0/13',
4416 'NP': '36.252.0.0/15',
4417 'NR': '203.98.224.0/19',
4418 'NU': '49.156.48.0/22',
4419 'NZ': '49.224.0.0/14',
4420 'OM': '5.36.0.0/15',
4421 'PA': '186.72.0.0/15',
4422 'PE': '186.160.0.0/14',
4423 'PF': '123.50.64.0/18',
4424 'PG': '124.240.192.0/19',
4425 'PH': '49.144.0.0/13',
4426 'PK': '39.32.0.0/11',
4427 'PL': '83.0.0.0/11',
4428 'PM': '70.36.0.0/20',
4429 'PR': '66.50.0.0/16',
4430 'PS': '188.161.0.0/16',
4431 'PT': '85.240.0.0/13',
4432 'PW': '202.124.224.0/20',
4433 'PY': '181.120.0.0/14',
4434 'QA': '37.210.0.0/15',
4435 'RE': '102.35.0.0/16',
4436 'RO': '79.112.0.0/13',
4437 'RS': '93.86.0.0/15',
4438 'RU': '5.136.0.0/13',
4439 'RW': '41.186.0.0/16',
4440 'SA': '188.48.0.0/13',
4441 'SB': '202.1.160.0/19',
4442 'SC': '154.192.0.0/11',
4443 'SD': '102.120.0.0/13',
4444 'SE': '78.64.0.0/12',
4445 'SG': '8.128.0.0/10',
4446 'SI': '188.196.0.0/14',
4447 'SK': '78.98.0.0/15',
4448 'SL': '102.143.0.0/17',
4449 'SM': '89.186.32.0/19',
4450 'SN': '41.82.0.0/15',
4451 'SO': '154.115.192.0/18',
4452 'SR': '186.179.128.0/17',
4453 'SS': '105.235.208.0/21',
4454 'ST': '197.159.160.0/19',
4455 'SV': '168.243.0.0/16',
4456 'SX': '190.102.0.0/20',
4457 'SY': '5.0.0.0/16',
4458 'SZ': '41.84.224.0/19',
4459 'TC': '65.255.48.0/20',
4460 'TD': '154.68.128.0/19',
4461 'TG': '196.168.0.0/14',
4462 'TH': '171.96.0.0/13',
4463 'TJ': '85.9.128.0/18',
4464 'TK': '27.96.24.0/21',
4465 'TL': '180.189.160.0/20',
4466 'TM': '95.85.96.0/19',
4467 'TN': '197.0.0.0/11',
4468 'TO': '175.176.144.0/21',
4469 'TR': '78.160.0.0/11',
4470 'TT': '186.44.0.0/15',
4471 'TV': '202.2.96.0/19',
4472 'TW': '120.96.0.0/11',
4473 'TZ': '156.156.0.0/14',
4474 'UA': '37.52.0.0/14',
4475 'UG': '102.80.0.0/13',
4476 'US': '6.0.0.0/8',
4477 'UY': '167.56.0.0/13',
4478 'UZ': '84.54.64.0/18',
4479 'VA': '212.77.0.0/19',
4480 'VC': '207.191.240.0/21',
4481 'VE': '186.88.0.0/13',
4482 'VG': '66.81.192.0/20',
4483 'VI': '146.226.0.0/16',
4484 'VN': '14.160.0.0/11',
4485 'VU': '202.80.32.0/20',
4486 'WF': '117.20.32.0/21',
4487 'WS': '202.4.32.0/19',
4488 'YE': '134.35.0.0/16',
4489 'YT': '41.242.116.0/22',
4490 'ZA': '41.0.0.0/11',
4491 'ZM': '102.144.0.0/13',
4492 'ZW': '102.177.192.0/18',
4493 }
4494
4495 @classmethod
4496 def random_ipv4(cls, code_or_block):
4497 if len(code_or_block) == 2:
4498 block = cls._country_ip_map.get(code_or_block.upper())
4499 if not block:
4500 return None
4501 else:
4502 block = code_or_block
4503 addr, preflen = block.split('/')
4504 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4505 addr_max = addr_min | (0xffffffff >> int(preflen))
4506 return compat_str(socket.inet_ntoa(
4507 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4508
4509
4510 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4511 def __init__(self, proxies=None):
4512 # Set default handlers
4513 for type in ('http', 'https'):
4514 setattr(self, '%s_open' % type,
4515 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4516 meth(r, proxy, type))
4517 compat_urllib_request.ProxyHandler.__init__(self, proxies)
4518
4519 def proxy_open(self, req, proxy, type):
4520 req_proxy = req.headers.get('Ytdl-request-proxy')
4521 if req_proxy is not None:
4522 proxy = req_proxy
4523 del req.headers['Ytdl-request-proxy']
4524
4525 if proxy == '__noproxy__':
4526 return None # No Proxy
4527 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4528 req.add_header('Ytdl-socks-proxy', proxy)
4529 # yt-dlp's http/https handlers do wrapping the socket with socks
4530 return None
4531 return compat_urllib_request.ProxyHandler.proxy_open(
4532 self, req, proxy, type)
4533
4534
4535 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4536 # released into Public Domain
4537 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4538
4539 def long_to_bytes(n, blocksize=0):
4540 """long_to_bytes(n:long, blocksize:int) : string
4541 Convert a long integer to a byte string.
4542
4543 If optional blocksize is given and greater than zero, pad the front of the
4544 byte string with binary zeros so that the length is a multiple of
4545 blocksize.
4546 """
4547 # after much testing, this algorithm was deemed to be the fastest
4548 s = b''
4549 n = int(n)
4550 while n > 0:
4551 s = compat_struct_pack('>I', n & 0xffffffff) + s
4552 n = n >> 32
4553 # strip off leading zeros
4554 for i in range(len(s)):
4555 if s[i] != b'\000'[0]:
4556 break
4557 else:
4558 # only happens when n == 0
4559 s = b'\000'
4560 i = 0
4561 s = s[i:]
4562 # add back some pad bytes. this could be done more efficiently w.r.t. the
4563 # de-padding being done above, but sigh...
4564 if blocksize > 0 and len(s) % blocksize:
4565 s = (blocksize - len(s) % blocksize) * b'\000' + s
4566 return s
4567
4568
4569 def bytes_to_long(s):
4570 """bytes_to_long(string) : long
4571 Convert a byte string to a long integer.
4572
4573 This is (essentially) the inverse of long_to_bytes().
4574 """
4575 acc = 0
4576 length = len(s)
4577 if length % 4:
4578 extra = (4 - length % 4)
4579 s = b'\000' * extra + s
4580 length = length + extra
4581 for i in range(0, length, 4):
4582 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4583 return acc
4584
4585
4586 def ohdave_rsa_encrypt(data, exponent, modulus):
4587 '''
4588 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4589
4590 Input:
4591 data: data to encrypt, bytes-like object
4592 exponent, modulus: parameter e and N of RSA algorithm, both integer
4593 Output: hex string of encrypted data
4594
4595 Limitation: supports one block encryption only
4596 '''
4597
4598 payload = int(binascii.hexlify(data[::-1]), 16)
4599 encrypted = pow(payload, exponent, modulus)
4600 return '%x' % encrypted
4601
4602
4603 def pkcs1pad(data, length):
4604 """
4605 Padding input data with PKCS#1 scheme
4606
4607 @param {int[]} data input data
4608 @param {int} length target length
4609 @returns {int[]} padded data
4610 """
4611 if len(data) > length - 11:
4612 raise ValueError('Input data too long for PKCS#1 padding')
4613
4614 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4615 return [0, 2] + pseudo_random + [0] + data
4616
4617
4618 def encode_base_n(num, n, table=None):
4619 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4620 if not table:
4621 table = FULL_TABLE[:n]
4622
4623 if n > len(table):
4624 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4625
4626 if num == 0:
4627 return table[0]
4628
4629 ret = ''
4630 while num:
4631 ret = table[num % n] + ret
4632 num = num // n
4633 return ret
4634
4635
4636 def decode_packed_codes(code):
4637 mobj = re.search(PACKED_CODES_RE, code)
4638 obfuscated_code, base, count, symbols = mobj.groups()
4639 base = int(base)
4640 count = int(count)
4641 symbols = symbols.split('|')
4642 symbol_table = {}
4643
4644 while count:
4645 count -= 1
4646 base_n_count = encode_base_n(count, base)
4647 symbol_table[base_n_count] = symbols[count] or base_n_count
4648
4649 return re.sub(
4650 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4651 obfuscated_code)
4652
4653
4654 def caesar(s, alphabet, shift):
4655 if shift == 0:
4656 return s
4657 l = len(alphabet)
4658 return ''.join(
4659 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4660 for c in s)
4661
4662
4663 def rot47(s):
4664 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4665
4666
4667 def parse_m3u8_attributes(attrib):
4668 info = {}
4669 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4670 if val.startswith('"'):
4671 val = val[1:-1]
4672 info[key] = val
4673 return info
4674
4675
4676 def urshift(val, n):
4677 return val >> n if val >= 0 else (val + 0x100000000) >> n
4678
4679
4680 # Based on png2str() written by @gdkchan and improved by @yokrysty
4681 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4682 def decode_png(png_data):
4683 # Reference: https://www.w3.org/TR/PNG/
4684 header = png_data[8:]
4685
4686 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4687 raise IOError('Not a valid PNG file.')
4688
4689 int_map = {1: '>B', 2: '>H', 4: '>I'}
4690 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4691
4692 chunks = []
4693
4694 while header:
4695 length = unpack_integer(header[:4])
4696 header = header[4:]
4697
4698 chunk_type = header[:4]
4699 header = header[4:]
4700
4701 chunk_data = header[:length]
4702 header = header[length:]
4703
4704 header = header[4:] # Skip CRC
4705
4706 chunks.append({
4707 'type': chunk_type,
4708 'length': length,
4709 'data': chunk_data
4710 })
4711
4712 ihdr = chunks[0]['data']
4713
4714 width = unpack_integer(ihdr[:4])
4715 height = unpack_integer(ihdr[4:8])
4716
4717 idat = b''
4718
4719 for chunk in chunks:
4720 if chunk['type'] == b'IDAT':
4721 idat += chunk['data']
4722
4723 if not idat:
4724 raise IOError('Unable to read PNG data.')
4725
4726 decompressed_data = bytearray(zlib.decompress(idat))
4727
4728 stride = width * 3
4729 pixels = []
4730
4731 def _get_pixel(idx):
4732 x = idx % stride
4733 y = idx // stride
4734 return pixels[y][x]
4735
4736 for y in range(height):
4737 basePos = y * (1 + stride)
4738 filter_type = decompressed_data[basePos]
4739
4740 current_row = []
4741
4742 pixels.append(current_row)
4743
4744 for x in range(stride):
4745 color = decompressed_data[1 + basePos + x]
4746 basex = y * stride + x
4747 left = 0
4748 up = 0
4749
4750 if x > 2:
4751 left = _get_pixel(basex - 3)
4752 if y > 0:
4753 up = _get_pixel(basex - stride)
4754
4755 if filter_type == 1: # Sub
4756 color = (color + left) & 0xff
4757 elif filter_type == 2: # Up
4758 color = (color + up) & 0xff
4759 elif filter_type == 3: # Average
4760 color = (color + ((left + up) >> 1)) & 0xff
4761 elif filter_type == 4: # Paeth
4762 a = left
4763 b = up
4764 c = 0
4765
4766 if x > 2 and y > 0:
4767 c = _get_pixel(basex - stride - 3)
4768
4769 p = a + b - c
4770
4771 pa = abs(p - a)
4772 pb = abs(p - b)
4773 pc = abs(p - c)
4774
4775 if pa <= pb and pa <= pc:
4776 color = (color + a) & 0xff
4777 elif pb <= pc:
4778 color = (color + b) & 0xff
4779 else:
4780 color = (color + c) & 0xff
4781
4782 current_row.append(color)
4783
4784 return width, height, pixels
4785
4786
4787 def write_xattr(path, key, value):
4788 # This mess below finds the best xattr tool for the job
4789 try:
4790 # try the pyxattr module...
4791 import xattr
4792
4793 if hasattr(xattr, 'set'): # pyxattr
4794 # Unicode arguments are not supported in python-pyxattr until
4795 # version 0.5.0
4796 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4797 pyxattr_required_version = '0.5.0'
4798 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4799 # TODO: fallback to CLI tools
4800 raise XAttrUnavailableError(
4801 'python-pyxattr is detected but is too old. '
4802 'yt-dlp requires %s or above while your version is %s. '
4803 'Falling back to other xattr implementations' % (
4804 pyxattr_required_version, xattr.__version__))
4805
4806 setxattr = xattr.set
4807 else: # xattr
4808 setxattr = xattr.setxattr
4809
4810 try:
4811 setxattr(path, key, value)
4812 except EnvironmentError as e:
4813 raise XAttrMetadataError(e.errno, e.strerror)
4814
4815 except ImportError:
4816 if compat_os_name == 'nt':
4817 # Write xattrs to NTFS Alternate Data Streams:
4818 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4819 assert ':' not in key
4820 assert os.path.exists(path)
4821
4822 ads_fn = path + ':' + key
4823 try:
4824 with open(ads_fn, 'wb') as f:
4825 f.write(value)
4826 except EnvironmentError as e:
4827 raise XAttrMetadataError(e.errno, e.strerror)
4828 else:
4829 user_has_setfattr = check_executable('setfattr', ['--version'])
4830 user_has_xattr = check_executable('xattr', ['-h'])
4831
4832 if user_has_setfattr or user_has_xattr:
4833
4834 value = value.decode('utf-8')
4835 if user_has_setfattr:
4836 executable = 'setfattr'
4837 opts = ['-n', key, '-v', value]
4838 elif user_has_xattr:
4839 executable = 'xattr'
4840 opts = ['-w', key, value]
4841
4842 cmd = ([encodeFilename(executable, True)]
4843 + [encodeArgument(o) for o in opts]
4844 + [encodeFilename(path, True)])
4845
4846 try:
4847 p = Popen(
4848 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4849 except EnvironmentError as e:
4850 raise XAttrMetadataError(e.errno, e.strerror)
4851 stdout, stderr = p.communicate_or_kill()
4852 stderr = stderr.decode('utf-8', 'replace')
4853 if p.returncode != 0:
4854 raise XAttrMetadataError(p.returncode, stderr)
4855
4856 else:
4857 # On Unix, and can't find pyxattr, setfattr, or xattr.
4858 if sys.platform.startswith('linux'):
4859 raise XAttrUnavailableError(
4860 "Couldn't find a tool to set the xattrs. "
4861 "Install either the python 'pyxattr' or 'xattr' "
4862 "modules, or the GNU 'attr' package "
4863 "(which contains the 'setfattr' tool).")
4864 else:
4865 raise XAttrUnavailableError(
4866 "Couldn't find a tool to set the xattrs. "
4867 "Install either the python 'xattr' module, "
4868 "or the 'xattr' binary.")
4869
4870
4871 def random_birthday(year_field, month_field, day_field):
4872 start_date = datetime.date(1950, 1, 1)
4873 end_date = datetime.date(1995, 12, 31)
4874 offset = random.randint(0, (end_date - start_date).days)
4875 random_date = start_date + datetime.timedelta(offset)
4876 return {
4877 year_field: str(random_date.year),
4878 month_field: str(random_date.month),
4879 day_field: str(random_date.day),
4880 }
4881
4882
4883 # Templates for internet shortcut files, which are plain text files.
4884 DOT_URL_LINK_TEMPLATE = '''
4885 [InternetShortcut]
4886 URL=%(url)s
4887 '''.lstrip()
4888
4889 DOT_WEBLOC_LINK_TEMPLATE = '''
4890 <?xml version="1.0" encoding="UTF-8"?>
4891 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4892 <plist version="1.0">
4893 <dict>
4894 \t<key>URL</key>
4895 \t<string>%(url)s</string>
4896 </dict>
4897 </plist>
4898 '''.lstrip()
4899
4900 DOT_DESKTOP_LINK_TEMPLATE = '''
4901 [Desktop Entry]
4902 Encoding=UTF-8
4903 Name=%(filename)s
4904 Type=Link
4905 URL=%(url)s
4906 Icon=text-html
4907 '''.lstrip()
4908
4909 LINK_TEMPLATES = {
4910 'url': DOT_URL_LINK_TEMPLATE,
4911 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4912 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4913 }
4914
4915
4916 def iri_to_uri(iri):
4917 """
4918 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4919
4920 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4921 """
4922
4923 iri_parts = compat_urllib_parse_urlparse(iri)
4924
4925 if '[' in iri_parts.netloc:
4926 raise ValueError('IPv6 URIs are not, yet, supported.')
4927 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4928
4929 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4930
4931 net_location = ''
4932 if iri_parts.username:
4933 net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4934 if iri_parts.password is not None:
4935 net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4936 net_location += '@'
4937
4938 net_location += iri_parts.hostname.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
4939 # The 'idna' encoding produces ASCII text.
4940 if iri_parts.port is not None and iri_parts.port != 80:
4941 net_location += ':' + str(iri_parts.port)
4942
4943 return compat_urllib_parse_urlunparse(
4944 (iri_parts.scheme,
4945 net_location,
4946
4947 compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4948
4949 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4950 compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4951
4952 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4953 compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4954
4955 compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4956
4957 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4958
4959
4960 def to_high_limit_path(path):
4961 if sys.platform in ['win32', 'cygwin']:
4962 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4963 return r'\\?\ '.rstrip() + os.path.abspath(path)
4964
4965 return path
4966
4967
4968 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4969 if field is None:
4970 val = obj if obj is not None else default
4971 else:
4972 val = obj.get(field, default)
4973 if func and val not in ignore:
4974 val = func(val)
4975 return template % val if val not in ignore else default
4976
4977
4978 def clean_podcast_url(url):
4979 return re.sub(r'''(?x)
4980 (?:
4981 (?:
4982 chtbl\.com/track|
4983 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4984 play\.podtrac\.com
4985 )/[^/]+|
4986 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4987 flex\.acast\.com|
4988 pd(?:
4989 cn\.co| # https://podcorn.com/analytics-prefix/
4990 st\.fm # https://podsights.com/docs/
4991 )/e
4992 )/''', '', url)
4993
4994
4995 _HEX_TABLE = '0123456789abcdef'
4996
4997
4998 def random_uuidv4():
4999 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5000
5001
5002 def make_dir(path, to_screen=None):
5003 try:
5004 dn = os.path.dirname(path)
5005 if dn and not os.path.exists(dn):
5006 os.makedirs(dn)
5007 return True
5008 except (OSError, IOError) as err:
5009 if callable(to_screen) is not None:
5010 to_screen('unable to create directory ' + error_to_compat_str(err))
5011 return False
5012
5013
5014 def get_executable_path():
5015 from zipimport import zipimporter
5016 if hasattr(sys, 'frozen'): # Running from PyInstaller
5017 path = os.path.dirname(sys.executable)
5018 elif isinstance(globals().get('__loader__'), zipimporter): # Running from ZIP
5019 path = os.path.join(os.path.dirname(__file__), '../..')
5020 else:
5021 path = os.path.join(os.path.dirname(__file__), '..')
5022 return os.path.abspath(path)
5023
5024
5025 def load_plugins(name, suffix, namespace):
5026 classes = {}
5027 try:
5028 plugins_spec = importlib.util.spec_from_file_location(
5029 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5030 plugins = importlib.util.module_from_spec(plugins_spec)
5031 sys.modules[plugins_spec.name] = plugins
5032 plugins_spec.loader.exec_module(plugins)
5033 for name in dir(plugins):
5034 if name in namespace:
5035 continue
5036 if not name.endswith(suffix):
5037 continue
5038 klass = getattr(plugins, name)
5039 classes[name] = namespace[name] = klass
5040 except FileNotFoundError:
5041 pass
5042 return classes
5043
5044
5045 def traverse_obj(
5046 obj, *path_list, default=None, expected_type=None, get_all=True,
5047 casesense=True, is_user_input=False, traverse_string=False):
5048 ''' Traverse nested list/dict/tuple
5049 @param path_list A list of paths which are checked one by one.
5050 Each path is a list of keys where each key is a string,
5051 a function, a tuple of strings/None or "...".
5052 When a fuction is given, it takes the key as argument and
5053 returns whether the key matches or not. When a tuple is given,
5054 all the keys given in the tuple are traversed, and
5055 "..." traverses all the keys in the object
5056 "None" returns the object without traversal
5057 @param default Default value to return
5058 @param expected_type Only accept final value of this type (Can also be any callable)
5059 @param get_all Return all the values obtained from a path or only the first one
5060 @param casesense Whether to consider dictionary keys as case sensitive
5061 @param is_user_input Whether the keys are generated from user input. If True,
5062 strings are converted to int/slice if necessary
5063 @param traverse_string Whether to traverse inside strings. If True, any
5064 non-compatible object will also be converted into a string
5065 # TODO: Write tests
5066 '''
5067 if not casesense:
5068 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5069 path_list = (map(_lower, variadic(path)) for path in path_list)
5070
5071 def _traverse_obj(obj, path, _current_depth=0):
5072 nonlocal depth
5073 path = tuple(variadic(path))
5074 for i, key in enumerate(path):
5075 if None in (key, obj):
5076 return obj
5077 if isinstance(key, (list, tuple)):
5078 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5079 key = ...
5080 if key is ...:
5081 obj = (obj.values() if isinstance(obj, dict)
5082 else obj if isinstance(obj, (list, tuple, LazyList))
5083 else str(obj) if traverse_string else [])
5084 _current_depth += 1
5085 depth = max(depth, _current_depth)
5086 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5087 elif callable(key):
5088 if isinstance(obj, (list, tuple, LazyList)):
5089 obj = enumerate(obj)
5090 elif isinstance(obj, dict):
5091 obj = obj.items()
5092 else:
5093 if not traverse_string:
5094 return None
5095 obj = str(obj)
5096 _current_depth += 1
5097 depth = max(depth, _current_depth)
5098 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
5099 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5100 obj = (obj.get(key) if casesense or (key in obj)
5101 else next((v for k, v in obj.items() if _lower(k) == key), None))
5102 else:
5103 if is_user_input:
5104 key = (int_or_none(key) if ':' not in key
5105 else slice(*map(int_or_none, key.split(':'))))
5106 if key == slice(None):
5107 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5108 if not isinstance(key, (int, slice)):
5109 return None
5110 if not isinstance(obj, (list, tuple, LazyList)):
5111 if not traverse_string:
5112 return None
5113 obj = str(obj)
5114 try:
5115 obj = obj[key]
5116 except IndexError:
5117 return None
5118 return obj
5119
5120 if isinstance(expected_type, type):
5121 type_test = lambda val: val if isinstance(val, expected_type) else None
5122 elif expected_type is not None:
5123 type_test = expected_type
5124 else:
5125 type_test = lambda val: val
5126
5127 for path in path_list:
5128 depth = 0
5129 val = _traverse_obj(obj, path)
5130 if val is not None:
5131 if depth:
5132 for _ in range(depth - 1):
5133 val = itertools.chain.from_iterable(v for v in val if v is not None)
5134 val = [v for v in map(type_test, val) if v is not None]
5135 if val:
5136 return val if get_all else val[0]
5137 else:
5138 val = type_test(val)
5139 if val is not None:
5140 return val
5141 return default
5142
5143
5144 def traverse_dict(dictn, keys, casesense=True):
5145 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5146 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5147 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5148
5149
5150 def variadic(x, allowed_types=(str, bytes, dict)):
5151 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5152
5153
5154 # create a JSON Web Signature (jws) with HS256 algorithm
5155 # the resulting format is in JWS Compact Serialization
5156 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5157 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5158 def jwt_encode_hs256(payload_data, key, headers={}):
5159 header_data = {
5160 'alg': 'HS256',
5161 'typ': 'JWT',
5162 }
5163 if headers:
5164 header_data.update(headers)
5165 header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5166 payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5167 h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5168 signature_b64 = base64.b64encode(h.digest())
5169 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5170 return token
5171
5172
5173 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5174 def jwt_decode_hs256(jwt):
5175 header_b64, payload_b64, signature_b64 = jwt.split('.')
5176 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5177 return payload_data
5178
5179
5180 def supports_terminal_sequences(stream):
5181 if compat_os_name == 'nt':
5182 from .compat import WINDOWS_VT_MODE # Must be imported locally
5183 if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5184 return False
5185 elif not os.getenv('TERM'):
5186 return False
5187 try:
5188 return stream.isatty()
5189 except BaseException:
5190 return False
5191
5192
5193 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5194
5195
5196 def remove_terminal_sequences(string):
5197 return _terminal_sequences_re.sub('', string)
5198
5199
5200 def number_of_digits(number):
5201 return len('%d' % number)
5202
5203
5204 def join_nonempty(*values, delim='-', from_dict=None):
5205 if from_dict is not None:
5206 values = map(from_dict.get, values)
5207 return delim.join(map(str, filter(None, values)))
5208
5209
5210 class Config:
5211 own_args = None
5212 filename = None
5213 __initialized = False
5214
5215 def __init__(self, parser, label=None):
5216 self._parser, self.label = parser, label
5217 self._loaded_paths, self.configs = set(), []
5218
5219 def init(self, args=None, filename=None):
5220 assert not self.__initialized
5221 if filename:
5222 location = os.path.realpath(filename)
5223 if location in self._loaded_paths:
5224 return False
5225 self._loaded_paths.add(location)
5226
5227 self.__initialized = True
5228 self.own_args, self.filename = args, filename
5229 for location in self._parser.parse_args(args)[0].config_locations or []:
5230 location = compat_expanduser(location)
5231 if os.path.isdir(location):
5232 location = os.path.join(location, 'yt-dlp.conf')
5233 if not os.path.exists(location):
5234 self._parser.error(f'config location {location} does not exist')
5235 self.append_config(self.read_file(location), location)
5236 return True
5237
5238 def __str__(self):
5239 label = join_nonempty(
5240 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5241 delim=' ')
5242 return join_nonempty(
5243 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5244 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5245 delim='\n')
5246
5247 @staticmethod
5248 def read_file(filename, default=[]):
5249 try:
5250 optionf = open(filename)
5251 except IOError:
5252 return default # silently skip if file is not present
5253 try:
5254 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5255 contents = optionf.read()
5256 if sys.version_info < (3,):
5257 contents = contents.decode(preferredencoding())
5258 res = compat_shlex_split(contents, comments=True)
5259 finally:
5260 optionf.close()
5261 return res
5262
5263 @staticmethod
5264 def hide_login_info(opts):
5265 PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5266 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5267
5268 def _scrub_eq(o):
5269 m = eqre.match(o)
5270 if m:
5271 return m.group('key') + '=PRIVATE'
5272 else:
5273 return o
5274
5275 opts = list(map(_scrub_eq, opts))
5276 for idx, opt in enumerate(opts):
5277 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5278 opts[idx + 1] = 'PRIVATE'
5279 return opts
5280
5281 def append_config(self, *args, label=None):
5282 config = type(self)(self._parser, label)
5283 config._loaded_paths = self._loaded_paths
5284 if config.init(*args):
5285 self.configs.append(config)
5286
5287 @property
5288 def all_args(self):
5289 for config in reversed(self.configs):
5290 yield from config.all_args
5291 yield from self.own_args or []
5292
5293 def parse_args(self):
5294 return self._parser.parse_args(list(self.all_args))