]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
Allow multiple and nested configuration files
[yt-dlp.git] / yt_dlp / utils.py
1 #!/usr/bin/env python3
2 # coding: utf-8
3
4 from __future__ import unicode_literals
5
6 import base64
7 import binascii
8 import calendar
9 import codecs
10 import collections
11 import contextlib
12 import ctypes
13 import datetime
14 import email.utils
15 import email.header
16 import errno
17 import functools
18 import gzip
19 import hashlib
20 import hmac
21 import importlib.util
22 import io
23 import itertools
24 import json
25 import locale
26 import math
27 import operator
28 import os
29 import platform
30 import random
31 import re
32 import socket
33 import ssl
34 import subprocess
35 import sys
36 import tempfile
37 import time
38 import traceback
39 import xml.etree.ElementTree
40 import zlib
41 import mimetypes
42
43 from .compat import (
44 compat_HTMLParseError,
45 compat_HTMLParser,
46 compat_HTTPError,
47 compat_basestring,
48 compat_chr,
49 compat_cookiejar,
50 compat_ctypes_WINFUNCTYPE,
51 compat_etree_fromstring,
52 compat_expanduser,
53 compat_html_entities,
54 compat_html_entities_html5,
55 compat_http_client,
56 compat_integer_types,
57 compat_numeric_types,
58 compat_kwargs,
59 compat_os_name,
60 compat_parse_qs,
61 compat_shlex_split,
62 compat_shlex_quote,
63 compat_str,
64 compat_struct_pack,
65 compat_struct_unpack,
66 compat_urllib_error,
67 compat_urllib_parse,
68 compat_urllib_parse_urlencode,
69 compat_urllib_parse_urlparse,
70 compat_urllib_parse_urlunparse,
71 compat_urllib_parse_quote,
72 compat_urllib_parse_quote_plus,
73 compat_urllib_parse_unquote_plus,
74 compat_urllib_request,
75 compat_urlparse,
76 compat_xpath,
77 )
78
79 from .socks import (
80 ProxyType,
81 sockssocket,
82 )
83
84
85 def register_socks_protocols():
86 # "Register" SOCKS protocols
87 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
88 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
89 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
90 if scheme not in compat_urlparse.uses_netloc:
91 compat_urlparse.uses_netloc.append(scheme)
92
93
94 # This is not clearly defined otherwise
95 compiled_regex_type = type(re.compile(''))
96
97
98 def random_user_agent():
99 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
100 _CHROME_VERSIONS = (
101 '90.0.4430.212',
102 '90.0.4430.24',
103 '90.0.4430.70',
104 '90.0.4430.72',
105 '90.0.4430.85',
106 '90.0.4430.93',
107 '91.0.4472.101',
108 '91.0.4472.106',
109 '91.0.4472.114',
110 '91.0.4472.124',
111 '91.0.4472.164',
112 '91.0.4472.19',
113 '91.0.4472.77',
114 '92.0.4515.107',
115 '92.0.4515.115',
116 '92.0.4515.131',
117 '92.0.4515.159',
118 '92.0.4515.43',
119 '93.0.4556.0',
120 '93.0.4577.15',
121 '93.0.4577.63',
122 '93.0.4577.82',
123 '94.0.4606.41',
124 '94.0.4606.54',
125 '94.0.4606.61',
126 '94.0.4606.71',
127 '94.0.4606.81',
128 '94.0.4606.85',
129 '95.0.4638.17',
130 '95.0.4638.50',
131 '95.0.4638.54',
132 '95.0.4638.69',
133 '95.0.4638.74',
134 '96.0.4664.18',
135 '96.0.4664.45',
136 '96.0.4664.55',
137 '96.0.4664.93',
138 '97.0.4692.20',
139 )
140 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
141
142
143 std_headers = {
144 'User-Agent': random_user_agent(),
145 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
146 'Accept-Encoding': 'gzip, deflate',
147 'Accept-Language': 'en-us,en;q=0.5',
148 }
149
150
151 USER_AGENTS = {
152 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
153 }
154
155
156 NO_DEFAULT = object()
157
158 ENGLISH_MONTH_NAMES = [
159 'January', 'February', 'March', 'April', 'May', 'June',
160 'July', 'August', 'September', 'October', 'November', 'December']
161
162 MONTH_NAMES = {
163 'en': ENGLISH_MONTH_NAMES,
164 'fr': [
165 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
166 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
167 }
168
169 KNOWN_EXTENSIONS = (
170 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
171 'flv', 'f4v', 'f4a', 'f4b',
172 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
173 'mkv', 'mka', 'mk3d',
174 'avi', 'divx',
175 'mov',
176 'asf', 'wmv', 'wma',
177 '3gp', '3g2',
178 'mp3',
179 'flac',
180 'ape',
181 'wav',
182 'f4f', 'f4m', 'm3u8', 'smil')
183
184 # needed for sanitizing filenames in restricted mode
185 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
186 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
187 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
188
189 DATE_FORMATS = (
190 '%d %B %Y',
191 '%d %b %Y',
192 '%B %d %Y',
193 '%B %dst %Y',
194 '%B %dnd %Y',
195 '%B %drd %Y',
196 '%B %dth %Y',
197 '%b %d %Y',
198 '%b %dst %Y',
199 '%b %dnd %Y',
200 '%b %drd %Y',
201 '%b %dth %Y',
202 '%b %dst %Y %I:%M',
203 '%b %dnd %Y %I:%M',
204 '%b %drd %Y %I:%M',
205 '%b %dth %Y %I:%M',
206 '%Y %m %d',
207 '%Y-%m-%d',
208 '%Y.%m.%d.',
209 '%Y/%m/%d',
210 '%Y/%m/%d %H:%M',
211 '%Y/%m/%d %H:%M:%S',
212 '%Y%m%d%H%M',
213 '%Y%m%d%H%M%S',
214 '%Y%m%d',
215 '%Y-%m-%d %H:%M',
216 '%Y-%m-%d %H:%M:%S',
217 '%Y-%m-%d %H:%M:%S.%f',
218 '%Y-%m-%d %H:%M:%S:%f',
219 '%d.%m.%Y %H:%M',
220 '%d.%m.%Y %H.%M',
221 '%Y-%m-%dT%H:%M:%SZ',
222 '%Y-%m-%dT%H:%M:%S.%fZ',
223 '%Y-%m-%dT%H:%M:%S.%f0Z',
224 '%Y-%m-%dT%H:%M:%S',
225 '%Y-%m-%dT%H:%M:%S.%f',
226 '%Y-%m-%dT%H:%M',
227 '%b %d %Y at %H:%M',
228 '%b %d %Y at %H:%M:%S',
229 '%B %d %Y at %H:%M',
230 '%B %d %Y at %H:%M:%S',
231 '%H:%M %d-%b-%Y',
232 )
233
234 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
235 DATE_FORMATS_DAY_FIRST.extend([
236 '%d-%m-%Y',
237 '%d.%m.%Y',
238 '%d.%m.%y',
239 '%d/%m/%Y',
240 '%d/%m/%y',
241 '%d/%m/%Y %H:%M:%S',
242 ])
243
244 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
245 DATE_FORMATS_MONTH_FIRST.extend([
246 '%m-%d-%Y',
247 '%m.%d.%Y',
248 '%m/%d/%Y',
249 '%m/%d/%y',
250 '%m/%d/%Y %H:%M:%S',
251 ])
252
253 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
254 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
255
256
257 def preferredencoding():
258 """Get preferred encoding.
259
260 Returns the best encoding scheme for the system, based on
261 locale.getpreferredencoding() and some further tweaks.
262 """
263 try:
264 pref = locale.getpreferredencoding()
265 'TEST'.encode(pref)
266 except Exception:
267 pref = 'UTF-8'
268
269 return pref
270
271
272 def write_json_file(obj, fn):
273 """ Encode obj as JSON and write it to fn, atomically if possible """
274
275 fn = encodeFilename(fn)
276 if sys.version_info < (3, 0) and sys.platform != 'win32':
277 encoding = get_filesystem_encoding()
278 # os.path.basename returns a bytes object, but NamedTemporaryFile
279 # will fail if the filename contains non ascii characters unless we
280 # use a unicode object
281 path_basename = lambda f: os.path.basename(fn).decode(encoding)
282 # the same for os.path.dirname
283 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
284 else:
285 path_basename = os.path.basename
286 path_dirname = os.path.dirname
287
288 args = {
289 'suffix': '.tmp',
290 'prefix': path_basename(fn) + '.',
291 'dir': path_dirname(fn),
292 'delete': False,
293 }
294
295 # In Python 2.x, json.dump expects a bytestream.
296 # In Python 3.x, it writes to a character stream
297 if sys.version_info < (3, 0):
298 args['mode'] = 'wb'
299 else:
300 args.update({
301 'mode': 'w',
302 'encoding': 'utf-8',
303 })
304
305 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
306
307 try:
308 with tf:
309 json.dump(obj, tf, ensure_ascii=False)
310 if sys.platform == 'win32':
311 # Need to remove existing file on Windows, else os.rename raises
312 # WindowsError or FileExistsError.
313 try:
314 os.unlink(fn)
315 except OSError:
316 pass
317 try:
318 mask = os.umask(0)
319 os.umask(mask)
320 os.chmod(tf.name, 0o666 & ~mask)
321 except OSError:
322 pass
323 os.rename(tf.name, fn)
324 except Exception:
325 try:
326 os.remove(tf.name)
327 except OSError:
328 pass
329 raise
330
331
332 if sys.version_info >= (2, 7):
333 def find_xpath_attr(node, xpath, key, val=None):
334 """ Find the xpath xpath[@key=val] """
335 assert re.match(r'^[a-zA-Z_-]+$', key)
336 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
337 return node.find(expr)
338 else:
339 def find_xpath_attr(node, xpath, key, val=None):
340 for f in node.findall(compat_xpath(xpath)):
341 if key not in f.attrib:
342 continue
343 if val is None or f.attrib.get(key) == val:
344 return f
345 return None
346
347 # On python2.6 the xml.etree.ElementTree.Element methods don't support
348 # the namespace parameter
349
350
351 def xpath_with_ns(path, ns_map):
352 components = [c.split(':') for c in path.split('/')]
353 replaced = []
354 for c in components:
355 if len(c) == 1:
356 replaced.append(c[0])
357 else:
358 ns, tag = c
359 replaced.append('{%s}%s' % (ns_map[ns], tag))
360 return '/'.join(replaced)
361
362
363 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
364 def _find_xpath(xpath):
365 return node.find(compat_xpath(xpath))
366
367 if isinstance(xpath, (str, compat_str)):
368 n = _find_xpath(xpath)
369 else:
370 for xp in xpath:
371 n = _find_xpath(xp)
372 if n is not None:
373 break
374
375 if n is None:
376 if default is not NO_DEFAULT:
377 return default
378 elif fatal:
379 name = xpath if name is None else name
380 raise ExtractorError('Could not find XML element %s' % name)
381 else:
382 return None
383 return n
384
385
386 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
387 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
388 if n is None or n == default:
389 return n
390 if n.text is None:
391 if default is not NO_DEFAULT:
392 return default
393 elif fatal:
394 name = xpath if name is None else name
395 raise ExtractorError('Could not find XML element\'s text %s' % name)
396 else:
397 return None
398 return n.text
399
400
401 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
402 n = find_xpath_attr(node, xpath, key)
403 if n is None:
404 if default is not NO_DEFAULT:
405 return default
406 elif fatal:
407 name = '%s[@%s]' % (xpath, key) if name is None else name
408 raise ExtractorError('Could not find XML attribute %s' % name)
409 else:
410 return None
411 return n.attrib[key]
412
413
414 def get_element_by_id(id, html):
415 """Return the content of the tag with the specified ID in the passed HTML document"""
416 return get_element_by_attribute('id', id, html)
417
418
419 def get_element_by_class(class_name, html):
420 """Return the content of the first tag with the specified class in the passed HTML document"""
421 retval = get_elements_by_class(class_name, html)
422 return retval[0] if retval else None
423
424
425 def get_element_by_attribute(attribute, value, html, escape_value=True):
426 retval = get_elements_by_attribute(attribute, value, html, escape_value)
427 return retval[0] if retval else None
428
429
430 def get_elements_by_class(class_name, html):
431 """Return the content of all tags with the specified class in the passed HTML document as a list"""
432 return get_elements_by_attribute(
433 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
434 html, escape_value=False)
435
436
437 def get_elements_by_attribute(attribute, value, html, escape_value=True):
438 """Return the content of the tag with the specified attribute in the passed HTML document"""
439
440 value = re.escape(value) if escape_value else value
441
442 retlist = []
443 for m in re.finditer(r'''(?xs)
444 <([a-zA-Z0-9:._-]+)
445 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
446 \s+%s=['"]?%s['"]?
447 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
448 \s*>
449 (?P<content>.*?)
450 </\1>
451 ''' % (re.escape(attribute), value), html):
452 res = m.group('content')
453
454 if res.startswith('"') or res.startswith("'"):
455 res = res[1:-1]
456
457 retlist.append(unescapeHTML(res))
458
459 return retlist
460
461
462 class HTMLAttributeParser(compat_HTMLParser):
463 """Trivial HTML parser to gather the attributes for a single element"""
464
465 def __init__(self):
466 self.attrs = {}
467 compat_HTMLParser.__init__(self)
468
469 def handle_starttag(self, tag, attrs):
470 self.attrs = dict(attrs)
471
472
473 class HTMLListAttrsParser(compat_HTMLParser):
474 """HTML parser to gather the attributes for the elements of a list"""
475
476 def __init__(self):
477 compat_HTMLParser.__init__(self)
478 self.items = []
479 self._level = 0
480
481 def handle_starttag(self, tag, attrs):
482 if tag == 'li' and self._level == 0:
483 self.items.append(dict(attrs))
484 self._level += 1
485
486 def handle_endtag(self, tag):
487 self._level -= 1
488
489
490 def extract_attributes(html_element):
491 """Given a string for an HTML element such as
492 <el
493 a="foo" B="bar" c="&98;az" d=boz
494 empty= noval entity="&amp;"
495 sq='"' dq="'"
496 >
497 Decode and return a dictionary of attributes.
498 {
499 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
500 'empty': '', 'noval': None, 'entity': '&',
501 'sq': '"', 'dq': '\''
502 }.
503 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
504 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
505 """
506 parser = HTMLAttributeParser()
507 try:
508 parser.feed(html_element)
509 parser.close()
510 # Older Python may throw HTMLParseError in case of malformed HTML
511 except compat_HTMLParseError:
512 pass
513 return parser.attrs
514
515
516 def parse_list(webpage):
517 """Given a string for an series of HTML <li> elements,
518 return a dictionary of their attributes"""
519 parser = HTMLListAttrsParser()
520 parser.feed(webpage)
521 parser.close()
522 return parser.items
523
524
525 def clean_html(html):
526 """Clean an HTML snippet into a readable string"""
527
528 if html is None: # Convenience for sanitizing descriptions etc.
529 return html
530
531 # Newline vs <br />
532 html = html.replace('\n', ' ')
533 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
534 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
535 # Strip html tags
536 html = re.sub('<.*?>', '', html)
537 # Replace html entities
538 html = unescapeHTML(html)
539 return html.strip()
540
541
542 def sanitize_open(filename, open_mode):
543 """Try to open the given filename, and slightly tweak it if this fails.
544
545 Attempts to open the given filename. If this fails, it tries to change
546 the filename slightly, step by step, until it's either able to open it
547 or it fails and raises a final exception, like the standard open()
548 function.
549
550 It returns the tuple (stream, definitive_file_name).
551 """
552 try:
553 if filename == '-':
554 if sys.platform == 'win32':
555 import msvcrt
556 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
557 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
558 stream = open(encodeFilename(filename), open_mode)
559 return (stream, filename)
560 except (IOError, OSError) as err:
561 if err.errno in (errno.EACCES,):
562 raise
563
564 # In case of error, try to remove win32 forbidden chars
565 alt_filename = sanitize_path(filename)
566 if alt_filename == filename:
567 raise
568 else:
569 # An exception here should be caught in the caller
570 stream = open(encodeFilename(alt_filename), open_mode)
571 return (stream, alt_filename)
572
573
574 def timeconvert(timestr):
575 """Convert RFC 2822 defined time string into system timestamp"""
576 timestamp = None
577 timetuple = email.utils.parsedate_tz(timestr)
578 if timetuple is not None:
579 timestamp = email.utils.mktime_tz(timetuple)
580 return timestamp
581
582
583 def sanitize_filename(s, restricted=False, is_id=False):
584 """Sanitizes a string so it could be used as part of a filename.
585 If restricted is set, use a stricter subset of allowed characters.
586 Set is_id if this is not an arbitrary string, but an ID that should be kept
587 if possible.
588 """
589 def replace_insane(char):
590 if restricted and char in ACCENT_CHARS:
591 return ACCENT_CHARS[char]
592 elif not restricted and char == '\n':
593 return ' '
594 elif char == '?' or ord(char) < 32 or ord(char) == 127:
595 return ''
596 elif char == '"':
597 return '' if restricted else '\''
598 elif char == ':':
599 return '_-' if restricted else ' -'
600 elif char in '\\/|*<>':
601 return '_'
602 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
603 return '_'
604 if restricted and ord(char) > 127:
605 return '_'
606 return char
607
608 if s == '':
609 return ''
610 # Handle timestamps
611 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
612 result = ''.join(map(replace_insane, s))
613 if not is_id:
614 while '__' in result:
615 result = result.replace('__', '_')
616 result = result.strip('_')
617 # Common case of "Foreign band name - English song title"
618 if restricted and result.startswith('-_'):
619 result = result[2:]
620 if result.startswith('-'):
621 result = '_' + result[len('-'):]
622 result = result.lstrip('.')
623 if not result:
624 result = '_'
625 return result
626
627
628 def sanitize_path(s, force=False):
629 """Sanitizes and normalizes path on Windows"""
630 if sys.platform == 'win32':
631 force = False
632 drive_or_unc, _ = os.path.splitdrive(s)
633 if sys.version_info < (2, 7) and not drive_or_unc:
634 drive_or_unc, _ = os.path.splitunc(s)
635 elif force:
636 drive_or_unc = ''
637 else:
638 return s
639
640 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
641 if drive_or_unc:
642 norm_path.pop(0)
643 sanitized_path = [
644 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
645 for path_part in norm_path]
646 if drive_or_unc:
647 sanitized_path.insert(0, drive_or_unc + os.path.sep)
648 elif force and s[0] == os.path.sep:
649 sanitized_path.insert(0, os.path.sep)
650 return os.path.join(*sanitized_path)
651
652
653 def sanitize_url(url):
654 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
655 # the number of unwanted failures due to missing protocol
656 if url.startswith('//'):
657 return 'http:%s' % url
658 # Fix some common typos seen so far
659 COMMON_TYPOS = (
660 # https://github.com/ytdl-org/youtube-dl/issues/15649
661 (r'^httpss://', r'https://'),
662 # https://bx1.be/lives/direct-tv/
663 (r'^rmtp([es]?)://', r'rtmp\1://'),
664 )
665 for mistake, fixup in COMMON_TYPOS:
666 if re.match(mistake, url):
667 return re.sub(mistake, fixup, url)
668 return url
669
670
671 def extract_basic_auth(url):
672 parts = compat_urlparse.urlsplit(url)
673 if parts.username is None:
674 return url, None
675 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
676 parts.hostname if parts.port is None
677 else '%s:%d' % (parts.hostname, parts.port))))
678 auth_payload = base64.b64encode(
679 ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
680 return url, 'Basic ' + auth_payload.decode('utf-8')
681
682
683 def sanitized_Request(url, *args, **kwargs):
684 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
685 if auth_header is not None:
686 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
687 headers['Authorization'] = auth_header
688 return compat_urllib_request.Request(url, *args, **kwargs)
689
690
691 def expand_path(s):
692 """Expand shell variables and ~"""
693 return os.path.expandvars(compat_expanduser(s))
694
695
696 def orderedSet(iterable):
697 """ Remove all duplicates from the input iterable """
698 res = []
699 for el in iterable:
700 if el not in res:
701 res.append(el)
702 return res
703
704
705 def _htmlentity_transform(entity_with_semicolon):
706 """Transforms an HTML entity to a character."""
707 entity = entity_with_semicolon[:-1]
708
709 # Known non-numeric HTML entity
710 if entity in compat_html_entities.name2codepoint:
711 return compat_chr(compat_html_entities.name2codepoint[entity])
712
713 # TODO: HTML5 allows entities without a semicolon. For example,
714 # '&Eacuteric' should be decoded as 'Éric'.
715 if entity_with_semicolon in compat_html_entities_html5:
716 return compat_html_entities_html5[entity_with_semicolon]
717
718 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
719 if mobj is not None:
720 numstr = mobj.group(1)
721 if numstr.startswith('x'):
722 base = 16
723 numstr = '0%s' % numstr
724 else:
725 base = 10
726 # See https://github.com/ytdl-org/youtube-dl/issues/7518
727 try:
728 return compat_chr(int(numstr, base))
729 except ValueError:
730 pass
731
732 # Unknown entity in name, return its literal representation
733 return '&%s;' % entity
734
735
736 def unescapeHTML(s):
737 if s is None:
738 return None
739 assert type(s) == compat_str
740
741 return re.sub(
742 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
743
744
745 def escapeHTML(text):
746 return (
747 text
748 .replace('&', '&amp;')
749 .replace('<', '&lt;')
750 .replace('>', '&gt;')
751 .replace('"', '&quot;')
752 .replace("'", '&#39;')
753 )
754
755
756 def process_communicate_or_kill(p, *args, **kwargs):
757 try:
758 return p.communicate(*args, **kwargs)
759 except BaseException: # Including KeyboardInterrupt
760 p.kill()
761 p.wait()
762 raise
763
764
765 class Popen(subprocess.Popen):
766 if sys.platform == 'win32':
767 _startupinfo = subprocess.STARTUPINFO()
768 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
769 else:
770 _startupinfo = None
771
772 def __init__(self, *args, **kwargs):
773 super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
774
775 def communicate_or_kill(self, *args, **kwargs):
776 return process_communicate_or_kill(self, *args, **kwargs)
777
778
779 def get_subprocess_encoding():
780 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
781 # For subprocess calls, encode with locale encoding
782 # Refer to http://stackoverflow.com/a/9951851/35070
783 encoding = preferredencoding()
784 else:
785 encoding = sys.getfilesystemencoding()
786 if encoding is None:
787 encoding = 'utf-8'
788 return encoding
789
790
791 def encodeFilename(s, for_subprocess=False):
792 """
793 @param s The name of the file
794 """
795
796 assert type(s) == compat_str
797
798 # Python 3 has a Unicode API
799 if sys.version_info >= (3, 0):
800 return s
801
802 # Pass '' directly to use Unicode APIs on Windows 2000 and up
803 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
804 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
805 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
806 return s
807
808 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
809 if sys.platform.startswith('java'):
810 return s
811
812 return s.encode(get_subprocess_encoding(), 'ignore')
813
814
815 def decodeFilename(b, for_subprocess=False):
816
817 if sys.version_info >= (3, 0):
818 return b
819
820 if not isinstance(b, bytes):
821 return b
822
823 return b.decode(get_subprocess_encoding(), 'ignore')
824
825
826 def encodeArgument(s):
827 if not isinstance(s, compat_str):
828 # Legacy code that uses byte strings
829 # Uncomment the following line after fixing all post processors
830 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
831 s = s.decode('ascii')
832 return encodeFilename(s, True)
833
834
835 def decodeArgument(b):
836 return decodeFilename(b, True)
837
838
839 def decodeOption(optval):
840 if optval is None:
841 return optval
842 if isinstance(optval, bytes):
843 optval = optval.decode(preferredencoding())
844
845 assert isinstance(optval, compat_str)
846 return optval
847
848
849 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
850
851
852 def timetuple_from_msec(msec):
853 secs, msec = divmod(msec, 1000)
854 mins, secs = divmod(secs, 60)
855 hrs, mins = divmod(mins, 60)
856 return _timetuple(hrs, mins, secs, msec)
857
858
859 def formatSeconds(secs, delim=':', msec=False):
860 time = timetuple_from_msec(secs * 1000)
861 if time.hours:
862 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
863 elif time.minutes:
864 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
865 else:
866 ret = '%d' % time.seconds
867 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
868
869
870 def _ssl_load_windows_store_certs(ssl_context, storename):
871 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
872 try:
873 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
874 if encoding == 'x509_asn' and (
875 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
876 except PermissionError:
877 return
878 for cert in certs:
879 try:
880 ssl_context.load_verify_locations(cadata=cert)
881 except ssl.SSLError:
882 pass
883
884
885 def make_HTTPS_handler(params, **kwargs):
886 opts_check_certificate = not params.get('nocheckcertificate')
887 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
888 context.check_hostname = opts_check_certificate
889 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
890 if opts_check_certificate:
891 try:
892 context.load_default_certs()
893 # Work around the issue in load_default_certs when there are bad certificates. See:
894 # https://github.com/yt-dlp/yt-dlp/issues/1060,
895 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
896 except ssl.SSLError:
897 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
898 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
899 # Create a new context to discard any certificates that were already loaded
900 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
901 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
902 for storename in ('CA', 'ROOT'):
903 _ssl_load_windows_store_certs(context, storename)
904 context.set_default_verify_paths()
905 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
906
907
908 def bug_reports_message(before=';'):
909 if ytdl_is_updateable():
910 update_cmd = 'type yt-dlp -U to update'
911 else:
912 update_cmd = 'see https://github.com/yt-dlp/yt-dlp on how to update'
913 msg = 'please report this issue on https://github.com/yt-dlp/yt-dlp .'
914 msg += ' Make sure you are using the latest version; %s.' % update_cmd
915 msg += ' Be sure to call yt-dlp with the --verbose flag and include its complete output.'
916
917 before = before.rstrip()
918 if not before or before.endswith(('.', '!', '?')):
919 msg = msg[0].title() + msg[1:]
920
921 return (before + ' ' if before else '') + msg
922
923
924 class YoutubeDLError(Exception):
925 """Base exception for YoutubeDL errors."""
926 msg = None
927
928 def __init__(self, msg=None):
929 if msg is not None:
930 self.msg = msg
931 elif self.msg is None:
932 self.msg = type(self).__name__
933 super().__init__(self.msg)
934
935
936 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
937 if hasattr(ssl, 'CertificateError'):
938 network_exceptions.append(ssl.CertificateError)
939 network_exceptions = tuple(network_exceptions)
940
941
942 class ExtractorError(YoutubeDLError):
943 """Error during info extraction."""
944
945 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
946 """ tb, if given, is the original traceback (so that it can be printed out).
947 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
948 """
949 if sys.exc_info()[0] in network_exceptions:
950 expected = True
951
952 self.msg = str(msg)
953 self.traceback = tb
954 self.expected = expected
955 self.cause = cause
956 self.video_id = video_id
957 self.ie = ie
958 self.exc_info = sys.exc_info() # preserve original exception
959
960 super(ExtractorError, self).__init__(''.join((
961 format_field(ie, template='[%s] '),
962 format_field(video_id, template='%s: '),
963 self.msg,
964 format_field(cause, template=' (caused by %r)'),
965 '' if expected else bug_reports_message())))
966
967 def format_traceback(self):
968 if self.traceback is None:
969 return None
970 return ''.join(traceback.format_tb(self.traceback))
971
972
973 class UnsupportedError(ExtractorError):
974 def __init__(self, url):
975 super(UnsupportedError, self).__init__(
976 'Unsupported URL: %s' % url, expected=True)
977 self.url = url
978
979
980 class RegexNotFoundError(ExtractorError):
981 """Error when a regex didn't match"""
982 pass
983
984
985 class GeoRestrictedError(ExtractorError):
986 """Geographic restriction Error exception.
987
988 This exception may be thrown when a video is not available from your
989 geographic location due to geographic restrictions imposed by a website.
990 """
991
992 def __init__(self, msg, countries=None, **kwargs):
993 kwargs['expected'] = True
994 super(GeoRestrictedError, self).__init__(msg, **kwargs)
995 self.countries = countries
996
997
998 class DownloadError(YoutubeDLError):
999 """Download Error exception.
1000
1001 This exception may be thrown by FileDownloader objects if they are not
1002 configured to continue on errors. They will contain the appropriate
1003 error message.
1004 """
1005
1006 def __init__(self, msg, exc_info=None):
1007 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1008 super(DownloadError, self).__init__(msg)
1009 self.exc_info = exc_info
1010
1011
1012 class EntryNotInPlaylist(YoutubeDLError):
1013 """Entry not in playlist exception.
1014
1015 This exception will be thrown by YoutubeDL when a requested entry
1016 is not found in the playlist info_dict
1017 """
1018 msg = 'Entry not found in info'
1019
1020
1021 class SameFileError(YoutubeDLError):
1022 """Same File exception.
1023
1024 This exception will be thrown by FileDownloader objects if they detect
1025 multiple files would have to be downloaded to the same file on disk.
1026 """
1027 msg = 'Fixed output name but more than one file to download'
1028
1029 def __init__(self, filename=None):
1030 if filename is not None:
1031 self.msg += f': {filename}'
1032 super().__init__(self.msg)
1033
1034
1035 class PostProcessingError(YoutubeDLError):
1036 """Post Processing exception.
1037
1038 This exception may be raised by PostProcessor's .run() method to
1039 indicate an error in the postprocessing task.
1040 """
1041
1042
1043 class DownloadCancelled(YoutubeDLError):
1044 """ Exception raised when the download queue should be interrupted """
1045 msg = 'The download was cancelled'
1046
1047
1048 class ExistingVideoReached(DownloadCancelled):
1049 """ --break-on-existing triggered """
1050 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1051
1052
1053 class RejectedVideoReached(DownloadCancelled):
1054 """ --break-on-reject triggered """
1055 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1056
1057
1058 class MaxDownloadsReached(DownloadCancelled):
1059 """ --max-downloads limit has been reached. """
1060 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1061
1062
1063 class ReExtractInfo(YoutubeDLError):
1064 """ Video info needs to be re-extracted. """
1065
1066 def __init__(self, msg, expected=False):
1067 super().__init__(msg)
1068 self.expected = expected
1069
1070
1071 class ThrottledDownload(ReExtractInfo):
1072 """ Download speed below --throttled-rate. """
1073 msg = 'The download speed is below throttle limit'
1074
1075 def __init__(self):
1076 super().__init__(self.msg, expected=False)
1077
1078
1079 class UnavailableVideoError(YoutubeDLError):
1080 """Unavailable Format exception.
1081
1082 This exception will be thrown when a video is requested
1083 in a format that is not available for that video.
1084 """
1085 msg = 'Unable to download video'
1086
1087 def __init__(self, err=None):
1088 if err is not None:
1089 self.msg += f': {err}'
1090 super().__init__(self.msg)
1091
1092
1093 class ContentTooShortError(YoutubeDLError):
1094 """Content Too Short exception.
1095
1096 This exception may be raised by FileDownloader objects when a file they
1097 download is too small for what the server announced first, indicating
1098 the connection was probably interrupted.
1099 """
1100
1101 def __init__(self, downloaded, expected):
1102 super(ContentTooShortError, self).__init__(
1103 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1104 )
1105 # Both in bytes
1106 self.downloaded = downloaded
1107 self.expected = expected
1108
1109
1110 class XAttrMetadataError(YoutubeDLError):
1111 def __init__(self, code=None, msg='Unknown error'):
1112 super(XAttrMetadataError, self).__init__(msg)
1113 self.code = code
1114 self.msg = msg
1115
1116 # Parsing code and msg
1117 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1118 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1119 self.reason = 'NO_SPACE'
1120 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1121 self.reason = 'VALUE_TOO_LONG'
1122 else:
1123 self.reason = 'NOT_SUPPORTED'
1124
1125
1126 class XAttrUnavailableError(YoutubeDLError):
1127 pass
1128
1129
1130 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1131 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1132 # expected HTTP responses to meet HTTP/1.0 or later (see also
1133 # https://github.com/ytdl-org/youtube-dl/issues/6727)
1134 if sys.version_info < (3, 0):
1135 kwargs['strict'] = True
1136 hc = http_class(*args, **compat_kwargs(kwargs))
1137 source_address = ydl_handler._params.get('source_address')
1138
1139 if source_address is not None:
1140 # This is to workaround _create_connection() from socket where it will try all
1141 # address data from getaddrinfo() including IPv6. This filters the result from
1142 # getaddrinfo() based on the source_address value.
1143 # This is based on the cpython socket.create_connection() function.
1144 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1145 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1146 host, port = address
1147 err = None
1148 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1149 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1150 ip_addrs = [addr for addr in addrs if addr[0] == af]
1151 if addrs and not ip_addrs:
1152 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1153 raise socket.error(
1154 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1155 % (ip_version, source_address[0]))
1156 for res in ip_addrs:
1157 af, socktype, proto, canonname, sa = res
1158 sock = None
1159 try:
1160 sock = socket.socket(af, socktype, proto)
1161 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1162 sock.settimeout(timeout)
1163 sock.bind(source_address)
1164 sock.connect(sa)
1165 err = None # Explicitly break reference cycle
1166 return sock
1167 except socket.error as _:
1168 err = _
1169 if sock is not None:
1170 sock.close()
1171 if err is not None:
1172 raise err
1173 else:
1174 raise socket.error('getaddrinfo returns an empty list')
1175 if hasattr(hc, '_create_connection'):
1176 hc._create_connection = _create_connection
1177 sa = (source_address, 0)
1178 if hasattr(hc, 'source_address'): # Python 2.7+
1179 hc.source_address = sa
1180 else: # Python 2.6
1181 def _hc_connect(self, *args, **kwargs):
1182 sock = _create_connection(
1183 (self.host, self.port), self.timeout, sa)
1184 if is_https:
1185 self.sock = ssl.wrap_socket(
1186 sock, self.key_file, self.cert_file,
1187 ssl_version=ssl.PROTOCOL_TLSv1)
1188 else:
1189 self.sock = sock
1190 hc.connect = functools.partial(_hc_connect, hc)
1191
1192 return hc
1193
1194
1195 def handle_youtubedl_headers(headers):
1196 filtered_headers = headers
1197
1198 if 'Youtubedl-no-compression' in filtered_headers:
1199 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1200 del filtered_headers['Youtubedl-no-compression']
1201
1202 return filtered_headers
1203
1204
1205 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1206 """Handler for HTTP requests and responses.
1207
1208 This class, when installed with an OpenerDirector, automatically adds
1209 the standard headers to every HTTP request and handles gzipped and
1210 deflated responses from web servers. If compression is to be avoided in
1211 a particular request, the original request in the program code only has
1212 to include the HTTP header "Youtubedl-no-compression", which will be
1213 removed before making the real request.
1214
1215 Part of this code was copied from:
1216
1217 http://techknack.net/python-urllib2-handlers/
1218
1219 Andrew Rowls, the author of that code, agreed to release it to the
1220 public domain.
1221 """
1222
1223 def __init__(self, params, *args, **kwargs):
1224 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1225 self._params = params
1226
1227 def http_open(self, req):
1228 conn_class = compat_http_client.HTTPConnection
1229
1230 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1231 if socks_proxy:
1232 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1233 del req.headers['Ytdl-socks-proxy']
1234
1235 return self.do_open(functools.partial(
1236 _create_http_connection, self, conn_class, False),
1237 req)
1238
1239 @staticmethod
1240 def deflate(data):
1241 if not data:
1242 return data
1243 try:
1244 return zlib.decompress(data, -zlib.MAX_WBITS)
1245 except zlib.error:
1246 return zlib.decompress(data)
1247
1248 def http_request(self, req):
1249 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1250 # always respected by websites, some tend to give out URLs with non percent-encoded
1251 # non-ASCII characters (see telemb.py, ard.py [#3412])
1252 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1253 # To work around aforementioned issue we will replace request's original URL with
1254 # percent-encoded one
1255 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1256 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1257 url = req.get_full_url()
1258 url_escaped = escape_url(url)
1259
1260 # Substitute URL if any change after escaping
1261 if url != url_escaped:
1262 req = update_Request(req, url=url_escaped)
1263
1264 for h, v in std_headers.items():
1265 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1266 # The dict keys are capitalized because of this bug by urllib
1267 if h.capitalize() not in req.headers:
1268 req.add_header(h, v)
1269
1270 req.headers = handle_youtubedl_headers(req.headers)
1271
1272 if sys.version_info < (2, 7) and '#' in req.get_full_url():
1273 # Python 2.6 is brain-dead when it comes to fragments
1274 req._Request__original = req._Request__original.partition('#')[0]
1275 req._Request__r_type = req._Request__r_type.partition('#')[0]
1276
1277 return req
1278
1279 def http_response(self, req, resp):
1280 old_resp = resp
1281 # gzip
1282 if resp.headers.get('Content-encoding', '') == 'gzip':
1283 content = resp.read()
1284 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1285 try:
1286 uncompressed = io.BytesIO(gz.read())
1287 except IOError as original_ioerror:
1288 # There may be junk add the end of the file
1289 # See http://stackoverflow.com/q/4928560/35070 for details
1290 for i in range(1, 1024):
1291 try:
1292 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1293 uncompressed = io.BytesIO(gz.read())
1294 except IOError:
1295 continue
1296 break
1297 else:
1298 raise original_ioerror
1299 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1300 resp.msg = old_resp.msg
1301 del resp.headers['Content-encoding']
1302 # deflate
1303 if resp.headers.get('Content-encoding', '') == 'deflate':
1304 gz = io.BytesIO(self.deflate(resp.read()))
1305 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1306 resp.msg = old_resp.msg
1307 del resp.headers['Content-encoding']
1308 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1309 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1310 if 300 <= resp.code < 400:
1311 location = resp.headers.get('Location')
1312 if location:
1313 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1314 if sys.version_info >= (3, 0):
1315 location = location.encode('iso-8859-1').decode('utf-8')
1316 else:
1317 location = location.decode('utf-8')
1318 location_escaped = escape_url(location)
1319 if location != location_escaped:
1320 del resp.headers['Location']
1321 if sys.version_info < (3, 0):
1322 location_escaped = location_escaped.encode('utf-8')
1323 resp.headers['Location'] = location_escaped
1324 return resp
1325
1326 https_request = http_request
1327 https_response = http_response
1328
1329
1330 def make_socks_conn_class(base_class, socks_proxy):
1331 assert issubclass(base_class, (
1332 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1333
1334 url_components = compat_urlparse.urlparse(socks_proxy)
1335 if url_components.scheme.lower() == 'socks5':
1336 socks_type = ProxyType.SOCKS5
1337 elif url_components.scheme.lower() in ('socks', 'socks4'):
1338 socks_type = ProxyType.SOCKS4
1339 elif url_components.scheme.lower() == 'socks4a':
1340 socks_type = ProxyType.SOCKS4A
1341
1342 def unquote_if_non_empty(s):
1343 if not s:
1344 return s
1345 return compat_urllib_parse_unquote_plus(s)
1346
1347 proxy_args = (
1348 socks_type,
1349 url_components.hostname, url_components.port or 1080,
1350 True, # Remote DNS
1351 unquote_if_non_empty(url_components.username),
1352 unquote_if_non_empty(url_components.password),
1353 )
1354
1355 class SocksConnection(base_class):
1356 def connect(self):
1357 self.sock = sockssocket()
1358 self.sock.setproxy(*proxy_args)
1359 if type(self.timeout) in (int, float):
1360 self.sock.settimeout(self.timeout)
1361 self.sock.connect((self.host, self.port))
1362
1363 if isinstance(self, compat_http_client.HTTPSConnection):
1364 if hasattr(self, '_context'): # Python > 2.6
1365 self.sock = self._context.wrap_socket(
1366 self.sock, server_hostname=self.host)
1367 else:
1368 self.sock = ssl.wrap_socket(self.sock)
1369
1370 return SocksConnection
1371
1372
1373 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1374 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1375 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1376 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1377 self._params = params
1378
1379 def https_open(self, req):
1380 kwargs = {}
1381 conn_class = self._https_conn_class
1382
1383 if hasattr(self, '_context'): # python > 2.6
1384 kwargs['context'] = self._context
1385 if hasattr(self, '_check_hostname'): # python 3.x
1386 kwargs['check_hostname'] = self._check_hostname
1387
1388 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1389 if socks_proxy:
1390 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1391 del req.headers['Ytdl-socks-proxy']
1392
1393 return self.do_open(functools.partial(
1394 _create_http_connection, self, conn_class, True),
1395 req, **kwargs)
1396
1397
1398 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1399 """
1400 See [1] for cookie file format.
1401
1402 1. https://curl.haxx.se/docs/http-cookies.html
1403 """
1404 _HTTPONLY_PREFIX = '#HttpOnly_'
1405 _ENTRY_LEN = 7
1406 _HEADER = '''# Netscape HTTP Cookie File
1407 # This file is generated by yt-dlp. Do not edit.
1408
1409 '''
1410 _CookieFileEntry = collections.namedtuple(
1411 'CookieFileEntry',
1412 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1413
1414 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1415 """
1416 Save cookies to a file.
1417
1418 Most of the code is taken from CPython 3.8 and slightly adapted
1419 to support cookie files with UTF-8 in both python 2 and 3.
1420 """
1421 if filename is None:
1422 if self.filename is not None:
1423 filename = self.filename
1424 else:
1425 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1426
1427 # Store session cookies with `expires` set to 0 instead of an empty
1428 # string
1429 for cookie in self:
1430 if cookie.expires is None:
1431 cookie.expires = 0
1432
1433 with io.open(filename, 'w', encoding='utf-8') as f:
1434 f.write(self._HEADER)
1435 now = time.time()
1436 for cookie in self:
1437 if not ignore_discard and cookie.discard:
1438 continue
1439 if not ignore_expires and cookie.is_expired(now):
1440 continue
1441 if cookie.secure:
1442 secure = 'TRUE'
1443 else:
1444 secure = 'FALSE'
1445 if cookie.domain.startswith('.'):
1446 initial_dot = 'TRUE'
1447 else:
1448 initial_dot = 'FALSE'
1449 if cookie.expires is not None:
1450 expires = compat_str(cookie.expires)
1451 else:
1452 expires = ''
1453 if cookie.value is None:
1454 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1455 # with no name, whereas http.cookiejar regards it as a
1456 # cookie with no value.
1457 name = ''
1458 value = cookie.name
1459 else:
1460 name = cookie.name
1461 value = cookie.value
1462 f.write(
1463 '\t'.join([cookie.domain, initial_dot, cookie.path,
1464 secure, expires, name, value]) + '\n')
1465
1466 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1467 """Load cookies from a file."""
1468 if filename is None:
1469 if self.filename is not None:
1470 filename = self.filename
1471 else:
1472 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1473
1474 def prepare_line(line):
1475 if line.startswith(self._HTTPONLY_PREFIX):
1476 line = line[len(self._HTTPONLY_PREFIX):]
1477 # comments and empty lines are fine
1478 if line.startswith('#') or not line.strip():
1479 return line
1480 cookie_list = line.split('\t')
1481 if len(cookie_list) != self._ENTRY_LEN:
1482 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1483 cookie = self._CookieFileEntry(*cookie_list)
1484 if cookie.expires_at and not cookie.expires_at.isdigit():
1485 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1486 return line
1487
1488 cf = io.StringIO()
1489 with io.open(filename, encoding='utf-8') as f:
1490 for line in f:
1491 try:
1492 cf.write(prepare_line(line))
1493 except compat_cookiejar.LoadError as e:
1494 write_string(
1495 'WARNING: skipping cookie file entry due to %s: %r\n'
1496 % (e, line), sys.stderr)
1497 continue
1498 cf.seek(0)
1499 self._really_load(cf, filename, ignore_discard, ignore_expires)
1500 # Session cookies are denoted by either `expires` field set to
1501 # an empty string or 0. MozillaCookieJar only recognizes the former
1502 # (see [1]). So we need force the latter to be recognized as session
1503 # cookies on our own.
1504 # Session cookies may be important for cookies-based authentication,
1505 # e.g. usually, when user does not check 'Remember me' check box while
1506 # logging in on a site, some important cookies are stored as session
1507 # cookies so that not recognizing them will result in failed login.
1508 # 1. https://bugs.python.org/issue17164
1509 for cookie in self:
1510 # Treat `expires=0` cookies as session cookies
1511 if cookie.expires == 0:
1512 cookie.expires = None
1513 cookie.discard = True
1514
1515
1516 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1517 def __init__(self, cookiejar=None):
1518 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1519
1520 def http_response(self, request, response):
1521 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1522 # characters in Set-Cookie HTTP header of last response (see
1523 # https://github.com/ytdl-org/youtube-dl/issues/6769).
1524 # In order to at least prevent crashing we will percent encode Set-Cookie
1525 # header before HTTPCookieProcessor starts processing it.
1526 # if sys.version_info < (3, 0) and response.headers:
1527 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1528 # set_cookie = response.headers.get(set_cookie_header)
1529 # if set_cookie:
1530 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1531 # if set_cookie != set_cookie_escaped:
1532 # del response.headers[set_cookie_header]
1533 # response.headers[set_cookie_header] = set_cookie_escaped
1534 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1535
1536 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1537 https_response = http_response
1538
1539
1540 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1541 """YoutubeDL redirect handler
1542
1543 The code is based on HTTPRedirectHandler implementation from CPython [1].
1544
1545 This redirect handler solves two issues:
1546 - ensures redirect URL is always unicode under python 2
1547 - introduces support for experimental HTTP response status code
1548 308 Permanent Redirect [2] used by some sites [3]
1549
1550 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1551 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1552 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1553 """
1554
1555 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1556
1557 def redirect_request(self, req, fp, code, msg, headers, newurl):
1558 """Return a Request or None in response to a redirect.
1559
1560 This is called by the http_error_30x methods when a
1561 redirection response is received. If a redirection should
1562 take place, return a new Request to allow http_error_30x to
1563 perform the redirect. Otherwise, raise HTTPError if no-one
1564 else should try to handle this url. Return None if you can't
1565 but another Handler might.
1566 """
1567 m = req.get_method()
1568 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1569 or code in (301, 302, 303) and m == "POST")):
1570 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1571 # Strictly (according to RFC 2616), 301 or 302 in response to
1572 # a POST MUST NOT cause a redirection without confirmation
1573 # from the user (of urllib.request, in this case). In practice,
1574 # essentially all clients do redirect in this case, so we do
1575 # the same.
1576
1577 # On python 2 urlh.geturl() may sometimes return redirect URL
1578 # as byte string instead of unicode. This workaround allows
1579 # to force it always return unicode.
1580 if sys.version_info[0] < 3:
1581 newurl = compat_str(newurl)
1582
1583 # Be conciliant with URIs containing a space. This is mainly
1584 # redundant with the more complete encoding done in http_error_302(),
1585 # but it is kept for compatibility with other callers.
1586 newurl = newurl.replace(' ', '%20')
1587
1588 CONTENT_HEADERS = ("content-length", "content-type")
1589 # NB: don't use dict comprehension for python 2.6 compatibility
1590 newheaders = dict((k, v) for k, v in req.headers.items()
1591 if k.lower() not in CONTENT_HEADERS)
1592 return compat_urllib_request.Request(
1593 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1594 unverifiable=True)
1595
1596
1597 def extract_timezone(date_str):
1598 m = re.search(
1599 r'''(?x)
1600 ^.{8,}? # >=8 char non-TZ prefix, if present
1601 (?P<tz>Z| # just the UTC Z, or
1602 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1603 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1604 [ ]? # optional space
1605 (?P<sign>\+|-) # +/-
1606 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1607 $)
1608 ''', date_str)
1609 if not m:
1610 timezone = datetime.timedelta()
1611 else:
1612 date_str = date_str[:-len(m.group('tz'))]
1613 if not m.group('sign'):
1614 timezone = datetime.timedelta()
1615 else:
1616 sign = 1 if m.group('sign') == '+' else -1
1617 timezone = datetime.timedelta(
1618 hours=sign * int(m.group('hours')),
1619 minutes=sign * int(m.group('minutes')))
1620 return timezone, date_str
1621
1622
1623 def parse_iso8601(date_str, delimiter='T', timezone=None):
1624 """ Return a UNIX timestamp from the given date """
1625
1626 if date_str is None:
1627 return None
1628
1629 date_str = re.sub(r'\.[0-9]+', '', date_str)
1630
1631 if timezone is None:
1632 timezone, date_str = extract_timezone(date_str)
1633
1634 try:
1635 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1636 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1637 return calendar.timegm(dt.timetuple())
1638 except ValueError:
1639 pass
1640
1641
1642 def date_formats(day_first=True):
1643 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1644
1645
1646 def unified_strdate(date_str, day_first=True):
1647 """Return a string with the date in the format YYYYMMDD"""
1648
1649 if date_str is None:
1650 return None
1651 upload_date = None
1652 # Replace commas
1653 date_str = date_str.replace(',', ' ')
1654 # Remove AM/PM + timezone
1655 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1656 _, date_str = extract_timezone(date_str)
1657
1658 for expression in date_formats(day_first):
1659 try:
1660 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1661 except ValueError:
1662 pass
1663 if upload_date is None:
1664 timetuple = email.utils.parsedate_tz(date_str)
1665 if timetuple:
1666 try:
1667 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1668 except ValueError:
1669 pass
1670 if upload_date is not None:
1671 return compat_str(upload_date)
1672
1673
1674 def unified_timestamp(date_str, day_first=True):
1675 if date_str is None:
1676 return None
1677
1678 date_str = re.sub(r'[,|]', '', date_str)
1679
1680 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1681 timezone, date_str = extract_timezone(date_str)
1682
1683 # Remove AM/PM + timezone
1684 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1685
1686 # Remove unrecognized timezones from ISO 8601 alike timestamps
1687 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1688 if m:
1689 date_str = date_str[:-len(m.group('tz'))]
1690
1691 # Python only supports microseconds, so remove nanoseconds
1692 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1693 if m:
1694 date_str = m.group(1)
1695
1696 for expression in date_formats(day_first):
1697 try:
1698 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1699 return calendar.timegm(dt.timetuple())
1700 except ValueError:
1701 pass
1702 timetuple = email.utils.parsedate_tz(date_str)
1703 if timetuple:
1704 return calendar.timegm(timetuple) + pm_delta * 3600
1705
1706
1707 def determine_ext(url, default_ext='unknown_video'):
1708 if url is None or '.' not in url:
1709 return default_ext
1710 guess = url.partition('?')[0].rpartition('.')[2]
1711 if re.match(r'^[A-Za-z0-9]+$', guess):
1712 return guess
1713 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1714 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1715 return guess.rstrip('/')
1716 else:
1717 return default_ext
1718
1719
1720 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1721 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1722
1723
1724 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1725 """
1726 Return a datetime object from a string in the format YYYYMMDD or
1727 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1728
1729 format: string date format used to return datetime object from
1730 precision: round the time portion of a datetime object.
1731 auto|microsecond|second|minute|hour|day.
1732 auto: round to the unit provided in date_str (if applicable).
1733 """
1734 auto_precision = False
1735 if precision == 'auto':
1736 auto_precision = True
1737 precision = 'microsecond'
1738 today = datetime_round(datetime.datetime.now(), precision)
1739 if date_str in ('now', 'today'):
1740 return today
1741 if date_str == 'yesterday':
1742 return today - datetime.timedelta(days=1)
1743 match = re.match(
1744 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1745 date_str)
1746 if match is not None:
1747 start_time = datetime_from_str(match.group('start'), precision, format)
1748 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1749 unit = match.group('unit')
1750 if unit == 'month' or unit == 'year':
1751 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1752 unit = 'day'
1753 else:
1754 if unit == 'week':
1755 unit = 'day'
1756 time *= 7
1757 delta = datetime.timedelta(**{unit + 's': time})
1758 new_date = start_time + delta
1759 if auto_precision:
1760 return datetime_round(new_date, unit)
1761 return new_date
1762
1763 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1764
1765
1766 def date_from_str(date_str, format='%Y%m%d'):
1767 """
1768 Return a datetime object from a string in the format YYYYMMDD or
1769 (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1770
1771 format: string date format used to return datetime object from
1772 """
1773 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1774
1775
1776 def datetime_add_months(dt, months):
1777 """Increment/Decrement a datetime object by months."""
1778 month = dt.month + months - 1
1779 year = dt.year + month // 12
1780 month = month % 12 + 1
1781 day = min(dt.day, calendar.monthrange(year, month)[1])
1782 return dt.replace(year, month, day)
1783
1784
1785 def datetime_round(dt, precision='day'):
1786 """
1787 Round a datetime object's time to a specific precision
1788 """
1789 if precision == 'microsecond':
1790 return dt
1791
1792 unit_seconds = {
1793 'day': 86400,
1794 'hour': 3600,
1795 'minute': 60,
1796 'second': 1,
1797 }
1798 roundto = lambda x, n: ((x + n / 2) // n) * n
1799 timestamp = calendar.timegm(dt.timetuple())
1800 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1801
1802
1803 def hyphenate_date(date_str):
1804 """
1805 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1806 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1807 if match is not None:
1808 return '-'.join(match.groups())
1809 else:
1810 return date_str
1811
1812
1813 class DateRange(object):
1814 """Represents a time interval between two dates"""
1815
1816 def __init__(self, start=None, end=None):
1817 """start and end must be strings in the format accepted by date"""
1818 if start is not None:
1819 self.start = date_from_str(start)
1820 else:
1821 self.start = datetime.datetime.min.date()
1822 if end is not None:
1823 self.end = date_from_str(end)
1824 else:
1825 self.end = datetime.datetime.max.date()
1826 if self.start > self.end:
1827 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1828
1829 @classmethod
1830 def day(cls, day):
1831 """Returns a range that only contains the given day"""
1832 return cls(day, day)
1833
1834 def __contains__(self, date):
1835 """Check if the date is in the range"""
1836 if not isinstance(date, datetime.date):
1837 date = date_from_str(date)
1838 return self.start <= date <= self.end
1839
1840 def __str__(self):
1841 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1842
1843
1844 def platform_name():
1845 """ Returns the platform name as a compat_str """
1846 res = platform.platform()
1847 if isinstance(res, bytes):
1848 res = res.decode(preferredencoding())
1849
1850 assert isinstance(res, compat_str)
1851 return res
1852
1853
1854 def get_windows_version():
1855 ''' Get Windows version. None if it's not running on Windows '''
1856 if compat_os_name == 'nt':
1857 return version_tuple(platform.win32_ver()[1])
1858 else:
1859 return None
1860
1861
1862 def _windows_write_string(s, out):
1863 """ Returns True if the string was written using special methods,
1864 False if it has yet to be written out."""
1865 # Adapted from http://stackoverflow.com/a/3259271/35070
1866
1867 import ctypes.wintypes
1868
1869 WIN_OUTPUT_IDS = {
1870 1: -11,
1871 2: -12,
1872 }
1873
1874 try:
1875 fileno = out.fileno()
1876 except AttributeError:
1877 # If the output stream doesn't have a fileno, it's virtual
1878 return False
1879 except io.UnsupportedOperation:
1880 # Some strange Windows pseudo files?
1881 return False
1882 if fileno not in WIN_OUTPUT_IDS:
1883 return False
1884
1885 GetStdHandle = compat_ctypes_WINFUNCTYPE(
1886 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1887 ('GetStdHandle', ctypes.windll.kernel32))
1888 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1889
1890 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
1891 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1892 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1893 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
1894 written = ctypes.wintypes.DWORD(0)
1895
1896 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
1897 FILE_TYPE_CHAR = 0x0002
1898 FILE_TYPE_REMOTE = 0x8000
1899 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
1900 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1901 ctypes.POINTER(ctypes.wintypes.DWORD))(
1902 ('GetConsoleMode', ctypes.windll.kernel32))
1903 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1904
1905 def not_a_console(handle):
1906 if handle == INVALID_HANDLE_VALUE or handle is None:
1907 return True
1908 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1909 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1910
1911 if not_a_console(h):
1912 return False
1913
1914 def next_nonbmp_pos(s):
1915 try:
1916 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1917 except StopIteration:
1918 return len(s)
1919
1920 while s:
1921 count = min(next_nonbmp_pos(s), 1024)
1922
1923 ret = WriteConsoleW(
1924 h, s, count if count else 2, ctypes.byref(written), None)
1925 if ret == 0:
1926 raise OSError('Failed to write string')
1927 if not count: # We just wrote a non-BMP character
1928 assert written.value == 2
1929 s = s[1:]
1930 else:
1931 assert written.value > 0
1932 s = s[written.value:]
1933 return True
1934
1935
1936 def write_string(s, out=None, encoding=None):
1937 if out is None:
1938 out = sys.stderr
1939 assert type(s) == compat_str
1940
1941 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1942 if _windows_write_string(s, out):
1943 return
1944
1945 if ('b' in getattr(out, 'mode', '')
1946 or sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1947 byt = s.encode(encoding or preferredencoding(), 'ignore')
1948 out.write(byt)
1949 elif hasattr(out, 'buffer'):
1950 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1951 byt = s.encode(enc, 'ignore')
1952 out.buffer.write(byt)
1953 else:
1954 out.write(s)
1955 out.flush()
1956
1957
1958 def bytes_to_intlist(bs):
1959 if not bs:
1960 return []
1961 if isinstance(bs[0], int): # Python 3
1962 return list(bs)
1963 else:
1964 return [ord(c) for c in bs]
1965
1966
1967 def intlist_to_bytes(xs):
1968 if not xs:
1969 return b''
1970 return compat_struct_pack('%dB' % len(xs), *xs)
1971
1972
1973 # Cross-platform file locking
1974 if sys.platform == 'win32':
1975 import ctypes.wintypes
1976 import msvcrt
1977
1978 class OVERLAPPED(ctypes.Structure):
1979 _fields_ = [
1980 ('Internal', ctypes.wintypes.LPVOID),
1981 ('InternalHigh', ctypes.wintypes.LPVOID),
1982 ('Offset', ctypes.wintypes.DWORD),
1983 ('OffsetHigh', ctypes.wintypes.DWORD),
1984 ('hEvent', ctypes.wintypes.HANDLE),
1985 ]
1986
1987 kernel32 = ctypes.windll.kernel32
1988 LockFileEx = kernel32.LockFileEx
1989 LockFileEx.argtypes = [
1990 ctypes.wintypes.HANDLE, # hFile
1991 ctypes.wintypes.DWORD, # dwFlags
1992 ctypes.wintypes.DWORD, # dwReserved
1993 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1994 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1995 ctypes.POINTER(OVERLAPPED) # Overlapped
1996 ]
1997 LockFileEx.restype = ctypes.wintypes.BOOL
1998 UnlockFileEx = kernel32.UnlockFileEx
1999 UnlockFileEx.argtypes = [
2000 ctypes.wintypes.HANDLE, # hFile
2001 ctypes.wintypes.DWORD, # dwReserved
2002 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2003 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2004 ctypes.POINTER(OVERLAPPED) # Overlapped
2005 ]
2006 UnlockFileEx.restype = ctypes.wintypes.BOOL
2007 whole_low = 0xffffffff
2008 whole_high = 0x7fffffff
2009
2010 def _lock_file(f, exclusive):
2011 overlapped = OVERLAPPED()
2012 overlapped.Offset = 0
2013 overlapped.OffsetHigh = 0
2014 overlapped.hEvent = 0
2015 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2016 handle = msvcrt.get_osfhandle(f.fileno())
2017 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
2018 whole_low, whole_high, f._lock_file_overlapped_p):
2019 raise OSError('Locking file failed: %r' % ctypes.FormatError())
2020
2021 def _unlock_file(f):
2022 assert f._lock_file_overlapped_p
2023 handle = msvcrt.get_osfhandle(f.fileno())
2024 if not UnlockFileEx(handle, 0,
2025 whole_low, whole_high, f._lock_file_overlapped_p):
2026 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2027
2028 else:
2029 # Some platforms, such as Jython, is missing fcntl
2030 try:
2031 import fcntl
2032
2033 def _lock_file(f, exclusive):
2034 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
2035
2036 def _unlock_file(f):
2037 fcntl.flock(f, fcntl.LOCK_UN)
2038 except ImportError:
2039 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2040
2041 def _lock_file(f, exclusive):
2042 raise IOError(UNSUPPORTED_MSG)
2043
2044 def _unlock_file(f):
2045 raise IOError(UNSUPPORTED_MSG)
2046
2047
2048 class locked_file(object):
2049 def __init__(self, filename, mode, encoding=None):
2050 assert mode in ['r', 'a', 'w']
2051 self.f = io.open(filename, mode, encoding=encoding)
2052 self.mode = mode
2053
2054 def __enter__(self):
2055 exclusive = self.mode != 'r'
2056 try:
2057 _lock_file(self.f, exclusive)
2058 except IOError:
2059 self.f.close()
2060 raise
2061 return self
2062
2063 def __exit__(self, etype, value, traceback):
2064 try:
2065 _unlock_file(self.f)
2066 finally:
2067 self.f.close()
2068
2069 def __iter__(self):
2070 return iter(self.f)
2071
2072 def write(self, *args):
2073 return self.f.write(*args)
2074
2075 def read(self, *args):
2076 return self.f.read(*args)
2077
2078
2079 def get_filesystem_encoding():
2080 encoding = sys.getfilesystemencoding()
2081 return encoding if encoding is not None else 'utf-8'
2082
2083
2084 def shell_quote(args):
2085 quoted_args = []
2086 encoding = get_filesystem_encoding()
2087 for a in args:
2088 if isinstance(a, bytes):
2089 # We may get a filename encoded with 'encodeFilename'
2090 a = a.decode(encoding)
2091 quoted_args.append(compat_shlex_quote(a))
2092 return ' '.join(quoted_args)
2093
2094
2095 def smuggle_url(url, data):
2096 """ Pass additional data in a URL for internal use. """
2097
2098 url, idata = unsmuggle_url(url, {})
2099 data.update(idata)
2100 sdata = compat_urllib_parse_urlencode(
2101 {'__youtubedl_smuggle': json.dumps(data)})
2102 return url + '#' + sdata
2103
2104
2105 def unsmuggle_url(smug_url, default=None):
2106 if '#__youtubedl_smuggle' not in smug_url:
2107 return smug_url, default
2108 url, _, sdata = smug_url.rpartition('#')
2109 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2110 data = json.loads(jsond)
2111 return url, data
2112
2113
2114 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2115 """ Formats numbers with decimal sufixes like K, M, etc """
2116 num, factor = float_or_none(num), float(factor)
2117 if num is None:
2118 return None
2119 exponent = 0 if num == 0 else int(math.log(num, factor))
2120 suffix = ['', *'kMGTPEZY'][exponent]
2121 if factor == 1024:
2122 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2123 converted = num / (factor ** exponent)
2124 return fmt % (converted, suffix)
2125
2126
2127 def format_bytes(bytes):
2128 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2129
2130
2131 def lookup_unit_table(unit_table, s):
2132 units_re = '|'.join(re.escape(u) for u in unit_table)
2133 m = re.match(
2134 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2135 if not m:
2136 return None
2137 num_str = m.group('num').replace(',', '.')
2138 mult = unit_table[m.group('unit')]
2139 return int(float(num_str) * mult)
2140
2141
2142 def parse_filesize(s):
2143 if s is None:
2144 return None
2145
2146 # The lower-case forms are of course incorrect and unofficial,
2147 # but we support those too
2148 _UNIT_TABLE = {
2149 'B': 1,
2150 'b': 1,
2151 'bytes': 1,
2152 'KiB': 1024,
2153 'KB': 1000,
2154 'kB': 1024,
2155 'Kb': 1000,
2156 'kb': 1000,
2157 'kilobytes': 1000,
2158 'kibibytes': 1024,
2159 'MiB': 1024 ** 2,
2160 'MB': 1000 ** 2,
2161 'mB': 1024 ** 2,
2162 'Mb': 1000 ** 2,
2163 'mb': 1000 ** 2,
2164 'megabytes': 1000 ** 2,
2165 'mebibytes': 1024 ** 2,
2166 'GiB': 1024 ** 3,
2167 'GB': 1000 ** 3,
2168 'gB': 1024 ** 3,
2169 'Gb': 1000 ** 3,
2170 'gb': 1000 ** 3,
2171 'gigabytes': 1000 ** 3,
2172 'gibibytes': 1024 ** 3,
2173 'TiB': 1024 ** 4,
2174 'TB': 1000 ** 4,
2175 'tB': 1024 ** 4,
2176 'Tb': 1000 ** 4,
2177 'tb': 1000 ** 4,
2178 'terabytes': 1000 ** 4,
2179 'tebibytes': 1024 ** 4,
2180 'PiB': 1024 ** 5,
2181 'PB': 1000 ** 5,
2182 'pB': 1024 ** 5,
2183 'Pb': 1000 ** 5,
2184 'pb': 1000 ** 5,
2185 'petabytes': 1000 ** 5,
2186 'pebibytes': 1024 ** 5,
2187 'EiB': 1024 ** 6,
2188 'EB': 1000 ** 6,
2189 'eB': 1024 ** 6,
2190 'Eb': 1000 ** 6,
2191 'eb': 1000 ** 6,
2192 'exabytes': 1000 ** 6,
2193 'exbibytes': 1024 ** 6,
2194 'ZiB': 1024 ** 7,
2195 'ZB': 1000 ** 7,
2196 'zB': 1024 ** 7,
2197 'Zb': 1000 ** 7,
2198 'zb': 1000 ** 7,
2199 'zettabytes': 1000 ** 7,
2200 'zebibytes': 1024 ** 7,
2201 'YiB': 1024 ** 8,
2202 'YB': 1000 ** 8,
2203 'yB': 1024 ** 8,
2204 'Yb': 1000 ** 8,
2205 'yb': 1000 ** 8,
2206 'yottabytes': 1000 ** 8,
2207 'yobibytes': 1024 ** 8,
2208 }
2209
2210 return lookup_unit_table(_UNIT_TABLE, s)
2211
2212
2213 def parse_count(s):
2214 if s is None:
2215 return None
2216
2217 s = re.sub(r'^[^\d]+\s', '', s).strip()
2218
2219 if re.match(r'^[\d,.]+$', s):
2220 return str_to_int(s)
2221
2222 _UNIT_TABLE = {
2223 'k': 1000,
2224 'K': 1000,
2225 'm': 1000 ** 2,
2226 'M': 1000 ** 2,
2227 'kk': 1000 ** 2,
2228 'KK': 1000 ** 2,
2229 'b': 1000 ** 3,
2230 'B': 1000 ** 3,
2231 }
2232
2233 ret = lookup_unit_table(_UNIT_TABLE, s)
2234 if ret is not None:
2235 return ret
2236
2237 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2238 if mobj:
2239 return str_to_int(mobj.group(1))
2240
2241
2242 def parse_resolution(s):
2243 if s is None:
2244 return {}
2245
2246 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2247 if mobj:
2248 return {
2249 'width': int(mobj.group('w')),
2250 'height': int(mobj.group('h')),
2251 }
2252
2253 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2254 if mobj:
2255 return {'height': int(mobj.group(1))}
2256
2257 mobj = re.search(r'\b([48])[kK]\b', s)
2258 if mobj:
2259 return {'height': int(mobj.group(1)) * 540}
2260
2261 return {}
2262
2263
2264 def parse_bitrate(s):
2265 if not isinstance(s, compat_str):
2266 return
2267 mobj = re.search(r'\b(\d+)\s*kbps', s)
2268 if mobj:
2269 return int(mobj.group(1))
2270
2271
2272 def month_by_name(name, lang='en'):
2273 """ Return the number of a month by (locale-independently) English name """
2274
2275 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2276
2277 try:
2278 return month_names.index(name) + 1
2279 except ValueError:
2280 return None
2281
2282
2283 def month_by_abbreviation(abbrev):
2284 """ Return the number of a month by (locale-independently) English
2285 abbreviations """
2286
2287 try:
2288 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2289 except ValueError:
2290 return None
2291
2292
2293 def fix_xml_ampersands(xml_str):
2294 """Replace all the '&' by '&amp;' in XML"""
2295 return re.sub(
2296 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2297 '&amp;',
2298 xml_str)
2299
2300
2301 def setproctitle(title):
2302 assert isinstance(title, compat_str)
2303
2304 # ctypes in Jython is not complete
2305 # http://bugs.jython.org/issue2148
2306 if sys.platform.startswith('java'):
2307 return
2308
2309 try:
2310 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2311 except OSError:
2312 return
2313 except TypeError:
2314 # LoadLibrary in Windows Python 2.7.13 only expects
2315 # a bytestring, but since unicode_literals turns
2316 # every string into a unicode string, it fails.
2317 return
2318 title_bytes = title.encode('utf-8')
2319 buf = ctypes.create_string_buffer(len(title_bytes))
2320 buf.value = title_bytes
2321 try:
2322 libc.prctl(15, buf, 0, 0, 0)
2323 except AttributeError:
2324 return # Strange libc, just skip this
2325
2326
2327 def remove_start(s, start):
2328 return s[len(start):] if s is not None and s.startswith(start) else s
2329
2330
2331 def remove_end(s, end):
2332 return s[:-len(end)] if s is not None and s.endswith(end) else s
2333
2334
2335 def remove_quotes(s):
2336 if s is None or len(s) < 2:
2337 return s
2338 for quote in ('"', "'", ):
2339 if s[0] == quote and s[-1] == quote:
2340 return s[1:-1]
2341 return s
2342
2343
2344 def get_domain(url):
2345 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2346 return domain.group('domain') if domain else None
2347
2348
2349 def url_basename(url):
2350 path = compat_urlparse.urlparse(url).path
2351 return path.strip('/').split('/')[-1]
2352
2353
2354 def base_url(url):
2355 return re.match(r'https?://[^?#&]+/', url).group()
2356
2357
2358 def urljoin(base, path):
2359 if isinstance(path, bytes):
2360 path = path.decode('utf-8')
2361 if not isinstance(path, compat_str) or not path:
2362 return None
2363 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2364 return path
2365 if isinstance(base, bytes):
2366 base = base.decode('utf-8')
2367 if not isinstance(base, compat_str) or not re.match(
2368 r'^(?:https?:)?//', base):
2369 return None
2370 return compat_urlparse.urljoin(base, path)
2371
2372
2373 class HEADRequest(compat_urllib_request.Request):
2374 def get_method(self):
2375 return 'HEAD'
2376
2377
2378 class PUTRequest(compat_urllib_request.Request):
2379 def get_method(self):
2380 return 'PUT'
2381
2382
2383 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2384 if get_attr:
2385 if v is not None:
2386 v = getattr(v, get_attr, None)
2387 if v == '':
2388 v = None
2389 if v is None:
2390 return default
2391 try:
2392 return int(v) * invscale // scale
2393 except (ValueError, TypeError, OverflowError):
2394 return default
2395
2396
2397 def str_or_none(v, default=None):
2398 return default if v is None else compat_str(v)
2399
2400
2401 def str_to_int(int_str):
2402 """ A more relaxed version of int_or_none """
2403 if isinstance(int_str, compat_integer_types):
2404 return int_str
2405 elif isinstance(int_str, compat_str):
2406 int_str = re.sub(r'[,\.\+]', '', int_str)
2407 return int_or_none(int_str)
2408
2409
2410 def float_or_none(v, scale=1, invscale=1, default=None):
2411 if v is None:
2412 return default
2413 try:
2414 return float(v) * invscale / scale
2415 except (ValueError, TypeError):
2416 return default
2417
2418
2419 def bool_or_none(v, default=None):
2420 return v if isinstance(v, bool) else default
2421
2422
2423 def strip_or_none(v, default=None):
2424 return v.strip() if isinstance(v, compat_str) else default
2425
2426
2427 def url_or_none(url):
2428 if not url or not isinstance(url, compat_str):
2429 return None
2430 url = url.strip()
2431 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2432
2433
2434 def strftime_or_none(timestamp, date_format, default=None):
2435 datetime_object = None
2436 try:
2437 if isinstance(timestamp, compat_numeric_types): # unix timestamp
2438 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2439 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2440 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2441 return datetime_object.strftime(date_format)
2442 except (ValueError, TypeError, AttributeError):
2443 return default
2444
2445
2446 def parse_duration(s):
2447 if not isinstance(s, compat_basestring):
2448 return None
2449 s = s.strip()
2450 if not s:
2451 return None
2452
2453 days, hours, mins, secs, ms = [None] * 5
2454 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
2455 if m:
2456 days, hours, mins, secs, ms = m.groups()
2457 else:
2458 m = re.match(
2459 r'''(?ix)(?:P?
2460 (?:
2461 [0-9]+\s*y(?:ears?)?\s*
2462 )?
2463 (?:
2464 [0-9]+\s*m(?:onths?)?\s*
2465 )?
2466 (?:
2467 [0-9]+\s*w(?:eeks?)?\s*
2468 )?
2469 (?:
2470 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2471 )?
2472 T)?
2473 (?:
2474 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2475 )?
2476 (?:
2477 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2478 )?
2479 (?:
2480 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2481 )?Z?$''', s)
2482 if m:
2483 days, hours, mins, secs, ms = m.groups()
2484 else:
2485 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2486 if m:
2487 hours, mins = m.groups()
2488 else:
2489 return None
2490
2491 duration = 0
2492 if secs:
2493 duration += float(secs)
2494 if mins:
2495 duration += float(mins) * 60
2496 if hours:
2497 duration += float(hours) * 60 * 60
2498 if days:
2499 duration += float(days) * 24 * 60 * 60
2500 if ms:
2501 duration += float(ms)
2502 return duration
2503
2504
2505 def prepend_extension(filename, ext, expected_real_ext=None):
2506 name, real_ext = os.path.splitext(filename)
2507 return (
2508 '{0}.{1}{2}'.format(name, ext, real_ext)
2509 if not expected_real_ext or real_ext[1:] == expected_real_ext
2510 else '{0}.{1}'.format(filename, ext))
2511
2512
2513 def replace_extension(filename, ext, expected_real_ext=None):
2514 name, real_ext = os.path.splitext(filename)
2515 return '{0}.{1}'.format(
2516 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2517 ext)
2518
2519
2520 def check_executable(exe, args=[]):
2521 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2522 args can be a list of arguments for a short output (like -version) """
2523 try:
2524 Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2525 except OSError:
2526 return False
2527 return exe
2528
2529
2530 def _get_exe_version_output(exe, args):
2531 try:
2532 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2533 # SIGTTOU if yt-dlp is run in the background.
2534 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2535 out, _ = Popen(
2536 [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2537 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2538 except OSError:
2539 return False
2540 if isinstance(out, bytes): # Python 2.x
2541 out = out.decode('ascii', 'ignore')
2542 return out
2543
2544
2545 def detect_exe_version(output, version_re=None, unrecognized='present'):
2546 assert isinstance(output, compat_str)
2547 if version_re is None:
2548 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2549 m = re.search(version_re, output)
2550 if m:
2551 return m.group(1)
2552 else:
2553 return unrecognized
2554
2555
2556 def get_exe_version(exe, args=['--version'],
2557 version_re=None, unrecognized='present'):
2558 """ Returns the version of the specified executable,
2559 or False if the executable is not present """
2560 out = _get_exe_version_output(exe, args)
2561 return detect_exe_version(out, version_re, unrecognized) if out else False
2562
2563
2564 class LazyList(collections.abc.Sequence):
2565 ''' Lazy immutable list from an iterable
2566 Note that slices of a LazyList are lists and not LazyList'''
2567
2568 class IndexError(IndexError):
2569 pass
2570
2571 def __init__(self, iterable, *, reverse=False, _cache=None):
2572 self.__iterable = iter(iterable)
2573 self.__cache = [] if _cache is None else _cache
2574 self.__reversed = reverse
2575
2576 def __iter__(self):
2577 if self.__reversed:
2578 # We need to consume the entire iterable to iterate in reverse
2579 yield from self.exhaust()
2580 return
2581 yield from self.__cache
2582 for item in self.__iterable:
2583 self.__cache.append(item)
2584 yield item
2585
2586 def __exhaust(self):
2587 self.__cache.extend(self.__iterable)
2588 # Discard the emptied iterable to make it pickle-able
2589 self.__iterable = []
2590 return self.__cache
2591
2592 def exhaust(self):
2593 ''' Evaluate the entire iterable '''
2594 return self.__exhaust()[::-1 if self.__reversed else 1]
2595
2596 @staticmethod
2597 def __reverse_index(x):
2598 return None if x is None else -(x + 1)
2599
2600 def __getitem__(self, idx):
2601 if isinstance(idx, slice):
2602 if self.__reversed:
2603 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2604 start, stop, step = idx.start, idx.stop, idx.step or 1
2605 elif isinstance(idx, int):
2606 if self.__reversed:
2607 idx = self.__reverse_index(idx)
2608 start, stop, step = idx, idx, 0
2609 else:
2610 raise TypeError('indices must be integers or slices')
2611 if ((start or 0) < 0 or (stop or 0) < 0
2612 or (start is None and step < 0)
2613 or (stop is None and step > 0)):
2614 # We need to consume the entire iterable to be able to slice from the end
2615 # Obviously, never use this with infinite iterables
2616 self.__exhaust()
2617 try:
2618 return self.__cache[idx]
2619 except IndexError as e:
2620 raise self.IndexError(e) from e
2621 n = max(start or 0, stop or 0) - len(self.__cache) + 1
2622 if n > 0:
2623 self.__cache.extend(itertools.islice(self.__iterable, n))
2624 try:
2625 return self.__cache[idx]
2626 except IndexError as e:
2627 raise self.IndexError(e) from e
2628
2629 def __bool__(self):
2630 try:
2631 self[-1] if self.__reversed else self[0]
2632 except self.IndexError:
2633 return False
2634 return True
2635
2636 def __len__(self):
2637 self.__exhaust()
2638 return len(self.__cache)
2639
2640 def __reversed__(self):
2641 return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2642
2643 def __copy__(self):
2644 return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2645
2646 def __repr__(self):
2647 # repr and str should mimic a list. So we exhaust the iterable
2648 return repr(self.exhaust())
2649
2650 def __str__(self):
2651 return repr(self.exhaust())
2652
2653
2654 class PagedList:
2655
2656 class IndexError(IndexError):
2657 pass
2658
2659 def __len__(self):
2660 # This is only useful for tests
2661 return len(self.getslice())
2662
2663 def __init__(self, pagefunc, pagesize, use_cache=True):
2664 self._pagefunc = pagefunc
2665 self._pagesize = pagesize
2666 self._use_cache = use_cache
2667 self._cache = {}
2668
2669 def getpage(self, pagenum):
2670 page_results = self._cache.get(pagenum)
2671 if page_results is None:
2672 page_results = list(self._pagefunc(pagenum))
2673 if self._use_cache:
2674 self._cache[pagenum] = page_results
2675 return page_results
2676
2677 def getslice(self, start=0, end=None):
2678 return list(self._getslice(start, end))
2679
2680 def _getslice(self, start, end):
2681 raise NotImplementedError('This method must be implemented by subclasses')
2682
2683 def __getitem__(self, idx):
2684 # NOTE: cache must be enabled if this is used
2685 if not isinstance(idx, int) or idx < 0:
2686 raise TypeError('indices must be non-negative integers')
2687 entries = self.getslice(idx, idx + 1)
2688 if not entries:
2689 raise self.IndexError()
2690 return entries[0]
2691
2692
2693 class OnDemandPagedList(PagedList):
2694 def _getslice(self, start, end):
2695 for pagenum in itertools.count(start // self._pagesize):
2696 firstid = pagenum * self._pagesize
2697 nextfirstid = pagenum * self._pagesize + self._pagesize
2698 if start >= nextfirstid:
2699 continue
2700
2701 startv = (
2702 start % self._pagesize
2703 if firstid <= start < nextfirstid
2704 else 0)
2705 endv = (
2706 ((end - 1) % self._pagesize) + 1
2707 if (end is not None and firstid <= end <= nextfirstid)
2708 else None)
2709
2710 page_results = self.getpage(pagenum)
2711 if startv != 0 or endv is not None:
2712 page_results = page_results[startv:endv]
2713 yield from page_results
2714
2715 # A little optimization - if current page is not "full", ie. does
2716 # not contain page_size videos then we can assume that this page
2717 # is the last one - there are no more ids on further pages -
2718 # i.e. no need to query again.
2719 if len(page_results) + startv < self._pagesize:
2720 break
2721
2722 # If we got the whole page, but the next page is not interesting,
2723 # break out early as well
2724 if end == nextfirstid:
2725 break
2726
2727
2728 class InAdvancePagedList(PagedList):
2729 def __init__(self, pagefunc, pagecount, pagesize):
2730 self._pagecount = pagecount
2731 PagedList.__init__(self, pagefunc, pagesize, True)
2732
2733 def _getslice(self, start, end):
2734 start_page = start // self._pagesize
2735 end_page = (
2736 self._pagecount if end is None else (end // self._pagesize + 1))
2737 skip_elems = start - start_page * self._pagesize
2738 only_more = None if end is None else end - start
2739 for pagenum in range(start_page, end_page):
2740 page_results = self.getpage(pagenum)
2741 if skip_elems:
2742 page_results = page_results[skip_elems:]
2743 skip_elems = None
2744 if only_more is not None:
2745 if len(page_results) < only_more:
2746 only_more -= len(page_results)
2747 else:
2748 yield from page_results[:only_more]
2749 break
2750 yield from page_results
2751
2752
2753 def uppercase_escape(s):
2754 unicode_escape = codecs.getdecoder('unicode_escape')
2755 return re.sub(
2756 r'\\U[0-9a-fA-F]{8}',
2757 lambda m: unicode_escape(m.group(0))[0],
2758 s)
2759
2760
2761 def lowercase_escape(s):
2762 unicode_escape = codecs.getdecoder('unicode_escape')
2763 return re.sub(
2764 r'\\u[0-9a-fA-F]{4}',
2765 lambda m: unicode_escape(m.group(0))[0],
2766 s)
2767
2768
2769 def escape_rfc3986(s):
2770 """Escape non-ASCII characters as suggested by RFC 3986"""
2771 if sys.version_info < (3, 0) and isinstance(s, compat_str):
2772 s = s.encode('utf-8')
2773 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2774
2775
2776 def escape_url(url):
2777 """Escape URL as suggested by RFC 3986"""
2778 url_parsed = compat_urllib_parse_urlparse(url)
2779 return url_parsed._replace(
2780 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2781 path=escape_rfc3986(url_parsed.path),
2782 params=escape_rfc3986(url_parsed.params),
2783 query=escape_rfc3986(url_parsed.query),
2784 fragment=escape_rfc3986(url_parsed.fragment)
2785 ).geturl()
2786
2787
2788 def parse_qs(url):
2789 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2790
2791
2792 def read_batch_urls(batch_fd):
2793 def fixup(url):
2794 if not isinstance(url, compat_str):
2795 url = url.decode('utf-8', 'replace')
2796 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2797 for bom in BOM_UTF8:
2798 if url.startswith(bom):
2799 url = url[len(bom):]
2800 url = url.lstrip()
2801 if not url or url.startswith(('#', ';', ']')):
2802 return False
2803 # "#" cannot be stripped out since it is part of the URI
2804 # However, it can be safely stipped out if follwing a whitespace
2805 return re.split(r'\s#', url, 1)[0].rstrip()
2806
2807 with contextlib.closing(batch_fd) as fd:
2808 return [url for url in map(fixup, fd) if url]
2809
2810
2811 def urlencode_postdata(*args, **kargs):
2812 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2813
2814
2815 def update_url_query(url, query):
2816 if not query:
2817 return url
2818 parsed_url = compat_urlparse.urlparse(url)
2819 qs = compat_parse_qs(parsed_url.query)
2820 qs.update(query)
2821 return compat_urlparse.urlunparse(parsed_url._replace(
2822 query=compat_urllib_parse_urlencode(qs, True)))
2823
2824
2825 def update_Request(req, url=None, data=None, headers={}, query={}):
2826 req_headers = req.headers.copy()
2827 req_headers.update(headers)
2828 req_data = data or req.data
2829 req_url = update_url_query(url or req.get_full_url(), query)
2830 req_get_method = req.get_method()
2831 if req_get_method == 'HEAD':
2832 req_type = HEADRequest
2833 elif req_get_method == 'PUT':
2834 req_type = PUTRequest
2835 else:
2836 req_type = compat_urllib_request.Request
2837 new_req = req_type(
2838 req_url, data=req_data, headers=req_headers,
2839 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2840 if hasattr(req, 'timeout'):
2841 new_req.timeout = req.timeout
2842 return new_req
2843
2844
2845 def _multipart_encode_impl(data, boundary):
2846 content_type = 'multipart/form-data; boundary=%s' % boundary
2847
2848 out = b''
2849 for k, v in data.items():
2850 out += b'--' + boundary.encode('ascii') + b'\r\n'
2851 if isinstance(k, compat_str):
2852 k = k.encode('utf-8')
2853 if isinstance(v, compat_str):
2854 v = v.encode('utf-8')
2855 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2856 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2857 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2858 if boundary.encode('ascii') in content:
2859 raise ValueError('Boundary overlaps with data')
2860 out += content
2861
2862 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2863
2864 return out, content_type
2865
2866
2867 def multipart_encode(data, boundary=None):
2868 '''
2869 Encode a dict to RFC 7578-compliant form-data
2870
2871 data:
2872 A dict where keys and values can be either Unicode or bytes-like
2873 objects.
2874 boundary:
2875 If specified a Unicode object, it's used as the boundary. Otherwise
2876 a random boundary is generated.
2877
2878 Reference: https://tools.ietf.org/html/rfc7578
2879 '''
2880 has_specified_boundary = boundary is not None
2881
2882 while True:
2883 if boundary is None:
2884 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2885
2886 try:
2887 out, content_type = _multipart_encode_impl(data, boundary)
2888 break
2889 except ValueError:
2890 if has_specified_boundary:
2891 raise
2892 boundary = None
2893
2894 return out, content_type
2895
2896
2897 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
2898 if isinstance(key_or_keys, (list, tuple)):
2899 for key in key_or_keys:
2900 if key not in d or d[key] is None or skip_false_values and not d[key]:
2901 continue
2902 return d[key]
2903 return default
2904 return d.get(key_or_keys, default)
2905
2906
2907 def try_get(src, getter, expected_type=None):
2908 for get in variadic(getter):
2909 try:
2910 v = get(src)
2911 except (AttributeError, KeyError, TypeError, IndexError):
2912 pass
2913 else:
2914 if expected_type is None or isinstance(v, expected_type):
2915 return v
2916
2917
2918 def merge_dicts(*dicts):
2919 merged = {}
2920 for a_dict in dicts:
2921 for k, v in a_dict.items():
2922 if v is None:
2923 continue
2924 if (k not in merged
2925 or (isinstance(v, compat_str) and v
2926 and isinstance(merged[k], compat_str)
2927 and not merged[k])):
2928 merged[k] = v
2929 return merged
2930
2931
2932 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2933 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2934
2935
2936 US_RATINGS = {
2937 'G': 0,
2938 'PG': 10,
2939 'PG-13': 13,
2940 'R': 16,
2941 'NC': 18,
2942 }
2943
2944
2945 TV_PARENTAL_GUIDELINES = {
2946 'TV-Y': 0,
2947 'TV-Y7': 7,
2948 'TV-G': 0,
2949 'TV-PG': 0,
2950 'TV-14': 14,
2951 'TV-MA': 17,
2952 }
2953
2954
2955 def parse_age_limit(s):
2956 if type(s) == int:
2957 return s if 0 <= s <= 21 else None
2958 if not isinstance(s, compat_basestring):
2959 return None
2960 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
2961 if m:
2962 return int(m.group('age'))
2963 s = s.upper()
2964 if s in US_RATINGS:
2965 return US_RATINGS[s]
2966 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
2967 if m:
2968 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
2969 return None
2970
2971
2972 def strip_jsonp(code):
2973 return re.sub(
2974 r'''(?sx)^
2975 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
2976 (?:\s*&&\s*(?P=func_name))?
2977 \s*\(\s*(?P<callback_data>.*)\);?
2978 \s*?(?://[^\n]*)*$''',
2979 r'\g<callback_data>', code)
2980
2981
2982 def js_to_json(code, vars={}):
2983 # vars is a dict of var, val pairs to substitute
2984 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
2985 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2986 INTEGER_TABLE = (
2987 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2988 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2989 )
2990
2991 def fix_kv(m):
2992 v = m.group(0)
2993 if v in ('true', 'false', 'null'):
2994 return v
2995 elif v in ('undefined', 'void 0'):
2996 return 'null'
2997 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
2998 return ""
2999
3000 if v[0] in ("'", '"'):
3001 v = re.sub(r'(?s)\\.|"', lambda m: {
3002 '"': '\\"',
3003 "\\'": "'",
3004 '\\\n': '',
3005 '\\x': '\\u00',
3006 }.get(m.group(0), m.group(0)), v[1:-1])
3007 else:
3008 for regex, base in INTEGER_TABLE:
3009 im = re.match(regex, v)
3010 if im:
3011 i = int(im.group(1), base)
3012 return '"%d":' % i if v.endswith(':') else '%d' % i
3013
3014 if v in vars:
3015 return vars[v]
3016
3017 return '"%s"' % v
3018
3019 return re.sub(r'''(?sx)
3020 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3021 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3022 {comment}|,(?={skip}[\]}}])|
3023 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3024 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3025 [0-9]+(?={skip}:)|
3026 !+
3027 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3028
3029
3030 def qualities(quality_ids):
3031 """ Get a numeric quality value out of a list of possible values """
3032 def q(qid):
3033 try:
3034 return quality_ids.index(qid)
3035 except ValueError:
3036 return -1
3037 return q
3038
3039
3040 POSTPROCESS_WHEN = {'pre_process', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
3041
3042
3043 DEFAULT_OUTTMPL = {
3044 'default': '%(title)s [%(id)s].%(ext)s',
3045 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3046 }
3047 OUTTMPL_TYPES = {
3048 'chapter': None,
3049 'subtitle': None,
3050 'thumbnail': None,
3051 'description': 'description',
3052 'annotation': 'annotations.xml',
3053 'infojson': 'info.json',
3054 'link': None,
3055 'pl_thumbnail': None,
3056 'pl_description': 'description',
3057 'pl_infojson': 'info.json',
3058 }
3059
3060 # As of [1] format syntax is:
3061 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3062 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3063 STR_FORMAT_RE_TMPL = r'''(?x)
3064 (?<!%)(?P<prefix>(?:%%)*)
3065 %
3066 (?P<has_key>\((?P<key>{0})\))?
3067 (?P<format>
3068 (?P<conversion>[#0\-+ ]+)?
3069 (?P<min_width>\d+)?
3070 (?P<precision>\.\d+)?
3071 (?P<len_mod>[hlL])? # unused in python
3072 {1} # conversion type
3073 )
3074 '''
3075
3076
3077 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3078
3079
3080 def limit_length(s, length):
3081 """ Add ellipses to overly long strings """
3082 if s is None:
3083 return None
3084 ELLIPSES = '...'
3085 if len(s) > length:
3086 return s[:length - len(ELLIPSES)] + ELLIPSES
3087 return s
3088
3089
3090 def version_tuple(v):
3091 return tuple(int(e) for e in re.split(r'[-.]', v))
3092
3093
3094 def is_outdated_version(version, limit, assume_new=True):
3095 if not version:
3096 return not assume_new
3097 try:
3098 return version_tuple(version) < version_tuple(limit)
3099 except ValueError:
3100 return not assume_new
3101
3102
3103 def ytdl_is_updateable():
3104 """ Returns if yt-dlp can be updated with -U """
3105
3106 from .update import is_non_updateable
3107
3108 return not is_non_updateable()
3109
3110
3111 def args_to_str(args):
3112 # Get a short string representation for a subprocess command
3113 return ' '.join(compat_shlex_quote(a) for a in args)
3114
3115
3116 def error_to_compat_str(err):
3117 err_str = str(err)
3118 # On python 2 error byte string must be decoded with proper
3119 # encoding rather than ascii
3120 if sys.version_info[0] < 3:
3121 err_str = err_str.decode(preferredencoding())
3122 return err_str
3123
3124
3125 def mimetype2ext(mt):
3126 if mt is None:
3127 return None
3128
3129 mt, _, params = mt.partition(';')
3130 mt = mt.strip()
3131
3132 FULL_MAP = {
3133 'audio/mp4': 'm4a',
3134 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3135 # it's the most popular one
3136 'audio/mpeg': 'mp3',
3137 'audio/x-wav': 'wav',
3138 'audio/wav': 'wav',
3139 'audio/wave': 'wav',
3140 }
3141
3142 ext = FULL_MAP.get(mt)
3143 if ext is not None:
3144 return ext
3145
3146 SUBTYPE_MAP = {
3147 '3gpp': '3gp',
3148 'smptett+xml': 'tt',
3149 'ttaf+xml': 'dfxp',
3150 'ttml+xml': 'ttml',
3151 'x-flv': 'flv',
3152 'x-mp4-fragmented': 'mp4',
3153 'x-ms-sami': 'sami',
3154 'x-ms-wmv': 'wmv',
3155 'mpegurl': 'm3u8',
3156 'x-mpegurl': 'm3u8',
3157 'vnd.apple.mpegurl': 'm3u8',
3158 'dash+xml': 'mpd',
3159 'f4m+xml': 'f4m',
3160 'hds+xml': 'f4m',
3161 'vnd.ms-sstr+xml': 'ism',
3162 'quicktime': 'mov',
3163 'mp2t': 'ts',
3164 'x-wav': 'wav',
3165 'filmstrip+json': 'fs',
3166 'svg+xml': 'svg',
3167 }
3168
3169 _, _, subtype = mt.rpartition('/')
3170 ext = SUBTYPE_MAP.get(subtype.lower())
3171 if ext is not None:
3172 return ext
3173
3174 SUFFIX_MAP = {
3175 'json': 'json',
3176 'xml': 'xml',
3177 'zip': 'zip',
3178 'gzip': 'gz',
3179 }
3180
3181 _, _, suffix = subtype.partition('+')
3182 ext = SUFFIX_MAP.get(suffix)
3183 if ext is not None:
3184 return ext
3185
3186 return subtype.replace('+', '.')
3187
3188
3189 def ext2mimetype(ext_or_url):
3190 if not ext_or_url:
3191 return None
3192 if '.' not in ext_or_url:
3193 ext_or_url = f'file.{ext_or_url}'
3194 return mimetypes.guess_type(ext_or_url)[0]
3195
3196
3197 def parse_codecs(codecs_str):
3198 # http://tools.ietf.org/html/rfc6381
3199 if not codecs_str:
3200 return {}
3201 split_codecs = list(filter(None, map(
3202 str.strip, codecs_str.strip().strip(',').split(','))))
3203 vcodec, acodec, tcodec, hdr = None, None, None, None
3204 for full_codec in split_codecs:
3205 parts = full_codec.split('.')
3206 codec = parts[0].replace('0', '')
3207 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3208 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3209 if not vcodec:
3210 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3211 if codec in ('dvh1', 'dvhe'):
3212 hdr = 'DV'
3213 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3214 hdr = 'HDR10'
3215 elif full_codec.replace('0', '').startswith('vp9.2'):
3216 hdr = 'HDR10'
3217 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3218 if not acodec:
3219 acodec = full_codec
3220 elif codec in ('stpp', 'wvtt',):
3221 if not tcodec:
3222 tcodec = full_codec
3223 else:
3224 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3225 if vcodec or acodec or tcodec:
3226 return {
3227 'vcodec': vcodec or 'none',
3228 'acodec': acodec or 'none',
3229 'dynamic_range': hdr,
3230 **({'tcodec': tcodec} if tcodec is not None else {}),
3231 }
3232 elif len(split_codecs) == 2:
3233 return {
3234 'vcodec': split_codecs[0],
3235 'acodec': split_codecs[1],
3236 }
3237 return {}
3238
3239
3240 def urlhandle_detect_ext(url_handle):
3241 getheader = url_handle.headers.get
3242
3243 cd = getheader('Content-Disposition')
3244 if cd:
3245 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3246 if m:
3247 e = determine_ext(m.group('filename'), default_ext=None)
3248 if e:
3249 return e
3250
3251 return mimetype2ext(getheader('Content-Type'))
3252
3253
3254 def encode_data_uri(data, mime_type):
3255 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3256
3257
3258 def age_restricted(content_limit, age_limit):
3259 """ Returns True iff the content should be blocked """
3260
3261 if age_limit is None: # No limit set
3262 return False
3263 if content_limit is None:
3264 return False # Content available for everyone
3265 return age_limit < content_limit
3266
3267
3268 def is_html(first_bytes):
3269 """ Detect whether a file contains HTML by examining its first bytes. """
3270
3271 BOMS = [
3272 (b'\xef\xbb\xbf', 'utf-8'),
3273 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3274 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3275 (b'\xff\xfe', 'utf-16-le'),
3276 (b'\xfe\xff', 'utf-16-be'),
3277 ]
3278 for bom, enc in BOMS:
3279 if first_bytes.startswith(bom):
3280 s = first_bytes[len(bom):].decode(enc, 'replace')
3281 break
3282 else:
3283 s = first_bytes.decode('utf-8', 'replace')
3284
3285 return re.match(r'^\s*<', s)
3286
3287
3288 def determine_protocol(info_dict):
3289 protocol = info_dict.get('protocol')
3290 if protocol is not None:
3291 return protocol
3292
3293 url = sanitize_url(info_dict['url'])
3294 if url.startswith('rtmp'):
3295 return 'rtmp'
3296 elif url.startswith('mms'):
3297 return 'mms'
3298 elif url.startswith('rtsp'):
3299 return 'rtsp'
3300
3301 ext = determine_ext(url)
3302 if ext == 'm3u8':
3303 return 'm3u8'
3304 elif ext == 'f4m':
3305 return 'f4m'
3306
3307 return compat_urllib_parse_urlparse(url).scheme
3308
3309
3310 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3311 """ Render a list of rows, each as a list of values.
3312 Text after a \t will be right aligned """
3313 def width(string):
3314 return len(remove_terminal_sequences(string).replace('\t', ''))
3315
3316 def get_max_lens(table):
3317 return [max(width(str(v)) for v in col) for col in zip(*table)]
3318
3319 def filter_using_list(row, filterArray):
3320 return [col for (take, col) in zip(filterArray, row) if take]
3321
3322 if hide_empty:
3323 max_lens = get_max_lens(data)
3324 header_row = filter_using_list(header_row, max_lens)
3325 data = [filter_using_list(row, max_lens) for row in data]
3326
3327 table = [header_row] + data
3328 max_lens = get_max_lens(table)
3329 extra_gap += 1
3330 if delim:
3331 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3332 table[1][-1] = table[1][-1][:-extra_gap] # Remove extra_gap from end of delimiter
3333 for row in table:
3334 for pos, text in enumerate(map(str, row)):
3335 if '\t' in text:
3336 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3337 else:
3338 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3339 ret = '\n'.join(''.join(row).rstrip() for row in table)
3340 return ret
3341
3342
3343 def _match_one(filter_part, dct, incomplete):
3344 # TODO: Generalize code with YoutubeDL._build_format_filter
3345 STRING_OPERATORS = {
3346 '*=': operator.contains,
3347 '^=': lambda attr, value: attr.startswith(value),
3348 '$=': lambda attr, value: attr.endswith(value),
3349 '~=': lambda attr, value: re.search(value, attr),
3350 }
3351 COMPARISON_OPERATORS = {
3352 **STRING_OPERATORS,
3353 '<=': operator.le, # "<=" must be defined above "<"
3354 '<': operator.lt,
3355 '>=': operator.ge,
3356 '>': operator.gt,
3357 '=': operator.eq,
3358 }
3359
3360 operator_rex = re.compile(r'''(?x)\s*
3361 (?P<key>[a-z_]+)
3362 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3363 (?:
3364 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3365 (?P<strval>.+?)
3366 )
3367 \s*$
3368 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3369 m = operator_rex.search(filter_part)
3370 if m:
3371 m = m.groupdict()
3372 unnegated_op = COMPARISON_OPERATORS[m['op']]
3373 if m['negation']:
3374 op = lambda attr, value: not unnegated_op(attr, value)
3375 else:
3376 op = unnegated_op
3377 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3378 if m['quote']:
3379 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3380 actual_value = dct.get(m['key'])
3381 numeric_comparison = None
3382 if isinstance(actual_value, compat_numeric_types):
3383 # If the original field is a string and matching comparisonvalue is
3384 # a number we should respect the origin of the original field
3385 # and process comparison value as a string (see
3386 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3387 try:
3388 numeric_comparison = int(comparison_value)
3389 except ValueError:
3390 numeric_comparison = parse_filesize(comparison_value)
3391 if numeric_comparison is None:
3392 numeric_comparison = parse_filesize(f'{comparison_value}B')
3393 if numeric_comparison is None:
3394 numeric_comparison = parse_duration(comparison_value)
3395 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3396 raise ValueError('Operator %s only supports string values!' % m['op'])
3397 if actual_value is None:
3398 return incomplete or m['none_inclusive']
3399 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3400
3401 UNARY_OPERATORS = {
3402 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3403 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3404 }
3405 operator_rex = re.compile(r'''(?x)\s*
3406 (?P<op>%s)\s*(?P<key>[a-z_]+)
3407 \s*$
3408 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3409 m = operator_rex.search(filter_part)
3410 if m:
3411 op = UNARY_OPERATORS[m.group('op')]
3412 actual_value = dct.get(m.group('key'))
3413 if incomplete and actual_value is None:
3414 return True
3415 return op(actual_value)
3416
3417 raise ValueError('Invalid filter part %r' % filter_part)
3418
3419
3420 def match_str(filter_str, dct, incomplete=False):
3421 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3422 When incomplete, all conditions passes on missing fields
3423 """
3424 return all(
3425 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3426 for filter_part in re.split(r'(?<!\\)&', filter_str))
3427
3428
3429 def match_filter_func(filter_str):
3430 def _match_func(info_dict, *args, **kwargs):
3431 if match_str(filter_str, info_dict, *args, **kwargs):
3432 return None
3433 else:
3434 video_title = info_dict.get('title', info_dict.get('id', 'video'))
3435 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3436 return _match_func
3437
3438
3439 def parse_dfxp_time_expr(time_expr):
3440 if not time_expr:
3441 return
3442
3443 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3444 if mobj:
3445 return float(mobj.group('time_offset'))
3446
3447 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3448 if mobj:
3449 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3450
3451
3452 def srt_subtitles_timecode(seconds):
3453 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3454
3455
3456 def ass_subtitles_timecode(seconds):
3457 time = timetuple_from_msec(seconds * 1000)
3458 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3459
3460
3461 def dfxp2srt(dfxp_data):
3462 '''
3463 @param dfxp_data A bytes-like object containing DFXP data
3464 @returns A unicode object containing converted SRT data
3465 '''
3466 LEGACY_NAMESPACES = (
3467 (b'http://www.w3.org/ns/ttml', [
3468 b'http://www.w3.org/2004/11/ttaf1',
3469 b'http://www.w3.org/2006/04/ttaf1',
3470 b'http://www.w3.org/2006/10/ttaf1',
3471 ]),
3472 (b'http://www.w3.org/ns/ttml#styling', [
3473 b'http://www.w3.org/ns/ttml#style',
3474 ]),
3475 )
3476
3477 SUPPORTED_STYLING = [
3478 'color',
3479 'fontFamily',
3480 'fontSize',
3481 'fontStyle',
3482 'fontWeight',
3483 'textDecoration'
3484 ]
3485
3486 _x = functools.partial(xpath_with_ns, ns_map={
3487 'xml': 'http://www.w3.org/XML/1998/namespace',
3488 'ttml': 'http://www.w3.org/ns/ttml',
3489 'tts': 'http://www.w3.org/ns/ttml#styling',
3490 })
3491
3492 styles = {}
3493 default_style = {}
3494
3495 class TTMLPElementParser(object):
3496 _out = ''
3497 _unclosed_elements = []
3498 _applied_styles = []
3499
3500 def start(self, tag, attrib):
3501 if tag in (_x('ttml:br'), 'br'):
3502 self._out += '\n'
3503 else:
3504 unclosed_elements = []
3505 style = {}
3506 element_style_id = attrib.get('style')
3507 if default_style:
3508 style.update(default_style)
3509 if element_style_id:
3510 style.update(styles.get(element_style_id, {}))
3511 for prop in SUPPORTED_STYLING:
3512 prop_val = attrib.get(_x('tts:' + prop))
3513 if prop_val:
3514 style[prop] = prop_val
3515 if style:
3516 font = ''
3517 for k, v in sorted(style.items()):
3518 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3519 continue
3520 if k == 'color':
3521 font += ' color="%s"' % v
3522 elif k == 'fontSize':
3523 font += ' size="%s"' % v
3524 elif k == 'fontFamily':
3525 font += ' face="%s"' % v
3526 elif k == 'fontWeight' and v == 'bold':
3527 self._out += '<b>'
3528 unclosed_elements.append('b')
3529 elif k == 'fontStyle' and v == 'italic':
3530 self._out += '<i>'
3531 unclosed_elements.append('i')
3532 elif k == 'textDecoration' and v == 'underline':
3533 self._out += '<u>'
3534 unclosed_elements.append('u')
3535 if font:
3536 self._out += '<font' + font + '>'
3537 unclosed_elements.append('font')
3538 applied_style = {}
3539 if self._applied_styles:
3540 applied_style.update(self._applied_styles[-1])
3541 applied_style.update(style)
3542 self._applied_styles.append(applied_style)
3543 self._unclosed_elements.append(unclosed_elements)
3544
3545 def end(self, tag):
3546 if tag not in (_x('ttml:br'), 'br'):
3547 unclosed_elements = self._unclosed_elements.pop()
3548 for element in reversed(unclosed_elements):
3549 self._out += '</%s>' % element
3550 if unclosed_elements and self._applied_styles:
3551 self._applied_styles.pop()
3552
3553 def data(self, data):
3554 self._out += data
3555
3556 def close(self):
3557 return self._out.strip()
3558
3559 def parse_node(node):
3560 target = TTMLPElementParser()
3561 parser = xml.etree.ElementTree.XMLParser(target=target)
3562 parser.feed(xml.etree.ElementTree.tostring(node))
3563 return parser.close()
3564
3565 for k, v in LEGACY_NAMESPACES:
3566 for ns in v:
3567 dfxp_data = dfxp_data.replace(ns, k)
3568
3569 dfxp = compat_etree_fromstring(dfxp_data)
3570 out = []
3571 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3572
3573 if not paras:
3574 raise ValueError('Invalid dfxp/TTML subtitle')
3575
3576 repeat = False
3577 while True:
3578 for style in dfxp.findall(_x('.//ttml:style')):
3579 style_id = style.get('id') or style.get(_x('xml:id'))
3580 if not style_id:
3581 continue
3582 parent_style_id = style.get('style')
3583 if parent_style_id:
3584 if parent_style_id not in styles:
3585 repeat = True
3586 continue
3587 styles[style_id] = styles[parent_style_id].copy()
3588 for prop in SUPPORTED_STYLING:
3589 prop_val = style.get(_x('tts:' + prop))
3590 if prop_val:
3591 styles.setdefault(style_id, {})[prop] = prop_val
3592 if repeat:
3593 repeat = False
3594 else:
3595 break
3596
3597 for p in ('body', 'div'):
3598 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3599 if ele is None:
3600 continue
3601 style = styles.get(ele.get('style'))
3602 if not style:
3603 continue
3604 default_style.update(style)
3605
3606 for para, index in zip(paras, itertools.count(1)):
3607 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3608 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3609 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3610 if begin_time is None:
3611 continue
3612 if not end_time:
3613 if not dur:
3614 continue
3615 end_time = begin_time + dur
3616 out.append('%d\n%s --> %s\n%s\n\n' % (
3617 index,
3618 srt_subtitles_timecode(begin_time),
3619 srt_subtitles_timecode(end_time),
3620 parse_node(para)))
3621
3622 return ''.join(out)
3623
3624
3625 def cli_option(params, command_option, param):
3626 param = params.get(param)
3627 if param:
3628 param = compat_str(param)
3629 return [command_option, param] if param is not None else []
3630
3631
3632 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3633 param = params.get(param)
3634 if param is None:
3635 return []
3636 assert isinstance(param, bool)
3637 if separator:
3638 return [command_option + separator + (true_value if param else false_value)]
3639 return [command_option, true_value if param else false_value]
3640
3641
3642 def cli_valueless_option(params, command_option, param, expected_value=True):
3643 param = params.get(param)
3644 return [command_option] if param == expected_value else []
3645
3646
3647 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3648 if isinstance(argdict, (list, tuple)): # for backward compatibility
3649 if use_compat:
3650 return argdict
3651 else:
3652 argdict = None
3653 if argdict is None:
3654 return default
3655 assert isinstance(argdict, dict)
3656
3657 assert isinstance(keys, (list, tuple))
3658 for key_list in keys:
3659 arg_list = list(filter(
3660 lambda x: x is not None,
3661 [argdict.get(key.lower()) for key in variadic(key_list)]))
3662 if arg_list:
3663 return [arg for args in arg_list for arg in args]
3664 return default
3665
3666
3667 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3668 main_key, exe = main_key.lower(), exe.lower()
3669 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3670 keys = [f'{root_key}{k}' for k in (keys or [''])]
3671 if root_key in keys:
3672 if main_key != exe:
3673 keys.append((main_key, exe))
3674 keys.append('default')
3675 else:
3676 use_compat = False
3677 return cli_configuration_args(argdict, keys, default, use_compat)
3678
3679
3680 class ISO639Utils(object):
3681 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3682 _lang_map = {
3683 'aa': 'aar',
3684 'ab': 'abk',
3685 'ae': 'ave',
3686 'af': 'afr',
3687 'ak': 'aka',
3688 'am': 'amh',
3689 'an': 'arg',
3690 'ar': 'ara',
3691 'as': 'asm',
3692 'av': 'ava',
3693 'ay': 'aym',
3694 'az': 'aze',
3695 'ba': 'bak',
3696 'be': 'bel',
3697 'bg': 'bul',
3698 'bh': 'bih',
3699 'bi': 'bis',
3700 'bm': 'bam',
3701 'bn': 'ben',
3702 'bo': 'bod',
3703 'br': 'bre',
3704 'bs': 'bos',
3705 'ca': 'cat',
3706 'ce': 'che',
3707 'ch': 'cha',
3708 'co': 'cos',
3709 'cr': 'cre',
3710 'cs': 'ces',
3711 'cu': 'chu',
3712 'cv': 'chv',
3713 'cy': 'cym',
3714 'da': 'dan',
3715 'de': 'deu',
3716 'dv': 'div',
3717 'dz': 'dzo',
3718 'ee': 'ewe',
3719 'el': 'ell',
3720 'en': 'eng',
3721 'eo': 'epo',
3722 'es': 'spa',
3723 'et': 'est',
3724 'eu': 'eus',
3725 'fa': 'fas',
3726 'ff': 'ful',
3727 'fi': 'fin',
3728 'fj': 'fij',
3729 'fo': 'fao',
3730 'fr': 'fra',
3731 'fy': 'fry',
3732 'ga': 'gle',
3733 'gd': 'gla',
3734 'gl': 'glg',
3735 'gn': 'grn',
3736 'gu': 'guj',
3737 'gv': 'glv',
3738 'ha': 'hau',
3739 'he': 'heb',
3740 'iw': 'heb', # Replaced by he in 1989 revision
3741 'hi': 'hin',
3742 'ho': 'hmo',
3743 'hr': 'hrv',
3744 'ht': 'hat',
3745 'hu': 'hun',
3746 'hy': 'hye',
3747 'hz': 'her',
3748 'ia': 'ina',
3749 'id': 'ind',
3750 'in': 'ind', # Replaced by id in 1989 revision
3751 'ie': 'ile',
3752 'ig': 'ibo',
3753 'ii': 'iii',
3754 'ik': 'ipk',
3755 'io': 'ido',
3756 'is': 'isl',
3757 'it': 'ita',
3758 'iu': 'iku',
3759 'ja': 'jpn',
3760 'jv': 'jav',
3761 'ka': 'kat',
3762 'kg': 'kon',
3763 'ki': 'kik',
3764 'kj': 'kua',
3765 'kk': 'kaz',
3766 'kl': 'kal',
3767 'km': 'khm',
3768 'kn': 'kan',
3769 'ko': 'kor',
3770 'kr': 'kau',
3771 'ks': 'kas',
3772 'ku': 'kur',
3773 'kv': 'kom',
3774 'kw': 'cor',
3775 'ky': 'kir',
3776 'la': 'lat',
3777 'lb': 'ltz',
3778 'lg': 'lug',
3779 'li': 'lim',
3780 'ln': 'lin',
3781 'lo': 'lao',
3782 'lt': 'lit',
3783 'lu': 'lub',
3784 'lv': 'lav',
3785 'mg': 'mlg',
3786 'mh': 'mah',
3787 'mi': 'mri',
3788 'mk': 'mkd',
3789 'ml': 'mal',
3790 'mn': 'mon',
3791 'mr': 'mar',
3792 'ms': 'msa',
3793 'mt': 'mlt',
3794 'my': 'mya',
3795 'na': 'nau',
3796 'nb': 'nob',
3797 'nd': 'nde',
3798 'ne': 'nep',
3799 'ng': 'ndo',
3800 'nl': 'nld',
3801 'nn': 'nno',
3802 'no': 'nor',
3803 'nr': 'nbl',
3804 'nv': 'nav',
3805 'ny': 'nya',
3806 'oc': 'oci',
3807 'oj': 'oji',
3808 'om': 'orm',
3809 'or': 'ori',
3810 'os': 'oss',
3811 'pa': 'pan',
3812 'pi': 'pli',
3813 'pl': 'pol',
3814 'ps': 'pus',
3815 'pt': 'por',
3816 'qu': 'que',
3817 'rm': 'roh',
3818 'rn': 'run',
3819 'ro': 'ron',
3820 'ru': 'rus',
3821 'rw': 'kin',
3822 'sa': 'san',
3823 'sc': 'srd',
3824 'sd': 'snd',
3825 'se': 'sme',
3826 'sg': 'sag',
3827 'si': 'sin',
3828 'sk': 'slk',
3829 'sl': 'slv',
3830 'sm': 'smo',
3831 'sn': 'sna',
3832 'so': 'som',
3833 'sq': 'sqi',
3834 'sr': 'srp',
3835 'ss': 'ssw',
3836 'st': 'sot',
3837 'su': 'sun',
3838 'sv': 'swe',
3839 'sw': 'swa',
3840 'ta': 'tam',
3841 'te': 'tel',
3842 'tg': 'tgk',
3843 'th': 'tha',
3844 'ti': 'tir',
3845 'tk': 'tuk',
3846 'tl': 'tgl',
3847 'tn': 'tsn',
3848 'to': 'ton',
3849 'tr': 'tur',
3850 'ts': 'tso',
3851 'tt': 'tat',
3852 'tw': 'twi',
3853 'ty': 'tah',
3854 'ug': 'uig',
3855 'uk': 'ukr',
3856 'ur': 'urd',
3857 'uz': 'uzb',
3858 've': 'ven',
3859 'vi': 'vie',
3860 'vo': 'vol',
3861 'wa': 'wln',
3862 'wo': 'wol',
3863 'xh': 'xho',
3864 'yi': 'yid',
3865 'ji': 'yid', # Replaced by yi in 1989 revision
3866 'yo': 'yor',
3867 'za': 'zha',
3868 'zh': 'zho',
3869 'zu': 'zul',
3870 }
3871
3872 @classmethod
3873 def short2long(cls, code):
3874 """Convert language code from ISO 639-1 to ISO 639-2/T"""
3875 return cls._lang_map.get(code[:2])
3876
3877 @classmethod
3878 def long2short(cls, code):
3879 """Convert language code from ISO 639-2/T to ISO 639-1"""
3880 for short_name, long_name in cls._lang_map.items():
3881 if long_name == code:
3882 return short_name
3883
3884
3885 class ISO3166Utils(object):
3886 # From http://data.okfn.org/data/core/country-list
3887 _country_map = {
3888 'AF': 'Afghanistan',
3889 'AX': 'Åland Islands',
3890 'AL': 'Albania',
3891 'DZ': 'Algeria',
3892 'AS': 'American Samoa',
3893 'AD': 'Andorra',
3894 'AO': 'Angola',
3895 'AI': 'Anguilla',
3896 'AQ': 'Antarctica',
3897 'AG': 'Antigua and Barbuda',
3898 'AR': 'Argentina',
3899 'AM': 'Armenia',
3900 'AW': 'Aruba',
3901 'AU': 'Australia',
3902 'AT': 'Austria',
3903 'AZ': 'Azerbaijan',
3904 'BS': 'Bahamas',
3905 'BH': 'Bahrain',
3906 'BD': 'Bangladesh',
3907 'BB': 'Barbados',
3908 'BY': 'Belarus',
3909 'BE': 'Belgium',
3910 'BZ': 'Belize',
3911 'BJ': 'Benin',
3912 'BM': 'Bermuda',
3913 'BT': 'Bhutan',
3914 'BO': 'Bolivia, Plurinational State of',
3915 'BQ': 'Bonaire, Sint Eustatius and Saba',
3916 'BA': 'Bosnia and Herzegovina',
3917 'BW': 'Botswana',
3918 'BV': 'Bouvet Island',
3919 'BR': 'Brazil',
3920 'IO': 'British Indian Ocean Territory',
3921 'BN': 'Brunei Darussalam',
3922 'BG': 'Bulgaria',
3923 'BF': 'Burkina Faso',
3924 'BI': 'Burundi',
3925 'KH': 'Cambodia',
3926 'CM': 'Cameroon',
3927 'CA': 'Canada',
3928 'CV': 'Cape Verde',
3929 'KY': 'Cayman Islands',
3930 'CF': 'Central African Republic',
3931 'TD': 'Chad',
3932 'CL': 'Chile',
3933 'CN': 'China',
3934 'CX': 'Christmas Island',
3935 'CC': 'Cocos (Keeling) Islands',
3936 'CO': 'Colombia',
3937 'KM': 'Comoros',
3938 'CG': 'Congo',
3939 'CD': 'Congo, the Democratic Republic of the',
3940 'CK': 'Cook Islands',
3941 'CR': 'Costa Rica',
3942 'CI': 'Côte d\'Ivoire',
3943 'HR': 'Croatia',
3944 'CU': 'Cuba',
3945 'CW': 'Curaçao',
3946 'CY': 'Cyprus',
3947 'CZ': 'Czech Republic',
3948 'DK': 'Denmark',
3949 'DJ': 'Djibouti',
3950 'DM': 'Dominica',
3951 'DO': 'Dominican Republic',
3952 'EC': 'Ecuador',
3953 'EG': 'Egypt',
3954 'SV': 'El Salvador',
3955 'GQ': 'Equatorial Guinea',
3956 'ER': 'Eritrea',
3957 'EE': 'Estonia',
3958 'ET': 'Ethiopia',
3959 'FK': 'Falkland Islands (Malvinas)',
3960 'FO': 'Faroe Islands',
3961 'FJ': 'Fiji',
3962 'FI': 'Finland',
3963 'FR': 'France',
3964 'GF': 'French Guiana',
3965 'PF': 'French Polynesia',
3966 'TF': 'French Southern Territories',
3967 'GA': 'Gabon',
3968 'GM': 'Gambia',
3969 'GE': 'Georgia',
3970 'DE': 'Germany',
3971 'GH': 'Ghana',
3972 'GI': 'Gibraltar',
3973 'GR': 'Greece',
3974 'GL': 'Greenland',
3975 'GD': 'Grenada',
3976 'GP': 'Guadeloupe',
3977 'GU': 'Guam',
3978 'GT': 'Guatemala',
3979 'GG': 'Guernsey',
3980 'GN': 'Guinea',
3981 'GW': 'Guinea-Bissau',
3982 'GY': 'Guyana',
3983 'HT': 'Haiti',
3984 'HM': 'Heard Island and McDonald Islands',
3985 'VA': 'Holy See (Vatican City State)',
3986 'HN': 'Honduras',
3987 'HK': 'Hong Kong',
3988 'HU': 'Hungary',
3989 'IS': 'Iceland',
3990 'IN': 'India',
3991 'ID': 'Indonesia',
3992 'IR': 'Iran, Islamic Republic of',
3993 'IQ': 'Iraq',
3994 'IE': 'Ireland',
3995 'IM': 'Isle of Man',
3996 'IL': 'Israel',
3997 'IT': 'Italy',
3998 'JM': 'Jamaica',
3999 'JP': 'Japan',
4000 'JE': 'Jersey',
4001 'JO': 'Jordan',
4002 'KZ': 'Kazakhstan',
4003 'KE': 'Kenya',
4004 'KI': 'Kiribati',
4005 'KP': 'Korea, Democratic People\'s Republic of',
4006 'KR': 'Korea, Republic of',
4007 'KW': 'Kuwait',
4008 'KG': 'Kyrgyzstan',
4009 'LA': 'Lao People\'s Democratic Republic',
4010 'LV': 'Latvia',
4011 'LB': 'Lebanon',
4012 'LS': 'Lesotho',
4013 'LR': 'Liberia',
4014 'LY': 'Libya',
4015 'LI': 'Liechtenstein',
4016 'LT': 'Lithuania',
4017 'LU': 'Luxembourg',
4018 'MO': 'Macao',
4019 'MK': 'Macedonia, the Former Yugoslav Republic of',
4020 'MG': 'Madagascar',
4021 'MW': 'Malawi',
4022 'MY': 'Malaysia',
4023 'MV': 'Maldives',
4024 'ML': 'Mali',
4025 'MT': 'Malta',
4026 'MH': 'Marshall Islands',
4027 'MQ': 'Martinique',
4028 'MR': 'Mauritania',
4029 'MU': 'Mauritius',
4030 'YT': 'Mayotte',
4031 'MX': 'Mexico',
4032 'FM': 'Micronesia, Federated States of',
4033 'MD': 'Moldova, Republic of',
4034 'MC': 'Monaco',
4035 'MN': 'Mongolia',
4036 'ME': 'Montenegro',
4037 'MS': 'Montserrat',
4038 'MA': 'Morocco',
4039 'MZ': 'Mozambique',
4040 'MM': 'Myanmar',
4041 'NA': 'Namibia',
4042 'NR': 'Nauru',
4043 'NP': 'Nepal',
4044 'NL': 'Netherlands',
4045 'NC': 'New Caledonia',
4046 'NZ': 'New Zealand',
4047 'NI': 'Nicaragua',
4048 'NE': 'Niger',
4049 'NG': 'Nigeria',
4050 'NU': 'Niue',
4051 'NF': 'Norfolk Island',
4052 'MP': 'Northern Mariana Islands',
4053 'NO': 'Norway',
4054 'OM': 'Oman',
4055 'PK': 'Pakistan',
4056 'PW': 'Palau',
4057 'PS': 'Palestine, State of',
4058 'PA': 'Panama',
4059 'PG': 'Papua New Guinea',
4060 'PY': 'Paraguay',
4061 'PE': 'Peru',
4062 'PH': 'Philippines',
4063 'PN': 'Pitcairn',
4064 'PL': 'Poland',
4065 'PT': 'Portugal',
4066 'PR': 'Puerto Rico',
4067 'QA': 'Qatar',
4068 'RE': 'Réunion',
4069 'RO': 'Romania',
4070 'RU': 'Russian Federation',
4071 'RW': 'Rwanda',
4072 'BL': 'Saint Barthélemy',
4073 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4074 'KN': 'Saint Kitts and Nevis',
4075 'LC': 'Saint Lucia',
4076 'MF': 'Saint Martin (French part)',
4077 'PM': 'Saint Pierre and Miquelon',
4078 'VC': 'Saint Vincent and the Grenadines',
4079 'WS': 'Samoa',
4080 'SM': 'San Marino',
4081 'ST': 'Sao Tome and Principe',
4082 'SA': 'Saudi Arabia',
4083 'SN': 'Senegal',
4084 'RS': 'Serbia',
4085 'SC': 'Seychelles',
4086 'SL': 'Sierra Leone',
4087 'SG': 'Singapore',
4088 'SX': 'Sint Maarten (Dutch part)',
4089 'SK': 'Slovakia',
4090 'SI': 'Slovenia',
4091 'SB': 'Solomon Islands',
4092 'SO': 'Somalia',
4093 'ZA': 'South Africa',
4094 'GS': 'South Georgia and the South Sandwich Islands',
4095 'SS': 'South Sudan',
4096 'ES': 'Spain',
4097 'LK': 'Sri Lanka',
4098 'SD': 'Sudan',
4099 'SR': 'Suriname',
4100 'SJ': 'Svalbard and Jan Mayen',
4101 'SZ': 'Swaziland',
4102 'SE': 'Sweden',
4103 'CH': 'Switzerland',
4104 'SY': 'Syrian Arab Republic',
4105 'TW': 'Taiwan, Province of China',
4106 'TJ': 'Tajikistan',
4107 'TZ': 'Tanzania, United Republic of',
4108 'TH': 'Thailand',
4109 'TL': 'Timor-Leste',
4110 'TG': 'Togo',
4111 'TK': 'Tokelau',
4112 'TO': 'Tonga',
4113 'TT': 'Trinidad and Tobago',
4114 'TN': 'Tunisia',
4115 'TR': 'Turkey',
4116 'TM': 'Turkmenistan',
4117 'TC': 'Turks and Caicos Islands',
4118 'TV': 'Tuvalu',
4119 'UG': 'Uganda',
4120 'UA': 'Ukraine',
4121 'AE': 'United Arab Emirates',
4122 'GB': 'United Kingdom',
4123 'US': 'United States',
4124 'UM': 'United States Minor Outlying Islands',
4125 'UY': 'Uruguay',
4126 'UZ': 'Uzbekistan',
4127 'VU': 'Vanuatu',
4128 'VE': 'Venezuela, Bolivarian Republic of',
4129 'VN': 'Viet Nam',
4130 'VG': 'Virgin Islands, British',
4131 'VI': 'Virgin Islands, U.S.',
4132 'WF': 'Wallis and Futuna',
4133 'EH': 'Western Sahara',
4134 'YE': 'Yemen',
4135 'ZM': 'Zambia',
4136 'ZW': 'Zimbabwe',
4137 }
4138
4139 @classmethod
4140 def short2full(cls, code):
4141 """Convert an ISO 3166-2 country code to the corresponding full name"""
4142 return cls._country_map.get(code.upper())
4143
4144
4145 class GeoUtils(object):
4146 # Major IPv4 address blocks per country
4147 _country_ip_map = {
4148 'AD': '46.172.224.0/19',
4149 'AE': '94.200.0.0/13',
4150 'AF': '149.54.0.0/17',
4151 'AG': '209.59.64.0/18',
4152 'AI': '204.14.248.0/21',
4153 'AL': '46.99.0.0/16',
4154 'AM': '46.70.0.0/15',
4155 'AO': '105.168.0.0/13',
4156 'AP': '182.50.184.0/21',
4157 'AQ': '23.154.160.0/24',
4158 'AR': '181.0.0.0/12',
4159 'AS': '202.70.112.0/20',
4160 'AT': '77.116.0.0/14',
4161 'AU': '1.128.0.0/11',
4162 'AW': '181.41.0.0/18',
4163 'AX': '185.217.4.0/22',
4164 'AZ': '5.197.0.0/16',
4165 'BA': '31.176.128.0/17',
4166 'BB': '65.48.128.0/17',
4167 'BD': '114.130.0.0/16',
4168 'BE': '57.0.0.0/8',
4169 'BF': '102.178.0.0/15',
4170 'BG': '95.42.0.0/15',
4171 'BH': '37.131.0.0/17',
4172 'BI': '154.117.192.0/18',
4173 'BJ': '137.255.0.0/16',
4174 'BL': '185.212.72.0/23',
4175 'BM': '196.12.64.0/18',
4176 'BN': '156.31.0.0/16',
4177 'BO': '161.56.0.0/16',
4178 'BQ': '161.0.80.0/20',
4179 'BR': '191.128.0.0/12',
4180 'BS': '24.51.64.0/18',
4181 'BT': '119.2.96.0/19',
4182 'BW': '168.167.0.0/16',
4183 'BY': '178.120.0.0/13',
4184 'BZ': '179.42.192.0/18',
4185 'CA': '99.224.0.0/11',
4186 'CD': '41.243.0.0/16',
4187 'CF': '197.242.176.0/21',
4188 'CG': '160.113.0.0/16',
4189 'CH': '85.0.0.0/13',
4190 'CI': '102.136.0.0/14',
4191 'CK': '202.65.32.0/19',
4192 'CL': '152.172.0.0/14',
4193 'CM': '102.244.0.0/14',
4194 'CN': '36.128.0.0/10',
4195 'CO': '181.240.0.0/12',
4196 'CR': '201.192.0.0/12',
4197 'CU': '152.206.0.0/15',
4198 'CV': '165.90.96.0/19',
4199 'CW': '190.88.128.0/17',
4200 'CY': '31.153.0.0/16',
4201 'CZ': '88.100.0.0/14',
4202 'DE': '53.0.0.0/8',
4203 'DJ': '197.241.0.0/17',
4204 'DK': '87.48.0.0/12',
4205 'DM': '192.243.48.0/20',
4206 'DO': '152.166.0.0/15',
4207 'DZ': '41.96.0.0/12',
4208 'EC': '186.68.0.0/15',
4209 'EE': '90.190.0.0/15',
4210 'EG': '156.160.0.0/11',
4211 'ER': '196.200.96.0/20',
4212 'ES': '88.0.0.0/11',
4213 'ET': '196.188.0.0/14',
4214 'EU': '2.16.0.0/13',
4215 'FI': '91.152.0.0/13',
4216 'FJ': '144.120.0.0/16',
4217 'FK': '80.73.208.0/21',
4218 'FM': '119.252.112.0/20',
4219 'FO': '88.85.32.0/19',
4220 'FR': '90.0.0.0/9',
4221 'GA': '41.158.0.0/15',
4222 'GB': '25.0.0.0/8',
4223 'GD': '74.122.88.0/21',
4224 'GE': '31.146.0.0/16',
4225 'GF': '161.22.64.0/18',
4226 'GG': '62.68.160.0/19',
4227 'GH': '154.160.0.0/12',
4228 'GI': '95.164.0.0/16',
4229 'GL': '88.83.0.0/19',
4230 'GM': '160.182.0.0/15',
4231 'GN': '197.149.192.0/18',
4232 'GP': '104.250.0.0/19',
4233 'GQ': '105.235.224.0/20',
4234 'GR': '94.64.0.0/13',
4235 'GT': '168.234.0.0/16',
4236 'GU': '168.123.0.0/16',
4237 'GW': '197.214.80.0/20',
4238 'GY': '181.41.64.0/18',
4239 'HK': '113.252.0.0/14',
4240 'HN': '181.210.0.0/16',
4241 'HR': '93.136.0.0/13',
4242 'HT': '148.102.128.0/17',
4243 'HU': '84.0.0.0/14',
4244 'ID': '39.192.0.0/10',
4245 'IE': '87.32.0.0/12',
4246 'IL': '79.176.0.0/13',
4247 'IM': '5.62.80.0/20',
4248 'IN': '117.192.0.0/10',
4249 'IO': '203.83.48.0/21',
4250 'IQ': '37.236.0.0/14',
4251 'IR': '2.176.0.0/12',
4252 'IS': '82.221.0.0/16',
4253 'IT': '79.0.0.0/10',
4254 'JE': '87.244.64.0/18',
4255 'JM': '72.27.0.0/17',
4256 'JO': '176.29.0.0/16',
4257 'JP': '133.0.0.0/8',
4258 'KE': '105.48.0.0/12',
4259 'KG': '158.181.128.0/17',
4260 'KH': '36.37.128.0/17',
4261 'KI': '103.25.140.0/22',
4262 'KM': '197.255.224.0/20',
4263 'KN': '198.167.192.0/19',
4264 'KP': '175.45.176.0/22',
4265 'KR': '175.192.0.0/10',
4266 'KW': '37.36.0.0/14',
4267 'KY': '64.96.0.0/15',
4268 'KZ': '2.72.0.0/13',
4269 'LA': '115.84.64.0/18',
4270 'LB': '178.135.0.0/16',
4271 'LC': '24.92.144.0/20',
4272 'LI': '82.117.0.0/19',
4273 'LK': '112.134.0.0/15',
4274 'LR': '102.183.0.0/16',
4275 'LS': '129.232.0.0/17',
4276 'LT': '78.56.0.0/13',
4277 'LU': '188.42.0.0/16',
4278 'LV': '46.109.0.0/16',
4279 'LY': '41.252.0.0/14',
4280 'MA': '105.128.0.0/11',
4281 'MC': '88.209.64.0/18',
4282 'MD': '37.246.0.0/16',
4283 'ME': '178.175.0.0/17',
4284 'MF': '74.112.232.0/21',
4285 'MG': '154.126.0.0/17',
4286 'MH': '117.103.88.0/21',
4287 'MK': '77.28.0.0/15',
4288 'ML': '154.118.128.0/18',
4289 'MM': '37.111.0.0/17',
4290 'MN': '49.0.128.0/17',
4291 'MO': '60.246.0.0/16',
4292 'MP': '202.88.64.0/20',
4293 'MQ': '109.203.224.0/19',
4294 'MR': '41.188.64.0/18',
4295 'MS': '208.90.112.0/22',
4296 'MT': '46.11.0.0/16',
4297 'MU': '105.16.0.0/12',
4298 'MV': '27.114.128.0/18',
4299 'MW': '102.70.0.0/15',
4300 'MX': '187.192.0.0/11',
4301 'MY': '175.136.0.0/13',
4302 'MZ': '197.218.0.0/15',
4303 'NA': '41.182.0.0/16',
4304 'NC': '101.101.0.0/18',
4305 'NE': '197.214.0.0/18',
4306 'NF': '203.17.240.0/22',
4307 'NG': '105.112.0.0/12',
4308 'NI': '186.76.0.0/15',
4309 'NL': '145.96.0.0/11',
4310 'NO': '84.208.0.0/13',
4311 'NP': '36.252.0.0/15',
4312 'NR': '203.98.224.0/19',
4313 'NU': '49.156.48.0/22',
4314 'NZ': '49.224.0.0/14',
4315 'OM': '5.36.0.0/15',
4316 'PA': '186.72.0.0/15',
4317 'PE': '186.160.0.0/14',
4318 'PF': '123.50.64.0/18',
4319 'PG': '124.240.192.0/19',
4320 'PH': '49.144.0.0/13',
4321 'PK': '39.32.0.0/11',
4322 'PL': '83.0.0.0/11',
4323 'PM': '70.36.0.0/20',
4324 'PR': '66.50.0.0/16',
4325 'PS': '188.161.0.0/16',
4326 'PT': '85.240.0.0/13',
4327 'PW': '202.124.224.0/20',
4328 'PY': '181.120.0.0/14',
4329 'QA': '37.210.0.0/15',
4330 'RE': '102.35.0.0/16',
4331 'RO': '79.112.0.0/13',
4332 'RS': '93.86.0.0/15',
4333 'RU': '5.136.0.0/13',
4334 'RW': '41.186.0.0/16',
4335 'SA': '188.48.0.0/13',
4336 'SB': '202.1.160.0/19',
4337 'SC': '154.192.0.0/11',
4338 'SD': '102.120.0.0/13',
4339 'SE': '78.64.0.0/12',
4340 'SG': '8.128.0.0/10',
4341 'SI': '188.196.0.0/14',
4342 'SK': '78.98.0.0/15',
4343 'SL': '102.143.0.0/17',
4344 'SM': '89.186.32.0/19',
4345 'SN': '41.82.0.0/15',
4346 'SO': '154.115.192.0/18',
4347 'SR': '186.179.128.0/17',
4348 'SS': '105.235.208.0/21',
4349 'ST': '197.159.160.0/19',
4350 'SV': '168.243.0.0/16',
4351 'SX': '190.102.0.0/20',
4352 'SY': '5.0.0.0/16',
4353 'SZ': '41.84.224.0/19',
4354 'TC': '65.255.48.0/20',
4355 'TD': '154.68.128.0/19',
4356 'TG': '196.168.0.0/14',
4357 'TH': '171.96.0.0/13',
4358 'TJ': '85.9.128.0/18',
4359 'TK': '27.96.24.0/21',
4360 'TL': '180.189.160.0/20',
4361 'TM': '95.85.96.0/19',
4362 'TN': '197.0.0.0/11',
4363 'TO': '175.176.144.0/21',
4364 'TR': '78.160.0.0/11',
4365 'TT': '186.44.0.0/15',
4366 'TV': '202.2.96.0/19',
4367 'TW': '120.96.0.0/11',
4368 'TZ': '156.156.0.0/14',
4369 'UA': '37.52.0.0/14',
4370 'UG': '102.80.0.0/13',
4371 'US': '6.0.0.0/8',
4372 'UY': '167.56.0.0/13',
4373 'UZ': '84.54.64.0/18',
4374 'VA': '212.77.0.0/19',
4375 'VC': '207.191.240.0/21',
4376 'VE': '186.88.0.0/13',
4377 'VG': '66.81.192.0/20',
4378 'VI': '146.226.0.0/16',
4379 'VN': '14.160.0.0/11',
4380 'VU': '202.80.32.0/20',
4381 'WF': '117.20.32.0/21',
4382 'WS': '202.4.32.0/19',
4383 'YE': '134.35.0.0/16',
4384 'YT': '41.242.116.0/22',
4385 'ZA': '41.0.0.0/11',
4386 'ZM': '102.144.0.0/13',
4387 'ZW': '102.177.192.0/18',
4388 }
4389
4390 @classmethod
4391 def random_ipv4(cls, code_or_block):
4392 if len(code_or_block) == 2:
4393 block = cls._country_ip_map.get(code_or_block.upper())
4394 if not block:
4395 return None
4396 else:
4397 block = code_or_block
4398 addr, preflen = block.split('/')
4399 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4400 addr_max = addr_min | (0xffffffff >> int(preflen))
4401 return compat_str(socket.inet_ntoa(
4402 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4403
4404
4405 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4406 def __init__(self, proxies=None):
4407 # Set default handlers
4408 for type in ('http', 'https'):
4409 setattr(self, '%s_open' % type,
4410 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4411 meth(r, proxy, type))
4412 compat_urllib_request.ProxyHandler.__init__(self, proxies)
4413
4414 def proxy_open(self, req, proxy, type):
4415 req_proxy = req.headers.get('Ytdl-request-proxy')
4416 if req_proxy is not None:
4417 proxy = req_proxy
4418 del req.headers['Ytdl-request-proxy']
4419
4420 if proxy == '__noproxy__':
4421 return None # No Proxy
4422 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4423 req.add_header('Ytdl-socks-proxy', proxy)
4424 # yt-dlp's http/https handlers do wrapping the socket with socks
4425 return None
4426 return compat_urllib_request.ProxyHandler.proxy_open(
4427 self, req, proxy, type)
4428
4429
4430 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4431 # released into Public Domain
4432 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4433
4434 def long_to_bytes(n, blocksize=0):
4435 """long_to_bytes(n:long, blocksize:int) : string
4436 Convert a long integer to a byte string.
4437
4438 If optional blocksize is given and greater than zero, pad the front of the
4439 byte string with binary zeros so that the length is a multiple of
4440 blocksize.
4441 """
4442 # after much testing, this algorithm was deemed to be the fastest
4443 s = b''
4444 n = int(n)
4445 while n > 0:
4446 s = compat_struct_pack('>I', n & 0xffffffff) + s
4447 n = n >> 32
4448 # strip off leading zeros
4449 for i in range(len(s)):
4450 if s[i] != b'\000'[0]:
4451 break
4452 else:
4453 # only happens when n == 0
4454 s = b'\000'
4455 i = 0
4456 s = s[i:]
4457 # add back some pad bytes. this could be done more efficiently w.r.t. the
4458 # de-padding being done above, but sigh...
4459 if blocksize > 0 and len(s) % blocksize:
4460 s = (blocksize - len(s) % blocksize) * b'\000' + s
4461 return s
4462
4463
4464 def bytes_to_long(s):
4465 """bytes_to_long(string) : long
4466 Convert a byte string to a long integer.
4467
4468 This is (essentially) the inverse of long_to_bytes().
4469 """
4470 acc = 0
4471 length = len(s)
4472 if length % 4:
4473 extra = (4 - length % 4)
4474 s = b'\000' * extra + s
4475 length = length + extra
4476 for i in range(0, length, 4):
4477 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4478 return acc
4479
4480
4481 def ohdave_rsa_encrypt(data, exponent, modulus):
4482 '''
4483 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4484
4485 Input:
4486 data: data to encrypt, bytes-like object
4487 exponent, modulus: parameter e and N of RSA algorithm, both integer
4488 Output: hex string of encrypted data
4489
4490 Limitation: supports one block encryption only
4491 '''
4492
4493 payload = int(binascii.hexlify(data[::-1]), 16)
4494 encrypted = pow(payload, exponent, modulus)
4495 return '%x' % encrypted
4496
4497
4498 def pkcs1pad(data, length):
4499 """
4500 Padding input data with PKCS#1 scheme
4501
4502 @param {int[]} data input data
4503 @param {int} length target length
4504 @returns {int[]} padded data
4505 """
4506 if len(data) > length - 11:
4507 raise ValueError('Input data too long for PKCS#1 padding')
4508
4509 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4510 return [0, 2] + pseudo_random + [0] + data
4511
4512
4513 def encode_base_n(num, n, table=None):
4514 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4515 if not table:
4516 table = FULL_TABLE[:n]
4517
4518 if n > len(table):
4519 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4520
4521 if num == 0:
4522 return table[0]
4523
4524 ret = ''
4525 while num:
4526 ret = table[num % n] + ret
4527 num = num // n
4528 return ret
4529
4530
4531 def decode_packed_codes(code):
4532 mobj = re.search(PACKED_CODES_RE, code)
4533 obfuscated_code, base, count, symbols = mobj.groups()
4534 base = int(base)
4535 count = int(count)
4536 symbols = symbols.split('|')
4537 symbol_table = {}
4538
4539 while count:
4540 count -= 1
4541 base_n_count = encode_base_n(count, base)
4542 symbol_table[base_n_count] = symbols[count] or base_n_count
4543
4544 return re.sub(
4545 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4546 obfuscated_code)
4547
4548
4549 def caesar(s, alphabet, shift):
4550 if shift == 0:
4551 return s
4552 l = len(alphabet)
4553 return ''.join(
4554 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4555 for c in s)
4556
4557
4558 def rot47(s):
4559 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4560
4561
4562 def parse_m3u8_attributes(attrib):
4563 info = {}
4564 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4565 if val.startswith('"'):
4566 val = val[1:-1]
4567 info[key] = val
4568 return info
4569
4570
4571 def urshift(val, n):
4572 return val >> n if val >= 0 else (val + 0x100000000) >> n
4573
4574
4575 # Based on png2str() written by @gdkchan and improved by @yokrysty
4576 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4577 def decode_png(png_data):
4578 # Reference: https://www.w3.org/TR/PNG/
4579 header = png_data[8:]
4580
4581 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4582 raise IOError('Not a valid PNG file.')
4583
4584 int_map = {1: '>B', 2: '>H', 4: '>I'}
4585 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4586
4587 chunks = []
4588
4589 while header:
4590 length = unpack_integer(header[:4])
4591 header = header[4:]
4592
4593 chunk_type = header[:4]
4594 header = header[4:]
4595
4596 chunk_data = header[:length]
4597 header = header[length:]
4598
4599 header = header[4:] # Skip CRC
4600
4601 chunks.append({
4602 'type': chunk_type,
4603 'length': length,
4604 'data': chunk_data
4605 })
4606
4607 ihdr = chunks[0]['data']
4608
4609 width = unpack_integer(ihdr[:4])
4610 height = unpack_integer(ihdr[4:8])
4611
4612 idat = b''
4613
4614 for chunk in chunks:
4615 if chunk['type'] == b'IDAT':
4616 idat += chunk['data']
4617
4618 if not idat:
4619 raise IOError('Unable to read PNG data.')
4620
4621 decompressed_data = bytearray(zlib.decompress(idat))
4622
4623 stride = width * 3
4624 pixels = []
4625
4626 def _get_pixel(idx):
4627 x = idx % stride
4628 y = idx // stride
4629 return pixels[y][x]
4630
4631 for y in range(height):
4632 basePos = y * (1 + stride)
4633 filter_type = decompressed_data[basePos]
4634
4635 current_row = []
4636
4637 pixels.append(current_row)
4638
4639 for x in range(stride):
4640 color = decompressed_data[1 + basePos + x]
4641 basex = y * stride + x
4642 left = 0
4643 up = 0
4644
4645 if x > 2:
4646 left = _get_pixel(basex - 3)
4647 if y > 0:
4648 up = _get_pixel(basex - stride)
4649
4650 if filter_type == 1: # Sub
4651 color = (color + left) & 0xff
4652 elif filter_type == 2: # Up
4653 color = (color + up) & 0xff
4654 elif filter_type == 3: # Average
4655 color = (color + ((left + up) >> 1)) & 0xff
4656 elif filter_type == 4: # Paeth
4657 a = left
4658 b = up
4659 c = 0
4660
4661 if x > 2 and y > 0:
4662 c = _get_pixel(basex - stride - 3)
4663
4664 p = a + b - c
4665
4666 pa = abs(p - a)
4667 pb = abs(p - b)
4668 pc = abs(p - c)
4669
4670 if pa <= pb and pa <= pc:
4671 color = (color + a) & 0xff
4672 elif pb <= pc:
4673 color = (color + b) & 0xff
4674 else:
4675 color = (color + c) & 0xff
4676
4677 current_row.append(color)
4678
4679 return width, height, pixels
4680
4681
4682 def write_xattr(path, key, value):
4683 # This mess below finds the best xattr tool for the job
4684 try:
4685 # try the pyxattr module...
4686 import xattr
4687
4688 if hasattr(xattr, 'set'): # pyxattr
4689 # Unicode arguments are not supported in python-pyxattr until
4690 # version 0.5.0
4691 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4692 pyxattr_required_version = '0.5.0'
4693 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4694 # TODO: fallback to CLI tools
4695 raise XAttrUnavailableError(
4696 'python-pyxattr is detected but is too old. '
4697 'yt-dlp requires %s or above while your version is %s. '
4698 'Falling back to other xattr implementations' % (
4699 pyxattr_required_version, xattr.__version__))
4700
4701 setxattr = xattr.set
4702 else: # xattr
4703 setxattr = xattr.setxattr
4704
4705 try:
4706 setxattr(path, key, value)
4707 except EnvironmentError as e:
4708 raise XAttrMetadataError(e.errno, e.strerror)
4709
4710 except ImportError:
4711 if compat_os_name == 'nt':
4712 # Write xattrs to NTFS Alternate Data Streams:
4713 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4714 assert ':' not in key
4715 assert os.path.exists(path)
4716
4717 ads_fn = path + ':' + key
4718 try:
4719 with open(ads_fn, 'wb') as f:
4720 f.write(value)
4721 except EnvironmentError as e:
4722 raise XAttrMetadataError(e.errno, e.strerror)
4723 else:
4724 user_has_setfattr = check_executable('setfattr', ['--version'])
4725 user_has_xattr = check_executable('xattr', ['-h'])
4726
4727 if user_has_setfattr or user_has_xattr:
4728
4729 value = value.decode('utf-8')
4730 if user_has_setfattr:
4731 executable = 'setfattr'
4732 opts = ['-n', key, '-v', value]
4733 elif user_has_xattr:
4734 executable = 'xattr'
4735 opts = ['-w', key, value]
4736
4737 cmd = ([encodeFilename(executable, True)]
4738 + [encodeArgument(o) for o in opts]
4739 + [encodeFilename(path, True)])
4740
4741 try:
4742 p = Popen(
4743 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4744 except EnvironmentError as e:
4745 raise XAttrMetadataError(e.errno, e.strerror)
4746 stdout, stderr = p.communicate_or_kill()
4747 stderr = stderr.decode('utf-8', 'replace')
4748 if p.returncode != 0:
4749 raise XAttrMetadataError(p.returncode, stderr)
4750
4751 else:
4752 # On Unix, and can't find pyxattr, setfattr, or xattr.
4753 if sys.platform.startswith('linux'):
4754 raise XAttrUnavailableError(
4755 "Couldn't find a tool to set the xattrs. "
4756 "Install either the python 'pyxattr' or 'xattr' "
4757 "modules, or the GNU 'attr' package "
4758 "(which contains the 'setfattr' tool).")
4759 else:
4760 raise XAttrUnavailableError(
4761 "Couldn't find a tool to set the xattrs. "
4762 "Install either the python 'xattr' module, "
4763 "or the 'xattr' binary.")
4764
4765
4766 def random_birthday(year_field, month_field, day_field):
4767 start_date = datetime.date(1950, 1, 1)
4768 end_date = datetime.date(1995, 12, 31)
4769 offset = random.randint(0, (end_date - start_date).days)
4770 random_date = start_date + datetime.timedelta(offset)
4771 return {
4772 year_field: str(random_date.year),
4773 month_field: str(random_date.month),
4774 day_field: str(random_date.day),
4775 }
4776
4777
4778 # Templates for internet shortcut files, which are plain text files.
4779 DOT_URL_LINK_TEMPLATE = '''
4780 [InternetShortcut]
4781 URL=%(url)s
4782 '''.lstrip()
4783
4784 DOT_WEBLOC_LINK_TEMPLATE = '''
4785 <?xml version="1.0" encoding="UTF-8"?>
4786 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4787 <plist version="1.0">
4788 <dict>
4789 \t<key>URL</key>
4790 \t<string>%(url)s</string>
4791 </dict>
4792 </plist>
4793 '''.lstrip()
4794
4795 DOT_DESKTOP_LINK_TEMPLATE = '''
4796 [Desktop Entry]
4797 Encoding=UTF-8
4798 Name=%(filename)s
4799 Type=Link
4800 URL=%(url)s
4801 Icon=text-html
4802 '''.lstrip()
4803
4804 LINK_TEMPLATES = {
4805 'url': DOT_URL_LINK_TEMPLATE,
4806 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4807 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4808 }
4809
4810
4811 def iri_to_uri(iri):
4812 """
4813 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4814
4815 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4816 """
4817
4818 iri_parts = compat_urllib_parse_urlparse(iri)
4819
4820 if '[' in iri_parts.netloc:
4821 raise ValueError('IPv6 URIs are not, yet, supported.')
4822 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4823
4824 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4825
4826 net_location = ''
4827 if iri_parts.username:
4828 net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4829 if iri_parts.password is not None:
4830 net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4831 net_location += '@'
4832
4833 net_location += iri_parts.hostname.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
4834 # The 'idna' encoding produces ASCII text.
4835 if iri_parts.port is not None and iri_parts.port != 80:
4836 net_location += ':' + str(iri_parts.port)
4837
4838 return compat_urllib_parse_urlunparse(
4839 (iri_parts.scheme,
4840 net_location,
4841
4842 compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4843
4844 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4845 compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4846
4847 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4848 compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4849
4850 compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4851
4852 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4853
4854
4855 def to_high_limit_path(path):
4856 if sys.platform in ['win32', 'cygwin']:
4857 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4858 return r'\\?\ '.rstrip() + os.path.abspath(path)
4859
4860 return path
4861
4862
4863 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4864 if field is None:
4865 val = obj if obj is not None else default
4866 else:
4867 val = obj.get(field, default)
4868 if func and val not in ignore:
4869 val = func(val)
4870 return template % val if val not in ignore else default
4871
4872
4873 def clean_podcast_url(url):
4874 return re.sub(r'''(?x)
4875 (?:
4876 (?:
4877 chtbl\.com/track|
4878 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
4879 play\.podtrac\.com
4880 )/[^/]+|
4881 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
4882 flex\.acast\.com|
4883 pd(?:
4884 cn\.co| # https://podcorn.com/analytics-prefix/
4885 st\.fm # https://podsights.com/docs/
4886 )/e
4887 )/''', '', url)
4888
4889
4890 _HEX_TABLE = '0123456789abcdef'
4891
4892
4893 def random_uuidv4():
4894 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
4895
4896
4897 def make_dir(path, to_screen=None):
4898 try:
4899 dn = os.path.dirname(path)
4900 if dn and not os.path.exists(dn):
4901 os.makedirs(dn)
4902 return True
4903 except (OSError, IOError) as err:
4904 if callable(to_screen) is not None:
4905 to_screen('unable to create directory ' + error_to_compat_str(err))
4906 return False
4907
4908
4909 def get_executable_path():
4910 from zipimport import zipimporter
4911 if hasattr(sys, 'frozen'): # Running from PyInstaller
4912 path = os.path.dirname(sys.executable)
4913 elif isinstance(globals().get('__loader__'), zipimporter): # Running from ZIP
4914 path = os.path.join(os.path.dirname(__file__), '../..')
4915 else:
4916 path = os.path.join(os.path.dirname(__file__), '..')
4917 return os.path.abspath(path)
4918
4919
4920 def load_plugins(name, suffix, namespace):
4921 classes = {}
4922 try:
4923 plugins_spec = importlib.util.spec_from_file_location(
4924 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
4925 plugins = importlib.util.module_from_spec(plugins_spec)
4926 sys.modules[plugins_spec.name] = plugins
4927 plugins_spec.loader.exec_module(plugins)
4928 for name in dir(plugins):
4929 if name in namespace:
4930 continue
4931 if not name.endswith(suffix):
4932 continue
4933 klass = getattr(plugins, name)
4934 classes[name] = namespace[name] = klass
4935 except FileNotFoundError:
4936 pass
4937 return classes
4938
4939
4940 def traverse_obj(
4941 obj, *path_list, default=None, expected_type=None, get_all=True,
4942 casesense=True, is_user_input=False, traverse_string=False):
4943 ''' Traverse nested list/dict/tuple
4944 @param path_list A list of paths which are checked one by one.
4945 Each path is a list of keys where each key is a string,
4946 a function, a tuple of strings/None or "...".
4947 When a fuction is given, it takes the key as argument and
4948 returns whether the key matches or not. When a tuple is given,
4949 all the keys given in the tuple are traversed, and
4950 "..." traverses all the keys in the object
4951 "None" returns the object without traversal
4952 @param default Default value to return
4953 @param expected_type Only accept final value of this type (Can also be any callable)
4954 @param get_all Return all the values obtained from a path or only the first one
4955 @param casesense Whether to consider dictionary keys as case sensitive
4956 @param is_user_input Whether the keys are generated from user input. If True,
4957 strings are converted to int/slice if necessary
4958 @param traverse_string Whether to traverse inside strings. If True, any
4959 non-compatible object will also be converted into a string
4960 # TODO: Write tests
4961 '''
4962 if not casesense:
4963 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
4964 path_list = (map(_lower, variadic(path)) for path in path_list)
4965
4966 def _traverse_obj(obj, path, _current_depth=0):
4967 nonlocal depth
4968 path = tuple(variadic(path))
4969 for i, key in enumerate(path):
4970 if None in (key, obj):
4971 return obj
4972 if isinstance(key, (list, tuple)):
4973 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
4974 key = ...
4975 if key is ...:
4976 obj = (obj.values() if isinstance(obj, dict)
4977 else obj if isinstance(obj, (list, tuple, LazyList))
4978 else str(obj) if traverse_string else [])
4979 _current_depth += 1
4980 depth = max(depth, _current_depth)
4981 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
4982 elif callable(key):
4983 if isinstance(obj, (list, tuple, LazyList)):
4984 obj = enumerate(obj)
4985 elif isinstance(obj, dict):
4986 obj = obj.items()
4987 else:
4988 if not traverse_string:
4989 return None
4990 obj = str(obj)
4991 _current_depth += 1
4992 depth = max(depth, _current_depth)
4993 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
4994 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
4995 obj = (obj.get(key) if casesense or (key in obj)
4996 else next((v for k, v in obj.items() if _lower(k) == key), None))
4997 else:
4998 if is_user_input:
4999 key = (int_or_none(key) if ':' not in key
5000 else slice(*map(int_or_none, key.split(':'))))
5001 if key == slice(None):
5002 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5003 if not isinstance(key, (int, slice)):
5004 return None
5005 if not isinstance(obj, (list, tuple, LazyList)):
5006 if not traverse_string:
5007 return None
5008 obj = str(obj)
5009 try:
5010 obj = obj[key]
5011 except IndexError:
5012 return None
5013 return obj
5014
5015 if isinstance(expected_type, type):
5016 type_test = lambda val: val if isinstance(val, expected_type) else None
5017 elif expected_type is not None:
5018 type_test = expected_type
5019 else:
5020 type_test = lambda val: val
5021
5022 for path in path_list:
5023 depth = 0
5024 val = _traverse_obj(obj, path)
5025 if val is not None:
5026 if depth:
5027 for _ in range(depth - 1):
5028 val = itertools.chain.from_iterable(v for v in val if v is not None)
5029 val = [v for v in map(type_test, val) if v is not None]
5030 if val:
5031 return val if get_all else val[0]
5032 else:
5033 val = type_test(val)
5034 if val is not None:
5035 return val
5036 return default
5037
5038
5039 # Deprecated
5040 def traverse_dict(dictn, keys, casesense=True):
5041 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5042 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5043 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5044
5045
5046 def variadic(x, allowed_types=(str, bytes, dict)):
5047 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5048
5049
5050 # create a JSON Web Signature (jws) with HS256 algorithm
5051 # the resulting format is in JWS Compact Serialization
5052 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5053 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5054 def jwt_encode_hs256(payload_data, key, headers={}):
5055 header_data = {
5056 'alg': 'HS256',
5057 'typ': 'JWT',
5058 }
5059 if headers:
5060 header_data.update(headers)
5061 header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5062 payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5063 h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5064 signature_b64 = base64.b64encode(h.digest())
5065 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5066 return token
5067
5068
5069 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5070 def jwt_decode_hs256(jwt):
5071 header_b64, payload_b64, signature_b64 = jwt.split('.')
5072 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5073 return payload_data
5074
5075
5076 def supports_terminal_sequences(stream):
5077 if compat_os_name == 'nt':
5078 from .compat import WINDOWS_VT_MODE # Must be imported locally
5079 if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5080 return False
5081 elif not os.getenv('TERM'):
5082 return False
5083 try:
5084 return stream.isatty()
5085 except BaseException:
5086 return False
5087
5088
5089 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5090
5091
5092 def remove_terminal_sequences(string):
5093 return _terminal_sequences_re.sub('', string)
5094
5095
5096 def number_of_digits(number):
5097 return len('%d' % number)
5098
5099
5100 def join_nonempty(*values, delim='-', from_dict=None):
5101 if from_dict is not None:
5102 values = map(from_dict.get, values)
5103 return delim.join(map(str, filter(None, values)))
5104
5105
5106 class Config:
5107 own_args = None
5108 filename = None
5109 __initialized = False
5110
5111 def __init__(self, parser, label=None):
5112 self._parser, self.label = parser, label
5113 self._loaded_paths, self.configs = set(), []
5114
5115 def init(self, args=None, filename=None):
5116 assert not self.__initialized
5117 if filename:
5118 location = os.path.realpath(filename)
5119 if location in self._loaded_paths:
5120 return False
5121 self._loaded_paths.add(location)
5122
5123 self.__initialized = True
5124 self.own_args, self.filename = args, filename
5125 for location in self._parser.parse_args(args)[0].config_locations or []:
5126 location = compat_expanduser(location)
5127 if os.path.isdir(location):
5128 location = os.path.join(location, 'yt-dlp.conf')
5129 if not os.path.exists(location):
5130 self._parser.error(f'config location {location} does not exist')
5131 self.append_config(self.read_file(location), location)
5132 return True
5133
5134 def __str__(self):
5135 label = join_nonempty(
5136 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5137 delim=' ')
5138 return join_nonempty(
5139 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5140 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5141 delim='\n')
5142
5143 @staticmethod
5144 def read_file(filename, default=[]):
5145 try:
5146 optionf = open(filename)
5147 except IOError:
5148 return default # silently skip if file is not present
5149 try:
5150 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5151 contents = optionf.read()
5152 if sys.version_info < (3,):
5153 contents = contents.decode(preferredencoding())
5154 res = compat_shlex_split(contents, comments=True)
5155 finally:
5156 optionf.close()
5157 return res
5158
5159 @staticmethod
5160 def hide_login_info(opts):
5161 PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5162 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5163
5164 def _scrub_eq(o):
5165 m = eqre.match(o)
5166 if m:
5167 return m.group('key') + '=PRIVATE'
5168 else:
5169 return o
5170
5171 opts = list(map(_scrub_eq, opts))
5172 for idx, opt in enumerate(opts):
5173 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5174 opts[idx + 1] = 'PRIVATE'
5175 return opts
5176
5177 def append_config(self, *args, label=None):
5178 config = type(self)(self._parser, label)
5179 config._loaded_paths = self._loaded_paths
5180 if config.init(*args):
5181 self.configs.append(config)
5182
5183 @property
5184 def all_args(self):
5185 for config in reversed(self.configs):
5186 yield from config.all_args
5187 yield from self.own_args or []
5188
5189 def parse_args(self):
5190 return self._parser.parse_args(list(self.all_args))