]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
Add pre-processor stage `after_filter`
[yt-dlp.git] / yt_dlp / utils.py
1 #!/usr/bin/env python3
2 # coding: utf-8
3
4 from __future__ import unicode_literals
5
6 import asyncio
7 import base64
8 import binascii
9 import calendar
10 import codecs
11 import collections
12 import contextlib
13 import ctypes
14 import datetime
15 import email.utils
16 import email.header
17 import errno
18 import functools
19 import gzip
20 import hashlib
21 import hmac
22 import importlib.util
23 import io
24 import itertools
25 import json
26 import locale
27 import math
28 import operator
29 import os
30 import platform
31 import random
32 import re
33 import socket
34 import ssl
35 import subprocess
36 import sys
37 import tempfile
38 import time
39 import traceback
40 import xml.etree.ElementTree
41 import zlib
42 import mimetypes
43
44 from .compat import (
45 compat_HTMLParseError,
46 compat_HTMLParser,
47 compat_HTTPError,
48 compat_basestring,
49 compat_chr,
50 compat_cookiejar,
51 compat_ctypes_WINFUNCTYPE,
52 compat_etree_fromstring,
53 compat_expanduser,
54 compat_html_entities,
55 compat_html_entities_html5,
56 compat_http_client,
57 compat_integer_types,
58 compat_numeric_types,
59 compat_kwargs,
60 compat_os_name,
61 compat_parse_qs,
62 compat_shlex_split,
63 compat_shlex_quote,
64 compat_str,
65 compat_struct_pack,
66 compat_struct_unpack,
67 compat_urllib_error,
68 compat_urllib_parse,
69 compat_urllib_parse_urlencode,
70 compat_urllib_parse_urlparse,
71 compat_urllib_parse_urlunparse,
72 compat_urllib_parse_quote,
73 compat_urllib_parse_quote_plus,
74 compat_urllib_parse_unquote_plus,
75 compat_urllib_request,
76 compat_urlparse,
77 compat_websockets,
78 compat_xpath,
79 )
80
81 from .socks import (
82 ProxyType,
83 sockssocket,
84 )
85
86
87 def register_socks_protocols():
88 # "Register" SOCKS protocols
89 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
90 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
91 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
92 if scheme not in compat_urlparse.uses_netloc:
93 compat_urlparse.uses_netloc.append(scheme)
94
95
96 # This is not clearly defined otherwise
97 compiled_regex_type = type(re.compile(''))
98
99
100 def random_user_agent():
101 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
102 _CHROME_VERSIONS = (
103 '90.0.4430.212',
104 '90.0.4430.24',
105 '90.0.4430.70',
106 '90.0.4430.72',
107 '90.0.4430.85',
108 '90.0.4430.93',
109 '91.0.4472.101',
110 '91.0.4472.106',
111 '91.0.4472.114',
112 '91.0.4472.124',
113 '91.0.4472.164',
114 '91.0.4472.19',
115 '91.0.4472.77',
116 '92.0.4515.107',
117 '92.0.4515.115',
118 '92.0.4515.131',
119 '92.0.4515.159',
120 '92.0.4515.43',
121 '93.0.4556.0',
122 '93.0.4577.15',
123 '93.0.4577.63',
124 '93.0.4577.82',
125 '94.0.4606.41',
126 '94.0.4606.54',
127 '94.0.4606.61',
128 '94.0.4606.71',
129 '94.0.4606.81',
130 '94.0.4606.85',
131 '95.0.4638.17',
132 '95.0.4638.50',
133 '95.0.4638.54',
134 '95.0.4638.69',
135 '95.0.4638.74',
136 '96.0.4664.18',
137 '96.0.4664.45',
138 '96.0.4664.55',
139 '96.0.4664.93',
140 '97.0.4692.20',
141 )
142 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
143
144
145 std_headers = {
146 'User-Agent': random_user_agent(),
147 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
148 'Accept-Encoding': 'gzip, deflate',
149 'Accept-Language': 'en-us,en;q=0.5',
150 'Sec-Fetch-Mode': 'navigate',
151 }
152
153
154 USER_AGENTS = {
155 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
156 }
157
158
159 NO_DEFAULT = object()
160
161 ENGLISH_MONTH_NAMES = [
162 'January', 'February', 'March', 'April', 'May', 'June',
163 'July', 'August', 'September', 'October', 'November', 'December']
164
165 MONTH_NAMES = {
166 'en': ENGLISH_MONTH_NAMES,
167 'fr': [
168 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
169 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
170 }
171
172 KNOWN_EXTENSIONS = (
173 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
174 'flv', 'f4v', 'f4a', 'f4b',
175 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
176 'mkv', 'mka', 'mk3d',
177 'avi', 'divx',
178 'mov',
179 'asf', 'wmv', 'wma',
180 '3gp', '3g2',
181 'mp3',
182 'flac',
183 'ape',
184 'wav',
185 'f4f', 'f4m', 'm3u8', 'smil')
186
187 # needed for sanitizing filenames in restricted mode
188 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
189 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
190 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
191
192 DATE_FORMATS = (
193 '%d %B %Y',
194 '%d %b %Y',
195 '%B %d %Y',
196 '%B %dst %Y',
197 '%B %dnd %Y',
198 '%B %drd %Y',
199 '%B %dth %Y',
200 '%b %d %Y',
201 '%b %dst %Y',
202 '%b %dnd %Y',
203 '%b %drd %Y',
204 '%b %dth %Y',
205 '%b %dst %Y %I:%M',
206 '%b %dnd %Y %I:%M',
207 '%b %drd %Y %I:%M',
208 '%b %dth %Y %I:%M',
209 '%Y %m %d',
210 '%Y-%m-%d',
211 '%Y.%m.%d.',
212 '%Y/%m/%d',
213 '%Y/%m/%d %H:%M',
214 '%Y/%m/%d %H:%M:%S',
215 '%Y%m%d%H%M',
216 '%Y%m%d%H%M%S',
217 '%Y%m%d',
218 '%Y-%m-%d %H:%M',
219 '%Y-%m-%d %H:%M:%S',
220 '%Y-%m-%d %H:%M:%S.%f',
221 '%Y-%m-%d %H:%M:%S:%f',
222 '%d.%m.%Y %H:%M',
223 '%d.%m.%Y %H.%M',
224 '%Y-%m-%dT%H:%M:%SZ',
225 '%Y-%m-%dT%H:%M:%S.%fZ',
226 '%Y-%m-%dT%H:%M:%S.%f0Z',
227 '%Y-%m-%dT%H:%M:%S',
228 '%Y-%m-%dT%H:%M:%S.%f',
229 '%Y-%m-%dT%H:%M',
230 '%b %d %Y at %H:%M',
231 '%b %d %Y at %H:%M:%S',
232 '%B %d %Y at %H:%M',
233 '%B %d %Y at %H:%M:%S',
234 '%H:%M %d-%b-%Y',
235 )
236
237 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
238 DATE_FORMATS_DAY_FIRST.extend([
239 '%d-%m-%Y',
240 '%d.%m.%Y',
241 '%d.%m.%y',
242 '%d/%m/%Y',
243 '%d/%m/%y',
244 '%d/%m/%Y %H:%M:%S',
245 ])
246
247 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
248 DATE_FORMATS_MONTH_FIRST.extend([
249 '%m-%d-%Y',
250 '%m.%d.%Y',
251 '%m/%d/%Y',
252 '%m/%d/%y',
253 '%m/%d/%Y %H:%M:%S',
254 ])
255
256 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
257 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
258
259
260 def preferredencoding():
261 """Get preferred encoding.
262
263 Returns the best encoding scheme for the system, based on
264 locale.getpreferredencoding() and some further tweaks.
265 """
266 try:
267 pref = locale.getpreferredencoding()
268 'TEST'.encode(pref)
269 except Exception:
270 pref = 'UTF-8'
271
272 return pref
273
274
275 def write_json_file(obj, fn):
276 """ Encode obj as JSON and write it to fn, atomically if possible """
277
278 fn = encodeFilename(fn)
279 if sys.version_info < (3, 0) and sys.platform != 'win32':
280 encoding = get_filesystem_encoding()
281 # os.path.basename returns a bytes object, but NamedTemporaryFile
282 # will fail if the filename contains non ascii characters unless we
283 # use a unicode object
284 path_basename = lambda f: os.path.basename(fn).decode(encoding)
285 # the same for os.path.dirname
286 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
287 else:
288 path_basename = os.path.basename
289 path_dirname = os.path.dirname
290
291 args = {
292 'suffix': '.tmp',
293 'prefix': path_basename(fn) + '.',
294 'dir': path_dirname(fn),
295 'delete': False,
296 }
297
298 # In Python 2.x, json.dump expects a bytestream.
299 # In Python 3.x, it writes to a character stream
300 if sys.version_info < (3, 0):
301 args['mode'] = 'wb'
302 else:
303 args.update({
304 'mode': 'w',
305 'encoding': 'utf-8',
306 })
307
308 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
309
310 try:
311 with tf:
312 json.dump(obj, tf, ensure_ascii=False)
313 if sys.platform == 'win32':
314 # Need to remove existing file on Windows, else os.rename raises
315 # WindowsError or FileExistsError.
316 try:
317 os.unlink(fn)
318 except OSError:
319 pass
320 try:
321 mask = os.umask(0)
322 os.umask(mask)
323 os.chmod(tf.name, 0o666 & ~mask)
324 except OSError:
325 pass
326 os.rename(tf.name, fn)
327 except Exception:
328 try:
329 os.remove(tf.name)
330 except OSError:
331 pass
332 raise
333
334
335 if sys.version_info >= (2, 7):
336 def find_xpath_attr(node, xpath, key, val=None):
337 """ Find the xpath xpath[@key=val] """
338 assert re.match(r'^[a-zA-Z_-]+$', key)
339 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
340 return node.find(expr)
341 else:
342 def find_xpath_attr(node, xpath, key, val=None):
343 for f in node.findall(compat_xpath(xpath)):
344 if key not in f.attrib:
345 continue
346 if val is None or f.attrib.get(key) == val:
347 return f
348 return None
349
350 # On python2.6 the xml.etree.ElementTree.Element methods don't support
351 # the namespace parameter
352
353
354 def xpath_with_ns(path, ns_map):
355 components = [c.split(':') for c in path.split('/')]
356 replaced = []
357 for c in components:
358 if len(c) == 1:
359 replaced.append(c[0])
360 else:
361 ns, tag = c
362 replaced.append('{%s}%s' % (ns_map[ns], tag))
363 return '/'.join(replaced)
364
365
366 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
367 def _find_xpath(xpath):
368 return node.find(compat_xpath(xpath))
369
370 if isinstance(xpath, (str, compat_str)):
371 n = _find_xpath(xpath)
372 else:
373 for xp in xpath:
374 n = _find_xpath(xp)
375 if n is not None:
376 break
377
378 if n is None:
379 if default is not NO_DEFAULT:
380 return default
381 elif fatal:
382 name = xpath if name is None else name
383 raise ExtractorError('Could not find XML element %s' % name)
384 else:
385 return None
386 return n
387
388
389 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
390 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
391 if n is None or n == default:
392 return n
393 if n.text is None:
394 if default is not NO_DEFAULT:
395 return default
396 elif fatal:
397 name = xpath if name is None else name
398 raise ExtractorError('Could not find XML element\'s text %s' % name)
399 else:
400 return None
401 return n.text
402
403
404 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
405 n = find_xpath_attr(node, xpath, key)
406 if n is None:
407 if default is not NO_DEFAULT:
408 return default
409 elif fatal:
410 name = '%s[@%s]' % (xpath, key) if name is None else name
411 raise ExtractorError('Could not find XML attribute %s' % name)
412 else:
413 return None
414 return n.attrib[key]
415
416
417 def get_element_by_id(id, html):
418 """Return the content of the tag with the specified ID in the passed HTML document"""
419 return get_element_by_attribute('id', id, html)
420
421
422 def get_element_html_by_id(id, html):
423 """Return the html of the tag with the specified ID in the passed HTML document"""
424 return get_element_html_by_attribute('id', id, html)
425
426
427 def get_element_by_class(class_name, html):
428 """Return the content of the first tag with the specified class in the passed HTML document"""
429 retval = get_elements_by_class(class_name, html)
430 return retval[0] if retval else None
431
432
433 def get_element_html_by_class(class_name, html):
434 """Return the html of the first tag with the specified class in the passed HTML document"""
435 retval = get_elements_html_by_class(class_name, html)
436 return retval[0] if retval else None
437
438
439 def get_element_by_attribute(attribute, value, html, escape_value=True):
440 retval = get_elements_by_attribute(attribute, value, html, escape_value)
441 return retval[0] if retval else None
442
443
444 def get_element_html_by_attribute(attribute, value, html, escape_value=True):
445 retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
446 return retval[0] if retval else None
447
448
449 def get_elements_by_class(class_name, html):
450 """Return the content of all tags with the specified class in the passed HTML document as a list"""
451 return get_elements_by_attribute(
452 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
453 html, escape_value=False)
454
455
456 def get_elements_html_by_class(class_name, html):
457 """Return the html of all tags with the specified class in the passed HTML document as a list"""
458 return get_elements_html_by_attribute(
459 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
460 html, escape_value=False)
461
462
463 def get_elements_by_attribute(*args, **kwargs):
464 """Return the content of the tag with the specified attribute in the passed HTML document"""
465 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
466
467
468 def get_elements_html_by_attribute(*args, **kwargs):
469 """Return the html of the tag with the specified attribute in the passed HTML document"""
470 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
471
472
473 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
474 """
475 Return the text (content) and the html (whole) of the tag with the specified
476 attribute in the passed HTML document
477 """
478
479 value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
480
481 value = re.escape(value) if escape_value else value
482
483 partial_element_re = r'''(?x)
484 <(?P<tag>[a-zA-Z0-9:._-]+)
485 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
486 \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
487 ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
488
489 for m in re.finditer(partial_element_re, html):
490 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
491
492 yield (
493 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
494 whole
495 )
496
497
498 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
499 """
500 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
501 closing tag for the first opening tag it has encountered, and can be used
502 as a context manager
503 """
504
505 class HTMLBreakOnClosingTagException(Exception):
506 pass
507
508 def __init__(self):
509 self.tagstack = collections.deque()
510 compat_HTMLParser.__init__(self)
511
512 def __enter__(self):
513 return self
514
515 def __exit__(self, *_):
516 self.close()
517
518 def close(self):
519 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
520 # so data remains buffered; we no longer have any interest in it, thus
521 # override this method to discard it
522 pass
523
524 def handle_starttag(self, tag, _):
525 self.tagstack.append(tag)
526
527 def handle_endtag(self, tag):
528 if not self.tagstack:
529 raise compat_HTMLParseError('no tags in the stack')
530 while self.tagstack:
531 inner_tag = self.tagstack.pop()
532 if inner_tag == tag:
533 break
534 else:
535 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
536 if not self.tagstack:
537 raise self.HTMLBreakOnClosingTagException()
538
539
540 def get_element_text_and_html_by_tag(tag, html):
541 """
542 For the first element with the specified tag in the passed HTML document
543 return its' content (text) and the whole element (html)
544 """
545 def find_or_raise(haystack, needle, exc):
546 try:
547 return haystack.index(needle)
548 except ValueError:
549 raise exc
550 closing_tag = f'</{tag}>'
551 whole_start = find_or_raise(
552 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
553 content_start = find_or_raise(
554 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
555 content_start += whole_start + 1
556 with HTMLBreakOnClosingTagParser() as parser:
557 parser.feed(html[whole_start:content_start])
558 if not parser.tagstack or parser.tagstack[0] != tag:
559 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
560 offset = content_start
561 while offset < len(html):
562 next_closing_tag_start = find_or_raise(
563 html[offset:], closing_tag,
564 compat_HTMLParseError(f'closing {tag} tag not found'))
565 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
566 try:
567 parser.feed(html[offset:offset + next_closing_tag_end])
568 offset += next_closing_tag_end
569 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
570 return html[content_start:offset + next_closing_tag_start], \
571 html[whole_start:offset + next_closing_tag_end]
572 raise compat_HTMLParseError('unexpected end of html')
573
574
575 class HTMLAttributeParser(compat_HTMLParser):
576 """Trivial HTML parser to gather the attributes for a single element"""
577
578 def __init__(self):
579 self.attrs = {}
580 compat_HTMLParser.__init__(self)
581
582 def handle_starttag(self, tag, attrs):
583 self.attrs = dict(attrs)
584
585
586 class HTMLListAttrsParser(compat_HTMLParser):
587 """HTML parser to gather the attributes for the elements of a list"""
588
589 def __init__(self):
590 compat_HTMLParser.__init__(self)
591 self.items = []
592 self._level = 0
593
594 def handle_starttag(self, tag, attrs):
595 if tag == 'li' and self._level == 0:
596 self.items.append(dict(attrs))
597 self._level += 1
598
599 def handle_endtag(self, tag):
600 self._level -= 1
601
602
603 def extract_attributes(html_element):
604 """Given a string for an HTML element such as
605 <el
606 a="foo" B="bar" c="&98;az" d=boz
607 empty= noval entity="&amp;"
608 sq='"' dq="'"
609 >
610 Decode and return a dictionary of attributes.
611 {
612 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
613 'empty': '', 'noval': None, 'entity': '&',
614 'sq': '"', 'dq': '\''
615 }.
616 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
617 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
618 """
619 parser = HTMLAttributeParser()
620 try:
621 parser.feed(html_element)
622 parser.close()
623 # Older Python may throw HTMLParseError in case of malformed HTML
624 except compat_HTMLParseError:
625 pass
626 return parser.attrs
627
628
629 def parse_list(webpage):
630 """Given a string for an series of HTML <li> elements,
631 return a dictionary of their attributes"""
632 parser = HTMLListAttrsParser()
633 parser.feed(webpage)
634 parser.close()
635 return parser.items
636
637
638 def clean_html(html):
639 """Clean an HTML snippet into a readable string"""
640
641 if html is None: # Convenience for sanitizing descriptions etc.
642 return html
643
644 html = re.sub(r'\s+', ' ', html)
645 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
646 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
647 # Strip html tags
648 html = re.sub('<.*?>', '', html)
649 # Replace html entities
650 html = unescapeHTML(html)
651 return html.strip()
652
653
654 def sanitize_open(filename, open_mode):
655 """Try to open the given filename, and slightly tweak it if this fails.
656
657 Attempts to open the given filename. If this fails, it tries to change
658 the filename slightly, step by step, until it's either able to open it
659 or it fails and raises a final exception, like the standard open()
660 function.
661
662 It returns the tuple (stream, definitive_file_name).
663 """
664 try:
665 if filename == '-':
666 if sys.platform == 'win32':
667 import msvcrt
668 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
669 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
670 stream = locked_file(filename, open_mode, block=False).open()
671 return (stream, filename)
672 except (IOError, OSError) as err:
673 if err.errno in (errno.EACCES,):
674 raise
675
676 # In case of error, try to remove win32 forbidden chars
677 alt_filename = sanitize_path(filename)
678 if alt_filename == filename:
679 raise
680 else:
681 # An exception here should be caught in the caller
682 stream = locked_file(filename, open_mode, block=False).open()
683 return (stream, alt_filename)
684
685
686 def timeconvert(timestr):
687 """Convert RFC 2822 defined time string into system timestamp"""
688 timestamp = None
689 timetuple = email.utils.parsedate_tz(timestr)
690 if timetuple is not None:
691 timestamp = email.utils.mktime_tz(timetuple)
692 return timestamp
693
694
695 def sanitize_filename(s, restricted=False, is_id=False):
696 """Sanitizes a string so it could be used as part of a filename.
697 If restricted is set, use a stricter subset of allowed characters.
698 Set is_id if this is not an arbitrary string, but an ID that should be kept
699 if possible.
700 """
701 def replace_insane(char):
702 if restricted and char in ACCENT_CHARS:
703 return ACCENT_CHARS[char]
704 elif not restricted and char == '\n':
705 return ' '
706 elif char == '?' or ord(char) < 32 or ord(char) == 127:
707 return ''
708 elif char == '"':
709 return '' if restricted else '\''
710 elif char == ':':
711 return '_-' if restricted else ' -'
712 elif char in '\\/|*<>':
713 return '_'
714 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
715 return '_'
716 if restricted and ord(char) > 127:
717 return '_'
718 return char
719
720 if s == '':
721 return ''
722 # Handle timestamps
723 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
724 result = ''.join(map(replace_insane, s))
725 if not is_id:
726 while '__' in result:
727 result = result.replace('__', '_')
728 result = result.strip('_')
729 # Common case of "Foreign band name - English song title"
730 if restricted and result.startswith('-_'):
731 result = result[2:]
732 if result.startswith('-'):
733 result = '_' + result[len('-'):]
734 result = result.lstrip('.')
735 if not result:
736 result = '_'
737 return result
738
739
740 def sanitize_path(s, force=False):
741 """Sanitizes and normalizes path on Windows"""
742 if sys.platform == 'win32':
743 force = False
744 drive_or_unc, _ = os.path.splitdrive(s)
745 if sys.version_info < (2, 7) and not drive_or_unc:
746 drive_or_unc, _ = os.path.splitunc(s)
747 elif force:
748 drive_or_unc = ''
749 else:
750 return s
751
752 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
753 if drive_or_unc:
754 norm_path.pop(0)
755 sanitized_path = [
756 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
757 for path_part in norm_path]
758 if drive_or_unc:
759 sanitized_path.insert(0, drive_or_unc + os.path.sep)
760 elif force and s[0] == os.path.sep:
761 sanitized_path.insert(0, os.path.sep)
762 return os.path.join(*sanitized_path)
763
764
765 def sanitize_url(url):
766 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
767 # the number of unwanted failures due to missing protocol
768 if url.startswith('//'):
769 return 'http:%s' % url
770 # Fix some common typos seen so far
771 COMMON_TYPOS = (
772 # https://github.com/ytdl-org/youtube-dl/issues/15649
773 (r'^httpss://', r'https://'),
774 # https://bx1.be/lives/direct-tv/
775 (r'^rmtp([es]?)://', r'rtmp\1://'),
776 )
777 for mistake, fixup in COMMON_TYPOS:
778 if re.match(mistake, url):
779 return re.sub(mistake, fixup, url)
780 return url
781
782
783 def extract_basic_auth(url):
784 parts = compat_urlparse.urlsplit(url)
785 if parts.username is None:
786 return url, None
787 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
788 parts.hostname if parts.port is None
789 else '%s:%d' % (parts.hostname, parts.port))))
790 auth_payload = base64.b64encode(
791 ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
792 return url, 'Basic ' + auth_payload.decode('utf-8')
793
794
795 def sanitized_Request(url, *args, **kwargs):
796 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
797 if auth_header is not None:
798 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
799 headers['Authorization'] = auth_header
800 return compat_urllib_request.Request(url, *args, **kwargs)
801
802
803 def expand_path(s):
804 """Expand shell variables and ~"""
805 return os.path.expandvars(compat_expanduser(s))
806
807
808 def orderedSet(iterable):
809 """ Remove all duplicates from the input iterable """
810 res = []
811 for el in iterable:
812 if el not in res:
813 res.append(el)
814 return res
815
816
817 def _htmlentity_transform(entity_with_semicolon):
818 """Transforms an HTML entity to a character."""
819 entity = entity_with_semicolon[:-1]
820
821 # Known non-numeric HTML entity
822 if entity in compat_html_entities.name2codepoint:
823 return compat_chr(compat_html_entities.name2codepoint[entity])
824
825 # TODO: HTML5 allows entities without a semicolon. For example,
826 # '&Eacuteric' should be decoded as 'Éric'.
827 if entity_with_semicolon in compat_html_entities_html5:
828 return compat_html_entities_html5[entity_with_semicolon]
829
830 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
831 if mobj is not None:
832 numstr = mobj.group(1)
833 if numstr.startswith('x'):
834 base = 16
835 numstr = '0%s' % numstr
836 else:
837 base = 10
838 # See https://github.com/ytdl-org/youtube-dl/issues/7518
839 try:
840 return compat_chr(int(numstr, base))
841 except ValueError:
842 pass
843
844 # Unknown entity in name, return its literal representation
845 return '&%s;' % entity
846
847
848 def unescapeHTML(s):
849 if s is None:
850 return None
851 assert type(s) == compat_str
852
853 return re.sub(
854 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
855
856
857 def escapeHTML(text):
858 return (
859 text
860 .replace('&', '&amp;')
861 .replace('<', '&lt;')
862 .replace('>', '&gt;')
863 .replace('"', '&quot;')
864 .replace("'", '&#39;')
865 )
866
867
868 def process_communicate_or_kill(p, *args, **kwargs):
869 try:
870 return p.communicate(*args, **kwargs)
871 except BaseException: # Including KeyboardInterrupt
872 p.kill()
873 p.wait()
874 raise
875
876
877 class Popen(subprocess.Popen):
878 if sys.platform == 'win32':
879 _startupinfo = subprocess.STARTUPINFO()
880 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
881 else:
882 _startupinfo = None
883
884 def __init__(self, *args, **kwargs):
885 super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
886
887 def communicate_or_kill(self, *args, **kwargs):
888 return process_communicate_or_kill(self, *args, **kwargs)
889
890
891 def get_subprocess_encoding():
892 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
893 # For subprocess calls, encode with locale encoding
894 # Refer to http://stackoverflow.com/a/9951851/35070
895 encoding = preferredencoding()
896 else:
897 encoding = sys.getfilesystemencoding()
898 if encoding is None:
899 encoding = 'utf-8'
900 return encoding
901
902
903 def encodeFilename(s, for_subprocess=False):
904 """
905 @param s The name of the file
906 """
907
908 assert type(s) == compat_str
909
910 # Python 3 has a Unicode API
911 if sys.version_info >= (3, 0):
912 return s
913
914 # Pass '' directly to use Unicode APIs on Windows 2000 and up
915 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
916 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
917 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
918 return s
919
920 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
921 if sys.platform.startswith('java'):
922 return s
923
924 return s.encode(get_subprocess_encoding(), 'ignore')
925
926
927 def decodeFilename(b, for_subprocess=False):
928
929 if sys.version_info >= (3, 0):
930 return b
931
932 if not isinstance(b, bytes):
933 return b
934
935 return b.decode(get_subprocess_encoding(), 'ignore')
936
937
938 def encodeArgument(s):
939 if not isinstance(s, compat_str):
940 # Legacy code that uses byte strings
941 # Uncomment the following line after fixing all post processors
942 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
943 s = s.decode('ascii')
944 return encodeFilename(s, True)
945
946
947 def decodeArgument(b):
948 return decodeFilename(b, True)
949
950
951 def decodeOption(optval):
952 if optval is None:
953 return optval
954 if isinstance(optval, bytes):
955 optval = optval.decode(preferredencoding())
956
957 assert isinstance(optval, compat_str)
958 return optval
959
960
961 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
962
963
964 def timetuple_from_msec(msec):
965 secs, msec = divmod(msec, 1000)
966 mins, secs = divmod(secs, 60)
967 hrs, mins = divmod(mins, 60)
968 return _timetuple(hrs, mins, secs, msec)
969
970
971 def formatSeconds(secs, delim=':', msec=False):
972 time = timetuple_from_msec(secs * 1000)
973 if time.hours:
974 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
975 elif time.minutes:
976 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
977 else:
978 ret = '%d' % time.seconds
979 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
980
981
982 def _ssl_load_windows_store_certs(ssl_context, storename):
983 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
984 try:
985 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
986 if encoding == 'x509_asn' and (
987 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
988 except PermissionError:
989 return
990 for cert in certs:
991 try:
992 ssl_context.load_verify_locations(cadata=cert)
993 except ssl.SSLError:
994 pass
995
996
997 def make_HTTPS_handler(params, **kwargs):
998 opts_check_certificate = not params.get('nocheckcertificate')
999 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1000 context.check_hostname = opts_check_certificate
1001 if params.get('legacyserverconnect'):
1002 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
1003 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1004 if opts_check_certificate:
1005 try:
1006 context.load_default_certs()
1007 # Work around the issue in load_default_certs when there are bad certificates. See:
1008 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1009 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1010 except ssl.SSLError:
1011 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1012 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1013 # Create a new context to discard any certificates that were already loaded
1014 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1015 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
1016 for storename in ('CA', 'ROOT'):
1017 _ssl_load_windows_store_certs(context, storename)
1018 context.set_default_verify_paths()
1019 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1020
1021
1022 def bug_reports_message(before=';'):
1023 msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp , '
1024 'filling out the "Broken site" issue template properly. '
1025 'Confirm you are on the latest version using -U')
1026
1027 before = before.rstrip()
1028 if not before or before.endswith(('.', '!', '?')):
1029 msg = msg[0].title() + msg[1:]
1030
1031 return (before + ' ' if before else '') + msg
1032
1033
1034 class YoutubeDLError(Exception):
1035 """Base exception for YoutubeDL errors."""
1036 msg = None
1037
1038 def __init__(self, msg=None):
1039 if msg is not None:
1040 self.msg = msg
1041 elif self.msg is None:
1042 self.msg = type(self).__name__
1043 super().__init__(self.msg)
1044
1045
1046 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1047 if hasattr(ssl, 'CertificateError'):
1048 network_exceptions.append(ssl.CertificateError)
1049 network_exceptions = tuple(network_exceptions)
1050
1051
1052 class ExtractorError(YoutubeDLError):
1053 """Error during info extraction."""
1054
1055 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1056 """ tb, if given, is the original traceback (so that it can be printed out).
1057 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1058 """
1059 if sys.exc_info()[0] in network_exceptions:
1060 expected = True
1061
1062 self.msg = str(msg)
1063 self.traceback = tb
1064 self.expected = expected
1065 self.cause = cause
1066 self.video_id = video_id
1067 self.ie = ie
1068 self.exc_info = sys.exc_info() # preserve original exception
1069
1070 super(ExtractorError, self).__init__(''.join((
1071 format_field(ie, template='[%s] '),
1072 format_field(video_id, template='%s: '),
1073 self.msg,
1074 format_field(cause, template=' (caused by %r)'),
1075 '' if expected else bug_reports_message())))
1076
1077 def format_traceback(self):
1078 if self.traceback is None:
1079 return None
1080 return ''.join(traceback.format_tb(self.traceback))
1081
1082
1083 class UnsupportedError(ExtractorError):
1084 def __init__(self, url):
1085 super(UnsupportedError, self).__init__(
1086 'Unsupported URL: %s' % url, expected=True)
1087 self.url = url
1088
1089
1090 class RegexNotFoundError(ExtractorError):
1091 """Error when a regex didn't match"""
1092 pass
1093
1094
1095 class GeoRestrictedError(ExtractorError):
1096 """Geographic restriction Error exception.
1097
1098 This exception may be thrown when a video is not available from your
1099 geographic location due to geographic restrictions imposed by a website.
1100 """
1101
1102 def __init__(self, msg, countries=None, **kwargs):
1103 kwargs['expected'] = True
1104 super(GeoRestrictedError, self).__init__(msg, **kwargs)
1105 self.countries = countries
1106
1107
1108 class DownloadError(YoutubeDLError):
1109 """Download Error exception.
1110
1111 This exception may be thrown by FileDownloader objects if they are not
1112 configured to continue on errors. They will contain the appropriate
1113 error message.
1114 """
1115
1116 def __init__(self, msg, exc_info=None):
1117 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1118 super(DownloadError, self).__init__(msg)
1119 self.exc_info = exc_info
1120
1121
1122 class EntryNotInPlaylist(YoutubeDLError):
1123 """Entry not in playlist exception.
1124
1125 This exception will be thrown by YoutubeDL when a requested entry
1126 is not found in the playlist info_dict
1127 """
1128 msg = 'Entry not found in info'
1129
1130
1131 class SameFileError(YoutubeDLError):
1132 """Same File exception.
1133
1134 This exception will be thrown by FileDownloader objects if they detect
1135 multiple files would have to be downloaded to the same file on disk.
1136 """
1137 msg = 'Fixed output name but more than one file to download'
1138
1139 def __init__(self, filename=None):
1140 if filename is not None:
1141 self.msg += f': {filename}'
1142 super().__init__(self.msg)
1143
1144
1145 class PostProcessingError(YoutubeDLError):
1146 """Post Processing exception.
1147
1148 This exception may be raised by PostProcessor's .run() method to
1149 indicate an error in the postprocessing task.
1150 """
1151
1152
1153 class DownloadCancelled(YoutubeDLError):
1154 """ Exception raised when the download queue should be interrupted """
1155 msg = 'The download was cancelled'
1156
1157
1158 class ExistingVideoReached(DownloadCancelled):
1159 """ --break-on-existing triggered """
1160 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1161
1162
1163 class RejectedVideoReached(DownloadCancelled):
1164 """ --break-on-reject triggered """
1165 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1166
1167
1168 class MaxDownloadsReached(DownloadCancelled):
1169 """ --max-downloads limit has been reached. """
1170 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1171
1172
1173 class ReExtractInfo(YoutubeDLError):
1174 """ Video info needs to be re-extracted. """
1175
1176 def __init__(self, msg, expected=False):
1177 super().__init__(msg)
1178 self.expected = expected
1179
1180
1181 class ThrottledDownload(ReExtractInfo):
1182 """ Download speed below --throttled-rate. """
1183 msg = 'The download speed is below throttle limit'
1184
1185 def __init__(self):
1186 super().__init__(self.msg, expected=False)
1187
1188
1189 class UnavailableVideoError(YoutubeDLError):
1190 """Unavailable Format exception.
1191
1192 This exception will be thrown when a video is requested
1193 in a format that is not available for that video.
1194 """
1195 msg = 'Unable to download video'
1196
1197 def __init__(self, err=None):
1198 if err is not None:
1199 self.msg += f': {err}'
1200 super().__init__(self.msg)
1201
1202
1203 class ContentTooShortError(YoutubeDLError):
1204 """Content Too Short exception.
1205
1206 This exception may be raised by FileDownloader objects when a file they
1207 download is too small for what the server announced first, indicating
1208 the connection was probably interrupted.
1209 """
1210
1211 def __init__(self, downloaded, expected):
1212 super(ContentTooShortError, self).__init__(
1213 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1214 )
1215 # Both in bytes
1216 self.downloaded = downloaded
1217 self.expected = expected
1218
1219
1220 class XAttrMetadataError(YoutubeDLError):
1221 def __init__(self, code=None, msg='Unknown error'):
1222 super(XAttrMetadataError, self).__init__(msg)
1223 self.code = code
1224 self.msg = msg
1225
1226 # Parsing code and msg
1227 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1228 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1229 self.reason = 'NO_SPACE'
1230 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1231 self.reason = 'VALUE_TOO_LONG'
1232 else:
1233 self.reason = 'NOT_SUPPORTED'
1234
1235
1236 class XAttrUnavailableError(YoutubeDLError):
1237 pass
1238
1239
1240 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1241 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1242 # expected HTTP responses to meet HTTP/1.0 or later (see also
1243 # https://github.com/ytdl-org/youtube-dl/issues/6727)
1244 if sys.version_info < (3, 0):
1245 kwargs['strict'] = True
1246 hc = http_class(*args, **compat_kwargs(kwargs))
1247 source_address = ydl_handler._params.get('source_address')
1248
1249 if source_address is not None:
1250 # This is to workaround _create_connection() from socket where it will try all
1251 # address data from getaddrinfo() including IPv6. This filters the result from
1252 # getaddrinfo() based on the source_address value.
1253 # This is based on the cpython socket.create_connection() function.
1254 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1255 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1256 host, port = address
1257 err = None
1258 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1259 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1260 ip_addrs = [addr for addr in addrs if addr[0] == af]
1261 if addrs and not ip_addrs:
1262 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1263 raise socket.error(
1264 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1265 % (ip_version, source_address[0]))
1266 for res in ip_addrs:
1267 af, socktype, proto, canonname, sa = res
1268 sock = None
1269 try:
1270 sock = socket.socket(af, socktype, proto)
1271 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1272 sock.settimeout(timeout)
1273 sock.bind(source_address)
1274 sock.connect(sa)
1275 err = None # Explicitly break reference cycle
1276 return sock
1277 except socket.error as _:
1278 err = _
1279 if sock is not None:
1280 sock.close()
1281 if err is not None:
1282 raise err
1283 else:
1284 raise socket.error('getaddrinfo returns an empty list')
1285 if hasattr(hc, '_create_connection'):
1286 hc._create_connection = _create_connection
1287 sa = (source_address, 0)
1288 if hasattr(hc, 'source_address'): # Python 2.7+
1289 hc.source_address = sa
1290 else: # Python 2.6
1291 def _hc_connect(self, *args, **kwargs):
1292 sock = _create_connection(
1293 (self.host, self.port), self.timeout, sa)
1294 if is_https:
1295 self.sock = ssl.wrap_socket(
1296 sock, self.key_file, self.cert_file,
1297 ssl_version=ssl.PROTOCOL_TLSv1)
1298 else:
1299 self.sock = sock
1300 hc.connect = functools.partial(_hc_connect, hc)
1301
1302 return hc
1303
1304
1305 def handle_youtubedl_headers(headers):
1306 filtered_headers = headers
1307
1308 if 'Youtubedl-no-compression' in filtered_headers:
1309 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1310 del filtered_headers['Youtubedl-no-compression']
1311
1312 return filtered_headers
1313
1314
1315 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1316 """Handler for HTTP requests and responses.
1317
1318 This class, when installed with an OpenerDirector, automatically adds
1319 the standard headers to every HTTP request and handles gzipped and
1320 deflated responses from web servers. If compression is to be avoided in
1321 a particular request, the original request in the program code only has
1322 to include the HTTP header "Youtubedl-no-compression", which will be
1323 removed before making the real request.
1324
1325 Part of this code was copied from:
1326
1327 http://techknack.net/python-urllib2-handlers/
1328
1329 Andrew Rowls, the author of that code, agreed to release it to the
1330 public domain.
1331 """
1332
1333 def __init__(self, params, *args, **kwargs):
1334 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1335 self._params = params
1336
1337 def http_open(self, req):
1338 conn_class = compat_http_client.HTTPConnection
1339
1340 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1341 if socks_proxy:
1342 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1343 del req.headers['Ytdl-socks-proxy']
1344
1345 return self.do_open(functools.partial(
1346 _create_http_connection, self, conn_class, False),
1347 req)
1348
1349 @staticmethod
1350 def deflate(data):
1351 if not data:
1352 return data
1353 try:
1354 return zlib.decompress(data, -zlib.MAX_WBITS)
1355 except zlib.error:
1356 return zlib.decompress(data)
1357
1358 def http_request(self, req):
1359 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1360 # always respected by websites, some tend to give out URLs with non percent-encoded
1361 # non-ASCII characters (see telemb.py, ard.py [#3412])
1362 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1363 # To work around aforementioned issue we will replace request's original URL with
1364 # percent-encoded one
1365 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1366 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1367 url = req.get_full_url()
1368 url_escaped = escape_url(url)
1369
1370 # Substitute URL if any change after escaping
1371 if url != url_escaped:
1372 req = update_Request(req, url=url_escaped)
1373
1374 for h, v in std_headers.items():
1375 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1376 # The dict keys are capitalized because of this bug by urllib
1377 if h.capitalize() not in req.headers:
1378 req.add_header(h, v)
1379
1380 req.headers = handle_youtubedl_headers(req.headers)
1381
1382 if sys.version_info < (2, 7) and '#' in req.get_full_url():
1383 # Python 2.6 is brain-dead when it comes to fragments
1384 req._Request__original = req._Request__original.partition('#')[0]
1385 req._Request__r_type = req._Request__r_type.partition('#')[0]
1386
1387 return req
1388
1389 def http_response(self, req, resp):
1390 old_resp = resp
1391 # gzip
1392 if resp.headers.get('Content-encoding', '') == 'gzip':
1393 content = resp.read()
1394 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1395 try:
1396 uncompressed = io.BytesIO(gz.read())
1397 except IOError as original_ioerror:
1398 # There may be junk add the end of the file
1399 # See http://stackoverflow.com/q/4928560/35070 for details
1400 for i in range(1, 1024):
1401 try:
1402 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1403 uncompressed = io.BytesIO(gz.read())
1404 except IOError:
1405 continue
1406 break
1407 else:
1408 raise original_ioerror
1409 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1410 resp.msg = old_resp.msg
1411 del resp.headers['Content-encoding']
1412 # deflate
1413 if resp.headers.get('Content-encoding', '') == 'deflate':
1414 gz = io.BytesIO(self.deflate(resp.read()))
1415 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1416 resp.msg = old_resp.msg
1417 del resp.headers['Content-encoding']
1418 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1419 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1420 if 300 <= resp.code < 400:
1421 location = resp.headers.get('Location')
1422 if location:
1423 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1424 if sys.version_info >= (3, 0):
1425 location = location.encode('iso-8859-1').decode('utf-8')
1426 else:
1427 location = location.decode('utf-8')
1428 location_escaped = escape_url(location)
1429 if location != location_escaped:
1430 del resp.headers['Location']
1431 if sys.version_info < (3, 0):
1432 location_escaped = location_escaped.encode('utf-8')
1433 resp.headers['Location'] = location_escaped
1434 return resp
1435
1436 https_request = http_request
1437 https_response = http_response
1438
1439
1440 def make_socks_conn_class(base_class, socks_proxy):
1441 assert issubclass(base_class, (
1442 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1443
1444 url_components = compat_urlparse.urlparse(socks_proxy)
1445 if url_components.scheme.lower() == 'socks5':
1446 socks_type = ProxyType.SOCKS5
1447 elif url_components.scheme.lower() in ('socks', 'socks4'):
1448 socks_type = ProxyType.SOCKS4
1449 elif url_components.scheme.lower() == 'socks4a':
1450 socks_type = ProxyType.SOCKS4A
1451
1452 def unquote_if_non_empty(s):
1453 if not s:
1454 return s
1455 return compat_urllib_parse_unquote_plus(s)
1456
1457 proxy_args = (
1458 socks_type,
1459 url_components.hostname, url_components.port or 1080,
1460 True, # Remote DNS
1461 unquote_if_non_empty(url_components.username),
1462 unquote_if_non_empty(url_components.password),
1463 )
1464
1465 class SocksConnection(base_class):
1466 def connect(self):
1467 self.sock = sockssocket()
1468 self.sock.setproxy(*proxy_args)
1469 if type(self.timeout) in (int, float):
1470 self.sock.settimeout(self.timeout)
1471 self.sock.connect((self.host, self.port))
1472
1473 if isinstance(self, compat_http_client.HTTPSConnection):
1474 if hasattr(self, '_context'): # Python > 2.6
1475 self.sock = self._context.wrap_socket(
1476 self.sock, server_hostname=self.host)
1477 else:
1478 self.sock = ssl.wrap_socket(self.sock)
1479
1480 return SocksConnection
1481
1482
1483 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1484 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1485 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1486 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1487 self._params = params
1488
1489 def https_open(self, req):
1490 kwargs = {}
1491 conn_class = self._https_conn_class
1492
1493 if hasattr(self, '_context'): # python > 2.6
1494 kwargs['context'] = self._context
1495 if hasattr(self, '_check_hostname'): # python 3.x
1496 kwargs['check_hostname'] = self._check_hostname
1497
1498 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1499 if socks_proxy:
1500 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1501 del req.headers['Ytdl-socks-proxy']
1502
1503 return self.do_open(functools.partial(
1504 _create_http_connection, self, conn_class, True),
1505 req, **kwargs)
1506
1507
1508 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1509 """
1510 See [1] for cookie file format.
1511
1512 1. https://curl.haxx.se/docs/http-cookies.html
1513 """
1514 _HTTPONLY_PREFIX = '#HttpOnly_'
1515 _ENTRY_LEN = 7
1516 _HEADER = '''# Netscape HTTP Cookie File
1517 # This file is generated by yt-dlp. Do not edit.
1518
1519 '''
1520 _CookieFileEntry = collections.namedtuple(
1521 'CookieFileEntry',
1522 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1523
1524 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1525 """
1526 Save cookies to a file.
1527
1528 Most of the code is taken from CPython 3.8 and slightly adapted
1529 to support cookie files with UTF-8 in both python 2 and 3.
1530 """
1531 if filename is None:
1532 if self.filename is not None:
1533 filename = self.filename
1534 else:
1535 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1536
1537 # Store session cookies with `expires` set to 0 instead of an empty
1538 # string
1539 for cookie in self:
1540 if cookie.expires is None:
1541 cookie.expires = 0
1542
1543 with io.open(filename, 'w', encoding='utf-8') as f:
1544 f.write(self._HEADER)
1545 now = time.time()
1546 for cookie in self:
1547 if not ignore_discard and cookie.discard:
1548 continue
1549 if not ignore_expires and cookie.is_expired(now):
1550 continue
1551 if cookie.secure:
1552 secure = 'TRUE'
1553 else:
1554 secure = 'FALSE'
1555 if cookie.domain.startswith('.'):
1556 initial_dot = 'TRUE'
1557 else:
1558 initial_dot = 'FALSE'
1559 if cookie.expires is not None:
1560 expires = compat_str(cookie.expires)
1561 else:
1562 expires = ''
1563 if cookie.value is None:
1564 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1565 # with no name, whereas http.cookiejar regards it as a
1566 # cookie with no value.
1567 name = ''
1568 value = cookie.name
1569 else:
1570 name = cookie.name
1571 value = cookie.value
1572 f.write(
1573 '\t'.join([cookie.domain, initial_dot, cookie.path,
1574 secure, expires, name, value]) + '\n')
1575
1576 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1577 """Load cookies from a file."""
1578 if filename is None:
1579 if self.filename is not None:
1580 filename = self.filename
1581 else:
1582 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1583
1584 def prepare_line(line):
1585 if line.startswith(self._HTTPONLY_PREFIX):
1586 line = line[len(self._HTTPONLY_PREFIX):]
1587 # comments and empty lines are fine
1588 if line.startswith('#') or not line.strip():
1589 return line
1590 cookie_list = line.split('\t')
1591 if len(cookie_list) != self._ENTRY_LEN:
1592 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1593 cookie = self._CookieFileEntry(*cookie_list)
1594 if cookie.expires_at and not cookie.expires_at.isdigit():
1595 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1596 return line
1597
1598 cf = io.StringIO()
1599 with io.open(filename, encoding='utf-8') as f:
1600 for line in f:
1601 try:
1602 cf.write(prepare_line(line))
1603 except compat_cookiejar.LoadError as e:
1604 write_string(
1605 'WARNING: skipping cookie file entry due to %s: %r\n'
1606 % (e, line), sys.stderr)
1607 continue
1608 cf.seek(0)
1609 self._really_load(cf, filename, ignore_discard, ignore_expires)
1610 # Session cookies are denoted by either `expires` field set to
1611 # an empty string or 0. MozillaCookieJar only recognizes the former
1612 # (see [1]). So we need force the latter to be recognized as session
1613 # cookies on our own.
1614 # Session cookies may be important for cookies-based authentication,
1615 # e.g. usually, when user does not check 'Remember me' check box while
1616 # logging in on a site, some important cookies are stored as session
1617 # cookies so that not recognizing them will result in failed login.
1618 # 1. https://bugs.python.org/issue17164
1619 for cookie in self:
1620 # Treat `expires=0` cookies as session cookies
1621 if cookie.expires == 0:
1622 cookie.expires = None
1623 cookie.discard = True
1624
1625
1626 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1627 def __init__(self, cookiejar=None):
1628 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1629
1630 def http_response(self, request, response):
1631 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1632 # characters in Set-Cookie HTTP header of last response (see
1633 # https://github.com/ytdl-org/youtube-dl/issues/6769).
1634 # In order to at least prevent crashing we will percent encode Set-Cookie
1635 # header before HTTPCookieProcessor starts processing it.
1636 # if sys.version_info < (3, 0) and response.headers:
1637 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1638 # set_cookie = response.headers.get(set_cookie_header)
1639 # if set_cookie:
1640 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1641 # if set_cookie != set_cookie_escaped:
1642 # del response.headers[set_cookie_header]
1643 # response.headers[set_cookie_header] = set_cookie_escaped
1644 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1645
1646 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1647 https_response = http_response
1648
1649
1650 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1651 """YoutubeDL redirect handler
1652
1653 The code is based on HTTPRedirectHandler implementation from CPython [1].
1654
1655 This redirect handler solves two issues:
1656 - ensures redirect URL is always unicode under python 2
1657 - introduces support for experimental HTTP response status code
1658 308 Permanent Redirect [2] used by some sites [3]
1659
1660 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1661 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1662 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1663 """
1664
1665 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1666
1667 def redirect_request(self, req, fp, code, msg, headers, newurl):
1668 """Return a Request or None in response to a redirect.
1669
1670 This is called by the http_error_30x methods when a
1671 redirection response is received. If a redirection should
1672 take place, return a new Request to allow http_error_30x to
1673 perform the redirect. Otherwise, raise HTTPError if no-one
1674 else should try to handle this url. Return None if you can't
1675 but another Handler might.
1676 """
1677 m = req.get_method()
1678 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1679 or code in (301, 302, 303) and m == "POST")):
1680 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1681 # Strictly (according to RFC 2616), 301 or 302 in response to
1682 # a POST MUST NOT cause a redirection without confirmation
1683 # from the user (of urllib.request, in this case). In practice,
1684 # essentially all clients do redirect in this case, so we do
1685 # the same.
1686
1687 # On python 2 urlh.geturl() may sometimes return redirect URL
1688 # as byte string instead of unicode. This workaround allows
1689 # to force it always return unicode.
1690 if sys.version_info[0] < 3:
1691 newurl = compat_str(newurl)
1692
1693 # Be conciliant with URIs containing a space. This is mainly
1694 # redundant with the more complete encoding done in http_error_302(),
1695 # but it is kept for compatibility with other callers.
1696 newurl = newurl.replace(' ', '%20')
1697
1698 CONTENT_HEADERS = ("content-length", "content-type")
1699 # NB: don't use dict comprehension for python 2.6 compatibility
1700 newheaders = dict((k, v) for k, v in req.headers.items()
1701 if k.lower() not in CONTENT_HEADERS)
1702 return compat_urllib_request.Request(
1703 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1704 unverifiable=True)
1705
1706
1707 def extract_timezone(date_str):
1708 m = re.search(
1709 r'''(?x)
1710 ^.{8,}? # >=8 char non-TZ prefix, if present
1711 (?P<tz>Z| # just the UTC Z, or
1712 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1713 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1714 [ ]? # optional space
1715 (?P<sign>\+|-) # +/-
1716 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1717 $)
1718 ''', date_str)
1719 if not m:
1720 timezone = datetime.timedelta()
1721 else:
1722 date_str = date_str[:-len(m.group('tz'))]
1723 if not m.group('sign'):
1724 timezone = datetime.timedelta()
1725 else:
1726 sign = 1 if m.group('sign') == '+' else -1
1727 timezone = datetime.timedelta(
1728 hours=sign * int(m.group('hours')),
1729 minutes=sign * int(m.group('minutes')))
1730 return timezone, date_str
1731
1732
1733 def parse_iso8601(date_str, delimiter='T', timezone=None):
1734 """ Return a UNIX timestamp from the given date """
1735
1736 if date_str is None:
1737 return None
1738
1739 date_str = re.sub(r'\.[0-9]+', '', date_str)
1740
1741 if timezone is None:
1742 timezone, date_str = extract_timezone(date_str)
1743
1744 try:
1745 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1746 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1747 return calendar.timegm(dt.timetuple())
1748 except ValueError:
1749 pass
1750
1751
1752 def date_formats(day_first=True):
1753 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1754
1755
1756 def unified_strdate(date_str, day_first=True):
1757 """Return a string with the date in the format YYYYMMDD"""
1758
1759 if date_str is None:
1760 return None
1761 upload_date = None
1762 # Replace commas
1763 date_str = date_str.replace(',', ' ')
1764 # Remove AM/PM + timezone
1765 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1766 _, date_str = extract_timezone(date_str)
1767
1768 for expression in date_formats(day_first):
1769 try:
1770 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1771 except ValueError:
1772 pass
1773 if upload_date is None:
1774 timetuple = email.utils.parsedate_tz(date_str)
1775 if timetuple:
1776 try:
1777 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1778 except ValueError:
1779 pass
1780 if upload_date is not None:
1781 return compat_str(upload_date)
1782
1783
1784 def unified_timestamp(date_str, day_first=True):
1785 if date_str is None:
1786 return None
1787
1788 date_str = re.sub(r'[,|]', '', date_str)
1789
1790 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1791 timezone, date_str = extract_timezone(date_str)
1792
1793 # Remove AM/PM + timezone
1794 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1795
1796 # Remove unrecognized timezones from ISO 8601 alike timestamps
1797 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1798 if m:
1799 date_str = date_str[:-len(m.group('tz'))]
1800
1801 # Python only supports microseconds, so remove nanoseconds
1802 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1803 if m:
1804 date_str = m.group(1)
1805
1806 for expression in date_formats(day_first):
1807 try:
1808 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1809 return calendar.timegm(dt.timetuple())
1810 except ValueError:
1811 pass
1812 timetuple = email.utils.parsedate_tz(date_str)
1813 if timetuple:
1814 return calendar.timegm(timetuple) + pm_delta * 3600
1815
1816
1817 def determine_ext(url, default_ext='unknown_video'):
1818 if url is None or '.' not in url:
1819 return default_ext
1820 guess = url.partition('?')[0].rpartition('.')[2]
1821 if re.match(r'^[A-Za-z0-9]+$', guess):
1822 return guess
1823 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1824 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1825 return guess.rstrip('/')
1826 else:
1827 return default_ext
1828
1829
1830 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1831 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1832
1833
1834 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1835 """
1836 Return a datetime object from a string in the format YYYYMMDD or
1837 (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1838
1839 format: string date format used to return datetime object from
1840 precision: round the time portion of a datetime object.
1841 auto|microsecond|second|minute|hour|day.
1842 auto: round to the unit provided in date_str (if applicable).
1843 """
1844 auto_precision = False
1845 if precision == 'auto':
1846 auto_precision = True
1847 precision = 'microsecond'
1848 today = datetime_round(datetime.datetime.utcnow(), precision)
1849 if date_str in ('now', 'today'):
1850 return today
1851 if date_str == 'yesterday':
1852 return today - datetime.timedelta(days=1)
1853 match = re.match(
1854 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1855 date_str)
1856 if match is not None:
1857 start_time = datetime_from_str(match.group('start'), precision, format)
1858 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1859 unit = match.group('unit')
1860 if unit == 'month' or unit == 'year':
1861 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1862 unit = 'day'
1863 else:
1864 if unit == 'week':
1865 unit = 'day'
1866 time *= 7
1867 delta = datetime.timedelta(**{unit + 's': time})
1868 new_date = start_time + delta
1869 if auto_precision:
1870 return datetime_round(new_date, unit)
1871 return new_date
1872
1873 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1874
1875
1876 def date_from_str(date_str, format='%Y%m%d', strict=False):
1877 """
1878 Return a datetime object from a string in the format YYYYMMDD or
1879 (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1880
1881 If "strict", only (now|today)[+-][0-9](day|week|month|year)(s)? is allowed
1882
1883 format: string date format used to return datetime object from
1884 """
1885 if strict and not re.fullmatch(r'\d{8}|(now|today)[+-]\d+(day|week|month|year)(s)?', date_str):
1886 raise ValueError(f'Invalid date format {date_str}')
1887 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1888
1889
1890 def datetime_add_months(dt, months):
1891 """Increment/Decrement a datetime object by months."""
1892 month = dt.month + months - 1
1893 year = dt.year + month // 12
1894 month = month % 12 + 1
1895 day = min(dt.day, calendar.monthrange(year, month)[1])
1896 return dt.replace(year, month, day)
1897
1898
1899 def datetime_round(dt, precision='day'):
1900 """
1901 Round a datetime object's time to a specific precision
1902 """
1903 if precision == 'microsecond':
1904 return dt
1905
1906 unit_seconds = {
1907 'day': 86400,
1908 'hour': 3600,
1909 'minute': 60,
1910 'second': 1,
1911 }
1912 roundto = lambda x, n: ((x + n / 2) // n) * n
1913 timestamp = calendar.timegm(dt.timetuple())
1914 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1915
1916
1917 def hyphenate_date(date_str):
1918 """
1919 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1920 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1921 if match is not None:
1922 return '-'.join(match.groups())
1923 else:
1924 return date_str
1925
1926
1927 class DateRange(object):
1928 """Represents a time interval between two dates"""
1929
1930 def __init__(self, start=None, end=None):
1931 """start and end must be strings in the format accepted by date"""
1932 if start is not None:
1933 self.start = date_from_str(start, strict=True)
1934 else:
1935 self.start = datetime.datetime.min.date()
1936 if end is not None:
1937 self.end = date_from_str(end, strict=True)
1938 else:
1939 self.end = datetime.datetime.max.date()
1940 if self.start > self.end:
1941 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1942
1943 @classmethod
1944 def day(cls, day):
1945 """Returns a range that only contains the given day"""
1946 return cls(day, day)
1947
1948 def __contains__(self, date):
1949 """Check if the date is in the range"""
1950 if not isinstance(date, datetime.date):
1951 date = date_from_str(date)
1952 return self.start <= date <= self.end
1953
1954 def __str__(self):
1955 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1956
1957
1958 def platform_name():
1959 """ Returns the platform name as a compat_str """
1960 res = platform.platform()
1961 if isinstance(res, bytes):
1962 res = res.decode(preferredencoding())
1963
1964 assert isinstance(res, compat_str)
1965 return res
1966
1967
1968 def get_windows_version():
1969 ''' Get Windows version. None if it's not running on Windows '''
1970 if compat_os_name == 'nt':
1971 return version_tuple(platform.win32_ver()[1])
1972 else:
1973 return None
1974
1975
1976 def _windows_write_string(s, out):
1977 """ Returns True if the string was written using special methods,
1978 False if it has yet to be written out."""
1979 # Adapted from http://stackoverflow.com/a/3259271/35070
1980
1981 import ctypes.wintypes
1982
1983 WIN_OUTPUT_IDS = {
1984 1: -11,
1985 2: -12,
1986 }
1987
1988 try:
1989 fileno = out.fileno()
1990 except AttributeError:
1991 # If the output stream doesn't have a fileno, it's virtual
1992 return False
1993 except io.UnsupportedOperation:
1994 # Some strange Windows pseudo files?
1995 return False
1996 if fileno not in WIN_OUTPUT_IDS:
1997 return False
1998
1999 GetStdHandle = compat_ctypes_WINFUNCTYPE(
2000 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
2001 ('GetStdHandle', ctypes.windll.kernel32))
2002 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
2003
2004 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
2005 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
2006 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
2007 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
2008 written = ctypes.wintypes.DWORD(0)
2009
2010 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
2011 FILE_TYPE_CHAR = 0x0002
2012 FILE_TYPE_REMOTE = 0x8000
2013 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
2014 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
2015 ctypes.POINTER(ctypes.wintypes.DWORD))(
2016 ('GetConsoleMode', ctypes.windll.kernel32))
2017 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
2018
2019 def not_a_console(handle):
2020 if handle == INVALID_HANDLE_VALUE or handle is None:
2021 return True
2022 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
2023 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
2024
2025 if not_a_console(h):
2026 return False
2027
2028 def next_nonbmp_pos(s):
2029 try:
2030 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
2031 except StopIteration:
2032 return len(s)
2033
2034 while s:
2035 count = min(next_nonbmp_pos(s), 1024)
2036
2037 ret = WriteConsoleW(
2038 h, s, count if count else 2, ctypes.byref(written), None)
2039 if ret == 0:
2040 raise OSError('Failed to write string')
2041 if not count: # We just wrote a non-BMP character
2042 assert written.value == 2
2043 s = s[1:]
2044 else:
2045 assert written.value > 0
2046 s = s[written.value:]
2047 return True
2048
2049
2050 def write_string(s, out=None, encoding=None):
2051 if out is None:
2052 out = sys.stderr
2053 assert type(s) == compat_str
2054
2055 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
2056 if _windows_write_string(s, out):
2057 return
2058
2059 if ('b' in getattr(out, 'mode', '')
2060 or sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
2061 byt = s.encode(encoding or preferredencoding(), 'ignore')
2062 out.write(byt)
2063 elif hasattr(out, 'buffer'):
2064 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2065 byt = s.encode(enc, 'ignore')
2066 out.buffer.write(byt)
2067 else:
2068 out.write(s)
2069 out.flush()
2070
2071
2072 def bytes_to_intlist(bs):
2073 if not bs:
2074 return []
2075 if isinstance(bs[0], int): # Python 3
2076 return list(bs)
2077 else:
2078 return [ord(c) for c in bs]
2079
2080
2081 def intlist_to_bytes(xs):
2082 if not xs:
2083 return b''
2084 return compat_struct_pack('%dB' % len(xs), *xs)
2085
2086
2087 # Cross-platform file locking
2088 if sys.platform == 'win32':
2089 import ctypes.wintypes
2090 import msvcrt
2091
2092 class OVERLAPPED(ctypes.Structure):
2093 _fields_ = [
2094 ('Internal', ctypes.wintypes.LPVOID),
2095 ('InternalHigh', ctypes.wintypes.LPVOID),
2096 ('Offset', ctypes.wintypes.DWORD),
2097 ('OffsetHigh', ctypes.wintypes.DWORD),
2098 ('hEvent', ctypes.wintypes.HANDLE),
2099 ]
2100
2101 kernel32 = ctypes.windll.kernel32
2102 LockFileEx = kernel32.LockFileEx
2103 LockFileEx.argtypes = [
2104 ctypes.wintypes.HANDLE, # hFile
2105 ctypes.wintypes.DWORD, # dwFlags
2106 ctypes.wintypes.DWORD, # dwReserved
2107 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2108 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2109 ctypes.POINTER(OVERLAPPED) # Overlapped
2110 ]
2111 LockFileEx.restype = ctypes.wintypes.BOOL
2112 UnlockFileEx = kernel32.UnlockFileEx
2113 UnlockFileEx.argtypes = [
2114 ctypes.wintypes.HANDLE, # hFile
2115 ctypes.wintypes.DWORD, # dwReserved
2116 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2117 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2118 ctypes.POINTER(OVERLAPPED) # Overlapped
2119 ]
2120 UnlockFileEx.restype = ctypes.wintypes.BOOL
2121 whole_low = 0xffffffff
2122 whole_high = 0x7fffffff
2123
2124 def _lock_file(f, exclusive, block): # todo: block unused on win32
2125 overlapped = OVERLAPPED()
2126 overlapped.Offset = 0
2127 overlapped.OffsetHigh = 0
2128 overlapped.hEvent = 0
2129 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2130 handle = msvcrt.get_osfhandle(f.fileno())
2131 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
2132 whole_low, whole_high, f._lock_file_overlapped_p):
2133 raise OSError('Locking file failed: %r' % ctypes.FormatError())
2134
2135 def _unlock_file(f):
2136 assert f._lock_file_overlapped_p
2137 handle = msvcrt.get_osfhandle(f.fileno())
2138 if not UnlockFileEx(handle, 0,
2139 whole_low, whole_high, f._lock_file_overlapped_p):
2140 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2141
2142 else:
2143 # Some platforms, such as Jython, is missing fcntl
2144 try:
2145 import fcntl
2146
2147 def _lock_file(f, exclusive, block):
2148 fcntl.flock(f,
2149 fcntl.LOCK_SH if not exclusive
2150 else fcntl.LOCK_EX if block
2151 else fcntl.LOCK_EX | fcntl.LOCK_NB)
2152
2153 def _unlock_file(f):
2154 fcntl.flock(f, fcntl.LOCK_UN)
2155
2156 except ImportError:
2157 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2158
2159 def _lock_file(f, exclusive, block):
2160 raise IOError(UNSUPPORTED_MSG)
2161
2162 def _unlock_file(f):
2163 raise IOError(UNSUPPORTED_MSG)
2164
2165
2166 class locked_file(object):
2167 def __init__(self, filename, mode, block=True, encoding=None):
2168 assert mode in ['r', 'rb', 'a', 'ab', 'w', 'wb']
2169 self.f = io.open(filename, mode, encoding=encoding)
2170 self.mode = mode
2171 self.block = block
2172
2173 def __enter__(self):
2174 exclusive = 'r' not in self.mode
2175 try:
2176 _lock_file(self.f, exclusive, self.block)
2177 except IOError:
2178 self.f.close()
2179 raise
2180 return self
2181
2182 def __exit__(self, etype, value, traceback):
2183 try:
2184 _unlock_file(self.f)
2185 finally:
2186 self.f.close()
2187
2188 def __iter__(self):
2189 return iter(self.f)
2190
2191 def write(self, *args):
2192 return self.f.write(*args)
2193
2194 def read(self, *args):
2195 return self.f.read(*args)
2196
2197 def flush(self):
2198 self.f.flush()
2199
2200 def open(self):
2201 return self.__enter__()
2202
2203 def close(self, *args):
2204 self.__exit__(self, *args, value=False, traceback=False)
2205
2206
2207 def get_filesystem_encoding():
2208 encoding = sys.getfilesystemencoding()
2209 return encoding if encoding is not None else 'utf-8'
2210
2211
2212 def shell_quote(args):
2213 quoted_args = []
2214 encoding = get_filesystem_encoding()
2215 for a in args:
2216 if isinstance(a, bytes):
2217 # We may get a filename encoded with 'encodeFilename'
2218 a = a.decode(encoding)
2219 quoted_args.append(compat_shlex_quote(a))
2220 return ' '.join(quoted_args)
2221
2222
2223 def smuggle_url(url, data):
2224 """ Pass additional data in a URL for internal use. """
2225
2226 url, idata = unsmuggle_url(url, {})
2227 data.update(idata)
2228 sdata = compat_urllib_parse_urlencode(
2229 {'__youtubedl_smuggle': json.dumps(data)})
2230 return url + '#' + sdata
2231
2232
2233 def unsmuggle_url(smug_url, default=None):
2234 if '#__youtubedl_smuggle' not in smug_url:
2235 return smug_url, default
2236 url, _, sdata = smug_url.rpartition('#')
2237 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2238 data = json.loads(jsond)
2239 return url, data
2240
2241
2242 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2243 """ Formats numbers with decimal sufixes like K, M, etc """
2244 num, factor = float_or_none(num), float(factor)
2245 if num is None:
2246 return None
2247 exponent = 0 if num == 0 else int(math.log(num, factor))
2248 suffix = ['', *'kMGTPEZY'][exponent]
2249 if factor == 1024:
2250 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2251 converted = num / (factor ** exponent)
2252 return fmt % (converted, suffix)
2253
2254
2255 def format_bytes(bytes):
2256 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2257
2258
2259 def lookup_unit_table(unit_table, s):
2260 units_re = '|'.join(re.escape(u) for u in unit_table)
2261 m = re.match(
2262 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2263 if not m:
2264 return None
2265 num_str = m.group('num').replace(',', '.')
2266 mult = unit_table[m.group('unit')]
2267 return int(float(num_str) * mult)
2268
2269
2270 def parse_filesize(s):
2271 if s is None:
2272 return None
2273
2274 # The lower-case forms are of course incorrect and unofficial,
2275 # but we support those too
2276 _UNIT_TABLE = {
2277 'B': 1,
2278 'b': 1,
2279 'bytes': 1,
2280 'KiB': 1024,
2281 'KB': 1000,
2282 'kB': 1024,
2283 'Kb': 1000,
2284 'kb': 1000,
2285 'kilobytes': 1000,
2286 'kibibytes': 1024,
2287 'MiB': 1024 ** 2,
2288 'MB': 1000 ** 2,
2289 'mB': 1024 ** 2,
2290 'Mb': 1000 ** 2,
2291 'mb': 1000 ** 2,
2292 'megabytes': 1000 ** 2,
2293 'mebibytes': 1024 ** 2,
2294 'GiB': 1024 ** 3,
2295 'GB': 1000 ** 3,
2296 'gB': 1024 ** 3,
2297 'Gb': 1000 ** 3,
2298 'gb': 1000 ** 3,
2299 'gigabytes': 1000 ** 3,
2300 'gibibytes': 1024 ** 3,
2301 'TiB': 1024 ** 4,
2302 'TB': 1000 ** 4,
2303 'tB': 1024 ** 4,
2304 'Tb': 1000 ** 4,
2305 'tb': 1000 ** 4,
2306 'terabytes': 1000 ** 4,
2307 'tebibytes': 1024 ** 4,
2308 'PiB': 1024 ** 5,
2309 'PB': 1000 ** 5,
2310 'pB': 1024 ** 5,
2311 'Pb': 1000 ** 5,
2312 'pb': 1000 ** 5,
2313 'petabytes': 1000 ** 5,
2314 'pebibytes': 1024 ** 5,
2315 'EiB': 1024 ** 6,
2316 'EB': 1000 ** 6,
2317 'eB': 1024 ** 6,
2318 'Eb': 1000 ** 6,
2319 'eb': 1000 ** 6,
2320 'exabytes': 1000 ** 6,
2321 'exbibytes': 1024 ** 6,
2322 'ZiB': 1024 ** 7,
2323 'ZB': 1000 ** 7,
2324 'zB': 1024 ** 7,
2325 'Zb': 1000 ** 7,
2326 'zb': 1000 ** 7,
2327 'zettabytes': 1000 ** 7,
2328 'zebibytes': 1024 ** 7,
2329 'YiB': 1024 ** 8,
2330 'YB': 1000 ** 8,
2331 'yB': 1024 ** 8,
2332 'Yb': 1000 ** 8,
2333 'yb': 1000 ** 8,
2334 'yottabytes': 1000 ** 8,
2335 'yobibytes': 1024 ** 8,
2336 }
2337
2338 return lookup_unit_table(_UNIT_TABLE, s)
2339
2340
2341 def parse_count(s):
2342 if s is None:
2343 return None
2344
2345 s = re.sub(r'^[^\d]+\s', '', s).strip()
2346
2347 if re.match(r'^[\d,.]+$', s):
2348 return str_to_int(s)
2349
2350 _UNIT_TABLE = {
2351 'k': 1000,
2352 'K': 1000,
2353 'm': 1000 ** 2,
2354 'M': 1000 ** 2,
2355 'kk': 1000 ** 2,
2356 'KK': 1000 ** 2,
2357 'b': 1000 ** 3,
2358 'B': 1000 ** 3,
2359 }
2360
2361 ret = lookup_unit_table(_UNIT_TABLE, s)
2362 if ret is not None:
2363 return ret
2364
2365 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2366 if mobj:
2367 return str_to_int(mobj.group(1))
2368
2369
2370 def parse_resolution(s):
2371 if s is None:
2372 return {}
2373
2374 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2375 if mobj:
2376 return {
2377 'width': int(mobj.group('w')),
2378 'height': int(mobj.group('h')),
2379 }
2380
2381 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2382 if mobj:
2383 return {'height': int(mobj.group(1))}
2384
2385 mobj = re.search(r'\b([48])[kK]\b', s)
2386 if mobj:
2387 return {'height': int(mobj.group(1)) * 540}
2388
2389 return {}
2390
2391
2392 def parse_bitrate(s):
2393 if not isinstance(s, compat_str):
2394 return
2395 mobj = re.search(r'\b(\d+)\s*kbps', s)
2396 if mobj:
2397 return int(mobj.group(1))
2398
2399
2400 def month_by_name(name, lang='en'):
2401 """ Return the number of a month by (locale-independently) English name """
2402
2403 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2404
2405 try:
2406 return month_names.index(name) + 1
2407 except ValueError:
2408 return None
2409
2410
2411 def month_by_abbreviation(abbrev):
2412 """ Return the number of a month by (locale-independently) English
2413 abbreviations """
2414
2415 try:
2416 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2417 except ValueError:
2418 return None
2419
2420
2421 def fix_xml_ampersands(xml_str):
2422 """Replace all the '&' by '&amp;' in XML"""
2423 return re.sub(
2424 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2425 '&amp;',
2426 xml_str)
2427
2428
2429 def setproctitle(title):
2430 assert isinstance(title, compat_str)
2431
2432 # ctypes in Jython is not complete
2433 # http://bugs.jython.org/issue2148
2434 if sys.platform.startswith('java'):
2435 return
2436
2437 try:
2438 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2439 except OSError:
2440 return
2441 except TypeError:
2442 # LoadLibrary in Windows Python 2.7.13 only expects
2443 # a bytestring, but since unicode_literals turns
2444 # every string into a unicode string, it fails.
2445 return
2446 title_bytes = title.encode('utf-8')
2447 buf = ctypes.create_string_buffer(len(title_bytes))
2448 buf.value = title_bytes
2449 try:
2450 libc.prctl(15, buf, 0, 0, 0)
2451 except AttributeError:
2452 return # Strange libc, just skip this
2453
2454
2455 def remove_start(s, start):
2456 return s[len(start):] if s is not None and s.startswith(start) else s
2457
2458
2459 def remove_end(s, end):
2460 return s[:-len(end)] if s is not None and s.endswith(end) else s
2461
2462
2463 def remove_quotes(s):
2464 if s is None or len(s) < 2:
2465 return s
2466 for quote in ('"', "'", ):
2467 if s[0] == quote and s[-1] == quote:
2468 return s[1:-1]
2469 return s
2470
2471
2472 def get_domain(url):
2473 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2474 return domain.group('domain') if domain else None
2475
2476
2477 def url_basename(url):
2478 path = compat_urlparse.urlparse(url).path
2479 return path.strip('/').split('/')[-1]
2480
2481
2482 def base_url(url):
2483 return re.match(r'https?://[^?#&]+/', url).group()
2484
2485
2486 def urljoin(base, path):
2487 if isinstance(path, bytes):
2488 path = path.decode('utf-8')
2489 if not isinstance(path, compat_str) or not path:
2490 return None
2491 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2492 return path
2493 if isinstance(base, bytes):
2494 base = base.decode('utf-8')
2495 if not isinstance(base, compat_str) or not re.match(
2496 r'^(?:https?:)?//', base):
2497 return None
2498 return compat_urlparse.urljoin(base, path)
2499
2500
2501 class HEADRequest(compat_urllib_request.Request):
2502 def get_method(self):
2503 return 'HEAD'
2504
2505
2506 class PUTRequest(compat_urllib_request.Request):
2507 def get_method(self):
2508 return 'PUT'
2509
2510
2511 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2512 if get_attr and v is not None:
2513 v = getattr(v, get_attr, None)
2514 try:
2515 return int(v) * invscale // scale
2516 except (ValueError, TypeError, OverflowError):
2517 return default
2518
2519
2520 def str_or_none(v, default=None):
2521 return default if v is None else compat_str(v)
2522
2523
2524 def str_to_int(int_str):
2525 """ A more relaxed version of int_or_none """
2526 if isinstance(int_str, compat_integer_types):
2527 return int_str
2528 elif isinstance(int_str, compat_str):
2529 int_str = re.sub(r'[,\.\+]', '', int_str)
2530 return int_or_none(int_str)
2531
2532
2533 def float_or_none(v, scale=1, invscale=1, default=None):
2534 if v is None:
2535 return default
2536 try:
2537 return float(v) * invscale / scale
2538 except (ValueError, TypeError):
2539 return default
2540
2541
2542 def bool_or_none(v, default=None):
2543 return v if isinstance(v, bool) else default
2544
2545
2546 def strip_or_none(v, default=None):
2547 return v.strip() if isinstance(v, compat_str) else default
2548
2549
2550 def url_or_none(url):
2551 if not url or not isinstance(url, compat_str):
2552 return None
2553 url = url.strip()
2554 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2555
2556
2557 def strftime_or_none(timestamp, date_format, default=None):
2558 datetime_object = None
2559 try:
2560 if isinstance(timestamp, compat_numeric_types): # unix timestamp
2561 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2562 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2563 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2564 return datetime_object.strftime(date_format)
2565 except (ValueError, TypeError, AttributeError):
2566 return default
2567
2568
2569 def parse_duration(s):
2570 if not isinstance(s, compat_basestring):
2571 return None
2572 s = s.strip()
2573 if not s:
2574 return None
2575
2576 days, hours, mins, secs, ms = [None] * 5
2577 m = re.match(r'''(?x)
2578 (?P<before_secs>
2579 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2580 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2581 (?P<ms>[.:][0-9]+)?Z?$
2582 ''', s)
2583 if m:
2584 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2585 else:
2586 m = re.match(
2587 r'''(?ix)(?:P?
2588 (?:
2589 [0-9]+\s*y(?:ears?)?\s*
2590 )?
2591 (?:
2592 [0-9]+\s*m(?:onths?)?\s*
2593 )?
2594 (?:
2595 [0-9]+\s*w(?:eeks?)?\s*
2596 )?
2597 (?:
2598 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2599 )?
2600 T)?
2601 (?:
2602 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2603 )?
2604 (?:
2605 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2606 )?
2607 (?:
2608 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2609 )?Z?$''', s)
2610 if m:
2611 days, hours, mins, secs, ms = m.groups()
2612 else:
2613 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2614 if m:
2615 hours, mins = m.groups()
2616 else:
2617 return None
2618
2619 duration = 0
2620 if secs:
2621 duration += float(secs)
2622 if mins:
2623 duration += float(mins) * 60
2624 if hours:
2625 duration += float(hours) * 60 * 60
2626 if days:
2627 duration += float(days) * 24 * 60 * 60
2628 if ms:
2629 duration += float(ms.replace(':', '.'))
2630 return duration
2631
2632
2633 def prepend_extension(filename, ext, expected_real_ext=None):
2634 name, real_ext = os.path.splitext(filename)
2635 return (
2636 '{0}.{1}{2}'.format(name, ext, real_ext)
2637 if not expected_real_ext or real_ext[1:] == expected_real_ext
2638 else '{0}.{1}'.format(filename, ext))
2639
2640
2641 def replace_extension(filename, ext, expected_real_ext=None):
2642 name, real_ext = os.path.splitext(filename)
2643 return '{0}.{1}'.format(
2644 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2645 ext)
2646
2647
2648 def check_executable(exe, args=[]):
2649 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2650 args can be a list of arguments for a short output (like -version) """
2651 try:
2652 Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2653 except OSError:
2654 return False
2655 return exe
2656
2657
2658 def _get_exe_version_output(exe, args):
2659 try:
2660 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2661 # SIGTTOU if yt-dlp is run in the background.
2662 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2663 out, _ = Popen(
2664 [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2665 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2666 except OSError:
2667 return False
2668 if isinstance(out, bytes): # Python 2.x
2669 out = out.decode('ascii', 'ignore')
2670 return out
2671
2672
2673 def detect_exe_version(output, version_re=None, unrecognized='present'):
2674 assert isinstance(output, compat_str)
2675 if version_re is None:
2676 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2677 m = re.search(version_re, output)
2678 if m:
2679 return m.group(1)
2680 else:
2681 return unrecognized
2682
2683
2684 def get_exe_version(exe, args=['--version'],
2685 version_re=None, unrecognized='present'):
2686 """ Returns the version of the specified executable,
2687 or False if the executable is not present """
2688 out = _get_exe_version_output(exe, args)
2689 return detect_exe_version(out, version_re, unrecognized) if out else False
2690
2691
2692 class LazyList(collections.abc.Sequence):
2693 ''' Lazy immutable list from an iterable
2694 Note that slices of a LazyList are lists and not LazyList'''
2695
2696 class IndexError(IndexError):
2697 pass
2698
2699 def __init__(self, iterable, *, reverse=False, _cache=None):
2700 self.__iterable = iter(iterable)
2701 self.__cache = [] if _cache is None else _cache
2702 self.__reversed = reverse
2703
2704 def __iter__(self):
2705 if self.__reversed:
2706 # We need to consume the entire iterable to iterate in reverse
2707 yield from self.exhaust()
2708 return
2709 yield from self.__cache
2710 for item in self.__iterable:
2711 self.__cache.append(item)
2712 yield item
2713
2714 def __exhaust(self):
2715 self.__cache.extend(self.__iterable)
2716 # Discard the emptied iterable to make it pickle-able
2717 self.__iterable = []
2718 return self.__cache
2719
2720 def exhaust(self):
2721 ''' Evaluate the entire iterable '''
2722 return self.__exhaust()[::-1 if self.__reversed else 1]
2723
2724 @staticmethod
2725 def __reverse_index(x):
2726 return None if x is None else -(x + 1)
2727
2728 def __getitem__(self, idx):
2729 if isinstance(idx, slice):
2730 if self.__reversed:
2731 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2732 start, stop, step = idx.start, idx.stop, idx.step or 1
2733 elif isinstance(idx, int):
2734 if self.__reversed:
2735 idx = self.__reverse_index(idx)
2736 start, stop, step = idx, idx, 0
2737 else:
2738 raise TypeError('indices must be integers or slices')
2739 if ((start or 0) < 0 or (stop or 0) < 0
2740 or (start is None and step < 0)
2741 or (stop is None and step > 0)):
2742 # We need to consume the entire iterable to be able to slice from the end
2743 # Obviously, never use this with infinite iterables
2744 self.__exhaust()
2745 try:
2746 return self.__cache[idx]
2747 except IndexError as e:
2748 raise self.IndexError(e) from e
2749 n = max(start or 0, stop or 0) - len(self.__cache) + 1
2750 if n > 0:
2751 self.__cache.extend(itertools.islice(self.__iterable, n))
2752 try:
2753 return self.__cache[idx]
2754 except IndexError as e:
2755 raise self.IndexError(e) from e
2756
2757 def __bool__(self):
2758 try:
2759 self[-1] if self.__reversed else self[0]
2760 except self.IndexError:
2761 return False
2762 return True
2763
2764 def __len__(self):
2765 self.__exhaust()
2766 return len(self.__cache)
2767
2768 def __reversed__(self):
2769 return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2770
2771 def __copy__(self):
2772 return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2773
2774 def __repr__(self):
2775 # repr and str should mimic a list. So we exhaust the iterable
2776 return repr(self.exhaust())
2777
2778 def __str__(self):
2779 return repr(self.exhaust())
2780
2781
2782 class PagedList:
2783
2784 class IndexError(IndexError):
2785 pass
2786
2787 def __len__(self):
2788 # This is only useful for tests
2789 return len(self.getslice())
2790
2791 def __init__(self, pagefunc, pagesize, use_cache=True):
2792 self._pagefunc = pagefunc
2793 self._pagesize = pagesize
2794 self._use_cache = use_cache
2795 self._cache = {}
2796
2797 def getpage(self, pagenum):
2798 page_results = self._cache.get(pagenum)
2799 if page_results is None:
2800 page_results = list(self._pagefunc(pagenum))
2801 if self._use_cache:
2802 self._cache[pagenum] = page_results
2803 return page_results
2804
2805 def getslice(self, start=0, end=None):
2806 return list(self._getslice(start, end))
2807
2808 def _getslice(self, start, end):
2809 raise NotImplementedError('This method must be implemented by subclasses')
2810
2811 def __getitem__(self, idx):
2812 # NOTE: cache must be enabled if this is used
2813 if not isinstance(idx, int) or idx < 0:
2814 raise TypeError('indices must be non-negative integers')
2815 entries = self.getslice(idx, idx + 1)
2816 if not entries:
2817 raise self.IndexError()
2818 return entries[0]
2819
2820
2821 class OnDemandPagedList(PagedList):
2822 def _getslice(self, start, end):
2823 for pagenum in itertools.count(start // self._pagesize):
2824 firstid = pagenum * self._pagesize
2825 nextfirstid = pagenum * self._pagesize + self._pagesize
2826 if start >= nextfirstid:
2827 continue
2828
2829 startv = (
2830 start % self._pagesize
2831 if firstid <= start < nextfirstid
2832 else 0)
2833 endv = (
2834 ((end - 1) % self._pagesize) + 1
2835 if (end is not None and firstid <= end <= nextfirstid)
2836 else None)
2837
2838 page_results = self.getpage(pagenum)
2839 if startv != 0 or endv is not None:
2840 page_results = page_results[startv:endv]
2841 yield from page_results
2842
2843 # A little optimization - if current page is not "full", ie. does
2844 # not contain page_size videos then we can assume that this page
2845 # is the last one - there are no more ids on further pages -
2846 # i.e. no need to query again.
2847 if len(page_results) + startv < self._pagesize:
2848 break
2849
2850 # If we got the whole page, but the next page is not interesting,
2851 # break out early as well
2852 if end == nextfirstid:
2853 break
2854
2855
2856 class InAdvancePagedList(PagedList):
2857 def __init__(self, pagefunc, pagecount, pagesize):
2858 self._pagecount = pagecount
2859 PagedList.__init__(self, pagefunc, pagesize, True)
2860
2861 def _getslice(self, start, end):
2862 start_page = start // self._pagesize
2863 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2864 skip_elems = start - start_page * self._pagesize
2865 only_more = None if end is None else end - start
2866 for pagenum in range(start_page, end_page):
2867 page_results = self.getpage(pagenum)
2868 if skip_elems:
2869 page_results = page_results[skip_elems:]
2870 skip_elems = None
2871 if only_more is not None:
2872 if len(page_results) < only_more:
2873 only_more -= len(page_results)
2874 else:
2875 yield from page_results[:only_more]
2876 break
2877 yield from page_results
2878
2879
2880 def uppercase_escape(s):
2881 unicode_escape = codecs.getdecoder('unicode_escape')
2882 return re.sub(
2883 r'\\U[0-9a-fA-F]{8}',
2884 lambda m: unicode_escape(m.group(0))[0],
2885 s)
2886
2887
2888 def lowercase_escape(s):
2889 unicode_escape = codecs.getdecoder('unicode_escape')
2890 return re.sub(
2891 r'\\u[0-9a-fA-F]{4}',
2892 lambda m: unicode_escape(m.group(0))[0],
2893 s)
2894
2895
2896 def escape_rfc3986(s):
2897 """Escape non-ASCII characters as suggested by RFC 3986"""
2898 if sys.version_info < (3, 0) and isinstance(s, compat_str):
2899 s = s.encode('utf-8')
2900 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2901
2902
2903 def escape_url(url):
2904 """Escape URL as suggested by RFC 3986"""
2905 url_parsed = compat_urllib_parse_urlparse(url)
2906 return url_parsed._replace(
2907 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2908 path=escape_rfc3986(url_parsed.path),
2909 params=escape_rfc3986(url_parsed.params),
2910 query=escape_rfc3986(url_parsed.query),
2911 fragment=escape_rfc3986(url_parsed.fragment)
2912 ).geturl()
2913
2914
2915 def parse_qs(url):
2916 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2917
2918
2919 def read_batch_urls(batch_fd):
2920 def fixup(url):
2921 if not isinstance(url, compat_str):
2922 url = url.decode('utf-8', 'replace')
2923 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2924 for bom in BOM_UTF8:
2925 if url.startswith(bom):
2926 url = url[len(bom):]
2927 url = url.lstrip()
2928 if not url or url.startswith(('#', ';', ']')):
2929 return False
2930 # "#" cannot be stripped out since it is part of the URI
2931 # However, it can be safely stipped out if follwing a whitespace
2932 return re.split(r'\s#', url, 1)[0].rstrip()
2933
2934 with contextlib.closing(batch_fd) as fd:
2935 return [url for url in map(fixup, fd) if url]
2936
2937
2938 def urlencode_postdata(*args, **kargs):
2939 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2940
2941
2942 def update_url_query(url, query):
2943 if not query:
2944 return url
2945 parsed_url = compat_urlparse.urlparse(url)
2946 qs = compat_parse_qs(parsed_url.query)
2947 qs.update(query)
2948 return compat_urlparse.urlunparse(parsed_url._replace(
2949 query=compat_urllib_parse_urlencode(qs, True)))
2950
2951
2952 def update_Request(req, url=None, data=None, headers={}, query={}):
2953 req_headers = req.headers.copy()
2954 req_headers.update(headers)
2955 req_data = data or req.data
2956 req_url = update_url_query(url or req.get_full_url(), query)
2957 req_get_method = req.get_method()
2958 if req_get_method == 'HEAD':
2959 req_type = HEADRequest
2960 elif req_get_method == 'PUT':
2961 req_type = PUTRequest
2962 else:
2963 req_type = compat_urllib_request.Request
2964 new_req = req_type(
2965 req_url, data=req_data, headers=req_headers,
2966 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2967 if hasattr(req, 'timeout'):
2968 new_req.timeout = req.timeout
2969 return new_req
2970
2971
2972 def _multipart_encode_impl(data, boundary):
2973 content_type = 'multipart/form-data; boundary=%s' % boundary
2974
2975 out = b''
2976 for k, v in data.items():
2977 out += b'--' + boundary.encode('ascii') + b'\r\n'
2978 if isinstance(k, compat_str):
2979 k = k.encode('utf-8')
2980 if isinstance(v, compat_str):
2981 v = v.encode('utf-8')
2982 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2983 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
2984 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
2985 if boundary.encode('ascii') in content:
2986 raise ValueError('Boundary overlaps with data')
2987 out += content
2988
2989 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2990
2991 return out, content_type
2992
2993
2994 def multipart_encode(data, boundary=None):
2995 '''
2996 Encode a dict to RFC 7578-compliant form-data
2997
2998 data:
2999 A dict where keys and values can be either Unicode or bytes-like
3000 objects.
3001 boundary:
3002 If specified a Unicode object, it's used as the boundary. Otherwise
3003 a random boundary is generated.
3004
3005 Reference: https://tools.ietf.org/html/rfc7578
3006 '''
3007 has_specified_boundary = boundary is not None
3008
3009 while True:
3010 if boundary is None:
3011 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3012
3013 try:
3014 out, content_type = _multipart_encode_impl(data, boundary)
3015 break
3016 except ValueError:
3017 if has_specified_boundary:
3018 raise
3019 boundary = None
3020
3021 return out, content_type
3022
3023
3024 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3025 if isinstance(key_or_keys, (list, tuple)):
3026 for key in key_or_keys:
3027 if key not in d or d[key] is None or skip_false_values and not d[key]:
3028 continue
3029 return d[key]
3030 return default
3031 return d.get(key_or_keys, default)
3032
3033
3034 def try_get(src, getter, expected_type=None):
3035 for get in variadic(getter):
3036 try:
3037 v = get(src)
3038 except (AttributeError, KeyError, TypeError, IndexError):
3039 pass
3040 else:
3041 if expected_type is None or isinstance(v, expected_type):
3042 return v
3043
3044
3045 def merge_dicts(*dicts):
3046 merged = {}
3047 for a_dict in dicts:
3048 for k, v in a_dict.items():
3049 if v is None:
3050 continue
3051 if (k not in merged
3052 or (isinstance(v, compat_str) and v
3053 and isinstance(merged[k], compat_str)
3054 and not merged[k])):
3055 merged[k] = v
3056 return merged
3057
3058
3059 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3060 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3061
3062
3063 US_RATINGS = {
3064 'G': 0,
3065 'PG': 10,
3066 'PG-13': 13,
3067 'R': 16,
3068 'NC': 18,
3069 }
3070
3071
3072 TV_PARENTAL_GUIDELINES = {
3073 'TV-Y': 0,
3074 'TV-Y7': 7,
3075 'TV-G': 0,
3076 'TV-PG': 0,
3077 'TV-14': 14,
3078 'TV-MA': 17,
3079 }
3080
3081
3082 def parse_age_limit(s):
3083 if type(s) == int:
3084 return s if 0 <= s <= 21 else None
3085 if not isinstance(s, compat_basestring):
3086 return None
3087 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3088 if m:
3089 return int(m.group('age'))
3090 s = s.upper()
3091 if s in US_RATINGS:
3092 return US_RATINGS[s]
3093 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3094 if m:
3095 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3096 return None
3097
3098
3099 def strip_jsonp(code):
3100 return re.sub(
3101 r'''(?sx)^
3102 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3103 (?:\s*&&\s*(?P=func_name))?
3104 \s*\(\s*(?P<callback_data>.*)\);?
3105 \s*?(?://[^\n]*)*$''',
3106 r'\g<callback_data>', code)
3107
3108
3109 def js_to_json(code, vars={}):
3110 # vars is a dict of var, val pairs to substitute
3111 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3112 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3113 INTEGER_TABLE = (
3114 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3115 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3116 )
3117
3118 def fix_kv(m):
3119 v = m.group(0)
3120 if v in ('true', 'false', 'null'):
3121 return v
3122 elif v in ('undefined', 'void 0'):
3123 return 'null'
3124 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3125 return ""
3126
3127 if v[0] in ("'", '"'):
3128 v = re.sub(r'(?s)\\.|"', lambda m: {
3129 '"': '\\"',
3130 "\\'": "'",
3131 '\\\n': '',
3132 '\\x': '\\u00',
3133 }.get(m.group(0), m.group(0)), v[1:-1])
3134 else:
3135 for regex, base in INTEGER_TABLE:
3136 im = re.match(regex, v)
3137 if im:
3138 i = int(im.group(1), base)
3139 return '"%d":' % i if v.endswith(':') else '%d' % i
3140
3141 if v in vars:
3142 return vars[v]
3143
3144 return '"%s"' % v
3145
3146 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3147
3148 return re.sub(r'''(?sx)
3149 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3150 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3151 {comment}|,(?={skip}[\]}}])|
3152 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3153 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3154 [0-9]+(?={skip}:)|
3155 !+
3156 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3157
3158
3159 def qualities(quality_ids):
3160 """ Get a numeric quality value out of a list of possible values """
3161 def q(qid):
3162 try:
3163 return quality_ids.index(qid)
3164 except ValueError:
3165 return -1
3166 return q
3167
3168
3169 POSTPROCESS_WHEN = {'pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
3170
3171
3172 DEFAULT_OUTTMPL = {
3173 'default': '%(title)s [%(id)s].%(ext)s',
3174 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3175 }
3176 OUTTMPL_TYPES = {
3177 'chapter': None,
3178 'subtitle': None,
3179 'thumbnail': None,
3180 'description': 'description',
3181 'annotation': 'annotations.xml',
3182 'infojson': 'info.json',
3183 'link': None,
3184 'pl_video': None,
3185 'pl_thumbnail': None,
3186 'pl_description': 'description',
3187 'pl_infojson': 'info.json',
3188 }
3189
3190 # As of [1] format syntax is:
3191 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3192 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3193 STR_FORMAT_RE_TMPL = r'''(?x)
3194 (?<!%)(?P<prefix>(?:%%)*)
3195 %
3196 (?P<has_key>\((?P<key>{0})\))?
3197 (?P<format>
3198 (?P<conversion>[#0\-+ ]+)?
3199 (?P<min_width>\d+)?
3200 (?P<precision>\.\d+)?
3201 (?P<len_mod>[hlL])? # unused in python
3202 {1} # conversion type
3203 )
3204 '''
3205
3206
3207 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3208
3209
3210 def limit_length(s, length):
3211 """ Add ellipses to overly long strings """
3212 if s is None:
3213 return None
3214 ELLIPSES = '...'
3215 if len(s) > length:
3216 return s[:length - len(ELLIPSES)] + ELLIPSES
3217 return s
3218
3219
3220 def version_tuple(v):
3221 return tuple(int(e) for e in re.split(r'[-.]', v))
3222
3223
3224 def is_outdated_version(version, limit, assume_new=True):
3225 if not version:
3226 return not assume_new
3227 try:
3228 return version_tuple(version) < version_tuple(limit)
3229 except ValueError:
3230 return not assume_new
3231
3232
3233 def ytdl_is_updateable():
3234 """ Returns if yt-dlp can be updated with -U """
3235
3236 from .update import is_non_updateable
3237
3238 return not is_non_updateable()
3239
3240
3241 def args_to_str(args):
3242 # Get a short string representation for a subprocess command
3243 return ' '.join(compat_shlex_quote(a) for a in args)
3244
3245
3246 def error_to_compat_str(err):
3247 err_str = str(err)
3248 # On python 2 error byte string must be decoded with proper
3249 # encoding rather than ascii
3250 if sys.version_info[0] < 3:
3251 err_str = err_str.decode(preferredencoding())
3252 return err_str
3253
3254
3255 def mimetype2ext(mt):
3256 if mt is None:
3257 return None
3258
3259 mt, _, params = mt.partition(';')
3260 mt = mt.strip()
3261
3262 FULL_MAP = {
3263 'audio/mp4': 'm4a',
3264 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3265 # it's the most popular one
3266 'audio/mpeg': 'mp3',
3267 'audio/x-wav': 'wav',
3268 'audio/wav': 'wav',
3269 'audio/wave': 'wav',
3270 }
3271
3272 ext = FULL_MAP.get(mt)
3273 if ext is not None:
3274 return ext
3275
3276 SUBTYPE_MAP = {
3277 '3gpp': '3gp',
3278 'smptett+xml': 'tt',
3279 'ttaf+xml': 'dfxp',
3280 'ttml+xml': 'ttml',
3281 'x-flv': 'flv',
3282 'x-mp4-fragmented': 'mp4',
3283 'x-ms-sami': 'sami',
3284 'x-ms-wmv': 'wmv',
3285 'mpegurl': 'm3u8',
3286 'x-mpegurl': 'm3u8',
3287 'vnd.apple.mpegurl': 'm3u8',
3288 'dash+xml': 'mpd',
3289 'f4m+xml': 'f4m',
3290 'hds+xml': 'f4m',
3291 'vnd.ms-sstr+xml': 'ism',
3292 'quicktime': 'mov',
3293 'mp2t': 'ts',
3294 'x-wav': 'wav',
3295 'filmstrip+json': 'fs',
3296 'svg+xml': 'svg',
3297 }
3298
3299 _, _, subtype = mt.rpartition('/')
3300 ext = SUBTYPE_MAP.get(subtype.lower())
3301 if ext is not None:
3302 return ext
3303
3304 SUFFIX_MAP = {
3305 'json': 'json',
3306 'xml': 'xml',
3307 'zip': 'zip',
3308 'gzip': 'gz',
3309 }
3310
3311 _, _, suffix = subtype.partition('+')
3312 ext = SUFFIX_MAP.get(suffix)
3313 if ext is not None:
3314 return ext
3315
3316 return subtype.replace('+', '.')
3317
3318
3319 def ext2mimetype(ext_or_url):
3320 if not ext_or_url:
3321 return None
3322 if '.' not in ext_or_url:
3323 ext_or_url = f'file.{ext_or_url}'
3324 return mimetypes.guess_type(ext_or_url)[0]
3325
3326
3327 def parse_codecs(codecs_str):
3328 # http://tools.ietf.org/html/rfc6381
3329 if not codecs_str:
3330 return {}
3331 split_codecs = list(filter(None, map(
3332 str.strip, codecs_str.strip().strip(',').split(','))))
3333 vcodec, acodec, tcodec, hdr = None, None, None, None
3334 for full_codec in split_codecs:
3335 parts = full_codec.split('.')
3336 codec = parts[0].replace('0', '')
3337 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3338 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3339 if not vcodec:
3340 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3341 if codec in ('dvh1', 'dvhe'):
3342 hdr = 'DV'
3343 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3344 hdr = 'HDR10'
3345 elif full_codec.replace('0', '').startswith('vp9.2'):
3346 hdr = 'HDR10'
3347 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3348 if not acodec:
3349 acodec = full_codec
3350 elif codec in ('stpp', 'wvtt',):
3351 if not tcodec:
3352 tcodec = full_codec
3353 else:
3354 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3355 if vcodec or acodec or tcodec:
3356 return {
3357 'vcodec': vcodec or 'none',
3358 'acodec': acodec or 'none',
3359 'dynamic_range': hdr,
3360 **({'tcodec': tcodec} if tcodec is not None else {}),
3361 }
3362 elif len(split_codecs) == 2:
3363 return {
3364 'vcodec': split_codecs[0],
3365 'acodec': split_codecs[1],
3366 }
3367 return {}
3368
3369
3370 def urlhandle_detect_ext(url_handle):
3371 getheader = url_handle.headers.get
3372
3373 cd = getheader('Content-Disposition')
3374 if cd:
3375 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3376 if m:
3377 e = determine_ext(m.group('filename'), default_ext=None)
3378 if e:
3379 return e
3380
3381 return mimetype2ext(getheader('Content-Type'))
3382
3383
3384 def encode_data_uri(data, mime_type):
3385 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3386
3387
3388 def age_restricted(content_limit, age_limit):
3389 """ Returns True iff the content should be blocked """
3390
3391 if age_limit is None: # No limit set
3392 return False
3393 if content_limit is None:
3394 return False # Content available for everyone
3395 return age_limit < content_limit
3396
3397
3398 def is_html(first_bytes):
3399 """ Detect whether a file contains HTML by examining its first bytes. """
3400
3401 BOMS = [
3402 (b'\xef\xbb\xbf', 'utf-8'),
3403 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3404 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3405 (b'\xff\xfe', 'utf-16-le'),
3406 (b'\xfe\xff', 'utf-16-be'),
3407 ]
3408 for bom, enc in BOMS:
3409 if first_bytes.startswith(bom):
3410 s = first_bytes[len(bom):].decode(enc, 'replace')
3411 break
3412 else:
3413 s = first_bytes.decode('utf-8', 'replace')
3414
3415 return re.match(r'^\s*<', s)
3416
3417
3418 def determine_protocol(info_dict):
3419 protocol = info_dict.get('protocol')
3420 if protocol is not None:
3421 return protocol
3422
3423 url = sanitize_url(info_dict['url'])
3424 if url.startswith('rtmp'):
3425 return 'rtmp'
3426 elif url.startswith('mms'):
3427 return 'mms'
3428 elif url.startswith('rtsp'):
3429 return 'rtsp'
3430
3431 ext = determine_ext(url)
3432 if ext == 'm3u8':
3433 return 'm3u8'
3434 elif ext == 'f4m':
3435 return 'f4m'
3436
3437 return compat_urllib_parse_urlparse(url).scheme
3438
3439
3440 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3441 """ Render a list of rows, each as a list of values.
3442 Text after a \t will be right aligned """
3443 def width(string):
3444 return len(remove_terminal_sequences(string).replace('\t', ''))
3445
3446 def get_max_lens(table):
3447 return [max(width(str(v)) for v in col) for col in zip(*table)]
3448
3449 def filter_using_list(row, filterArray):
3450 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3451
3452 max_lens = get_max_lens(data) if hide_empty else []
3453 header_row = filter_using_list(header_row, max_lens)
3454 data = [filter_using_list(row, max_lens) for row in data]
3455
3456 table = [header_row] + data
3457 max_lens = get_max_lens(table)
3458 extra_gap += 1
3459 if delim:
3460 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3461 table[1][-1] = table[1][-1][:-extra_gap] # Remove extra_gap from end of delimiter
3462 for row in table:
3463 for pos, text in enumerate(map(str, row)):
3464 if '\t' in text:
3465 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3466 else:
3467 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3468 ret = '\n'.join(''.join(row).rstrip() for row in table)
3469 return ret
3470
3471
3472 def _match_one(filter_part, dct, incomplete):
3473 # TODO: Generalize code with YoutubeDL._build_format_filter
3474 STRING_OPERATORS = {
3475 '*=': operator.contains,
3476 '^=': lambda attr, value: attr.startswith(value),
3477 '$=': lambda attr, value: attr.endswith(value),
3478 '~=': lambda attr, value: re.search(value, attr),
3479 }
3480 COMPARISON_OPERATORS = {
3481 **STRING_OPERATORS,
3482 '<=': operator.le, # "<=" must be defined above "<"
3483 '<': operator.lt,
3484 '>=': operator.ge,
3485 '>': operator.gt,
3486 '=': operator.eq,
3487 }
3488
3489 operator_rex = re.compile(r'''(?x)\s*
3490 (?P<key>[a-z_]+)
3491 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3492 (?:
3493 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3494 (?P<strval>.+?)
3495 )
3496 \s*$
3497 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3498 m = operator_rex.search(filter_part)
3499 if m:
3500 m = m.groupdict()
3501 unnegated_op = COMPARISON_OPERATORS[m['op']]
3502 if m['negation']:
3503 op = lambda attr, value: not unnegated_op(attr, value)
3504 else:
3505 op = unnegated_op
3506 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3507 if m['quote']:
3508 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3509 actual_value = dct.get(m['key'])
3510 numeric_comparison = None
3511 if isinstance(actual_value, compat_numeric_types):
3512 # If the original field is a string and matching comparisonvalue is
3513 # a number we should respect the origin of the original field
3514 # and process comparison value as a string (see
3515 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3516 try:
3517 numeric_comparison = int(comparison_value)
3518 except ValueError:
3519 numeric_comparison = parse_filesize(comparison_value)
3520 if numeric_comparison is None:
3521 numeric_comparison = parse_filesize(f'{comparison_value}B')
3522 if numeric_comparison is None:
3523 numeric_comparison = parse_duration(comparison_value)
3524 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3525 raise ValueError('Operator %s only supports string values!' % m['op'])
3526 if actual_value is None:
3527 return incomplete or m['none_inclusive']
3528 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3529
3530 UNARY_OPERATORS = {
3531 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3532 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3533 }
3534 operator_rex = re.compile(r'''(?x)\s*
3535 (?P<op>%s)\s*(?P<key>[a-z_]+)
3536 \s*$
3537 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3538 m = operator_rex.search(filter_part)
3539 if m:
3540 op = UNARY_OPERATORS[m.group('op')]
3541 actual_value = dct.get(m.group('key'))
3542 if incomplete and actual_value is None:
3543 return True
3544 return op(actual_value)
3545
3546 raise ValueError('Invalid filter part %r' % filter_part)
3547
3548
3549 def match_str(filter_str, dct, incomplete=False):
3550 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3551 When incomplete, all conditions passes on missing fields
3552 """
3553 return all(
3554 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3555 for filter_part in re.split(r'(?<!\\)&', filter_str))
3556
3557
3558 def match_filter_func(filter_str):
3559 def _match_func(info_dict, *args, **kwargs):
3560 if match_str(filter_str, info_dict, *args, **kwargs):
3561 return None
3562 else:
3563 video_title = info_dict.get('title', info_dict.get('id', 'video'))
3564 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3565 return _match_func
3566
3567
3568 def parse_dfxp_time_expr(time_expr):
3569 if not time_expr:
3570 return
3571
3572 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3573 if mobj:
3574 return float(mobj.group('time_offset'))
3575
3576 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3577 if mobj:
3578 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3579
3580
3581 def srt_subtitles_timecode(seconds):
3582 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3583
3584
3585 def ass_subtitles_timecode(seconds):
3586 time = timetuple_from_msec(seconds * 1000)
3587 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3588
3589
3590 def dfxp2srt(dfxp_data):
3591 '''
3592 @param dfxp_data A bytes-like object containing DFXP data
3593 @returns A unicode object containing converted SRT data
3594 '''
3595 LEGACY_NAMESPACES = (
3596 (b'http://www.w3.org/ns/ttml', [
3597 b'http://www.w3.org/2004/11/ttaf1',
3598 b'http://www.w3.org/2006/04/ttaf1',
3599 b'http://www.w3.org/2006/10/ttaf1',
3600 ]),
3601 (b'http://www.w3.org/ns/ttml#styling', [
3602 b'http://www.w3.org/ns/ttml#style',
3603 ]),
3604 )
3605
3606 SUPPORTED_STYLING = [
3607 'color',
3608 'fontFamily',
3609 'fontSize',
3610 'fontStyle',
3611 'fontWeight',
3612 'textDecoration'
3613 ]
3614
3615 _x = functools.partial(xpath_with_ns, ns_map={
3616 'xml': 'http://www.w3.org/XML/1998/namespace',
3617 'ttml': 'http://www.w3.org/ns/ttml',
3618 'tts': 'http://www.w3.org/ns/ttml#styling',
3619 })
3620
3621 styles = {}
3622 default_style = {}
3623
3624 class TTMLPElementParser(object):
3625 _out = ''
3626 _unclosed_elements = []
3627 _applied_styles = []
3628
3629 def start(self, tag, attrib):
3630 if tag in (_x('ttml:br'), 'br'):
3631 self._out += '\n'
3632 else:
3633 unclosed_elements = []
3634 style = {}
3635 element_style_id = attrib.get('style')
3636 if default_style:
3637 style.update(default_style)
3638 if element_style_id:
3639 style.update(styles.get(element_style_id, {}))
3640 for prop in SUPPORTED_STYLING:
3641 prop_val = attrib.get(_x('tts:' + prop))
3642 if prop_val:
3643 style[prop] = prop_val
3644 if style:
3645 font = ''
3646 for k, v in sorted(style.items()):
3647 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3648 continue
3649 if k == 'color':
3650 font += ' color="%s"' % v
3651 elif k == 'fontSize':
3652 font += ' size="%s"' % v
3653 elif k == 'fontFamily':
3654 font += ' face="%s"' % v
3655 elif k == 'fontWeight' and v == 'bold':
3656 self._out += '<b>'
3657 unclosed_elements.append('b')
3658 elif k == 'fontStyle' and v == 'italic':
3659 self._out += '<i>'
3660 unclosed_elements.append('i')
3661 elif k == 'textDecoration' and v == 'underline':
3662 self._out += '<u>'
3663 unclosed_elements.append('u')
3664 if font:
3665 self._out += '<font' + font + '>'
3666 unclosed_elements.append('font')
3667 applied_style = {}
3668 if self._applied_styles:
3669 applied_style.update(self._applied_styles[-1])
3670 applied_style.update(style)
3671 self._applied_styles.append(applied_style)
3672 self._unclosed_elements.append(unclosed_elements)
3673
3674 def end(self, tag):
3675 if tag not in (_x('ttml:br'), 'br'):
3676 unclosed_elements = self._unclosed_elements.pop()
3677 for element in reversed(unclosed_elements):
3678 self._out += '</%s>' % element
3679 if unclosed_elements and self._applied_styles:
3680 self._applied_styles.pop()
3681
3682 def data(self, data):
3683 self._out += data
3684
3685 def close(self):
3686 return self._out.strip()
3687
3688 def parse_node(node):
3689 target = TTMLPElementParser()
3690 parser = xml.etree.ElementTree.XMLParser(target=target)
3691 parser.feed(xml.etree.ElementTree.tostring(node))
3692 return parser.close()
3693
3694 for k, v in LEGACY_NAMESPACES:
3695 for ns in v:
3696 dfxp_data = dfxp_data.replace(ns, k)
3697
3698 dfxp = compat_etree_fromstring(dfxp_data)
3699 out = []
3700 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3701
3702 if not paras:
3703 raise ValueError('Invalid dfxp/TTML subtitle')
3704
3705 repeat = False
3706 while True:
3707 for style in dfxp.findall(_x('.//ttml:style')):
3708 style_id = style.get('id') or style.get(_x('xml:id'))
3709 if not style_id:
3710 continue
3711 parent_style_id = style.get('style')
3712 if parent_style_id:
3713 if parent_style_id not in styles:
3714 repeat = True
3715 continue
3716 styles[style_id] = styles[parent_style_id].copy()
3717 for prop in SUPPORTED_STYLING:
3718 prop_val = style.get(_x('tts:' + prop))
3719 if prop_val:
3720 styles.setdefault(style_id, {})[prop] = prop_val
3721 if repeat:
3722 repeat = False
3723 else:
3724 break
3725
3726 for p in ('body', 'div'):
3727 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3728 if ele is None:
3729 continue
3730 style = styles.get(ele.get('style'))
3731 if not style:
3732 continue
3733 default_style.update(style)
3734
3735 for para, index in zip(paras, itertools.count(1)):
3736 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3737 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3738 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3739 if begin_time is None:
3740 continue
3741 if not end_time:
3742 if not dur:
3743 continue
3744 end_time = begin_time + dur
3745 out.append('%d\n%s --> %s\n%s\n\n' % (
3746 index,
3747 srt_subtitles_timecode(begin_time),
3748 srt_subtitles_timecode(end_time),
3749 parse_node(para)))
3750
3751 return ''.join(out)
3752
3753
3754 def cli_option(params, command_option, param):
3755 param = params.get(param)
3756 if param:
3757 param = compat_str(param)
3758 return [command_option, param] if param is not None else []
3759
3760
3761 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3762 param = params.get(param)
3763 if param is None:
3764 return []
3765 assert isinstance(param, bool)
3766 if separator:
3767 return [command_option + separator + (true_value if param else false_value)]
3768 return [command_option, true_value if param else false_value]
3769
3770
3771 def cli_valueless_option(params, command_option, param, expected_value=True):
3772 param = params.get(param)
3773 return [command_option] if param == expected_value else []
3774
3775
3776 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3777 if isinstance(argdict, (list, tuple)): # for backward compatibility
3778 if use_compat:
3779 return argdict
3780 else:
3781 argdict = None
3782 if argdict is None:
3783 return default
3784 assert isinstance(argdict, dict)
3785
3786 assert isinstance(keys, (list, tuple))
3787 for key_list in keys:
3788 arg_list = list(filter(
3789 lambda x: x is not None,
3790 [argdict.get(key.lower()) for key in variadic(key_list)]))
3791 if arg_list:
3792 return [arg for args in arg_list for arg in args]
3793 return default
3794
3795
3796 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3797 main_key, exe = main_key.lower(), exe.lower()
3798 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3799 keys = [f'{root_key}{k}' for k in (keys or [''])]
3800 if root_key in keys:
3801 if main_key != exe:
3802 keys.append((main_key, exe))
3803 keys.append('default')
3804 else:
3805 use_compat = False
3806 return cli_configuration_args(argdict, keys, default, use_compat)
3807
3808
3809 class ISO639Utils(object):
3810 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3811 _lang_map = {
3812 'aa': 'aar',
3813 'ab': 'abk',
3814 'ae': 'ave',
3815 'af': 'afr',
3816 'ak': 'aka',
3817 'am': 'amh',
3818 'an': 'arg',
3819 'ar': 'ara',
3820 'as': 'asm',
3821 'av': 'ava',
3822 'ay': 'aym',
3823 'az': 'aze',
3824 'ba': 'bak',
3825 'be': 'bel',
3826 'bg': 'bul',
3827 'bh': 'bih',
3828 'bi': 'bis',
3829 'bm': 'bam',
3830 'bn': 'ben',
3831 'bo': 'bod',
3832 'br': 'bre',
3833 'bs': 'bos',
3834 'ca': 'cat',
3835 'ce': 'che',
3836 'ch': 'cha',
3837 'co': 'cos',
3838 'cr': 'cre',
3839 'cs': 'ces',
3840 'cu': 'chu',
3841 'cv': 'chv',
3842 'cy': 'cym',
3843 'da': 'dan',
3844 'de': 'deu',
3845 'dv': 'div',
3846 'dz': 'dzo',
3847 'ee': 'ewe',
3848 'el': 'ell',
3849 'en': 'eng',
3850 'eo': 'epo',
3851 'es': 'spa',
3852 'et': 'est',
3853 'eu': 'eus',
3854 'fa': 'fas',
3855 'ff': 'ful',
3856 'fi': 'fin',
3857 'fj': 'fij',
3858 'fo': 'fao',
3859 'fr': 'fra',
3860 'fy': 'fry',
3861 'ga': 'gle',
3862 'gd': 'gla',
3863 'gl': 'glg',
3864 'gn': 'grn',
3865 'gu': 'guj',
3866 'gv': 'glv',
3867 'ha': 'hau',
3868 'he': 'heb',
3869 'iw': 'heb', # Replaced by he in 1989 revision
3870 'hi': 'hin',
3871 'ho': 'hmo',
3872 'hr': 'hrv',
3873 'ht': 'hat',
3874 'hu': 'hun',
3875 'hy': 'hye',
3876 'hz': 'her',
3877 'ia': 'ina',
3878 'id': 'ind',
3879 'in': 'ind', # Replaced by id in 1989 revision
3880 'ie': 'ile',
3881 'ig': 'ibo',
3882 'ii': 'iii',
3883 'ik': 'ipk',
3884 'io': 'ido',
3885 'is': 'isl',
3886 'it': 'ita',
3887 'iu': 'iku',
3888 'ja': 'jpn',
3889 'jv': 'jav',
3890 'ka': 'kat',
3891 'kg': 'kon',
3892 'ki': 'kik',
3893 'kj': 'kua',
3894 'kk': 'kaz',
3895 'kl': 'kal',
3896 'km': 'khm',
3897 'kn': 'kan',
3898 'ko': 'kor',
3899 'kr': 'kau',
3900 'ks': 'kas',
3901 'ku': 'kur',
3902 'kv': 'kom',
3903 'kw': 'cor',
3904 'ky': 'kir',
3905 'la': 'lat',
3906 'lb': 'ltz',
3907 'lg': 'lug',
3908 'li': 'lim',
3909 'ln': 'lin',
3910 'lo': 'lao',
3911 'lt': 'lit',
3912 'lu': 'lub',
3913 'lv': 'lav',
3914 'mg': 'mlg',
3915 'mh': 'mah',
3916 'mi': 'mri',
3917 'mk': 'mkd',
3918 'ml': 'mal',
3919 'mn': 'mon',
3920 'mr': 'mar',
3921 'ms': 'msa',
3922 'mt': 'mlt',
3923 'my': 'mya',
3924 'na': 'nau',
3925 'nb': 'nob',
3926 'nd': 'nde',
3927 'ne': 'nep',
3928 'ng': 'ndo',
3929 'nl': 'nld',
3930 'nn': 'nno',
3931 'no': 'nor',
3932 'nr': 'nbl',
3933 'nv': 'nav',
3934 'ny': 'nya',
3935 'oc': 'oci',
3936 'oj': 'oji',
3937 'om': 'orm',
3938 'or': 'ori',
3939 'os': 'oss',
3940 'pa': 'pan',
3941 'pi': 'pli',
3942 'pl': 'pol',
3943 'ps': 'pus',
3944 'pt': 'por',
3945 'qu': 'que',
3946 'rm': 'roh',
3947 'rn': 'run',
3948 'ro': 'ron',
3949 'ru': 'rus',
3950 'rw': 'kin',
3951 'sa': 'san',
3952 'sc': 'srd',
3953 'sd': 'snd',
3954 'se': 'sme',
3955 'sg': 'sag',
3956 'si': 'sin',
3957 'sk': 'slk',
3958 'sl': 'slv',
3959 'sm': 'smo',
3960 'sn': 'sna',
3961 'so': 'som',
3962 'sq': 'sqi',
3963 'sr': 'srp',
3964 'ss': 'ssw',
3965 'st': 'sot',
3966 'su': 'sun',
3967 'sv': 'swe',
3968 'sw': 'swa',
3969 'ta': 'tam',
3970 'te': 'tel',
3971 'tg': 'tgk',
3972 'th': 'tha',
3973 'ti': 'tir',
3974 'tk': 'tuk',
3975 'tl': 'tgl',
3976 'tn': 'tsn',
3977 'to': 'ton',
3978 'tr': 'tur',
3979 'ts': 'tso',
3980 'tt': 'tat',
3981 'tw': 'twi',
3982 'ty': 'tah',
3983 'ug': 'uig',
3984 'uk': 'ukr',
3985 'ur': 'urd',
3986 'uz': 'uzb',
3987 've': 'ven',
3988 'vi': 'vie',
3989 'vo': 'vol',
3990 'wa': 'wln',
3991 'wo': 'wol',
3992 'xh': 'xho',
3993 'yi': 'yid',
3994 'ji': 'yid', # Replaced by yi in 1989 revision
3995 'yo': 'yor',
3996 'za': 'zha',
3997 'zh': 'zho',
3998 'zu': 'zul',
3999 }
4000
4001 @classmethod
4002 def short2long(cls, code):
4003 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4004 return cls._lang_map.get(code[:2])
4005
4006 @classmethod
4007 def long2short(cls, code):
4008 """Convert language code from ISO 639-2/T to ISO 639-1"""
4009 for short_name, long_name in cls._lang_map.items():
4010 if long_name == code:
4011 return short_name
4012
4013
4014 class ISO3166Utils(object):
4015 # From http://data.okfn.org/data/core/country-list
4016 _country_map = {
4017 'AF': 'Afghanistan',
4018 'AX': 'Åland Islands',
4019 'AL': 'Albania',
4020 'DZ': 'Algeria',
4021 'AS': 'American Samoa',
4022 'AD': 'Andorra',
4023 'AO': 'Angola',
4024 'AI': 'Anguilla',
4025 'AQ': 'Antarctica',
4026 'AG': 'Antigua and Barbuda',
4027 'AR': 'Argentina',
4028 'AM': 'Armenia',
4029 'AW': 'Aruba',
4030 'AU': 'Australia',
4031 'AT': 'Austria',
4032 'AZ': 'Azerbaijan',
4033 'BS': 'Bahamas',
4034 'BH': 'Bahrain',
4035 'BD': 'Bangladesh',
4036 'BB': 'Barbados',
4037 'BY': 'Belarus',
4038 'BE': 'Belgium',
4039 'BZ': 'Belize',
4040 'BJ': 'Benin',
4041 'BM': 'Bermuda',
4042 'BT': 'Bhutan',
4043 'BO': 'Bolivia, Plurinational State of',
4044 'BQ': 'Bonaire, Sint Eustatius and Saba',
4045 'BA': 'Bosnia and Herzegovina',
4046 'BW': 'Botswana',
4047 'BV': 'Bouvet Island',
4048 'BR': 'Brazil',
4049 'IO': 'British Indian Ocean Territory',
4050 'BN': 'Brunei Darussalam',
4051 'BG': 'Bulgaria',
4052 'BF': 'Burkina Faso',
4053 'BI': 'Burundi',
4054 'KH': 'Cambodia',
4055 'CM': 'Cameroon',
4056 'CA': 'Canada',
4057 'CV': 'Cape Verde',
4058 'KY': 'Cayman Islands',
4059 'CF': 'Central African Republic',
4060 'TD': 'Chad',
4061 'CL': 'Chile',
4062 'CN': 'China',
4063 'CX': 'Christmas Island',
4064 'CC': 'Cocos (Keeling) Islands',
4065 'CO': 'Colombia',
4066 'KM': 'Comoros',
4067 'CG': 'Congo',
4068 'CD': 'Congo, the Democratic Republic of the',
4069 'CK': 'Cook Islands',
4070 'CR': 'Costa Rica',
4071 'CI': 'Côte d\'Ivoire',
4072 'HR': 'Croatia',
4073 'CU': 'Cuba',
4074 'CW': 'Curaçao',
4075 'CY': 'Cyprus',
4076 'CZ': 'Czech Republic',
4077 'DK': 'Denmark',
4078 'DJ': 'Djibouti',
4079 'DM': 'Dominica',
4080 'DO': 'Dominican Republic',
4081 'EC': 'Ecuador',
4082 'EG': 'Egypt',
4083 'SV': 'El Salvador',
4084 'GQ': 'Equatorial Guinea',
4085 'ER': 'Eritrea',
4086 'EE': 'Estonia',
4087 'ET': 'Ethiopia',
4088 'FK': 'Falkland Islands (Malvinas)',
4089 'FO': 'Faroe Islands',
4090 'FJ': 'Fiji',
4091 'FI': 'Finland',
4092 'FR': 'France',
4093 'GF': 'French Guiana',
4094 'PF': 'French Polynesia',
4095 'TF': 'French Southern Territories',
4096 'GA': 'Gabon',
4097 'GM': 'Gambia',
4098 'GE': 'Georgia',
4099 'DE': 'Germany',
4100 'GH': 'Ghana',
4101 'GI': 'Gibraltar',
4102 'GR': 'Greece',
4103 'GL': 'Greenland',
4104 'GD': 'Grenada',
4105 'GP': 'Guadeloupe',
4106 'GU': 'Guam',
4107 'GT': 'Guatemala',
4108 'GG': 'Guernsey',
4109 'GN': 'Guinea',
4110 'GW': 'Guinea-Bissau',
4111 'GY': 'Guyana',
4112 'HT': 'Haiti',
4113 'HM': 'Heard Island and McDonald Islands',
4114 'VA': 'Holy See (Vatican City State)',
4115 'HN': 'Honduras',
4116 'HK': 'Hong Kong',
4117 'HU': 'Hungary',
4118 'IS': 'Iceland',
4119 'IN': 'India',
4120 'ID': 'Indonesia',
4121 'IR': 'Iran, Islamic Republic of',
4122 'IQ': 'Iraq',
4123 'IE': 'Ireland',
4124 'IM': 'Isle of Man',
4125 'IL': 'Israel',
4126 'IT': 'Italy',
4127 'JM': 'Jamaica',
4128 'JP': 'Japan',
4129 'JE': 'Jersey',
4130 'JO': 'Jordan',
4131 'KZ': 'Kazakhstan',
4132 'KE': 'Kenya',
4133 'KI': 'Kiribati',
4134 'KP': 'Korea, Democratic People\'s Republic of',
4135 'KR': 'Korea, Republic of',
4136 'KW': 'Kuwait',
4137 'KG': 'Kyrgyzstan',
4138 'LA': 'Lao People\'s Democratic Republic',
4139 'LV': 'Latvia',
4140 'LB': 'Lebanon',
4141 'LS': 'Lesotho',
4142 'LR': 'Liberia',
4143 'LY': 'Libya',
4144 'LI': 'Liechtenstein',
4145 'LT': 'Lithuania',
4146 'LU': 'Luxembourg',
4147 'MO': 'Macao',
4148 'MK': 'Macedonia, the Former Yugoslav Republic of',
4149 'MG': 'Madagascar',
4150 'MW': 'Malawi',
4151 'MY': 'Malaysia',
4152 'MV': 'Maldives',
4153 'ML': 'Mali',
4154 'MT': 'Malta',
4155 'MH': 'Marshall Islands',
4156 'MQ': 'Martinique',
4157 'MR': 'Mauritania',
4158 'MU': 'Mauritius',
4159 'YT': 'Mayotte',
4160 'MX': 'Mexico',
4161 'FM': 'Micronesia, Federated States of',
4162 'MD': 'Moldova, Republic of',
4163 'MC': 'Monaco',
4164 'MN': 'Mongolia',
4165 'ME': 'Montenegro',
4166 'MS': 'Montserrat',
4167 'MA': 'Morocco',
4168 'MZ': 'Mozambique',
4169 'MM': 'Myanmar',
4170 'NA': 'Namibia',
4171 'NR': 'Nauru',
4172 'NP': 'Nepal',
4173 'NL': 'Netherlands',
4174 'NC': 'New Caledonia',
4175 'NZ': 'New Zealand',
4176 'NI': 'Nicaragua',
4177 'NE': 'Niger',
4178 'NG': 'Nigeria',
4179 'NU': 'Niue',
4180 'NF': 'Norfolk Island',
4181 'MP': 'Northern Mariana Islands',
4182 'NO': 'Norway',
4183 'OM': 'Oman',
4184 'PK': 'Pakistan',
4185 'PW': 'Palau',
4186 'PS': 'Palestine, State of',
4187 'PA': 'Panama',
4188 'PG': 'Papua New Guinea',
4189 'PY': 'Paraguay',
4190 'PE': 'Peru',
4191 'PH': 'Philippines',
4192 'PN': 'Pitcairn',
4193 'PL': 'Poland',
4194 'PT': 'Portugal',
4195 'PR': 'Puerto Rico',
4196 'QA': 'Qatar',
4197 'RE': 'Réunion',
4198 'RO': 'Romania',
4199 'RU': 'Russian Federation',
4200 'RW': 'Rwanda',
4201 'BL': 'Saint Barthélemy',
4202 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4203 'KN': 'Saint Kitts and Nevis',
4204 'LC': 'Saint Lucia',
4205 'MF': 'Saint Martin (French part)',
4206 'PM': 'Saint Pierre and Miquelon',
4207 'VC': 'Saint Vincent and the Grenadines',
4208 'WS': 'Samoa',
4209 'SM': 'San Marino',
4210 'ST': 'Sao Tome and Principe',
4211 'SA': 'Saudi Arabia',
4212 'SN': 'Senegal',
4213 'RS': 'Serbia',
4214 'SC': 'Seychelles',
4215 'SL': 'Sierra Leone',
4216 'SG': 'Singapore',
4217 'SX': 'Sint Maarten (Dutch part)',
4218 'SK': 'Slovakia',
4219 'SI': 'Slovenia',
4220 'SB': 'Solomon Islands',
4221 'SO': 'Somalia',
4222 'ZA': 'South Africa',
4223 'GS': 'South Georgia and the South Sandwich Islands',
4224 'SS': 'South Sudan',
4225 'ES': 'Spain',
4226 'LK': 'Sri Lanka',
4227 'SD': 'Sudan',
4228 'SR': 'Suriname',
4229 'SJ': 'Svalbard and Jan Mayen',
4230 'SZ': 'Swaziland',
4231 'SE': 'Sweden',
4232 'CH': 'Switzerland',
4233 'SY': 'Syrian Arab Republic',
4234 'TW': 'Taiwan, Province of China',
4235 'TJ': 'Tajikistan',
4236 'TZ': 'Tanzania, United Republic of',
4237 'TH': 'Thailand',
4238 'TL': 'Timor-Leste',
4239 'TG': 'Togo',
4240 'TK': 'Tokelau',
4241 'TO': 'Tonga',
4242 'TT': 'Trinidad and Tobago',
4243 'TN': 'Tunisia',
4244 'TR': 'Turkey',
4245 'TM': 'Turkmenistan',
4246 'TC': 'Turks and Caicos Islands',
4247 'TV': 'Tuvalu',
4248 'UG': 'Uganda',
4249 'UA': 'Ukraine',
4250 'AE': 'United Arab Emirates',
4251 'GB': 'United Kingdom',
4252 'US': 'United States',
4253 'UM': 'United States Minor Outlying Islands',
4254 'UY': 'Uruguay',
4255 'UZ': 'Uzbekistan',
4256 'VU': 'Vanuatu',
4257 'VE': 'Venezuela, Bolivarian Republic of',
4258 'VN': 'Viet Nam',
4259 'VG': 'Virgin Islands, British',
4260 'VI': 'Virgin Islands, U.S.',
4261 'WF': 'Wallis and Futuna',
4262 'EH': 'Western Sahara',
4263 'YE': 'Yemen',
4264 'ZM': 'Zambia',
4265 'ZW': 'Zimbabwe',
4266 }
4267
4268 @classmethod
4269 def short2full(cls, code):
4270 """Convert an ISO 3166-2 country code to the corresponding full name"""
4271 return cls._country_map.get(code.upper())
4272
4273
4274 class GeoUtils(object):
4275 # Major IPv4 address blocks per country
4276 _country_ip_map = {
4277 'AD': '46.172.224.0/19',
4278 'AE': '94.200.0.0/13',
4279 'AF': '149.54.0.0/17',
4280 'AG': '209.59.64.0/18',
4281 'AI': '204.14.248.0/21',
4282 'AL': '46.99.0.0/16',
4283 'AM': '46.70.0.0/15',
4284 'AO': '105.168.0.0/13',
4285 'AP': '182.50.184.0/21',
4286 'AQ': '23.154.160.0/24',
4287 'AR': '181.0.0.0/12',
4288 'AS': '202.70.112.0/20',
4289 'AT': '77.116.0.0/14',
4290 'AU': '1.128.0.0/11',
4291 'AW': '181.41.0.0/18',
4292 'AX': '185.217.4.0/22',
4293 'AZ': '5.197.0.0/16',
4294 'BA': '31.176.128.0/17',
4295 'BB': '65.48.128.0/17',
4296 'BD': '114.130.0.0/16',
4297 'BE': '57.0.0.0/8',
4298 'BF': '102.178.0.0/15',
4299 'BG': '95.42.0.0/15',
4300 'BH': '37.131.0.0/17',
4301 'BI': '154.117.192.0/18',
4302 'BJ': '137.255.0.0/16',
4303 'BL': '185.212.72.0/23',
4304 'BM': '196.12.64.0/18',
4305 'BN': '156.31.0.0/16',
4306 'BO': '161.56.0.0/16',
4307 'BQ': '161.0.80.0/20',
4308 'BR': '191.128.0.0/12',
4309 'BS': '24.51.64.0/18',
4310 'BT': '119.2.96.0/19',
4311 'BW': '168.167.0.0/16',
4312 'BY': '178.120.0.0/13',
4313 'BZ': '179.42.192.0/18',
4314 'CA': '99.224.0.0/11',
4315 'CD': '41.243.0.0/16',
4316 'CF': '197.242.176.0/21',
4317 'CG': '160.113.0.0/16',
4318 'CH': '85.0.0.0/13',
4319 'CI': '102.136.0.0/14',
4320 'CK': '202.65.32.0/19',
4321 'CL': '152.172.0.0/14',
4322 'CM': '102.244.0.0/14',
4323 'CN': '36.128.0.0/10',
4324 'CO': '181.240.0.0/12',
4325 'CR': '201.192.0.0/12',
4326 'CU': '152.206.0.0/15',
4327 'CV': '165.90.96.0/19',
4328 'CW': '190.88.128.0/17',
4329 'CY': '31.153.0.0/16',
4330 'CZ': '88.100.0.0/14',
4331 'DE': '53.0.0.0/8',
4332 'DJ': '197.241.0.0/17',
4333 'DK': '87.48.0.0/12',
4334 'DM': '192.243.48.0/20',
4335 'DO': '152.166.0.0/15',
4336 'DZ': '41.96.0.0/12',
4337 'EC': '186.68.0.0/15',
4338 'EE': '90.190.0.0/15',
4339 'EG': '156.160.0.0/11',
4340 'ER': '196.200.96.0/20',
4341 'ES': '88.0.0.0/11',
4342 'ET': '196.188.0.0/14',
4343 'EU': '2.16.0.0/13',
4344 'FI': '91.152.0.0/13',
4345 'FJ': '144.120.0.0/16',
4346 'FK': '80.73.208.0/21',
4347 'FM': '119.252.112.0/20',
4348 'FO': '88.85.32.0/19',
4349 'FR': '90.0.0.0/9',
4350 'GA': '41.158.0.0/15',
4351 'GB': '25.0.0.0/8',
4352 'GD': '74.122.88.0/21',
4353 'GE': '31.146.0.0/16',
4354 'GF': '161.22.64.0/18',
4355 'GG': '62.68.160.0/19',
4356 'GH': '154.160.0.0/12',
4357 'GI': '95.164.0.0/16',
4358 'GL': '88.83.0.0/19',
4359 'GM': '160.182.0.0/15',
4360 'GN': '197.149.192.0/18',
4361 'GP': '104.250.0.0/19',
4362 'GQ': '105.235.224.0/20',
4363 'GR': '94.64.0.0/13',
4364 'GT': '168.234.0.0/16',
4365 'GU': '168.123.0.0/16',
4366 'GW': '197.214.80.0/20',
4367 'GY': '181.41.64.0/18',
4368 'HK': '113.252.0.0/14',
4369 'HN': '181.210.0.0/16',
4370 'HR': '93.136.0.0/13',
4371 'HT': '148.102.128.0/17',
4372 'HU': '84.0.0.0/14',
4373 'ID': '39.192.0.0/10',
4374 'IE': '87.32.0.0/12',
4375 'IL': '79.176.0.0/13',
4376 'IM': '5.62.80.0/20',
4377 'IN': '117.192.0.0/10',
4378 'IO': '203.83.48.0/21',
4379 'IQ': '37.236.0.0/14',
4380 'IR': '2.176.0.0/12',
4381 'IS': '82.221.0.0/16',
4382 'IT': '79.0.0.0/10',
4383 'JE': '87.244.64.0/18',
4384 'JM': '72.27.0.0/17',
4385 'JO': '176.29.0.0/16',
4386 'JP': '133.0.0.0/8',
4387 'KE': '105.48.0.0/12',
4388 'KG': '158.181.128.0/17',
4389 'KH': '36.37.128.0/17',
4390 'KI': '103.25.140.0/22',
4391 'KM': '197.255.224.0/20',
4392 'KN': '198.167.192.0/19',
4393 'KP': '175.45.176.0/22',
4394 'KR': '175.192.0.0/10',
4395 'KW': '37.36.0.0/14',
4396 'KY': '64.96.0.0/15',
4397 'KZ': '2.72.0.0/13',
4398 'LA': '115.84.64.0/18',
4399 'LB': '178.135.0.0/16',
4400 'LC': '24.92.144.0/20',
4401 'LI': '82.117.0.0/19',
4402 'LK': '112.134.0.0/15',
4403 'LR': '102.183.0.0/16',
4404 'LS': '129.232.0.0/17',
4405 'LT': '78.56.0.0/13',
4406 'LU': '188.42.0.0/16',
4407 'LV': '46.109.0.0/16',
4408 'LY': '41.252.0.0/14',
4409 'MA': '105.128.0.0/11',
4410 'MC': '88.209.64.0/18',
4411 'MD': '37.246.0.0/16',
4412 'ME': '178.175.0.0/17',
4413 'MF': '74.112.232.0/21',
4414 'MG': '154.126.0.0/17',
4415 'MH': '117.103.88.0/21',
4416 'MK': '77.28.0.0/15',
4417 'ML': '154.118.128.0/18',
4418 'MM': '37.111.0.0/17',
4419 'MN': '49.0.128.0/17',
4420 'MO': '60.246.0.0/16',
4421 'MP': '202.88.64.0/20',
4422 'MQ': '109.203.224.0/19',
4423 'MR': '41.188.64.0/18',
4424 'MS': '208.90.112.0/22',
4425 'MT': '46.11.0.0/16',
4426 'MU': '105.16.0.0/12',
4427 'MV': '27.114.128.0/18',
4428 'MW': '102.70.0.0/15',
4429 'MX': '187.192.0.0/11',
4430 'MY': '175.136.0.0/13',
4431 'MZ': '197.218.0.0/15',
4432 'NA': '41.182.0.0/16',
4433 'NC': '101.101.0.0/18',
4434 'NE': '197.214.0.0/18',
4435 'NF': '203.17.240.0/22',
4436 'NG': '105.112.0.0/12',
4437 'NI': '186.76.0.0/15',
4438 'NL': '145.96.0.0/11',
4439 'NO': '84.208.0.0/13',
4440 'NP': '36.252.0.0/15',
4441 'NR': '203.98.224.0/19',
4442 'NU': '49.156.48.0/22',
4443 'NZ': '49.224.0.0/14',
4444 'OM': '5.36.0.0/15',
4445 'PA': '186.72.0.0/15',
4446 'PE': '186.160.0.0/14',
4447 'PF': '123.50.64.0/18',
4448 'PG': '124.240.192.0/19',
4449 'PH': '49.144.0.0/13',
4450 'PK': '39.32.0.0/11',
4451 'PL': '83.0.0.0/11',
4452 'PM': '70.36.0.0/20',
4453 'PR': '66.50.0.0/16',
4454 'PS': '188.161.0.0/16',
4455 'PT': '85.240.0.0/13',
4456 'PW': '202.124.224.0/20',
4457 'PY': '181.120.0.0/14',
4458 'QA': '37.210.0.0/15',
4459 'RE': '102.35.0.0/16',
4460 'RO': '79.112.0.0/13',
4461 'RS': '93.86.0.0/15',
4462 'RU': '5.136.0.0/13',
4463 'RW': '41.186.0.0/16',
4464 'SA': '188.48.0.0/13',
4465 'SB': '202.1.160.0/19',
4466 'SC': '154.192.0.0/11',
4467 'SD': '102.120.0.0/13',
4468 'SE': '78.64.0.0/12',
4469 'SG': '8.128.0.0/10',
4470 'SI': '188.196.0.0/14',
4471 'SK': '78.98.0.0/15',
4472 'SL': '102.143.0.0/17',
4473 'SM': '89.186.32.0/19',
4474 'SN': '41.82.0.0/15',
4475 'SO': '154.115.192.0/18',
4476 'SR': '186.179.128.0/17',
4477 'SS': '105.235.208.0/21',
4478 'ST': '197.159.160.0/19',
4479 'SV': '168.243.0.0/16',
4480 'SX': '190.102.0.0/20',
4481 'SY': '5.0.0.0/16',
4482 'SZ': '41.84.224.0/19',
4483 'TC': '65.255.48.0/20',
4484 'TD': '154.68.128.0/19',
4485 'TG': '196.168.0.0/14',
4486 'TH': '171.96.0.0/13',
4487 'TJ': '85.9.128.0/18',
4488 'TK': '27.96.24.0/21',
4489 'TL': '180.189.160.0/20',
4490 'TM': '95.85.96.0/19',
4491 'TN': '197.0.0.0/11',
4492 'TO': '175.176.144.0/21',
4493 'TR': '78.160.0.0/11',
4494 'TT': '186.44.0.0/15',
4495 'TV': '202.2.96.0/19',
4496 'TW': '120.96.0.0/11',
4497 'TZ': '156.156.0.0/14',
4498 'UA': '37.52.0.0/14',
4499 'UG': '102.80.0.0/13',
4500 'US': '6.0.0.0/8',
4501 'UY': '167.56.0.0/13',
4502 'UZ': '84.54.64.0/18',
4503 'VA': '212.77.0.0/19',
4504 'VC': '207.191.240.0/21',
4505 'VE': '186.88.0.0/13',
4506 'VG': '66.81.192.0/20',
4507 'VI': '146.226.0.0/16',
4508 'VN': '14.160.0.0/11',
4509 'VU': '202.80.32.0/20',
4510 'WF': '117.20.32.0/21',
4511 'WS': '202.4.32.0/19',
4512 'YE': '134.35.0.0/16',
4513 'YT': '41.242.116.0/22',
4514 'ZA': '41.0.0.0/11',
4515 'ZM': '102.144.0.0/13',
4516 'ZW': '102.177.192.0/18',
4517 }
4518
4519 @classmethod
4520 def random_ipv4(cls, code_or_block):
4521 if len(code_or_block) == 2:
4522 block = cls._country_ip_map.get(code_or_block.upper())
4523 if not block:
4524 return None
4525 else:
4526 block = code_or_block
4527 addr, preflen = block.split('/')
4528 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4529 addr_max = addr_min | (0xffffffff >> int(preflen))
4530 return compat_str(socket.inet_ntoa(
4531 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4532
4533
4534 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4535 def __init__(self, proxies=None):
4536 # Set default handlers
4537 for type in ('http', 'https'):
4538 setattr(self, '%s_open' % type,
4539 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4540 meth(r, proxy, type))
4541 compat_urllib_request.ProxyHandler.__init__(self, proxies)
4542
4543 def proxy_open(self, req, proxy, type):
4544 req_proxy = req.headers.get('Ytdl-request-proxy')
4545 if req_proxy is not None:
4546 proxy = req_proxy
4547 del req.headers['Ytdl-request-proxy']
4548
4549 if proxy == '__noproxy__':
4550 return None # No Proxy
4551 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4552 req.add_header('Ytdl-socks-proxy', proxy)
4553 # yt-dlp's http/https handlers do wrapping the socket with socks
4554 return None
4555 return compat_urllib_request.ProxyHandler.proxy_open(
4556 self, req, proxy, type)
4557
4558
4559 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4560 # released into Public Domain
4561 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4562
4563 def long_to_bytes(n, blocksize=0):
4564 """long_to_bytes(n:long, blocksize:int) : string
4565 Convert a long integer to a byte string.
4566
4567 If optional blocksize is given and greater than zero, pad the front of the
4568 byte string with binary zeros so that the length is a multiple of
4569 blocksize.
4570 """
4571 # after much testing, this algorithm was deemed to be the fastest
4572 s = b''
4573 n = int(n)
4574 while n > 0:
4575 s = compat_struct_pack('>I', n & 0xffffffff) + s
4576 n = n >> 32
4577 # strip off leading zeros
4578 for i in range(len(s)):
4579 if s[i] != b'\000'[0]:
4580 break
4581 else:
4582 # only happens when n == 0
4583 s = b'\000'
4584 i = 0
4585 s = s[i:]
4586 # add back some pad bytes. this could be done more efficiently w.r.t. the
4587 # de-padding being done above, but sigh...
4588 if blocksize > 0 and len(s) % blocksize:
4589 s = (blocksize - len(s) % blocksize) * b'\000' + s
4590 return s
4591
4592
4593 def bytes_to_long(s):
4594 """bytes_to_long(string) : long
4595 Convert a byte string to a long integer.
4596
4597 This is (essentially) the inverse of long_to_bytes().
4598 """
4599 acc = 0
4600 length = len(s)
4601 if length % 4:
4602 extra = (4 - length % 4)
4603 s = b'\000' * extra + s
4604 length = length + extra
4605 for i in range(0, length, 4):
4606 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4607 return acc
4608
4609
4610 def ohdave_rsa_encrypt(data, exponent, modulus):
4611 '''
4612 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4613
4614 Input:
4615 data: data to encrypt, bytes-like object
4616 exponent, modulus: parameter e and N of RSA algorithm, both integer
4617 Output: hex string of encrypted data
4618
4619 Limitation: supports one block encryption only
4620 '''
4621
4622 payload = int(binascii.hexlify(data[::-1]), 16)
4623 encrypted = pow(payload, exponent, modulus)
4624 return '%x' % encrypted
4625
4626
4627 def pkcs1pad(data, length):
4628 """
4629 Padding input data with PKCS#1 scheme
4630
4631 @param {int[]} data input data
4632 @param {int} length target length
4633 @returns {int[]} padded data
4634 """
4635 if len(data) > length - 11:
4636 raise ValueError('Input data too long for PKCS#1 padding')
4637
4638 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4639 return [0, 2] + pseudo_random + [0] + data
4640
4641
4642 def encode_base_n(num, n, table=None):
4643 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4644 if not table:
4645 table = FULL_TABLE[:n]
4646
4647 if n > len(table):
4648 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4649
4650 if num == 0:
4651 return table[0]
4652
4653 ret = ''
4654 while num:
4655 ret = table[num % n] + ret
4656 num = num // n
4657 return ret
4658
4659
4660 def decode_packed_codes(code):
4661 mobj = re.search(PACKED_CODES_RE, code)
4662 obfuscated_code, base, count, symbols = mobj.groups()
4663 base = int(base)
4664 count = int(count)
4665 symbols = symbols.split('|')
4666 symbol_table = {}
4667
4668 while count:
4669 count -= 1
4670 base_n_count = encode_base_n(count, base)
4671 symbol_table[base_n_count] = symbols[count] or base_n_count
4672
4673 return re.sub(
4674 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4675 obfuscated_code)
4676
4677
4678 def caesar(s, alphabet, shift):
4679 if shift == 0:
4680 return s
4681 l = len(alphabet)
4682 return ''.join(
4683 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4684 for c in s)
4685
4686
4687 def rot47(s):
4688 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4689
4690
4691 def parse_m3u8_attributes(attrib):
4692 info = {}
4693 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4694 if val.startswith('"'):
4695 val = val[1:-1]
4696 info[key] = val
4697 return info
4698
4699
4700 def urshift(val, n):
4701 return val >> n if val >= 0 else (val + 0x100000000) >> n
4702
4703
4704 # Based on png2str() written by @gdkchan and improved by @yokrysty
4705 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4706 def decode_png(png_data):
4707 # Reference: https://www.w3.org/TR/PNG/
4708 header = png_data[8:]
4709
4710 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4711 raise IOError('Not a valid PNG file.')
4712
4713 int_map = {1: '>B', 2: '>H', 4: '>I'}
4714 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4715
4716 chunks = []
4717
4718 while header:
4719 length = unpack_integer(header[:4])
4720 header = header[4:]
4721
4722 chunk_type = header[:4]
4723 header = header[4:]
4724
4725 chunk_data = header[:length]
4726 header = header[length:]
4727
4728 header = header[4:] # Skip CRC
4729
4730 chunks.append({
4731 'type': chunk_type,
4732 'length': length,
4733 'data': chunk_data
4734 })
4735
4736 ihdr = chunks[0]['data']
4737
4738 width = unpack_integer(ihdr[:4])
4739 height = unpack_integer(ihdr[4:8])
4740
4741 idat = b''
4742
4743 for chunk in chunks:
4744 if chunk['type'] == b'IDAT':
4745 idat += chunk['data']
4746
4747 if not idat:
4748 raise IOError('Unable to read PNG data.')
4749
4750 decompressed_data = bytearray(zlib.decompress(idat))
4751
4752 stride = width * 3
4753 pixels = []
4754
4755 def _get_pixel(idx):
4756 x = idx % stride
4757 y = idx // stride
4758 return pixels[y][x]
4759
4760 for y in range(height):
4761 basePos = y * (1 + stride)
4762 filter_type = decompressed_data[basePos]
4763
4764 current_row = []
4765
4766 pixels.append(current_row)
4767
4768 for x in range(stride):
4769 color = decompressed_data[1 + basePos + x]
4770 basex = y * stride + x
4771 left = 0
4772 up = 0
4773
4774 if x > 2:
4775 left = _get_pixel(basex - 3)
4776 if y > 0:
4777 up = _get_pixel(basex - stride)
4778
4779 if filter_type == 1: # Sub
4780 color = (color + left) & 0xff
4781 elif filter_type == 2: # Up
4782 color = (color + up) & 0xff
4783 elif filter_type == 3: # Average
4784 color = (color + ((left + up) >> 1)) & 0xff
4785 elif filter_type == 4: # Paeth
4786 a = left
4787 b = up
4788 c = 0
4789
4790 if x > 2 and y > 0:
4791 c = _get_pixel(basex - stride - 3)
4792
4793 p = a + b - c
4794
4795 pa = abs(p - a)
4796 pb = abs(p - b)
4797 pc = abs(p - c)
4798
4799 if pa <= pb and pa <= pc:
4800 color = (color + a) & 0xff
4801 elif pb <= pc:
4802 color = (color + b) & 0xff
4803 else:
4804 color = (color + c) & 0xff
4805
4806 current_row.append(color)
4807
4808 return width, height, pixels
4809
4810
4811 def write_xattr(path, key, value):
4812 # This mess below finds the best xattr tool for the job
4813 try:
4814 # try the pyxattr module...
4815 import xattr
4816
4817 if hasattr(xattr, 'set'): # pyxattr
4818 # Unicode arguments are not supported in python-pyxattr until
4819 # version 0.5.0
4820 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4821 pyxattr_required_version = '0.5.0'
4822 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4823 # TODO: fallback to CLI tools
4824 raise XAttrUnavailableError(
4825 'python-pyxattr is detected but is too old. '
4826 'yt-dlp requires %s or above while your version is %s. '
4827 'Falling back to other xattr implementations' % (
4828 pyxattr_required_version, xattr.__version__))
4829
4830 setxattr = xattr.set
4831 else: # xattr
4832 setxattr = xattr.setxattr
4833
4834 try:
4835 setxattr(path, key, value)
4836 except EnvironmentError as e:
4837 raise XAttrMetadataError(e.errno, e.strerror)
4838
4839 except ImportError:
4840 if compat_os_name == 'nt':
4841 # Write xattrs to NTFS Alternate Data Streams:
4842 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4843 assert ':' not in key
4844 assert os.path.exists(path)
4845
4846 ads_fn = path + ':' + key
4847 try:
4848 with open(ads_fn, 'wb') as f:
4849 f.write(value)
4850 except EnvironmentError as e:
4851 raise XAttrMetadataError(e.errno, e.strerror)
4852 else:
4853 user_has_setfattr = check_executable('setfattr', ['--version'])
4854 user_has_xattr = check_executable('xattr', ['-h'])
4855
4856 if user_has_setfattr or user_has_xattr:
4857
4858 value = value.decode('utf-8')
4859 if user_has_setfattr:
4860 executable = 'setfattr'
4861 opts = ['-n', key, '-v', value]
4862 elif user_has_xattr:
4863 executable = 'xattr'
4864 opts = ['-w', key, value]
4865
4866 cmd = ([encodeFilename(executable, True)]
4867 + [encodeArgument(o) for o in opts]
4868 + [encodeFilename(path, True)])
4869
4870 try:
4871 p = Popen(
4872 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4873 except EnvironmentError as e:
4874 raise XAttrMetadataError(e.errno, e.strerror)
4875 stdout, stderr = p.communicate_or_kill()
4876 stderr = stderr.decode('utf-8', 'replace')
4877 if p.returncode != 0:
4878 raise XAttrMetadataError(p.returncode, stderr)
4879
4880 else:
4881 # On Unix, and can't find pyxattr, setfattr, or xattr.
4882 if sys.platform.startswith('linux'):
4883 raise XAttrUnavailableError(
4884 "Couldn't find a tool to set the xattrs. "
4885 "Install either the python 'pyxattr' or 'xattr' "
4886 "modules, or the GNU 'attr' package "
4887 "(which contains the 'setfattr' tool).")
4888 else:
4889 raise XAttrUnavailableError(
4890 "Couldn't find a tool to set the xattrs. "
4891 "Install either the python 'xattr' module, "
4892 "or the 'xattr' binary.")
4893
4894
4895 def random_birthday(year_field, month_field, day_field):
4896 start_date = datetime.date(1950, 1, 1)
4897 end_date = datetime.date(1995, 12, 31)
4898 offset = random.randint(0, (end_date - start_date).days)
4899 random_date = start_date + datetime.timedelta(offset)
4900 return {
4901 year_field: str(random_date.year),
4902 month_field: str(random_date.month),
4903 day_field: str(random_date.day),
4904 }
4905
4906
4907 # Templates for internet shortcut files, which are plain text files.
4908 DOT_URL_LINK_TEMPLATE = '''
4909 [InternetShortcut]
4910 URL=%(url)s
4911 '''.lstrip()
4912
4913 DOT_WEBLOC_LINK_TEMPLATE = '''
4914 <?xml version="1.0" encoding="UTF-8"?>
4915 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4916 <plist version="1.0">
4917 <dict>
4918 \t<key>URL</key>
4919 \t<string>%(url)s</string>
4920 </dict>
4921 </plist>
4922 '''.lstrip()
4923
4924 DOT_DESKTOP_LINK_TEMPLATE = '''
4925 [Desktop Entry]
4926 Encoding=UTF-8
4927 Name=%(filename)s
4928 Type=Link
4929 URL=%(url)s
4930 Icon=text-html
4931 '''.lstrip()
4932
4933 LINK_TEMPLATES = {
4934 'url': DOT_URL_LINK_TEMPLATE,
4935 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4936 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4937 }
4938
4939
4940 def iri_to_uri(iri):
4941 """
4942 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4943
4944 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4945 """
4946
4947 iri_parts = compat_urllib_parse_urlparse(iri)
4948
4949 if '[' in iri_parts.netloc:
4950 raise ValueError('IPv6 URIs are not, yet, supported.')
4951 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4952
4953 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4954
4955 net_location = ''
4956 if iri_parts.username:
4957 net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4958 if iri_parts.password is not None:
4959 net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4960 net_location += '@'
4961
4962 net_location += iri_parts.hostname.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
4963 # The 'idna' encoding produces ASCII text.
4964 if iri_parts.port is not None and iri_parts.port != 80:
4965 net_location += ':' + str(iri_parts.port)
4966
4967 return compat_urllib_parse_urlunparse(
4968 (iri_parts.scheme,
4969 net_location,
4970
4971 compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
4972
4973 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
4974 compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
4975
4976 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
4977 compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
4978
4979 compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
4980
4981 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
4982
4983
4984 def to_high_limit_path(path):
4985 if sys.platform in ['win32', 'cygwin']:
4986 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
4987 return r'\\?\ '.rstrip() + os.path.abspath(path)
4988
4989 return path
4990
4991
4992 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
4993 val = traverse_obj(obj, *variadic(field))
4994 if val in ignore:
4995 return default
4996 return template % (func(val) if func else val)
4997
4998
4999 def clean_podcast_url(url):
5000 return re.sub(r'''(?x)
5001 (?:
5002 (?:
5003 chtbl\.com/track|
5004 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5005 play\.podtrac\.com
5006 )/[^/]+|
5007 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5008 flex\.acast\.com|
5009 pd(?:
5010 cn\.co| # https://podcorn.com/analytics-prefix/
5011 st\.fm # https://podsights.com/docs/
5012 )/e
5013 )/''', '', url)
5014
5015
5016 _HEX_TABLE = '0123456789abcdef'
5017
5018
5019 def random_uuidv4():
5020 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5021
5022
5023 def make_dir(path, to_screen=None):
5024 try:
5025 dn = os.path.dirname(path)
5026 if dn and not os.path.exists(dn):
5027 os.makedirs(dn)
5028 return True
5029 except (OSError, IOError) as err:
5030 if callable(to_screen) is not None:
5031 to_screen('unable to create directory ' + error_to_compat_str(err))
5032 return False
5033
5034
5035 def get_executable_path():
5036 from zipimport import zipimporter
5037 if hasattr(sys, 'frozen'): # Running from PyInstaller
5038 path = os.path.dirname(sys.executable)
5039 elif isinstance(globals().get('__loader__'), zipimporter): # Running from ZIP
5040 path = os.path.join(os.path.dirname(__file__), '../..')
5041 else:
5042 path = os.path.join(os.path.dirname(__file__), '..')
5043 return os.path.abspath(path)
5044
5045
5046 def load_plugins(name, suffix, namespace):
5047 classes = {}
5048 try:
5049 plugins_spec = importlib.util.spec_from_file_location(
5050 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5051 plugins = importlib.util.module_from_spec(plugins_spec)
5052 sys.modules[plugins_spec.name] = plugins
5053 plugins_spec.loader.exec_module(plugins)
5054 for name in dir(plugins):
5055 if name in namespace:
5056 continue
5057 if not name.endswith(suffix):
5058 continue
5059 klass = getattr(plugins, name)
5060 classes[name] = namespace[name] = klass
5061 except FileNotFoundError:
5062 pass
5063 return classes
5064
5065
5066 def traverse_obj(
5067 obj, *path_list, default=None, expected_type=None, get_all=True,
5068 casesense=True, is_user_input=False, traverse_string=False):
5069 ''' Traverse nested list/dict/tuple
5070 @param path_list A list of paths which are checked one by one.
5071 Each path is a list of keys where each key is a string,
5072 a function, a tuple of strings/None or "...".
5073 When a fuction is given, it takes the key as argument and
5074 returns whether the key matches or not. When a tuple is given,
5075 all the keys given in the tuple are traversed, and
5076 "..." traverses all the keys in the object
5077 "None" returns the object without traversal
5078 @param default Default value to return
5079 @param expected_type Only accept final value of this type (Can also be any callable)
5080 @param get_all Return all the values obtained from a path or only the first one
5081 @param casesense Whether to consider dictionary keys as case sensitive
5082 @param is_user_input Whether the keys are generated from user input. If True,
5083 strings are converted to int/slice if necessary
5084 @param traverse_string Whether to traverse inside strings. If True, any
5085 non-compatible object will also be converted into a string
5086 # TODO: Write tests
5087 '''
5088 if not casesense:
5089 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5090 path_list = (map(_lower, variadic(path)) for path in path_list)
5091
5092 def _traverse_obj(obj, path, _current_depth=0):
5093 nonlocal depth
5094 path = tuple(variadic(path))
5095 for i, key in enumerate(path):
5096 if None in (key, obj):
5097 return obj
5098 if isinstance(key, (list, tuple)):
5099 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5100 key = ...
5101 if key is ...:
5102 obj = (obj.values() if isinstance(obj, dict)
5103 else obj if isinstance(obj, (list, tuple, LazyList))
5104 else str(obj) if traverse_string else [])
5105 _current_depth += 1
5106 depth = max(depth, _current_depth)
5107 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5108 elif callable(key):
5109 if isinstance(obj, (list, tuple, LazyList)):
5110 obj = enumerate(obj)
5111 elif isinstance(obj, dict):
5112 obj = obj.items()
5113 else:
5114 if not traverse_string:
5115 return None
5116 obj = str(obj)
5117 _current_depth += 1
5118 depth = max(depth, _current_depth)
5119 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
5120 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5121 obj = (obj.get(key) if casesense or (key in obj)
5122 else next((v for k, v in obj.items() if _lower(k) == key), None))
5123 else:
5124 if is_user_input:
5125 key = (int_or_none(key) if ':' not in key
5126 else slice(*map(int_or_none, key.split(':'))))
5127 if key == slice(None):
5128 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5129 if not isinstance(key, (int, slice)):
5130 return None
5131 if not isinstance(obj, (list, tuple, LazyList)):
5132 if not traverse_string:
5133 return None
5134 obj = str(obj)
5135 try:
5136 obj = obj[key]
5137 except IndexError:
5138 return None
5139 return obj
5140
5141 if isinstance(expected_type, type):
5142 type_test = lambda val: val if isinstance(val, expected_type) else None
5143 elif expected_type is not None:
5144 type_test = expected_type
5145 else:
5146 type_test = lambda val: val
5147
5148 for path in path_list:
5149 depth = 0
5150 val = _traverse_obj(obj, path)
5151 if val is not None:
5152 if depth:
5153 for _ in range(depth - 1):
5154 val = itertools.chain.from_iterable(v for v in val if v is not None)
5155 val = [v for v in map(type_test, val) if v is not None]
5156 if val:
5157 return val if get_all else val[0]
5158 else:
5159 val = type_test(val)
5160 if val is not None:
5161 return val
5162 return default
5163
5164
5165 def traverse_dict(dictn, keys, casesense=True):
5166 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5167 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5168 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5169
5170
5171 def variadic(x, allowed_types=(str, bytes, dict)):
5172 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5173
5174
5175 # create a JSON Web Signature (jws) with HS256 algorithm
5176 # the resulting format is in JWS Compact Serialization
5177 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5178 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5179 def jwt_encode_hs256(payload_data, key, headers={}):
5180 header_data = {
5181 'alg': 'HS256',
5182 'typ': 'JWT',
5183 }
5184 if headers:
5185 header_data.update(headers)
5186 header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5187 payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5188 h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5189 signature_b64 = base64.b64encode(h.digest())
5190 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5191 return token
5192
5193
5194 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5195 def jwt_decode_hs256(jwt):
5196 header_b64, payload_b64, signature_b64 = jwt.split('.')
5197 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5198 return payload_data
5199
5200
5201 def supports_terminal_sequences(stream):
5202 if compat_os_name == 'nt':
5203 from .compat import WINDOWS_VT_MODE # Must be imported locally
5204 if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5205 return False
5206 elif not os.getenv('TERM'):
5207 return False
5208 try:
5209 return stream.isatty()
5210 except BaseException:
5211 return False
5212
5213
5214 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5215
5216
5217 def remove_terminal_sequences(string):
5218 return _terminal_sequences_re.sub('', string)
5219
5220
5221 def number_of_digits(number):
5222 return len('%d' % number)
5223
5224
5225 def join_nonempty(*values, delim='-', from_dict=None):
5226 if from_dict is not None:
5227 values = map(from_dict.get, values)
5228 return delim.join(map(str, filter(None, values)))
5229
5230
5231 class Config:
5232 own_args = None
5233 filename = None
5234 __initialized = False
5235
5236 def __init__(self, parser, label=None):
5237 self._parser, self.label = parser, label
5238 self._loaded_paths, self.configs = set(), []
5239
5240 def init(self, args=None, filename=None):
5241 assert not self.__initialized
5242 directory = ''
5243 if filename:
5244 location = os.path.realpath(filename)
5245 directory = os.path.dirname(location)
5246 if location in self._loaded_paths:
5247 return False
5248 self._loaded_paths.add(location)
5249
5250 self.__initialized = True
5251 self.own_args, self.filename = args, filename
5252 for location in self._parser.parse_args(args)[0].config_locations or []:
5253 location = os.path.join(directory, expand_path(location))
5254 if os.path.isdir(location):
5255 location = os.path.join(location, 'yt-dlp.conf')
5256 if not os.path.exists(location):
5257 self._parser.error(f'config location {location} does not exist')
5258 self.append_config(self.read_file(location), location)
5259 return True
5260
5261 def __str__(self):
5262 label = join_nonempty(
5263 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5264 delim=' ')
5265 return join_nonempty(
5266 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5267 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5268 delim='\n')
5269
5270 @staticmethod
5271 def read_file(filename, default=[]):
5272 try:
5273 optionf = open(filename)
5274 except IOError:
5275 return default # silently skip if file is not present
5276 try:
5277 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5278 contents = optionf.read()
5279 if sys.version_info < (3,):
5280 contents = contents.decode(preferredencoding())
5281 res = compat_shlex_split(contents, comments=True)
5282 finally:
5283 optionf.close()
5284 return res
5285
5286 @staticmethod
5287 def hide_login_info(opts):
5288 PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5289 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5290
5291 def _scrub_eq(o):
5292 m = eqre.match(o)
5293 if m:
5294 return m.group('key') + '=PRIVATE'
5295 else:
5296 return o
5297
5298 opts = list(map(_scrub_eq, opts))
5299 for idx, opt in enumerate(opts):
5300 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5301 opts[idx + 1] = 'PRIVATE'
5302 return opts
5303
5304 def append_config(self, *args, label=None):
5305 config = type(self)(self._parser, label)
5306 config._loaded_paths = self._loaded_paths
5307 if config.init(*args):
5308 self.configs.append(config)
5309
5310 @property
5311 def all_args(self):
5312 for config in reversed(self.configs):
5313 yield from config.all_args
5314 yield from self.own_args or []
5315
5316 def parse_args(self):
5317 return self._parser.parse_args(list(self.all_args))
5318
5319
5320 class WebSocketsWrapper():
5321 """Wraps websockets module to use in non-async scopes"""
5322
5323 def __init__(self, url, headers=None):
5324 self.loop = asyncio.events.new_event_loop()
5325 self.conn = compat_websockets.connect(
5326 url, extra_headers=headers, ping_interval=None,
5327 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5328
5329 def __enter__(self):
5330 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5331 return self
5332
5333 def send(self, *args):
5334 self.run_with_loop(self.pool.send(*args), self.loop)
5335
5336 def recv(self, *args):
5337 return self.run_with_loop(self.pool.recv(*args), self.loop)
5338
5339 def __exit__(self, type, value, traceback):
5340 try:
5341 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5342 finally:
5343 self.loop.close()
5344 self.r_cancel_all_tasks(self.loop)
5345
5346 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5347 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5348 @staticmethod
5349 def run_with_loop(main, loop):
5350 if not asyncio.coroutines.iscoroutine(main):
5351 raise ValueError(f'a coroutine was expected, got {main!r}')
5352
5353 try:
5354 return loop.run_until_complete(main)
5355 finally:
5356 loop.run_until_complete(loop.shutdown_asyncgens())
5357 if hasattr(loop, 'shutdown_default_executor'):
5358 loop.run_until_complete(loop.shutdown_default_executor())
5359
5360 @staticmethod
5361 def _cancel_all_tasks(loop):
5362 to_cancel = asyncio.tasks.all_tasks(loop)
5363
5364 if not to_cancel:
5365 return
5366
5367 for task in to_cancel:
5368 task.cancel()
5369
5370 loop.run_until_complete(
5371 asyncio.tasks.gather(*to_cancel, loop=loop, return_exceptions=True))
5372
5373 for task in to_cancel:
5374 if task.cancelled():
5375 continue
5376 if task.exception() is not None:
5377 loop.call_exception_handler({
5378 'message': 'unhandled exception during asyncio.run() shutdown',
5379 'exception': task.exception(),
5380 'task': task,
5381 })
5382
5383
5384 has_websockets = bool(compat_websockets)