]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
9406eb834d42d263fdbaa06c3c6940f01ccb0d19
[yt-dlp.git] / yt_dlp / utils.py
1 #!/usr/bin/env python3
2 # coding: utf-8
3
4 from __future__ import unicode_literals
5
6 import asyncio
7 import atexit
8 import base64
9 import binascii
10 import calendar
11 import codecs
12 import collections
13 import contextlib
14 import ctypes
15 import datetime
16 import email.utils
17 import email.header
18 import errno
19 import functools
20 import gzip
21 import hashlib
22 import hmac
23 import importlib.util
24 import io
25 import itertools
26 import json
27 import locale
28 import math
29 import operator
30 import os
31 import platform
32 import random
33 import re
34 import socket
35 import ssl
36 import subprocess
37 import sys
38 import tempfile
39 import time
40 import traceback
41 import xml.etree.ElementTree
42 import zlib
43 import mimetypes
44
45 from .compat import (
46 compat_HTMLParseError,
47 compat_HTMLParser,
48 compat_HTTPError,
49 compat_basestring,
50 compat_chr,
51 compat_cookiejar,
52 compat_ctypes_WINFUNCTYPE,
53 compat_etree_fromstring,
54 compat_expanduser,
55 compat_html_entities,
56 compat_html_entities_html5,
57 compat_http_client,
58 compat_integer_types,
59 compat_numeric_types,
60 compat_kwargs,
61 compat_os_name,
62 compat_parse_qs,
63 compat_shlex_split,
64 compat_shlex_quote,
65 compat_str,
66 compat_struct_pack,
67 compat_struct_unpack,
68 compat_urllib_error,
69 compat_urllib_parse,
70 compat_urllib_parse_urlencode,
71 compat_urllib_parse_urlparse,
72 compat_urllib_parse_urlunparse,
73 compat_urllib_parse_quote,
74 compat_urllib_parse_quote_plus,
75 compat_urllib_parse_unquote_plus,
76 compat_urllib_request,
77 compat_urlparse,
78 compat_websockets,
79 compat_xpath,
80 )
81
82 from .socks import (
83 ProxyType,
84 sockssocket,
85 )
86
87
88 def register_socks_protocols():
89 # "Register" SOCKS protocols
90 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
91 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
92 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
93 if scheme not in compat_urlparse.uses_netloc:
94 compat_urlparse.uses_netloc.append(scheme)
95
96
97 # This is not clearly defined otherwise
98 compiled_regex_type = type(re.compile(''))
99
100
101 def random_user_agent():
102 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
103 _CHROME_VERSIONS = (
104 '90.0.4430.212',
105 '90.0.4430.24',
106 '90.0.4430.70',
107 '90.0.4430.72',
108 '90.0.4430.85',
109 '90.0.4430.93',
110 '91.0.4472.101',
111 '91.0.4472.106',
112 '91.0.4472.114',
113 '91.0.4472.124',
114 '91.0.4472.164',
115 '91.0.4472.19',
116 '91.0.4472.77',
117 '92.0.4515.107',
118 '92.0.4515.115',
119 '92.0.4515.131',
120 '92.0.4515.159',
121 '92.0.4515.43',
122 '93.0.4556.0',
123 '93.0.4577.15',
124 '93.0.4577.63',
125 '93.0.4577.82',
126 '94.0.4606.41',
127 '94.0.4606.54',
128 '94.0.4606.61',
129 '94.0.4606.71',
130 '94.0.4606.81',
131 '94.0.4606.85',
132 '95.0.4638.17',
133 '95.0.4638.50',
134 '95.0.4638.54',
135 '95.0.4638.69',
136 '95.0.4638.74',
137 '96.0.4664.18',
138 '96.0.4664.45',
139 '96.0.4664.55',
140 '96.0.4664.93',
141 '97.0.4692.20',
142 )
143 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
144
145
146 std_headers = {
147 'User-Agent': random_user_agent(),
148 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
149 'Accept-Encoding': 'gzip, deflate',
150 'Accept-Language': 'en-us,en;q=0.5',
151 'Sec-Fetch-Mode': 'navigate',
152 }
153
154
155 USER_AGENTS = {
156 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
157 }
158
159
160 NO_DEFAULT = object()
161
162 ENGLISH_MONTH_NAMES = [
163 'January', 'February', 'March', 'April', 'May', 'June',
164 'July', 'August', 'September', 'October', 'November', 'December']
165
166 MONTH_NAMES = {
167 'en': ENGLISH_MONTH_NAMES,
168 'fr': [
169 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
170 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
171 }
172
173 KNOWN_EXTENSIONS = (
174 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
175 'flv', 'f4v', 'f4a', 'f4b',
176 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
177 'mkv', 'mka', 'mk3d',
178 'avi', 'divx',
179 'mov',
180 'asf', 'wmv', 'wma',
181 '3gp', '3g2',
182 'mp3',
183 'flac',
184 'ape',
185 'wav',
186 'f4f', 'f4m', 'm3u8', 'smil')
187
188 # needed for sanitizing filenames in restricted mode
189 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
190 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
191 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
192
193 DATE_FORMATS = (
194 '%d %B %Y',
195 '%d %b %Y',
196 '%B %d %Y',
197 '%B %dst %Y',
198 '%B %dnd %Y',
199 '%B %drd %Y',
200 '%B %dth %Y',
201 '%b %d %Y',
202 '%b %dst %Y',
203 '%b %dnd %Y',
204 '%b %drd %Y',
205 '%b %dth %Y',
206 '%b %dst %Y %I:%M',
207 '%b %dnd %Y %I:%M',
208 '%b %drd %Y %I:%M',
209 '%b %dth %Y %I:%M',
210 '%Y %m %d',
211 '%Y-%m-%d',
212 '%Y.%m.%d.',
213 '%Y/%m/%d',
214 '%Y/%m/%d %H:%M',
215 '%Y/%m/%d %H:%M:%S',
216 '%Y%m%d%H%M',
217 '%Y%m%d%H%M%S',
218 '%Y%m%d',
219 '%Y-%m-%d %H:%M',
220 '%Y-%m-%d %H:%M:%S',
221 '%Y-%m-%d %H:%M:%S.%f',
222 '%Y-%m-%d %H:%M:%S:%f',
223 '%d.%m.%Y %H:%M',
224 '%d.%m.%Y %H.%M',
225 '%Y-%m-%dT%H:%M:%SZ',
226 '%Y-%m-%dT%H:%M:%S.%fZ',
227 '%Y-%m-%dT%H:%M:%S.%f0Z',
228 '%Y-%m-%dT%H:%M:%S',
229 '%Y-%m-%dT%H:%M:%S.%f',
230 '%Y-%m-%dT%H:%M',
231 '%b %d %Y at %H:%M',
232 '%b %d %Y at %H:%M:%S',
233 '%B %d %Y at %H:%M',
234 '%B %d %Y at %H:%M:%S',
235 '%H:%M %d-%b-%Y',
236 )
237
238 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
239 DATE_FORMATS_DAY_FIRST.extend([
240 '%d-%m-%Y',
241 '%d.%m.%Y',
242 '%d.%m.%y',
243 '%d/%m/%Y',
244 '%d/%m/%y',
245 '%d/%m/%Y %H:%M:%S',
246 ])
247
248 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
249 DATE_FORMATS_MONTH_FIRST.extend([
250 '%m-%d-%Y',
251 '%m.%d.%Y',
252 '%m/%d/%Y',
253 '%m/%d/%y',
254 '%m/%d/%Y %H:%M:%S',
255 ])
256
257 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
258 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
259
260
261 def preferredencoding():
262 """Get preferred encoding.
263
264 Returns the best encoding scheme for the system, based on
265 locale.getpreferredencoding() and some further tweaks.
266 """
267 try:
268 pref = locale.getpreferredencoding()
269 'TEST'.encode(pref)
270 except Exception:
271 pref = 'UTF-8'
272
273 return pref
274
275
276 def write_json_file(obj, fn):
277 """ Encode obj as JSON and write it to fn, atomically if possible """
278
279 fn = encodeFilename(fn)
280 if sys.version_info < (3, 0) and sys.platform != 'win32':
281 encoding = get_filesystem_encoding()
282 # os.path.basename returns a bytes object, but NamedTemporaryFile
283 # will fail if the filename contains non ascii characters unless we
284 # use a unicode object
285 path_basename = lambda f: os.path.basename(fn).decode(encoding)
286 # the same for os.path.dirname
287 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
288 else:
289 path_basename = os.path.basename
290 path_dirname = os.path.dirname
291
292 args = {
293 'suffix': '.tmp',
294 'prefix': path_basename(fn) + '.',
295 'dir': path_dirname(fn),
296 'delete': False,
297 }
298
299 # In Python 2.x, json.dump expects a bytestream.
300 # In Python 3.x, it writes to a character stream
301 if sys.version_info < (3, 0):
302 args['mode'] = 'wb'
303 else:
304 args.update({
305 'mode': 'w',
306 'encoding': 'utf-8',
307 })
308
309 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
310
311 try:
312 with tf:
313 json.dump(obj, tf, ensure_ascii=False)
314 if sys.platform == 'win32':
315 # Need to remove existing file on Windows, else os.rename raises
316 # WindowsError or FileExistsError.
317 try:
318 os.unlink(fn)
319 except OSError:
320 pass
321 try:
322 mask = os.umask(0)
323 os.umask(mask)
324 os.chmod(tf.name, 0o666 & ~mask)
325 except OSError:
326 pass
327 os.rename(tf.name, fn)
328 except Exception:
329 try:
330 os.remove(tf.name)
331 except OSError:
332 pass
333 raise
334
335
336 if sys.version_info >= (2, 7):
337 def find_xpath_attr(node, xpath, key, val=None):
338 """ Find the xpath xpath[@key=val] """
339 assert re.match(r'^[a-zA-Z_-]+$', key)
340 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
341 return node.find(expr)
342 else:
343 def find_xpath_attr(node, xpath, key, val=None):
344 for f in node.findall(compat_xpath(xpath)):
345 if key not in f.attrib:
346 continue
347 if val is None or f.attrib.get(key) == val:
348 return f
349 return None
350
351 # On python2.6 the xml.etree.ElementTree.Element methods don't support
352 # the namespace parameter
353
354
355 def xpath_with_ns(path, ns_map):
356 components = [c.split(':') for c in path.split('/')]
357 replaced = []
358 for c in components:
359 if len(c) == 1:
360 replaced.append(c[0])
361 else:
362 ns, tag = c
363 replaced.append('{%s}%s' % (ns_map[ns], tag))
364 return '/'.join(replaced)
365
366
367 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
368 def _find_xpath(xpath):
369 return node.find(compat_xpath(xpath))
370
371 if isinstance(xpath, (str, compat_str)):
372 n = _find_xpath(xpath)
373 else:
374 for xp in xpath:
375 n = _find_xpath(xp)
376 if n is not None:
377 break
378
379 if n is None:
380 if default is not NO_DEFAULT:
381 return default
382 elif fatal:
383 name = xpath if name is None else name
384 raise ExtractorError('Could not find XML element %s' % name)
385 else:
386 return None
387 return n
388
389
390 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
391 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
392 if n is None or n == default:
393 return n
394 if n.text is None:
395 if default is not NO_DEFAULT:
396 return default
397 elif fatal:
398 name = xpath if name is None else name
399 raise ExtractorError('Could not find XML element\'s text %s' % name)
400 else:
401 return None
402 return n.text
403
404
405 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
406 n = find_xpath_attr(node, xpath, key)
407 if n is None:
408 if default is not NO_DEFAULT:
409 return default
410 elif fatal:
411 name = '%s[@%s]' % (xpath, key) if name is None else name
412 raise ExtractorError('Could not find XML attribute %s' % name)
413 else:
414 return None
415 return n.attrib[key]
416
417
418 def get_element_by_id(id, html):
419 """Return the content of the tag with the specified ID in the passed HTML document"""
420 return get_element_by_attribute('id', id, html)
421
422
423 def get_element_html_by_id(id, html):
424 """Return the html of the tag with the specified ID in the passed HTML document"""
425 return get_element_html_by_attribute('id', id, html)
426
427
428 def get_element_by_class(class_name, html):
429 """Return the content of the first tag with the specified class in the passed HTML document"""
430 retval = get_elements_by_class(class_name, html)
431 return retval[0] if retval else None
432
433
434 def get_element_html_by_class(class_name, html):
435 """Return the html of the first tag with the specified class in the passed HTML document"""
436 retval = get_elements_html_by_class(class_name, html)
437 return retval[0] if retval else None
438
439
440 def get_element_by_attribute(attribute, value, html, escape_value=True):
441 retval = get_elements_by_attribute(attribute, value, html, escape_value)
442 return retval[0] if retval else None
443
444
445 def get_element_html_by_attribute(attribute, value, html, escape_value=True):
446 retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
447 return retval[0] if retval else None
448
449
450 def get_elements_by_class(class_name, html):
451 """Return the content of all tags with the specified class in the passed HTML document as a list"""
452 return get_elements_by_attribute(
453 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
454 html, escape_value=False)
455
456
457 def get_elements_html_by_class(class_name, html):
458 """Return the html of all tags with the specified class in the passed HTML document as a list"""
459 return get_elements_html_by_attribute(
460 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
461 html, escape_value=False)
462
463
464 def get_elements_by_attribute(*args, **kwargs):
465 """Return the content of the tag with the specified attribute in the passed HTML document"""
466 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
467
468
469 def get_elements_html_by_attribute(*args, **kwargs):
470 """Return the html of the tag with the specified attribute in the passed HTML document"""
471 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
472
473
474 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
475 """
476 Return the text (content) and the html (whole) of the tag with the specified
477 attribute in the passed HTML document
478 """
479
480 value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
481
482 value = re.escape(value) if escape_value else value
483
484 partial_element_re = r'''(?x)
485 <(?P<tag>[a-zA-Z0-9:._-]+)
486 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
487 \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
488 ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
489
490 for m in re.finditer(partial_element_re, html):
491 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
492
493 yield (
494 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
495 whole
496 )
497
498
499 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
500 """
501 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
502 closing tag for the first opening tag it has encountered, and can be used
503 as a context manager
504 """
505
506 class HTMLBreakOnClosingTagException(Exception):
507 pass
508
509 def __init__(self):
510 self.tagstack = collections.deque()
511 compat_HTMLParser.__init__(self)
512
513 def __enter__(self):
514 return self
515
516 def __exit__(self, *_):
517 self.close()
518
519 def close(self):
520 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
521 # so data remains buffered; we no longer have any interest in it, thus
522 # override this method to discard it
523 pass
524
525 def handle_starttag(self, tag, _):
526 self.tagstack.append(tag)
527
528 def handle_endtag(self, tag):
529 if not self.tagstack:
530 raise compat_HTMLParseError('no tags in the stack')
531 while self.tagstack:
532 inner_tag = self.tagstack.pop()
533 if inner_tag == tag:
534 break
535 else:
536 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
537 if not self.tagstack:
538 raise self.HTMLBreakOnClosingTagException()
539
540
541 def get_element_text_and_html_by_tag(tag, html):
542 """
543 For the first element with the specified tag in the passed HTML document
544 return its' content (text) and the whole element (html)
545 """
546 def find_or_raise(haystack, needle, exc):
547 try:
548 return haystack.index(needle)
549 except ValueError:
550 raise exc
551 closing_tag = f'</{tag}>'
552 whole_start = find_or_raise(
553 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
554 content_start = find_or_raise(
555 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
556 content_start += whole_start + 1
557 with HTMLBreakOnClosingTagParser() as parser:
558 parser.feed(html[whole_start:content_start])
559 if not parser.tagstack or parser.tagstack[0] != tag:
560 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
561 offset = content_start
562 while offset < len(html):
563 next_closing_tag_start = find_or_raise(
564 html[offset:], closing_tag,
565 compat_HTMLParseError(f'closing {tag} tag not found'))
566 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
567 try:
568 parser.feed(html[offset:offset + next_closing_tag_end])
569 offset += next_closing_tag_end
570 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
571 return html[content_start:offset + next_closing_tag_start], \
572 html[whole_start:offset + next_closing_tag_end]
573 raise compat_HTMLParseError('unexpected end of html')
574
575
576 class HTMLAttributeParser(compat_HTMLParser):
577 """Trivial HTML parser to gather the attributes for a single element"""
578
579 def __init__(self):
580 self.attrs = {}
581 compat_HTMLParser.__init__(self)
582
583 def handle_starttag(self, tag, attrs):
584 self.attrs = dict(attrs)
585
586
587 class HTMLListAttrsParser(compat_HTMLParser):
588 """HTML parser to gather the attributes for the elements of a list"""
589
590 def __init__(self):
591 compat_HTMLParser.__init__(self)
592 self.items = []
593 self._level = 0
594
595 def handle_starttag(self, tag, attrs):
596 if tag == 'li' and self._level == 0:
597 self.items.append(dict(attrs))
598 self._level += 1
599
600 def handle_endtag(self, tag):
601 self._level -= 1
602
603
604 def extract_attributes(html_element):
605 """Given a string for an HTML element such as
606 <el
607 a="foo" B="bar" c="&98;az" d=boz
608 empty= noval entity="&amp;"
609 sq='"' dq="'"
610 >
611 Decode and return a dictionary of attributes.
612 {
613 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
614 'empty': '', 'noval': None, 'entity': '&',
615 'sq': '"', 'dq': '\''
616 }.
617 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
618 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
619 """
620 parser = HTMLAttributeParser()
621 try:
622 parser.feed(html_element)
623 parser.close()
624 # Older Python may throw HTMLParseError in case of malformed HTML
625 except compat_HTMLParseError:
626 pass
627 return parser.attrs
628
629
630 def parse_list(webpage):
631 """Given a string for an series of HTML <li> elements,
632 return a dictionary of their attributes"""
633 parser = HTMLListAttrsParser()
634 parser.feed(webpage)
635 parser.close()
636 return parser.items
637
638
639 def clean_html(html):
640 """Clean an HTML snippet into a readable string"""
641
642 if html is None: # Convenience for sanitizing descriptions etc.
643 return html
644
645 html = re.sub(r'\s+', ' ', html)
646 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
647 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
648 # Strip html tags
649 html = re.sub('<.*?>', '', html)
650 # Replace html entities
651 html = unescapeHTML(html)
652 return html.strip()
653
654
655 def sanitize_open(filename, open_mode):
656 """Try to open the given filename, and slightly tweak it if this fails.
657
658 Attempts to open the given filename. If this fails, it tries to change
659 the filename slightly, step by step, until it's either able to open it
660 or it fails and raises a final exception, like the standard open()
661 function.
662
663 It returns the tuple (stream, definitive_file_name).
664 """
665 try:
666 if filename == '-':
667 if sys.platform == 'win32':
668 import msvcrt
669 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
670 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
671 stream = locked_file(filename, open_mode, block=False).open()
672 return (stream, filename)
673 except (IOError, OSError) as err:
674 if err.errno in (errno.EACCES,):
675 raise
676
677 # In case of error, try to remove win32 forbidden chars
678 alt_filename = sanitize_path(filename)
679 if alt_filename == filename:
680 raise
681 else:
682 # An exception here should be caught in the caller
683 stream = locked_file(filename, open_mode, block=False).open()
684 return (stream, alt_filename)
685
686
687 def timeconvert(timestr):
688 """Convert RFC 2822 defined time string into system timestamp"""
689 timestamp = None
690 timetuple = email.utils.parsedate_tz(timestr)
691 if timetuple is not None:
692 timestamp = email.utils.mktime_tz(timetuple)
693 return timestamp
694
695
696 def sanitize_filename(s, restricted=False, is_id=False):
697 """Sanitizes a string so it could be used as part of a filename.
698 If restricted is set, use a stricter subset of allowed characters.
699 Set is_id if this is not an arbitrary string, but an ID that should be kept
700 if possible.
701 """
702 def replace_insane(char):
703 if restricted and char in ACCENT_CHARS:
704 return ACCENT_CHARS[char]
705 elif not restricted and char == '\n':
706 return ' '
707 elif char == '?' or ord(char) < 32 or ord(char) == 127:
708 return ''
709 elif char == '"':
710 return '' if restricted else '\''
711 elif char == ':':
712 return '_-' if restricted else ' -'
713 elif char in '\\/|*<>':
714 return '_'
715 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
716 return '_'
717 if restricted and ord(char) > 127:
718 return '_'
719 return char
720
721 if s == '':
722 return ''
723 # Handle timestamps
724 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
725 result = ''.join(map(replace_insane, s))
726 if not is_id:
727 while '__' in result:
728 result = result.replace('__', '_')
729 result = result.strip('_')
730 # Common case of "Foreign band name - English song title"
731 if restricted and result.startswith('-_'):
732 result = result[2:]
733 if result.startswith('-'):
734 result = '_' + result[len('-'):]
735 result = result.lstrip('.')
736 if not result:
737 result = '_'
738 return result
739
740
741 def sanitize_path(s, force=False):
742 """Sanitizes and normalizes path on Windows"""
743 if sys.platform == 'win32':
744 force = False
745 drive_or_unc, _ = os.path.splitdrive(s)
746 if sys.version_info < (2, 7) and not drive_or_unc:
747 drive_or_unc, _ = os.path.splitunc(s)
748 elif force:
749 drive_or_unc = ''
750 else:
751 return s
752
753 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
754 if drive_or_unc:
755 norm_path.pop(0)
756 sanitized_path = [
757 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
758 for path_part in norm_path]
759 if drive_or_unc:
760 sanitized_path.insert(0, drive_or_unc + os.path.sep)
761 elif force and s[0] == os.path.sep:
762 sanitized_path.insert(0, os.path.sep)
763 return os.path.join(*sanitized_path)
764
765
766 def sanitize_url(url):
767 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
768 # the number of unwanted failures due to missing protocol
769 if url.startswith('//'):
770 return 'http:%s' % url
771 # Fix some common typos seen so far
772 COMMON_TYPOS = (
773 # https://github.com/ytdl-org/youtube-dl/issues/15649
774 (r'^httpss://', r'https://'),
775 # https://bx1.be/lives/direct-tv/
776 (r'^rmtp([es]?)://', r'rtmp\1://'),
777 )
778 for mistake, fixup in COMMON_TYPOS:
779 if re.match(mistake, url):
780 return re.sub(mistake, fixup, url)
781 return url
782
783
784 def extract_basic_auth(url):
785 parts = compat_urlparse.urlsplit(url)
786 if parts.username is None:
787 return url, None
788 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
789 parts.hostname if parts.port is None
790 else '%s:%d' % (parts.hostname, parts.port))))
791 auth_payload = base64.b64encode(
792 ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
793 return url, 'Basic ' + auth_payload.decode('utf-8')
794
795
796 def sanitized_Request(url, *args, **kwargs):
797 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
798 if auth_header is not None:
799 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
800 headers['Authorization'] = auth_header
801 return compat_urllib_request.Request(url, *args, **kwargs)
802
803
804 def expand_path(s):
805 """Expand shell variables and ~"""
806 return os.path.expandvars(compat_expanduser(s))
807
808
809 def orderedSet(iterable):
810 """ Remove all duplicates from the input iterable """
811 res = []
812 for el in iterable:
813 if el not in res:
814 res.append(el)
815 return res
816
817
818 def _htmlentity_transform(entity_with_semicolon):
819 """Transforms an HTML entity to a character."""
820 entity = entity_with_semicolon[:-1]
821
822 # Known non-numeric HTML entity
823 if entity in compat_html_entities.name2codepoint:
824 return compat_chr(compat_html_entities.name2codepoint[entity])
825
826 # TODO: HTML5 allows entities without a semicolon. For example,
827 # '&Eacuteric' should be decoded as 'Éric'.
828 if entity_with_semicolon in compat_html_entities_html5:
829 return compat_html_entities_html5[entity_with_semicolon]
830
831 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
832 if mobj is not None:
833 numstr = mobj.group(1)
834 if numstr.startswith('x'):
835 base = 16
836 numstr = '0%s' % numstr
837 else:
838 base = 10
839 # See https://github.com/ytdl-org/youtube-dl/issues/7518
840 try:
841 return compat_chr(int(numstr, base))
842 except ValueError:
843 pass
844
845 # Unknown entity in name, return its literal representation
846 return '&%s;' % entity
847
848
849 def unescapeHTML(s):
850 if s is None:
851 return None
852 assert type(s) == compat_str
853
854 return re.sub(
855 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
856
857
858 def escapeHTML(text):
859 return (
860 text
861 .replace('&', '&amp;')
862 .replace('<', '&lt;')
863 .replace('>', '&gt;')
864 .replace('"', '&quot;')
865 .replace("'", '&#39;')
866 )
867
868
869 def process_communicate_or_kill(p, *args, **kwargs):
870 try:
871 return p.communicate(*args, **kwargs)
872 except BaseException: # Including KeyboardInterrupt
873 p.kill()
874 p.wait()
875 raise
876
877
878 class Popen(subprocess.Popen):
879 if sys.platform == 'win32':
880 _startupinfo = subprocess.STARTUPINFO()
881 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
882 else:
883 _startupinfo = None
884
885 def __init__(self, *args, **kwargs):
886 super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
887
888 def communicate_or_kill(self, *args, **kwargs):
889 return process_communicate_or_kill(self, *args, **kwargs)
890
891
892 def get_subprocess_encoding():
893 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
894 # For subprocess calls, encode with locale encoding
895 # Refer to http://stackoverflow.com/a/9951851/35070
896 encoding = preferredencoding()
897 else:
898 encoding = sys.getfilesystemencoding()
899 if encoding is None:
900 encoding = 'utf-8'
901 return encoding
902
903
904 def encodeFilename(s, for_subprocess=False):
905 """
906 @param s The name of the file
907 """
908
909 assert type(s) == compat_str
910
911 # Python 3 has a Unicode API
912 if sys.version_info >= (3, 0):
913 return s
914
915 # Pass '' directly to use Unicode APIs on Windows 2000 and up
916 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
917 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
918 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
919 return s
920
921 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
922 if sys.platform.startswith('java'):
923 return s
924
925 return s.encode(get_subprocess_encoding(), 'ignore')
926
927
928 def decodeFilename(b, for_subprocess=False):
929
930 if sys.version_info >= (3, 0):
931 return b
932
933 if not isinstance(b, bytes):
934 return b
935
936 return b.decode(get_subprocess_encoding(), 'ignore')
937
938
939 def encodeArgument(s):
940 if not isinstance(s, compat_str):
941 # Legacy code that uses byte strings
942 # Uncomment the following line after fixing all post processors
943 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
944 s = s.decode('ascii')
945 return encodeFilename(s, True)
946
947
948 def decodeArgument(b):
949 return decodeFilename(b, True)
950
951
952 def decodeOption(optval):
953 if optval is None:
954 return optval
955 if isinstance(optval, bytes):
956 optval = optval.decode(preferredencoding())
957
958 assert isinstance(optval, compat_str)
959 return optval
960
961
962 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
963
964
965 def timetuple_from_msec(msec):
966 secs, msec = divmod(msec, 1000)
967 mins, secs = divmod(secs, 60)
968 hrs, mins = divmod(mins, 60)
969 return _timetuple(hrs, mins, secs, msec)
970
971
972 def formatSeconds(secs, delim=':', msec=False):
973 time = timetuple_from_msec(secs * 1000)
974 if time.hours:
975 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
976 elif time.minutes:
977 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
978 else:
979 ret = '%d' % time.seconds
980 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
981
982
983 def _ssl_load_windows_store_certs(ssl_context, storename):
984 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
985 try:
986 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
987 if encoding == 'x509_asn' and (
988 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
989 except PermissionError:
990 return
991 for cert in certs:
992 try:
993 ssl_context.load_verify_locations(cadata=cert)
994 except ssl.SSLError:
995 pass
996
997
998 def make_HTTPS_handler(params, **kwargs):
999 opts_check_certificate = not params.get('nocheckcertificate')
1000 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1001 context.check_hostname = opts_check_certificate
1002 if params.get('legacyserverconnect'):
1003 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
1004 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1005 if opts_check_certificate:
1006 try:
1007 context.load_default_certs()
1008 # Work around the issue in load_default_certs when there are bad certificates. See:
1009 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1010 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1011 except ssl.SSLError:
1012 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1013 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1014 # Create a new context to discard any certificates that were already loaded
1015 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1016 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
1017 for storename in ('CA', 'ROOT'):
1018 _ssl_load_windows_store_certs(context, storename)
1019 context.set_default_verify_paths()
1020 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1021
1022
1023 def bug_reports_message(before=';'):
1024 msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp , '
1025 'filling out the "Broken site" issue template properly. '
1026 'Confirm you are on the latest version using -U')
1027
1028 before = before.rstrip()
1029 if not before or before.endswith(('.', '!', '?')):
1030 msg = msg[0].title() + msg[1:]
1031
1032 return (before + ' ' if before else '') + msg
1033
1034
1035 class YoutubeDLError(Exception):
1036 """Base exception for YoutubeDL errors."""
1037 msg = None
1038
1039 def __init__(self, msg=None):
1040 if msg is not None:
1041 self.msg = msg
1042 elif self.msg is None:
1043 self.msg = type(self).__name__
1044 super().__init__(self.msg)
1045
1046
1047 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1048 if hasattr(ssl, 'CertificateError'):
1049 network_exceptions.append(ssl.CertificateError)
1050 network_exceptions = tuple(network_exceptions)
1051
1052
1053 class ExtractorError(YoutubeDLError):
1054 """Error during info extraction."""
1055
1056 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1057 """ tb, if given, is the original traceback (so that it can be printed out).
1058 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1059 """
1060 if sys.exc_info()[0] in network_exceptions:
1061 expected = True
1062
1063 self.orig_msg = str(msg)
1064 self.traceback = tb
1065 self.expected = expected
1066 self.cause = cause
1067 self.video_id = video_id
1068 self.ie = ie
1069 self.exc_info = sys.exc_info() # preserve original exception
1070
1071 super(ExtractorError, self).__init__(''.join((
1072 format_field(ie, template='[%s] '),
1073 format_field(video_id, template='%s: '),
1074 msg,
1075 format_field(cause, template=' (caused by %r)'),
1076 '' if expected else bug_reports_message())))
1077
1078 def format_traceback(self):
1079 return join_nonempty(
1080 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1081 self.cause and ''.join(traceback.format_exception(self.cause)[1:]),
1082 delim='\n') or None
1083
1084
1085 class UnsupportedError(ExtractorError):
1086 def __init__(self, url):
1087 super(UnsupportedError, self).__init__(
1088 'Unsupported URL: %s' % url, expected=True)
1089 self.url = url
1090
1091
1092 class RegexNotFoundError(ExtractorError):
1093 """Error when a regex didn't match"""
1094 pass
1095
1096
1097 class GeoRestrictedError(ExtractorError):
1098 """Geographic restriction Error exception.
1099
1100 This exception may be thrown when a video is not available from your
1101 geographic location due to geographic restrictions imposed by a website.
1102 """
1103
1104 def __init__(self, msg, countries=None, **kwargs):
1105 kwargs['expected'] = True
1106 super(GeoRestrictedError, self).__init__(msg, **kwargs)
1107 self.countries = countries
1108
1109
1110 class DownloadError(YoutubeDLError):
1111 """Download Error exception.
1112
1113 This exception may be thrown by FileDownloader objects if they are not
1114 configured to continue on errors. They will contain the appropriate
1115 error message.
1116 """
1117
1118 def __init__(self, msg, exc_info=None):
1119 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1120 super(DownloadError, self).__init__(msg)
1121 self.exc_info = exc_info
1122
1123
1124 class EntryNotInPlaylist(YoutubeDLError):
1125 """Entry not in playlist exception.
1126
1127 This exception will be thrown by YoutubeDL when a requested entry
1128 is not found in the playlist info_dict
1129 """
1130 msg = 'Entry not found in info'
1131
1132
1133 class SameFileError(YoutubeDLError):
1134 """Same File exception.
1135
1136 This exception will be thrown by FileDownloader objects if they detect
1137 multiple files would have to be downloaded to the same file on disk.
1138 """
1139 msg = 'Fixed output name but more than one file to download'
1140
1141 def __init__(self, filename=None):
1142 if filename is not None:
1143 self.msg += f': {filename}'
1144 super().__init__(self.msg)
1145
1146
1147 class PostProcessingError(YoutubeDLError):
1148 """Post Processing exception.
1149
1150 This exception may be raised by PostProcessor's .run() method to
1151 indicate an error in the postprocessing task.
1152 """
1153
1154
1155 class DownloadCancelled(YoutubeDLError):
1156 """ Exception raised when the download queue should be interrupted """
1157 msg = 'The download was cancelled'
1158
1159
1160 class ExistingVideoReached(DownloadCancelled):
1161 """ --break-on-existing triggered """
1162 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1163
1164
1165 class RejectedVideoReached(DownloadCancelled):
1166 """ --break-on-reject triggered """
1167 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1168
1169
1170 class MaxDownloadsReached(DownloadCancelled):
1171 """ --max-downloads limit has been reached. """
1172 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1173
1174
1175 class ReExtractInfo(YoutubeDLError):
1176 """ Video info needs to be re-extracted. """
1177
1178 def __init__(self, msg, expected=False):
1179 super().__init__(msg)
1180 self.expected = expected
1181
1182
1183 class ThrottledDownload(ReExtractInfo):
1184 """ Download speed below --throttled-rate. """
1185 msg = 'The download speed is below throttle limit'
1186
1187 def __init__(self):
1188 super().__init__(self.msg, expected=False)
1189
1190
1191 class UnavailableVideoError(YoutubeDLError):
1192 """Unavailable Format exception.
1193
1194 This exception will be thrown when a video is requested
1195 in a format that is not available for that video.
1196 """
1197 msg = 'Unable to download video'
1198
1199 def __init__(self, err=None):
1200 if err is not None:
1201 self.msg += f': {err}'
1202 super().__init__(self.msg)
1203
1204
1205 class ContentTooShortError(YoutubeDLError):
1206 """Content Too Short exception.
1207
1208 This exception may be raised by FileDownloader objects when a file they
1209 download is too small for what the server announced first, indicating
1210 the connection was probably interrupted.
1211 """
1212
1213 def __init__(self, downloaded, expected):
1214 super(ContentTooShortError, self).__init__(
1215 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1216 )
1217 # Both in bytes
1218 self.downloaded = downloaded
1219 self.expected = expected
1220
1221
1222 class XAttrMetadataError(YoutubeDLError):
1223 def __init__(self, code=None, msg='Unknown error'):
1224 super(XAttrMetadataError, self).__init__(msg)
1225 self.code = code
1226 self.msg = msg
1227
1228 # Parsing code and msg
1229 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1230 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1231 self.reason = 'NO_SPACE'
1232 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1233 self.reason = 'VALUE_TOO_LONG'
1234 else:
1235 self.reason = 'NOT_SUPPORTED'
1236
1237
1238 class XAttrUnavailableError(YoutubeDLError):
1239 pass
1240
1241
1242 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1243 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1244 # expected HTTP responses to meet HTTP/1.0 or later (see also
1245 # https://github.com/ytdl-org/youtube-dl/issues/6727)
1246 if sys.version_info < (3, 0):
1247 kwargs['strict'] = True
1248 hc = http_class(*args, **compat_kwargs(kwargs))
1249 source_address = ydl_handler._params.get('source_address')
1250
1251 if source_address is not None:
1252 # This is to workaround _create_connection() from socket where it will try all
1253 # address data from getaddrinfo() including IPv6. This filters the result from
1254 # getaddrinfo() based on the source_address value.
1255 # This is based on the cpython socket.create_connection() function.
1256 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1257 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1258 host, port = address
1259 err = None
1260 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1261 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1262 ip_addrs = [addr for addr in addrs if addr[0] == af]
1263 if addrs and not ip_addrs:
1264 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1265 raise socket.error(
1266 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1267 % (ip_version, source_address[0]))
1268 for res in ip_addrs:
1269 af, socktype, proto, canonname, sa = res
1270 sock = None
1271 try:
1272 sock = socket.socket(af, socktype, proto)
1273 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1274 sock.settimeout(timeout)
1275 sock.bind(source_address)
1276 sock.connect(sa)
1277 err = None # Explicitly break reference cycle
1278 return sock
1279 except socket.error as _:
1280 err = _
1281 if sock is not None:
1282 sock.close()
1283 if err is not None:
1284 raise err
1285 else:
1286 raise socket.error('getaddrinfo returns an empty list')
1287 if hasattr(hc, '_create_connection'):
1288 hc._create_connection = _create_connection
1289 sa = (source_address, 0)
1290 if hasattr(hc, 'source_address'): # Python 2.7+
1291 hc.source_address = sa
1292 else: # Python 2.6
1293 def _hc_connect(self, *args, **kwargs):
1294 sock = _create_connection(
1295 (self.host, self.port), self.timeout, sa)
1296 if is_https:
1297 self.sock = ssl.wrap_socket(
1298 sock, self.key_file, self.cert_file,
1299 ssl_version=ssl.PROTOCOL_TLSv1)
1300 else:
1301 self.sock = sock
1302 hc.connect = functools.partial(_hc_connect, hc)
1303
1304 return hc
1305
1306
1307 def handle_youtubedl_headers(headers):
1308 filtered_headers = headers
1309
1310 if 'Youtubedl-no-compression' in filtered_headers:
1311 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1312 del filtered_headers['Youtubedl-no-compression']
1313
1314 return filtered_headers
1315
1316
1317 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1318 """Handler for HTTP requests and responses.
1319
1320 This class, when installed with an OpenerDirector, automatically adds
1321 the standard headers to every HTTP request and handles gzipped and
1322 deflated responses from web servers. If compression is to be avoided in
1323 a particular request, the original request in the program code only has
1324 to include the HTTP header "Youtubedl-no-compression", which will be
1325 removed before making the real request.
1326
1327 Part of this code was copied from:
1328
1329 http://techknack.net/python-urllib2-handlers/
1330
1331 Andrew Rowls, the author of that code, agreed to release it to the
1332 public domain.
1333 """
1334
1335 def __init__(self, params, *args, **kwargs):
1336 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1337 self._params = params
1338
1339 def http_open(self, req):
1340 conn_class = compat_http_client.HTTPConnection
1341
1342 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1343 if socks_proxy:
1344 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1345 del req.headers['Ytdl-socks-proxy']
1346
1347 return self.do_open(functools.partial(
1348 _create_http_connection, self, conn_class, False),
1349 req)
1350
1351 @staticmethod
1352 def deflate(data):
1353 if not data:
1354 return data
1355 try:
1356 return zlib.decompress(data, -zlib.MAX_WBITS)
1357 except zlib.error:
1358 return zlib.decompress(data)
1359
1360 def http_request(self, req):
1361 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1362 # always respected by websites, some tend to give out URLs with non percent-encoded
1363 # non-ASCII characters (see telemb.py, ard.py [#3412])
1364 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1365 # To work around aforementioned issue we will replace request's original URL with
1366 # percent-encoded one
1367 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1368 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1369 url = req.get_full_url()
1370 url_escaped = escape_url(url)
1371
1372 # Substitute URL if any change after escaping
1373 if url != url_escaped:
1374 req = update_Request(req, url=url_escaped)
1375
1376 for h, v in self._params.get('http_headers', std_headers).items():
1377 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1378 # The dict keys are capitalized because of this bug by urllib
1379 if h.capitalize() not in req.headers:
1380 req.add_header(h, v)
1381
1382 req.headers = handle_youtubedl_headers(req.headers)
1383
1384 if sys.version_info < (2, 7) and '#' in req.get_full_url():
1385 # Python 2.6 is brain-dead when it comes to fragments
1386 req._Request__original = req._Request__original.partition('#')[0]
1387 req._Request__r_type = req._Request__r_type.partition('#')[0]
1388
1389 return req
1390
1391 def http_response(self, req, resp):
1392 old_resp = resp
1393 # gzip
1394 if resp.headers.get('Content-encoding', '') == 'gzip':
1395 content = resp.read()
1396 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1397 try:
1398 uncompressed = io.BytesIO(gz.read())
1399 except IOError as original_ioerror:
1400 # There may be junk add the end of the file
1401 # See http://stackoverflow.com/q/4928560/35070 for details
1402 for i in range(1, 1024):
1403 try:
1404 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1405 uncompressed = io.BytesIO(gz.read())
1406 except IOError:
1407 continue
1408 break
1409 else:
1410 raise original_ioerror
1411 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1412 resp.msg = old_resp.msg
1413 del resp.headers['Content-encoding']
1414 # deflate
1415 if resp.headers.get('Content-encoding', '') == 'deflate':
1416 gz = io.BytesIO(self.deflate(resp.read()))
1417 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1418 resp.msg = old_resp.msg
1419 del resp.headers['Content-encoding']
1420 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1421 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1422 if 300 <= resp.code < 400:
1423 location = resp.headers.get('Location')
1424 if location:
1425 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1426 if sys.version_info >= (3, 0):
1427 location = location.encode('iso-8859-1').decode('utf-8')
1428 else:
1429 location = location.decode('utf-8')
1430 location_escaped = escape_url(location)
1431 if location != location_escaped:
1432 del resp.headers['Location']
1433 if sys.version_info < (3, 0):
1434 location_escaped = location_escaped.encode('utf-8')
1435 resp.headers['Location'] = location_escaped
1436 return resp
1437
1438 https_request = http_request
1439 https_response = http_response
1440
1441
1442 def make_socks_conn_class(base_class, socks_proxy):
1443 assert issubclass(base_class, (
1444 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1445
1446 url_components = compat_urlparse.urlparse(socks_proxy)
1447 if url_components.scheme.lower() == 'socks5':
1448 socks_type = ProxyType.SOCKS5
1449 elif url_components.scheme.lower() in ('socks', 'socks4'):
1450 socks_type = ProxyType.SOCKS4
1451 elif url_components.scheme.lower() == 'socks4a':
1452 socks_type = ProxyType.SOCKS4A
1453
1454 def unquote_if_non_empty(s):
1455 if not s:
1456 return s
1457 return compat_urllib_parse_unquote_plus(s)
1458
1459 proxy_args = (
1460 socks_type,
1461 url_components.hostname, url_components.port or 1080,
1462 True, # Remote DNS
1463 unquote_if_non_empty(url_components.username),
1464 unquote_if_non_empty(url_components.password),
1465 )
1466
1467 class SocksConnection(base_class):
1468 def connect(self):
1469 self.sock = sockssocket()
1470 self.sock.setproxy(*proxy_args)
1471 if type(self.timeout) in (int, float):
1472 self.sock.settimeout(self.timeout)
1473 self.sock.connect((self.host, self.port))
1474
1475 if isinstance(self, compat_http_client.HTTPSConnection):
1476 if hasattr(self, '_context'): # Python > 2.6
1477 self.sock = self._context.wrap_socket(
1478 self.sock, server_hostname=self.host)
1479 else:
1480 self.sock = ssl.wrap_socket(self.sock)
1481
1482 return SocksConnection
1483
1484
1485 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1486 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1487 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1488 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1489 self._params = params
1490
1491 def https_open(self, req):
1492 kwargs = {}
1493 conn_class = self._https_conn_class
1494
1495 if hasattr(self, '_context'): # python > 2.6
1496 kwargs['context'] = self._context
1497 if hasattr(self, '_check_hostname'): # python 3.x
1498 kwargs['check_hostname'] = self._check_hostname
1499
1500 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1501 if socks_proxy:
1502 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1503 del req.headers['Ytdl-socks-proxy']
1504
1505 return self.do_open(functools.partial(
1506 _create_http_connection, self, conn_class, True),
1507 req, **kwargs)
1508
1509
1510 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1511 """
1512 See [1] for cookie file format.
1513
1514 1. https://curl.haxx.se/docs/http-cookies.html
1515 """
1516 _HTTPONLY_PREFIX = '#HttpOnly_'
1517 _ENTRY_LEN = 7
1518 _HEADER = '''# Netscape HTTP Cookie File
1519 # This file is generated by yt-dlp. Do not edit.
1520
1521 '''
1522 _CookieFileEntry = collections.namedtuple(
1523 'CookieFileEntry',
1524 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1525
1526 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1527 """
1528 Save cookies to a file.
1529
1530 Most of the code is taken from CPython 3.8 and slightly adapted
1531 to support cookie files with UTF-8 in both python 2 and 3.
1532 """
1533 if filename is None:
1534 if self.filename is not None:
1535 filename = self.filename
1536 else:
1537 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1538
1539 # Store session cookies with `expires` set to 0 instead of an empty
1540 # string
1541 for cookie in self:
1542 if cookie.expires is None:
1543 cookie.expires = 0
1544
1545 with io.open(filename, 'w', encoding='utf-8') as f:
1546 f.write(self._HEADER)
1547 now = time.time()
1548 for cookie in self:
1549 if not ignore_discard and cookie.discard:
1550 continue
1551 if not ignore_expires and cookie.is_expired(now):
1552 continue
1553 if cookie.secure:
1554 secure = 'TRUE'
1555 else:
1556 secure = 'FALSE'
1557 if cookie.domain.startswith('.'):
1558 initial_dot = 'TRUE'
1559 else:
1560 initial_dot = 'FALSE'
1561 if cookie.expires is not None:
1562 expires = compat_str(cookie.expires)
1563 else:
1564 expires = ''
1565 if cookie.value is None:
1566 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1567 # with no name, whereas http.cookiejar regards it as a
1568 # cookie with no value.
1569 name = ''
1570 value = cookie.name
1571 else:
1572 name = cookie.name
1573 value = cookie.value
1574 f.write(
1575 '\t'.join([cookie.domain, initial_dot, cookie.path,
1576 secure, expires, name, value]) + '\n')
1577
1578 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1579 """Load cookies from a file."""
1580 if filename is None:
1581 if self.filename is not None:
1582 filename = self.filename
1583 else:
1584 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1585
1586 def prepare_line(line):
1587 if line.startswith(self._HTTPONLY_PREFIX):
1588 line = line[len(self._HTTPONLY_PREFIX):]
1589 # comments and empty lines are fine
1590 if line.startswith('#') or not line.strip():
1591 return line
1592 cookie_list = line.split('\t')
1593 if len(cookie_list) != self._ENTRY_LEN:
1594 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1595 cookie = self._CookieFileEntry(*cookie_list)
1596 if cookie.expires_at and not cookie.expires_at.isdigit():
1597 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1598 return line
1599
1600 cf = io.StringIO()
1601 with io.open(filename, encoding='utf-8') as f:
1602 for line in f:
1603 try:
1604 cf.write(prepare_line(line))
1605 except compat_cookiejar.LoadError as e:
1606 write_string(
1607 'WARNING: skipping cookie file entry due to %s: %r\n'
1608 % (e, line), sys.stderr)
1609 continue
1610 cf.seek(0)
1611 self._really_load(cf, filename, ignore_discard, ignore_expires)
1612 # Session cookies are denoted by either `expires` field set to
1613 # an empty string or 0. MozillaCookieJar only recognizes the former
1614 # (see [1]). So we need force the latter to be recognized as session
1615 # cookies on our own.
1616 # Session cookies may be important for cookies-based authentication,
1617 # e.g. usually, when user does not check 'Remember me' check box while
1618 # logging in on a site, some important cookies are stored as session
1619 # cookies so that not recognizing them will result in failed login.
1620 # 1. https://bugs.python.org/issue17164
1621 for cookie in self:
1622 # Treat `expires=0` cookies as session cookies
1623 if cookie.expires == 0:
1624 cookie.expires = None
1625 cookie.discard = True
1626
1627
1628 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1629 def __init__(self, cookiejar=None):
1630 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1631
1632 def http_response(self, request, response):
1633 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1634 # characters in Set-Cookie HTTP header of last response (see
1635 # https://github.com/ytdl-org/youtube-dl/issues/6769).
1636 # In order to at least prevent crashing we will percent encode Set-Cookie
1637 # header before HTTPCookieProcessor starts processing it.
1638 # if sys.version_info < (3, 0) and response.headers:
1639 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1640 # set_cookie = response.headers.get(set_cookie_header)
1641 # if set_cookie:
1642 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1643 # if set_cookie != set_cookie_escaped:
1644 # del response.headers[set_cookie_header]
1645 # response.headers[set_cookie_header] = set_cookie_escaped
1646 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1647
1648 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1649 https_response = http_response
1650
1651
1652 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1653 """YoutubeDL redirect handler
1654
1655 The code is based on HTTPRedirectHandler implementation from CPython [1].
1656
1657 This redirect handler solves two issues:
1658 - ensures redirect URL is always unicode under python 2
1659 - introduces support for experimental HTTP response status code
1660 308 Permanent Redirect [2] used by some sites [3]
1661
1662 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1663 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1664 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1665 """
1666
1667 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1668
1669 def redirect_request(self, req, fp, code, msg, headers, newurl):
1670 """Return a Request or None in response to a redirect.
1671
1672 This is called by the http_error_30x methods when a
1673 redirection response is received. If a redirection should
1674 take place, return a new Request to allow http_error_30x to
1675 perform the redirect. Otherwise, raise HTTPError if no-one
1676 else should try to handle this url. Return None if you can't
1677 but another Handler might.
1678 """
1679 m = req.get_method()
1680 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1681 or code in (301, 302, 303) and m == "POST")):
1682 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1683 # Strictly (according to RFC 2616), 301 or 302 in response to
1684 # a POST MUST NOT cause a redirection without confirmation
1685 # from the user (of urllib.request, in this case). In practice,
1686 # essentially all clients do redirect in this case, so we do
1687 # the same.
1688
1689 # On python 2 urlh.geturl() may sometimes return redirect URL
1690 # as byte string instead of unicode. This workaround allows
1691 # to force it always return unicode.
1692 if sys.version_info[0] < 3:
1693 newurl = compat_str(newurl)
1694
1695 # Be conciliant with URIs containing a space. This is mainly
1696 # redundant with the more complete encoding done in http_error_302(),
1697 # but it is kept for compatibility with other callers.
1698 newurl = newurl.replace(' ', '%20')
1699
1700 CONTENT_HEADERS = ("content-length", "content-type")
1701 # NB: don't use dict comprehension for python 2.6 compatibility
1702 newheaders = dict((k, v) for k, v in req.headers.items()
1703 if k.lower() not in CONTENT_HEADERS)
1704 return compat_urllib_request.Request(
1705 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1706 unverifiable=True)
1707
1708
1709 def extract_timezone(date_str):
1710 m = re.search(
1711 r'''(?x)
1712 ^.{8,}? # >=8 char non-TZ prefix, if present
1713 (?P<tz>Z| # just the UTC Z, or
1714 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1715 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1716 [ ]? # optional space
1717 (?P<sign>\+|-) # +/-
1718 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1719 $)
1720 ''', date_str)
1721 if not m:
1722 timezone = datetime.timedelta()
1723 else:
1724 date_str = date_str[:-len(m.group('tz'))]
1725 if not m.group('sign'):
1726 timezone = datetime.timedelta()
1727 else:
1728 sign = 1 if m.group('sign') == '+' else -1
1729 timezone = datetime.timedelta(
1730 hours=sign * int(m.group('hours')),
1731 minutes=sign * int(m.group('minutes')))
1732 return timezone, date_str
1733
1734
1735 def parse_iso8601(date_str, delimiter='T', timezone=None):
1736 """ Return a UNIX timestamp from the given date """
1737
1738 if date_str is None:
1739 return None
1740
1741 date_str = re.sub(r'\.[0-9]+', '', date_str)
1742
1743 if timezone is None:
1744 timezone, date_str = extract_timezone(date_str)
1745
1746 try:
1747 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1748 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1749 return calendar.timegm(dt.timetuple())
1750 except ValueError:
1751 pass
1752
1753
1754 def date_formats(day_first=True):
1755 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1756
1757
1758 def unified_strdate(date_str, day_first=True):
1759 """Return a string with the date in the format YYYYMMDD"""
1760
1761 if date_str is None:
1762 return None
1763 upload_date = None
1764 # Replace commas
1765 date_str = date_str.replace(',', ' ')
1766 # Remove AM/PM + timezone
1767 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1768 _, date_str = extract_timezone(date_str)
1769
1770 for expression in date_formats(day_first):
1771 try:
1772 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1773 except ValueError:
1774 pass
1775 if upload_date is None:
1776 timetuple = email.utils.parsedate_tz(date_str)
1777 if timetuple:
1778 try:
1779 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1780 except ValueError:
1781 pass
1782 if upload_date is not None:
1783 return compat_str(upload_date)
1784
1785
1786 def unified_timestamp(date_str, day_first=True):
1787 if date_str is None:
1788 return None
1789
1790 date_str = re.sub(r'[,|]', '', date_str)
1791
1792 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1793 timezone, date_str = extract_timezone(date_str)
1794
1795 # Remove AM/PM + timezone
1796 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1797
1798 # Remove unrecognized timezones from ISO 8601 alike timestamps
1799 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1800 if m:
1801 date_str = date_str[:-len(m.group('tz'))]
1802
1803 # Python only supports microseconds, so remove nanoseconds
1804 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1805 if m:
1806 date_str = m.group(1)
1807
1808 for expression in date_formats(day_first):
1809 try:
1810 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1811 return calendar.timegm(dt.timetuple())
1812 except ValueError:
1813 pass
1814 timetuple = email.utils.parsedate_tz(date_str)
1815 if timetuple:
1816 return calendar.timegm(timetuple) + pm_delta * 3600
1817
1818
1819 def determine_ext(url, default_ext='unknown_video'):
1820 if url is None or '.' not in url:
1821 return default_ext
1822 guess = url.partition('?')[0].rpartition('.')[2]
1823 if re.match(r'^[A-Za-z0-9]+$', guess):
1824 return guess
1825 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1826 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1827 return guess.rstrip('/')
1828 else:
1829 return default_ext
1830
1831
1832 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1833 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1834
1835
1836 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1837 """
1838 Return a datetime object from a string in the format YYYYMMDD or
1839 (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1840
1841 format: string date format used to return datetime object from
1842 precision: round the time portion of a datetime object.
1843 auto|microsecond|second|minute|hour|day.
1844 auto: round to the unit provided in date_str (if applicable).
1845 """
1846 auto_precision = False
1847 if precision == 'auto':
1848 auto_precision = True
1849 precision = 'microsecond'
1850 today = datetime_round(datetime.datetime.utcnow(), precision)
1851 if date_str in ('now', 'today'):
1852 return today
1853 if date_str == 'yesterday':
1854 return today - datetime.timedelta(days=1)
1855 match = re.match(
1856 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1857 date_str)
1858 if match is not None:
1859 start_time = datetime_from_str(match.group('start'), precision, format)
1860 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1861 unit = match.group('unit')
1862 if unit == 'month' or unit == 'year':
1863 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1864 unit = 'day'
1865 else:
1866 if unit == 'week':
1867 unit = 'day'
1868 time *= 7
1869 delta = datetime.timedelta(**{unit + 's': time})
1870 new_date = start_time + delta
1871 if auto_precision:
1872 return datetime_round(new_date, unit)
1873 return new_date
1874
1875 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1876
1877
1878 def date_from_str(date_str, format='%Y%m%d', strict=False):
1879 """
1880 Return a datetime object from a string in the format YYYYMMDD or
1881 (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1882
1883 If "strict", only (now|today)[+-][0-9](day|week|month|year)(s)? is allowed
1884
1885 format: string date format used to return datetime object from
1886 """
1887 if strict and not re.fullmatch(r'\d{8}|(now|today)[+-]\d+(day|week|month|year)(s)?', date_str):
1888 raise ValueError(f'Invalid date format {date_str}')
1889 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1890
1891
1892 def datetime_add_months(dt, months):
1893 """Increment/Decrement a datetime object by months."""
1894 month = dt.month + months - 1
1895 year = dt.year + month // 12
1896 month = month % 12 + 1
1897 day = min(dt.day, calendar.monthrange(year, month)[1])
1898 return dt.replace(year, month, day)
1899
1900
1901 def datetime_round(dt, precision='day'):
1902 """
1903 Round a datetime object's time to a specific precision
1904 """
1905 if precision == 'microsecond':
1906 return dt
1907
1908 unit_seconds = {
1909 'day': 86400,
1910 'hour': 3600,
1911 'minute': 60,
1912 'second': 1,
1913 }
1914 roundto = lambda x, n: ((x + n / 2) // n) * n
1915 timestamp = calendar.timegm(dt.timetuple())
1916 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1917
1918
1919 def hyphenate_date(date_str):
1920 """
1921 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1922 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1923 if match is not None:
1924 return '-'.join(match.groups())
1925 else:
1926 return date_str
1927
1928
1929 class DateRange(object):
1930 """Represents a time interval between two dates"""
1931
1932 def __init__(self, start=None, end=None):
1933 """start and end must be strings in the format accepted by date"""
1934 if start is not None:
1935 self.start = date_from_str(start, strict=True)
1936 else:
1937 self.start = datetime.datetime.min.date()
1938 if end is not None:
1939 self.end = date_from_str(end, strict=True)
1940 else:
1941 self.end = datetime.datetime.max.date()
1942 if self.start > self.end:
1943 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1944
1945 @classmethod
1946 def day(cls, day):
1947 """Returns a range that only contains the given day"""
1948 return cls(day, day)
1949
1950 def __contains__(self, date):
1951 """Check if the date is in the range"""
1952 if not isinstance(date, datetime.date):
1953 date = date_from_str(date)
1954 return self.start <= date <= self.end
1955
1956 def __str__(self):
1957 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1958
1959
1960 def platform_name():
1961 """ Returns the platform name as a compat_str """
1962 res = platform.platform()
1963 if isinstance(res, bytes):
1964 res = res.decode(preferredencoding())
1965
1966 assert isinstance(res, compat_str)
1967 return res
1968
1969
1970 def get_windows_version():
1971 ''' Get Windows version. None if it's not running on Windows '''
1972 if compat_os_name == 'nt':
1973 return version_tuple(platform.win32_ver()[1])
1974 else:
1975 return None
1976
1977
1978 def _windows_write_string(s, out):
1979 """ Returns True if the string was written using special methods,
1980 False if it has yet to be written out."""
1981 # Adapted from http://stackoverflow.com/a/3259271/35070
1982
1983 import ctypes.wintypes
1984
1985 WIN_OUTPUT_IDS = {
1986 1: -11,
1987 2: -12,
1988 }
1989
1990 try:
1991 fileno = out.fileno()
1992 except AttributeError:
1993 # If the output stream doesn't have a fileno, it's virtual
1994 return False
1995 except io.UnsupportedOperation:
1996 # Some strange Windows pseudo files?
1997 return False
1998 if fileno not in WIN_OUTPUT_IDS:
1999 return False
2000
2001 GetStdHandle = compat_ctypes_WINFUNCTYPE(
2002 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
2003 ('GetStdHandle', ctypes.windll.kernel32))
2004 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
2005
2006 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
2007 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
2008 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
2009 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
2010 written = ctypes.wintypes.DWORD(0)
2011
2012 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
2013 FILE_TYPE_CHAR = 0x0002
2014 FILE_TYPE_REMOTE = 0x8000
2015 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
2016 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
2017 ctypes.POINTER(ctypes.wintypes.DWORD))(
2018 ('GetConsoleMode', ctypes.windll.kernel32))
2019 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
2020
2021 def not_a_console(handle):
2022 if handle == INVALID_HANDLE_VALUE or handle is None:
2023 return True
2024 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
2025 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
2026
2027 if not_a_console(h):
2028 return False
2029
2030 def next_nonbmp_pos(s):
2031 try:
2032 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
2033 except StopIteration:
2034 return len(s)
2035
2036 while s:
2037 count = min(next_nonbmp_pos(s), 1024)
2038
2039 ret = WriteConsoleW(
2040 h, s, count if count else 2, ctypes.byref(written), None)
2041 if ret == 0:
2042 raise OSError('Failed to write string')
2043 if not count: # We just wrote a non-BMP character
2044 assert written.value == 2
2045 s = s[1:]
2046 else:
2047 assert written.value > 0
2048 s = s[written.value:]
2049 return True
2050
2051
2052 def write_string(s, out=None, encoding=None):
2053 if out is None:
2054 out = sys.stderr
2055 assert type(s) == compat_str
2056
2057 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
2058 if _windows_write_string(s, out):
2059 return
2060
2061 if ('b' in getattr(out, 'mode', '')
2062 or sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
2063 byt = s.encode(encoding or preferredencoding(), 'ignore')
2064 out.write(byt)
2065 elif hasattr(out, 'buffer'):
2066 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2067 byt = s.encode(enc, 'ignore')
2068 out.buffer.write(byt)
2069 else:
2070 out.write(s)
2071 out.flush()
2072
2073
2074 def bytes_to_intlist(bs):
2075 if not bs:
2076 return []
2077 if isinstance(bs[0], int): # Python 3
2078 return list(bs)
2079 else:
2080 return [ord(c) for c in bs]
2081
2082
2083 def intlist_to_bytes(xs):
2084 if not xs:
2085 return b''
2086 return compat_struct_pack('%dB' % len(xs), *xs)
2087
2088
2089 # Cross-platform file locking
2090 if sys.platform == 'win32':
2091 import ctypes.wintypes
2092 import msvcrt
2093
2094 class OVERLAPPED(ctypes.Structure):
2095 _fields_ = [
2096 ('Internal', ctypes.wintypes.LPVOID),
2097 ('InternalHigh', ctypes.wintypes.LPVOID),
2098 ('Offset', ctypes.wintypes.DWORD),
2099 ('OffsetHigh', ctypes.wintypes.DWORD),
2100 ('hEvent', ctypes.wintypes.HANDLE),
2101 ]
2102
2103 kernel32 = ctypes.windll.kernel32
2104 LockFileEx = kernel32.LockFileEx
2105 LockFileEx.argtypes = [
2106 ctypes.wintypes.HANDLE, # hFile
2107 ctypes.wintypes.DWORD, # dwFlags
2108 ctypes.wintypes.DWORD, # dwReserved
2109 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2110 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2111 ctypes.POINTER(OVERLAPPED) # Overlapped
2112 ]
2113 LockFileEx.restype = ctypes.wintypes.BOOL
2114 UnlockFileEx = kernel32.UnlockFileEx
2115 UnlockFileEx.argtypes = [
2116 ctypes.wintypes.HANDLE, # hFile
2117 ctypes.wintypes.DWORD, # dwReserved
2118 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2119 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2120 ctypes.POINTER(OVERLAPPED) # Overlapped
2121 ]
2122 UnlockFileEx.restype = ctypes.wintypes.BOOL
2123 whole_low = 0xffffffff
2124 whole_high = 0x7fffffff
2125
2126 def _lock_file(f, exclusive, block):
2127 overlapped = OVERLAPPED()
2128 overlapped.Offset = 0
2129 overlapped.OffsetHigh = 0
2130 overlapped.hEvent = 0
2131 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2132
2133 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2134 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2135 0, whole_low, whole_high, f._lock_file_overlapped_p):
2136 raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
2137
2138 def _unlock_file(f):
2139 assert f._lock_file_overlapped_p
2140 handle = msvcrt.get_osfhandle(f.fileno())
2141 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2142 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2143
2144 else:
2145 try:
2146 import fcntl
2147
2148 def _lock_file(f, exclusive, block):
2149 try:
2150 fcntl.flock(f,
2151 fcntl.LOCK_SH if not exclusive
2152 else fcntl.LOCK_EX if block
2153 else fcntl.LOCK_EX | fcntl.LOCK_NB)
2154 except BlockingIOError:
2155 raise
2156 except OSError: # AOSP does not have flock()
2157 fcntl.lockf(f,
2158 fcntl.LOCK_SH if not exclusive
2159 else fcntl.LOCK_EX if block
2160 else fcntl.LOCK_EX | fcntl.LOCK_NB)
2161
2162 def _unlock_file(f):
2163 try:
2164 fcntl.flock(f, fcntl.LOCK_UN)
2165 except OSError:
2166 fcntl.lockf(f, fcntl.LOCK_UN)
2167
2168 except ImportError:
2169 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2170
2171 def _lock_file(f, exclusive, block):
2172 raise IOError(UNSUPPORTED_MSG)
2173
2174 def _unlock_file(f):
2175 raise IOError(UNSUPPORTED_MSG)
2176
2177
2178 class locked_file(object):
2179 _closed = False
2180
2181 def __init__(self, filename, mode, block=True, encoding=None):
2182 assert mode in ['r', 'rb', 'a', 'ab', 'w', 'wb']
2183 self.f = io.open(filename, mode, encoding=encoding)
2184 self.mode = mode
2185 self.block = block
2186
2187 def __enter__(self):
2188 exclusive = 'r' not in self.mode
2189 try:
2190 _lock_file(self.f, exclusive, self.block)
2191 except IOError:
2192 self.f.close()
2193 raise
2194 return self
2195
2196 def __exit__(self, etype, value, traceback):
2197 try:
2198 if not self._closed:
2199 _unlock_file(self.f)
2200 finally:
2201 self.f.close()
2202 self._closed = True
2203
2204 def __iter__(self):
2205 return iter(self.f)
2206
2207 def write(self, *args):
2208 return self.f.write(*args)
2209
2210 def read(self, *args):
2211 return self.f.read(*args)
2212
2213 def flush(self):
2214 self.f.flush()
2215
2216 def open(self):
2217 return self.__enter__()
2218
2219 def close(self, *args):
2220 self.__exit__(self, *args, value=False, traceback=False)
2221
2222
2223 def get_filesystem_encoding():
2224 encoding = sys.getfilesystemencoding()
2225 return encoding if encoding is not None else 'utf-8'
2226
2227
2228 def shell_quote(args):
2229 quoted_args = []
2230 encoding = get_filesystem_encoding()
2231 for a in args:
2232 if isinstance(a, bytes):
2233 # We may get a filename encoded with 'encodeFilename'
2234 a = a.decode(encoding)
2235 quoted_args.append(compat_shlex_quote(a))
2236 return ' '.join(quoted_args)
2237
2238
2239 def smuggle_url(url, data):
2240 """ Pass additional data in a URL for internal use. """
2241
2242 url, idata = unsmuggle_url(url, {})
2243 data.update(idata)
2244 sdata = compat_urllib_parse_urlencode(
2245 {'__youtubedl_smuggle': json.dumps(data)})
2246 return url + '#' + sdata
2247
2248
2249 def unsmuggle_url(smug_url, default=None):
2250 if '#__youtubedl_smuggle' not in smug_url:
2251 return smug_url, default
2252 url, _, sdata = smug_url.rpartition('#')
2253 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2254 data = json.loads(jsond)
2255 return url, data
2256
2257
2258 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2259 """ Formats numbers with decimal sufixes like K, M, etc """
2260 num, factor = float_or_none(num), float(factor)
2261 if num is None or num < 0:
2262 return None
2263 exponent = 0 if num == 0 else int(math.log(num, factor))
2264 suffix = ['', *'kMGTPEZY'][exponent]
2265 if factor == 1024:
2266 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2267 converted = num / (factor ** exponent)
2268 return fmt % (converted, suffix)
2269
2270
2271 def format_bytes(bytes):
2272 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2273
2274
2275 def lookup_unit_table(unit_table, s):
2276 units_re = '|'.join(re.escape(u) for u in unit_table)
2277 m = re.match(
2278 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2279 if not m:
2280 return None
2281 num_str = m.group('num').replace(',', '.')
2282 mult = unit_table[m.group('unit')]
2283 return int(float(num_str) * mult)
2284
2285
2286 def parse_filesize(s):
2287 if s is None:
2288 return None
2289
2290 # The lower-case forms are of course incorrect and unofficial,
2291 # but we support those too
2292 _UNIT_TABLE = {
2293 'B': 1,
2294 'b': 1,
2295 'bytes': 1,
2296 'KiB': 1024,
2297 'KB': 1000,
2298 'kB': 1024,
2299 'Kb': 1000,
2300 'kb': 1000,
2301 'kilobytes': 1000,
2302 'kibibytes': 1024,
2303 'MiB': 1024 ** 2,
2304 'MB': 1000 ** 2,
2305 'mB': 1024 ** 2,
2306 'Mb': 1000 ** 2,
2307 'mb': 1000 ** 2,
2308 'megabytes': 1000 ** 2,
2309 'mebibytes': 1024 ** 2,
2310 'GiB': 1024 ** 3,
2311 'GB': 1000 ** 3,
2312 'gB': 1024 ** 3,
2313 'Gb': 1000 ** 3,
2314 'gb': 1000 ** 3,
2315 'gigabytes': 1000 ** 3,
2316 'gibibytes': 1024 ** 3,
2317 'TiB': 1024 ** 4,
2318 'TB': 1000 ** 4,
2319 'tB': 1024 ** 4,
2320 'Tb': 1000 ** 4,
2321 'tb': 1000 ** 4,
2322 'terabytes': 1000 ** 4,
2323 'tebibytes': 1024 ** 4,
2324 'PiB': 1024 ** 5,
2325 'PB': 1000 ** 5,
2326 'pB': 1024 ** 5,
2327 'Pb': 1000 ** 5,
2328 'pb': 1000 ** 5,
2329 'petabytes': 1000 ** 5,
2330 'pebibytes': 1024 ** 5,
2331 'EiB': 1024 ** 6,
2332 'EB': 1000 ** 6,
2333 'eB': 1024 ** 6,
2334 'Eb': 1000 ** 6,
2335 'eb': 1000 ** 6,
2336 'exabytes': 1000 ** 6,
2337 'exbibytes': 1024 ** 6,
2338 'ZiB': 1024 ** 7,
2339 'ZB': 1000 ** 7,
2340 'zB': 1024 ** 7,
2341 'Zb': 1000 ** 7,
2342 'zb': 1000 ** 7,
2343 'zettabytes': 1000 ** 7,
2344 'zebibytes': 1024 ** 7,
2345 'YiB': 1024 ** 8,
2346 'YB': 1000 ** 8,
2347 'yB': 1024 ** 8,
2348 'Yb': 1000 ** 8,
2349 'yb': 1000 ** 8,
2350 'yottabytes': 1000 ** 8,
2351 'yobibytes': 1024 ** 8,
2352 }
2353
2354 return lookup_unit_table(_UNIT_TABLE, s)
2355
2356
2357 def parse_count(s):
2358 if s is None:
2359 return None
2360
2361 s = re.sub(r'^[^\d]+\s', '', s).strip()
2362
2363 if re.match(r'^[\d,.]+$', s):
2364 return str_to_int(s)
2365
2366 _UNIT_TABLE = {
2367 'k': 1000,
2368 'K': 1000,
2369 'm': 1000 ** 2,
2370 'M': 1000 ** 2,
2371 'kk': 1000 ** 2,
2372 'KK': 1000 ** 2,
2373 'b': 1000 ** 3,
2374 'B': 1000 ** 3,
2375 }
2376
2377 ret = lookup_unit_table(_UNIT_TABLE, s)
2378 if ret is not None:
2379 return ret
2380
2381 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2382 if mobj:
2383 return str_to_int(mobj.group(1))
2384
2385
2386 def parse_resolution(s):
2387 if s is None:
2388 return {}
2389
2390 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2391 if mobj:
2392 return {
2393 'width': int(mobj.group('w')),
2394 'height': int(mobj.group('h')),
2395 }
2396
2397 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2398 if mobj:
2399 return {'height': int(mobj.group(1))}
2400
2401 mobj = re.search(r'\b([48])[kK]\b', s)
2402 if mobj:
2403 return {'height': int(mobj.group(1)) * 540}
2404
2405 return {}
2406
2407
2408 def parse_bitrate(s):
2409 if not isinstance(s, compat_str):
2410 return
2411 mobj = re.search(r'\b(\d+)\s*kbps', s)
2412 if mobj:
2413 return int(mobj.group(1))
2414
2415
2416 def month_by_name(name, lang='en'):
2417 """ Return the number of a month by (locale-independently) English name """
2418
2419 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2420
2421 try:
2422 return month_names.index(name) + 1
2423 except ValueError:
2424 return None
2425
2426
2427 def month_by_abbreviation(abbrev):
2428 """ Return the number of a month by (locale-independently) English
2429 abbreviations """
2430
2431 try:
2432 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2433 except ValueError:
2434 return None
2435
2436
2437 def fix_xml_ampersands(xml_str):
2438 """Replace all the '&' by '&amp;' in XML"""
2439 return re.sub(
2440 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2441 '&amp;',
2442 xml_str)
2443
2444
2445 def setproctitle(title):
2446 assert isinstance(title, compat_str)
2447
2448 # ctypes in Jython is not complete
2449 # http://bugs.jython.org/issue2148
2450 if sys.platform.startswith('java'):
2451 return
2452
2453 try:
2454 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2455 except OSError:
2456 return
2457 except TypeError:
2458 # LoadLibrary in Windows Python 2.7.13 only expects
2459 # a bytestring, but since unicode_literals turns
2460 # every string into a unicode string, it fails.
2461 return
2462 title_bytes = title.encode('utf-8')
2463 buf = ctypes.create_string_buffer(len(title_bytes))
2464 buf.value = title_bytes
2465 try:
2466 libc.prctl(15, buf, 0, 0, 0)
2467 except AttributeError:
2468 return # Strange libc, just skip this
2469
2470
2471 def remove_start(s, start):
2472 return s[len(start):] if s is not None and s.startswith(start) else s
2473
2474
2475 def remove_end(s, end):
2476 return s[:-len(end)] if s is not None and s.endswith(end) else s
2477
2478
2479 def remove_quotes(s):
2480 if s is None or len(s) < 2:
2481 return s
2482 for quote in ('"', "'", ):
2483 if s[0] == quote and s[-1] == quote:
2484 return s[1:-1]
2485 return s
2486
2487
2488 def get_domain(url):
2489 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2490 return domain.group('domain') if domain else None
2491
2492
2493 def url_basename(url):
2494 path = compat_urlparse.urlparse(url).path
2495 return path.strip('/').split('/')[-1]
2496
2497
2498 def base_url(url):
2499 return re.match(r'https?://[^?#&]+/', url).group()
2500
2501
2502 def urljoin(base, path):
2503 if isinstance(path, bytes):
2504 path = path.decode('utf-8')
2505 if not isinstance(path, compat_str) or not path:
2506 return None
2507 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2508 return path
2509 if isinstance(base, bytes):
2510 base = base.decode('utf-8')
2511 if not isinstance(base, compat_str) or not re.match(
2512 r'^(?:https?:)?//', base):
2513 return None
2514 return compat_urlparse.urljoin(base, path)
2515
2516
2517 class HEADRequest(compat_urllib_request.Request):
2518 def get_method(self):
2519 return 'HEAD'
2520
2521
2522 class PUTRequest(compat_urllib_request.Request):
2523 def get_method(self):
2524 return 'PUT'
2525
2526
2527 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2528 if get_attr and v is not None:
2529 v = getattr(v, get_attr, None)
2530 try:
2531 return int(v) * invscale // scale
2532 except (ValueError, TypeError, OverflowError):
2533 return default
2534
2535
2536 def str_or_none(v, default=None):
2537 return default if v is None else compat_str(v)
2538
2539
2540 def str_to_int(int_str):
2541 """ A more relaxed version of int_or_none """
2542 if isinstance(int_str, compat_integer_types):
2543 return int_str
2544 elif isinstance(int_str, compat_str):
2545 int_str = re.sub(r'[,\.\+]', '', int_str)
2546 return int_or_none(int_str)
2547
2548
2549 def float_or_none(v, scale=1, invscale=1, default=None):
2550 if v is None:
2551 return default
2552 try:
2553 return float(v) * invscale / scale
2554 except (ValueError, TypeError):
2555 return default
2556
2557
2558 def bool_or_none(v, default=None):
2559 return v if isinstance(v, bool) else default
2560
2561
2562 def strip_or_none(v, default=None):
2563 return v.strip() if isinstance(v, compat_str) else default
2564
2565
2566 def url_or_none(url):
2567 if not url or not isinstance(url, compat_str):
2568 return None
2569 url = url.strip()
2570 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2571
2572
2573 def request_to_url(req):
2574 if isinstance(req, compat_urllib_request.Request):
2575 return req.get_full_url()
2576 else:
2577 return req
2578
2579
2580 def strftime_or_none(timestamp, date_format, default=None):
2581 datetime_object = None
2582 try:
2583 if isinstance(timestamp, compat_numeric_types): # unix timestamp
2584 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2585 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2586 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2587 return datetime_object.strftime(date_format)
2588 except (ValueError, TypeError, AttributeError):
2589 return default
2590
2591
2592 def parse_duration(s):
2593 if not isinstance(s, compat_basestring):
2594 return None
2595 s = s.strip()
2596 if not s:
2597 return None
2598
2599 days, hours, mins, secs, ms = [None] * 5
2600 m = re.match(r'''(?x)
2601 (?P<before_secs>
2602 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2603 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2604 (?P<ms>[.:][0-9]+)?Z?$
2605 ''', s)
2606 if m:
2607 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2608 else:
2609 m = re.match(
2610 r'''(?ix)(?:P?
2611 (?:
2612 [0-9]+\s*y(?:ears?)?\s*
2613 )?
2614 (?:
2615 [0-9]+\s*m(?:onths?)?\s*
2616 )?
2617 (?:
2618 [0-9]+\s*w(?:eeks?)?\s*
2619 )?
2620 (?:
2621 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2622 )?
2623 T)?
2624 (?:
2625 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2626 )?
2627 (?:
2628 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2629 )?
2630 (?:
2631 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2632 )?Z?$''', s)
2633 if m:
2634 days, hours, mins, secs, ms = m.groups()
2635 else:
2636 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2637 if m:
2638 hours, mins = m.groups()
2639 else:
2640 return None
2641
2642 duration = 0
2643 if secs:
2644 duration += float(secs)
2645 if mins:
2646 duration += float(mins) * 60
2647 if hours:
2648 duration += float(hours) * 60 * 60
2649 if days:
2650 duration += float(days) * 24 * 60 * 60
2651 if ms:
2652 duration += float(ms.replace(':', '.'))
2653 return duration
2654
2655
2656 def prepend_extension(filename, ext, expected_real_ext=None):
2657 name, real_ext = os.path.splitext(filename)
2658 return (
2659 '{0}.{1}{2}'.format(name, ext, real_ext)
2660 if not expected_real_ext or real_ext[1:] == expected_real_ext
2661 else '{0}.{1}'.format(filename, ext))
2662
2663
2664 def replace_extension(filename, ext, expected_real_ext=None):
2665 name, real_ext = os.path.splitext(filename)
2666 return '{0}.{1}'.format(
2667 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2668 ext)
2669
2670
2671 def check_executable(exe, args=[]):
2672 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2673 args can be a list of arguments for a short output (like -version) """
2674 try:
2675 Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2676 except OSError:
2677 return False
2678 return exe
2679
2680
2681 def _get_exe_version_output(exe, args):
2682 try:
2683 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2684 # SIGTTOU if yt-dlp is run in the background.
2685 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2686 out, _ = Popen(
2687 [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2688 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2689 except OSError:
2690 return False
2691 if isinstance(out, bytes): # Python 2.x
2692 out = out.decode('ascii', 'ignore')
2693 return out
2694
2695
2696 def detect_exe_version(output, version_re=None, unrecognized='present'):
2697 assert isinstance(output, compat_str)
2698 if version_re is None:
2699 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2700 m = re.search(version_re, output)
2701 if m:
2702 return m.group(1)
2703 else:
2704 return unrecognized
2705
2706
2707 def get_exe_version(exe, args=['--version'],
2708 version_re=None, unrecognized='present'):
2709 """ Returns the version of the specified executable,
2710 or False if the executable is not present """
2711 out = _get_exe_version_output(exe, args)
2712 return detect_exe_version(out, version_re, unrecognized) if out else False
2713
2714
2715 class LazyList(collections.abc.Sequence):
2716 ''' Lazy immutable list from an iterable
2717 Note that slices of a LazyList are lists and not LazyList'''
2718
2719 class IndexError(IndexError):
2720 pass
2721
2722 def __init__(self, iterable, *, reverse=False, _cache=None):
2723 self.__iterable = iter(iterable)
2724 self.__cache = [] if _cache is None else _cache
2725 self.__reversed = reverse
2726
2727 def __iter__(self):
2728 if self.__reversed:
2729 # We need to consume the entire iterable to iterate in reverse
2730 yield from self.exhaust()
2731 return
2732 yield from self.__cache
2733 for item in self.__iterable:
2734 self.__cache.append(item)
2735 yield item
2736
2737 def __exhaust(self):
2738 self.__cache.extend(self.__iterable)
2739 # Discard the emptied iterable to make it pickle-able
2740 self.__iterable = []
2741 return self.__cache
2742
2743 def exhaust(self):
2744 ''' Evaluate the entire iterable '''
2745 return self.__exhaust()[::-1 if self.__reversed else 1]
2746
2747 @staticmethod
2748 def __reverse_index(x):
2749 return None if x is None else -(x + 1)
2750
2751 def __getitem__(self, idx):
2752 if isinstance(idx, slice):
2753 if self.__reversed:
2754 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2755 start, stop, step = idx.start, idx.stop, idx.step or 1
2756 elif isinstance(idx, int):
2757 if self.__reversed:
2758 idx = self.__reverse_index(idx)
2759 start, stop, step = idx, idx, 0
2760 else:
2761 raise TypeError('indices must be integers or slices')
2762 if ((start or 0) < 0 or (stop or 0) < 0
2763 or (start is None and step < 0)
2764 or (stop is None and step > 0)):
2765 # We need to consume the entire iterable to be able to slice from the end
2766 # Obviously, never use this with infinite iterables
2767 self.__exhaust()
2768 try:
2769 return self.__cache[idx]
2770 except IndexError as e:
2771 raise self.IndexError(e) from e
2772 n = max(start or 0, stop or 0) - len(self.__cache) + 1
2773 if n > 0:
2774 self.__cache.extend(itertools.islice(self.__iterable, n))
2775 try:
2776 return self.__cache[idx]
2777 except IndexError as e:
2778 raise self.IndexError(e) from e
2779
2780 def __bool__(self):
2781 try:
2782 self[-1] if self.__reversed else self[0]
2783 except self.IndexError:
2784 return False
2785 return True
2786
2787 def __len__(self):
2788 self.__exhaust()
2789 return len(self.__cache)
2790
2791 def __reversed__(self):
2792 return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2793
2794 def __copy__(self):
2795 return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2796
2797 def __repr__(self):
2798 # repr and str should mimic a list. So we exhaust the iterable
2799 return repr(self.exhaust())
2800
2801 def __str__(self):
2802 return repr(self.exhaust())
2803
2804
2805 class PagedList:
2806
2807 class IndexError(IndexError):
2808 pass
2809
2810 def __len__(self):
2811 # This is only useful for tests
2812 return len(self.getslice())
2813
2814 def __init__(self, pagefunc, pagesize, use_cache=True):
2815 self._pagefunc = pagefunc
2816 self._pagesize = pagesize
2817 self._pagecount = float('inf')
2818 self._use_cache = use_cache
2819 self._cache = {}
2820
2821 def getpage(self, pagenum):
2822 page_results = self._cache.get(pagenum)
2823 if page_results is None:
2824 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2825 if self._use_cache:
2826 self._cache[pagenum] = page_results
2827 return page_results
2828
2829 def getslice(self, start=0, end=None):
2830 return list(self._getslice(start, end))
2831
2832 def _getslice(self, start, end):
2833 raise NotImplementedError('This method must be implemented by subclasses')
2834
2835 def __getitem__(self, idx):
2836 assert self._use_cache, 'Indexing PagedList requires cache'
2837 if not isinstance(idx, int) or idx < 0:
2838 raise TypeError('indices must be non-negative integers')
2839 entries = self.getslice(idx, idx + 1)
2840 if not entries:
2841 raise self.IndexError()
2842 return entries[0]
2843
2844
2845 class OnDemandPagedList(PagedList):
2846 def _getslice(self, start, end):
2847 for pagenum in itertools.count(start // self._pagesize):
2848 firstid = pagenum * self._pagesize
2849 nextfirstid = pagenum * self._pagesize + self._pagesize
2850 if start >= nextfirstid:
2851 continue
2852
2853 startv = (
2854 start % self._pagesize
2855 if firstid <= start < nextfirstid
2856 else 0)
2857 endv = (
2858 ((end - 1) % self._pagesize) + 1
2859 if (end is not None and firstid <= end <= nextfirstid)
2860 else None)
2861
2862 try:
2863 page_results = self.getpage(pagenum)
2864 except Exception:
2865 self._pagecount = pagenum - 1
2866 raise
2867 if startv != 0 or endv is not None:
2868 page_results = page_results[startv:endv]
2869 yield from page_results
2870
2871 # A little optimization - if current page is not "full", ie. does
2872 # not contain page_size videos then we can assume that this page
2873 # is the last one - there are no more ids on further pages -
2874 # i.e. no need to query again.
2875 if len(page_results) + startv < self._pagesize:
2876 break
2877
2878 # If we got the whole page, but the next page is not interesting,
2879 # break out early as well
2880 if end == nextfirstid:
2881 break
2882
2883
2884 class InAdvancePagedList(PagedList):
2885 def __init__(self, pagefunc, pagecount, pagesize):
2886 PagedList.__init__(self, pagefunc, pagesize, True)
2887 self._pagecount = pagecount
2888
2889 def _getslice(self, start, end):
2890 start_page = start // self._pagesize
2891 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2892 skip_elems = start - start_page * self._pagesize
2893 only_more = None if end is None else end - start
2894 for pagenum in range(start_page, end_page):
2895 page_results = self.getpage(pagenum)
2896 if skip_elems:
2897 page_results = page_results[skip_elems:]
2898 skip_elems = None
2899 if only_more is not None:
2900 if len(page_results) < only_more:
2901 only_more -= len(page_results)
2902 else:
2903 yield from page_results[:only_more]
2904 break
2905 yield from page_results
2906
2907
2908 def uppercase_escape(s):
2909 unicode_escape = codecs.getdecoder('unicode_escape')
2910 return re.sub(
2911 r'\\U[0-9a-fA-F]{8}',
2912 lambda m: unicode_escape(m.group(0))[0],
2913 s)
2914
2915
2916 def lowercase_escape(s):
2917 unicode_escape = codecs.getdecoder('unicode_escape')
2918 return re.sub(
2919 r'\\u[0-9a-fA-F]{4}',
2920 lambda m: unicode_escape(m.group(0))[0],
2921 s)
2922
2923
2924 def escape_rfc3986(s):
2925 """Escape non-ASCII characters as suggested by RFC 3986"""
2926 if sys.version_info < (3, 0) and isinstance(s, compat_str):
2927 s = s.encode('utf-8')
2928 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2929
2930
2931 def escape_url(url):
2932 """Escape URL as suggested by RFC 3986"""
2933 url_parsed = compat_urllib_parse_urlparse(url)
2934 return url_parsed._replace(
2935 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2936 path=escape_rfc3986(url_parsed.path),
2937 params=escape_rfc3986(url_parsed.params),
2938 query=escape_rfc3986(url_parsed.query),
2939 fragment=escape_rfc3986(url_parsed.fragment)
2940 ).geturl()
2941
2942
2943 def parse_qs(url):
2944 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2945
2946
2947 def read_batch_urls(batch_fd):
2948 def fixup(url):
2949 if not isinstance(url, compat_str):
2950 url = url.decode('utf-8', 'replace')
2951 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2952 for bom in BOM_UTF8:
2953 if url.startswith(bom):
2954 url = url[len(bom):]
2955 url = url.lstrip()
2956 if not url or url.startswith(('#', ';', ']')):
2957 return False
2958 # "#" cannot be stripped out since it is part of the URI
2959 # However, it can be safely stipped out if follwing a whitespace
2960 return re.split(r'\s#', url, 1)[0].rstrip()
2961
2962 with contextlib.closing(batch_fd) as fd:
2963 return [url for url in map(fixup, fd) if url]
2964
2965
2966 def urlencode_postdata(*args, **kargs):
2967 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2968
2969
2970 def update_url_query(url, query):
2971 if not query:
2972 return url
2973 parsed_url = compat_urlparse.urlparse(url)
2974 qs = compat_parse_qs(parsed_url.query)
2975 qs.update(query)
2976 return compat_urlparse.urlunparse(parsed_url._replace(
2977 query=compat_urllib_parse_urlencode(qs, True)))
2978
2979
2980 def update_Request(req, url=None, data=None, headers={}, query={}):
2981 req_headers = req.headers.copy()
2982 req_headers.update(headers)
2983 req_data = data or req.data
2984 req_url = update_url_query(url or req.get_full_url(), query)
2985 req_get_method = req.get_method()
2986 if req_get_method == 'HEAD':
2987 req_type = HEADRequest
2988 elif req_get_method == 'PUT':
2989 req_type = PUTRequest
2990 else:
2991 req_type = compat_urllib_request.Request
2992 new_req = req_type(
2993 req_url, data=req_data, headers=req_headers,
2994 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2995 if hasattr(req, 'timeout'):
2996 new_req.timeout = req.timeout
2997 return new_req
2998
2999
3000 def _multipart_encode_impl(data, boundary):
3001 content_type = 'multipart/form-data; boundary=%s' % boundary
3002
3003 out = b''
3004 for k, v in data.items():
3005 out += b'--' + boundary.encode('ascii') + b'\r\n'
3006 if isinstance(k, compat_str):
3007 k = k.encode('utf-8')
3008 if isinstance(v, compat_str):
3009 v = v.encode('utf-8')
3010 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3011 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3012 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3013 if boundary.encode('ascii') in content:
3014 raise ValueError('Boundary overlaps with data')
3015 out += content
3016
3017 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3018
3019 return out, content_type
3020
3021
3022 def multipart_encode(data, boundary=None):
3023 '''
3024 Encode a dict to RFC 7578-compliant form-data
3025
3026 data:
3027 A dict where keys and values can be either Unicode or bytes-like
3028 objects.
3029 boundary:
3030 If specified a Unicode object, it's used as the boundary. Otherwise
3031 a random boundary is generated.
3032
3033 Reference: https://tools.ietf.org/html/rfc7578
3034 '''
3035 has_specified_boundary = boundary is not None
3036
3037 while True:
3038 if boundary is None:
3039 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3040
3041 try:
3042 out, content_type = _multipart_encode_impl(data, boundary)
3043 break
3044 except ValueError:
3045 if has_specified_boundary:
3046 raise
3047 boundary = None
3048
3049 return out, content_type
3050
3051
3052 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3053 if isinstance(key_or_keys, (list, tuple)):
3054 for key in key_or_keys:
3055 if key not in d or d[key] is None or skip_false_values and not d[key]:
3056 continue
3057 return d[key]
3058 return default
3059 return d.get(key_or_keys, default)
3060
3061
3062 def try_get(src, getter, expected_type=None):
3063 for get in variadic(getter):
3064 try:
3065 v = get(src)
3066 except (AttributeError, KeyError, TypeError, IndexError):
3067 pass
3068 else:
3069 if expected_type is None or isinstance(v, expected_type):
3070 return v
3071
3072
3073 def merge_dicts(*dicts):
3074 merged = {}
3075 for a_dict in dicts:
3076 for k, v in a_dict.items():
3077 if v is None:
3078 continue
3079 if (k not in merged
3080 or (isinstance(v, compat_str) and v
3081 and isinstance(merged[k], compat_str)
3082 and not merged[k])):
3083 merged[k] = v
3084 return merged
3085
3086
3087 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3088 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3089
3090
3091 US_RATINGS = {
3092 'G': 0,
3093 'PG': 10,
3094 'PG-13': 13,
3095 'R': 16,
3096 'NC': 18,
3097 }
3098
3099
3100 TV_PARENTAL_GUIDELINES = {
3101 'TV-Y': 0,
3102 'TV-Y7': 7,
3103 'TV-G': 0,
3104 'TV-PG': 0,
3105 'TV-14': 14,
3106 'TV-MA': 17,
3107 }
3108
3109
3110 def parse_age_limit(s):
3111 if type(s) == int:
3112 return s if 0 <= s <= 21 else None
3113 if not isinstance(s, compat_basestring):
3114 return None
3115 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3116 if m:
3117 return int(m.group('age'))
3118 s = s.upper()
3119 if s in US_RATINGS:
3120 return US_RATINGS[s]
3121 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3122 if m:
3123 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3124 return None
3125
3126
3127 def strip_jsonp(code):
3128 return re.sub(
3129 r'''(?sx)^
3130 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3131 (?:\s*&&\s*(?P=func_name))?
3132 \s*\(\s*(?P<callback_data>.*)\);?
3133 \s*?(?://[^\n]*)*$''',
3134 r'\g<callback_data>', code)
3135
3136
3137 def js_to_json(code, vars={}):
3138 # vars is a dict of var, val pairs to substitute
3139 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3140 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3141 INTEGER_TABLE = (
3142 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3143 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3144 )
3145
3146 def fix_kv(m):
3147 v = m.group(0)
3148 if v in ('true', 'false', 'null'):
3149 return v
3150 elif v in ('undefined', 'void 0'):
3151 return 'null'
3152 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3153 return ""
3154
3155 if v[0] in ("'", '"'):
3156 v = re.sub(r'(?s)\\.|"', lambda m: {
3157 '"': '\\"',
3158 "\\'": "'",
3159 '\\\n': '',
3160 '\\x': '\\u00',
3161 }.get(m.group(0), m.group(0)), v[1:-1])
3162 else:
3163 for regex, base in INTEGER_TABLE:
3164 im = re.match(regex, v)
3165 if im:
3166 i = int(im.group(1), base)
3167 return '"%d":' % i if v.endswith(':') else '%d' % i
3168
3169 if v in vars:
3170 return vars[v]
3171
3172 return '"%s"' % v
3173
3174 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3175
3176 return re.sub(r'''(?sx)
3177 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3178 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3179 {comment}|,(?={skip}[\]}}])|
3180 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3181 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3182 [0-9]+(?={skip}:)|
3183 !+
3184 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3185
3186
3187 def qualities(quality_ids):
3188 """ Get a numeric quality value out of a list of possible values """
3189 def q(qid):
3190 try:
3191 return quality_ids.index(qid)
3192 except ValueError:
3193 return -1
3194 return q
3195
3196
3197 POSTPROCESS_WHEN = {'pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
3198
3199
3200 DEFAULT_OUTTMPL = {
3201 'default': '%(title)s [%(id)s].%(ext)s',
3202 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3203 }
3204 OUTTMPL_TYPES = {
3205 'chapter': None,
3206 'subtitle': None,
3207 'thumbnail': None,
3208 'description': 'description',
3209 'annotation': 'annotations.xml',
3210 'infojson': 'info.json',
3211 'link': None,
3212 'pl_video': None,
3213 'pl_thumbnail': None,
3214 'pl_description': 'description',
3215 'pl_infojson': 'info.json',
3216 }
3217
3218 # As of [1] format syntax is:
3219 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3220 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3221 STR_FORMAT_RE_TMPL = r'''(?x)
3222 (?<!%)(?P<prefix>(?:%%)*)
3223 %
3224 (?P<has_key>\((?P<key>{0})\))?
3225 (?P<format>
3226 (?P<conversion>[#0\-+ ]+)?
3227 (?P<min_width>\d+)?
3228 (?P<precision>\.\d+)?
3229 (?P<len_mod>[hlL])? # unused in python
3230 {1} # conversion type
3231 )
3232 '''
3233
3234
3235 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3236
3237
3238 def limit_length(s, length):
3239 """ Add ellipses to overly long strings """
3240 if s is None:
3241 return None
3242 ELLIPSES = '...'
3243 if len(s) > length:
3244 return s[:length - len(ELLIPSES)] + ELLIPSES
3245 return s
3246
3247
3248 def version_tuple(v):
3249 return tuple(int(e) for e in re.split(r'[-.]', v))
3250
3251
3252 def is_outdated_version(version, limit, assume_new=True):
3253 if not version:
3254 return not assume_new
3255 try:
3256 return version_tuple(version) < version_tuple(limit)
3257 except ValueError:
3258 return not assume_new
3259
3260
3261 def ytdl_is_updateable():
3262 """ Returns if yt-dlp can be updated with -U """
3263
3264 from .update import is_non_updateable
3265
3266 return not is_non_updateable()
3267
3268
3269 def args_to_str(args):
3270 # Get a short string representation for a subprocess command
3271 return ' '.join(compat_shlex_quote(a) for a in args)
3272
3273
3274 def error_to_compat_str(err):
3275 err_str = str(err)
3276 # On python 2 error byte string must be decoded with proper
3277 # encoding rather than ascii
3278 if sys.version_info[0] < 3:
3279 err_str = err_str.decode(preferredencoding())
3280 return err_str
3281
3282
3283 def mimetype2ext(mt):
3284 if mt is None:
3285 return None
3286
3287 mt, _, params = mt.partition(';')
3288 mt = mt.strip()
3289
3290 FULL_MAP = {
3291 'audio/mp4': 'm4a',
3292 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3293 # it's the most popular one
3294 'audio/mpeg': 'mp3',
3295 'audio/x-wav': 'wav',
3296 'audio/wav': 'wav',
3297 'audio/wave': 'wav',
3298 }
3299
3300 ext = FULL_MAP.get(mt)
3301 if ext is not None:
3302 return ext
3303
3304 SUBTYPE_MAP = {
3305 '3gpp': '3gp',
3306 'smptett+xml': 'tt',
3307 'ttaf+xml': 'dfxp',
3308 'ttml+xml': 'ttml',
3309 'x-flv': 'flv',
3310 'x-mp4-fragmented': 'mp4',
3311 'x-ms-sami': 'sami',
3312 'x-ms-wmv': 'wmv',
3313 'mpegurl': 'm3u8',
3314 'x-mpegurl': 'm3u8',
3315 'vnd.apple.mpegurl': 'm3u8',
3316 'dash+xml': 'mpd',
3317 'f4m+xml': 'f4m',
3318 'hds+xml': 'f4m',
3319 'vnd.ms-sstr+xml': 'ism',
3320 'quicktime': 'mov',
3321 'mp2t': 'ts',
3322 'x-wav': 'wav',
3323 'filmstrip+json': 'fs',
3324 'svg+xml': 'svg',
3325 }
3326
3327 _, _, subtype = mt.rpartition('/')
3328 ext = SUBTYPE_MAP.get(subtype.lower())
3329 if ext is not None:
3330 return ext
3331
3332 SUFFIX_MAP = {
3333 'json': 'json',
3334 'xml': 'xml',
3335 'zip': 'zip',
3336 'gzip': 'gz',
3337 }
3338
3339 _, _, suffix = subtype.partition('+')
3340 ext = SUFFIX_MAP.get(suffix)
3341 if ext is not None:
3342 return ext
3343
3344 return subtype.replace('+', '.')
3345
3346
3347 def ext2mimetype(ext_or_url):
3348 if not ext_or_url:
3349 return None
3350 if '.' not in ext_or_url:
3351 ext_or_url = f'file.{ext_or_url}'
3352 return mimetypes.guess_type(ext_or_url)[0]
3353
3354
3355 def parse_codecs(codecs_str):
3356 # http://tools.ietf.org/html/rfc6381
3357 if not codecs_str:
3358 return {}
3359 split_codecs = list(filter(None, map(
3360 str.strip, codecs_str.strip().strip(',').split(','))))
3361 vcodec, acodec, tcodec, hdr = None, None, None, None
3362 for full_codec in split_codecs:
3363 parts = full_codec.split('.')
3364 codec = parts[0].replace('0', '')
3365 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3366 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3367 if not vcodec:
3368 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3369 if codec in ('dvh1', 'dvhe'):
3370 hdr = 'DV'
3371 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3372 hdr = 'HDR10'
3373 elif full_codec.replace('0', '').startswith('vp9.2'):
3374 hdr = 'HDR10'
3375 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3376 if not acodec:
3377 acodec = full_codec
3378 elif codec in ('stpp', 'wvtt',):
3379 if not tcodec:
3380 tcodec = full_codec
3381 else:
3382 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3383 if vcodec or acodec or tcodec:
3384 return {
3385 'vcodec': vcodec or 'none',
3386 'acodec': acodec or 'none',
3387 'dynamic_range': hdr,
3388 **({'tcodec': tcodec} if tcodec is not None else {}),
3389 }
3390 elif len(split_codecs) == 2:
3391 return {
3392 'vcodec': split_codecs[0],
3393 'acodec': split_codecs[1],
3394 }
3395 return {}
3396
3397
3398 def urlhandle_detect_ext(url_handle):
3399 getheader = url_handle.headers.get
3400
3401 cd = getheader('Content-Disposition')
3402 if cd:
3403 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3404 if m:
3405 e = determine_ext(m.group('filename'), default_ext=None)
3406 if e:
3407 return e
3408
3409 return mimetype2ext(getheader('Content-Type'))
3410
3411
3412 def encode_data_uri(data, mime_type):
3413 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3414
3415
3416 def age_restricted(content_limit, age_limit):
3417 """ Returns True iff the content should be blocked """
3418
3419 if age_limit is None: # No limit set
3420 return False
3421 if content_limit is None:
3422 return False # Content available for everyone
3423 return age_limit < content_limit
3424
3425
3426 def is_html(first_bytes):
3427 """ Detect whether a file contains HTML by examining its first bytes. """
3428
3429 BOMS = [
3430 (b'\xef\xbb\xbf', 'utf-8'),
3431 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3432 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3433 (b'\xff\xfe', 'utf-16-le'),
3434 (b'\xfe\xff', 'utf-16-be'),
3435 ]
3436 for bom, enc in BOMS:
3437 if first_bytes.startswith(bom):
3438 s = first_bytes[len(bom):].decode(enc, 'replace')
3439 break
3440 else:
3441 s = first_bytes.decode('utf-8', 'replace')
3442
3443 return re.match(r'^\s*<', s)
3444
3445
3446 def determine_protocol(info_dict):
3447 protocol = info_dict.get('protocol')
3448 if protocol is not None:
3449 return protocol
3450
3451 url = sanitize_url(info_dict['url'])
3452 if url.startswith('rtmp'):
3453 return 'rtmp'
3454 elif url.startswith('mms'):
3455 return 'mms'
3456 elif url.startswith('rtsp'):
3457 return 'rtsp'
3458
3459 ext = determine_ext(url)
3460 if ext == 'm3u8':
3461 return 'm3u8'
3462 elif ext == 'f4m':
3463 return 'f4m'
3464
3465 return compat_urllib_parse_urlparse(url).scheme
3466
3467
3468 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3469 """ Render a list of rows, each as a list of values.
3470 Text after a \t will be right aligned """
3471 def width(string):
3472 return len(remove_terminal_sequences(string).replace('\t', ''))
3473
3474 def get_max_lens(table):
3475 return [max(width(str(v)) for v in col) for col in zip(*table)]
3476
3477 def filter_using_list(row, filterArray):
3478 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3479
3480 max_lens = get_max_lens(data) if hide_empty else []
3481 header_row = filter_using_list(header_row, max_lens)
3482 data = [filter_using_list(row, max_lens) for row in data]
3483
3484 table = [header_row] + data
3485 max_lens = get_max_lens(table)
3486 extra_gap += 1
3487 if delim:
3488 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3489 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3490 for row in table:
3491 for pos, text in enumerate(map(str, row)):
3492 if '\t' in text:
3493 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3494 else:
3495 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3496 ret = '\n'.join(''.join(row).rstrip() for row in table)
3497 return ret
3498
3499
3500 def _match_one(filter_part, dct, incomplete):
3501 # TODO: Generalize code with YoutubeDL._build_format_filter
3502 STRING_OPERATORS = {
3503 '*=': operator.contains,
3504 '^=': lambda attr, value: attr.startswith(value),
3505 '$=': lambda attr, value: attr.endswith(value),
3506 '~=': lambda attr, value: re.search(value, attr),
3507 }
3508 COMPARISON_OPERATORS = {
3509 **STRING_OPERATORS,
3510 '<=': operator.le, # "<=" must be defined above "<"
3511 '<': operator.lt,
3512 '>=': operator.ge,
3513 '>': operator.gt,
3514 '=': operator.eq,
3515 }
3516
3517 operator_rex = re.compile(r'''(?x)\s*
3518 (?P<key>[a-z_]+)
3519 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3520 (?:
3521 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3522 (?P<strval>.+?)
3523 )
3524 \s*$
3525 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3526 m = operator_rex.search(filter_part)
3527 if m:
3528 m = m.groupdict()
3529 unnegated_op = COMPARISON_OPERATORS[m['op']]
3530 if m['negation']:
3531 op = lambda attr, value: not unnegated_op(attr, value)
3532 else:
3533 op = unnegated_op
3534 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3535 if m['quote']:
3536 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3537 actual_value = dct.get(m['key'])
3538 numeric_comparison = None
3539 if isinstance(actual_value, compat_numeric_types):
3540 # If the original field is a string and matching comparisonvalue is
3541 # a number we should respect the origin of the original field
3542 # and process comparison value as a string (see
3543 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3544 try:
3545 numeric_comparison = int(comparison_value)
3546 except ValueError:
3547 numeric_comparison = parse_filesize(comparison_value)
3548 if numeric_comparison is None:
3549 numeric_comparison = parse_filesize(f'{comparison_value}B')
3550 if numeric_comparison is None:
3551 numeric_comparison = parse_duration(comparison_value)
3552 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3553 raise ValueError('Operator %s only supports string values!' % m['op'])
3554 if actual_value is None:
3555 return incomplete or m['none_inclusive']
3556 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3557
3558 UNARY_OPERATORS = {
3559 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3560 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3561 }
3562 operator_rex = re.compile(r'''(?x)\s*
3563 (?P<op>%s)\s*(?P<key>[a-z_]+)
3564 \s*$
3565 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3566 m = operator_rex.search(filter_part)
3567 if m:
3568 op = UNARY_OPERATORS[m.group('op')]
3569 actual_value = dct.get(m.group('key'))
3570 if incomplete and actual_value is None:
3571 return True
3572 return op(actual_value)
3573
3574 raise ValueError('Invalid filter part %r' % filter_part)
3575
3576
3577 def match_str(filter_str, dct, incomplete=False):
3578 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3579 When incomplete, all conditions passes on missing fields
3580 """
3581 return all(
3582 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3583 for filter_part in re.split(r'(?<!\\)&', filter_str))
3584
3585
3586 def match_filter_func(filter_str):
3587 def _match_func(info_dict, *args, **kwargs):
3588 if match_str(filter_str, info_dict, *args, **kwargs):
3589 return None
3590 else:
3591 video_title = info_dict.get('title', info_dict.get('id', 'video'))
3592 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3593 return _match_func
3594
3595
3596 def parse_dfxp_time_expr(time_expr):
3597 if not time_expr:
3598 return
3599
3600 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3601 if mobj:
3602 return float(mobj.group('time_offset'))
3603
3604 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3605 if mobj:
3606 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3607
3608
3609 def srt_subtitles_timecode(seconds):
3610 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3611
3612
3613 def ass_subtitles_timecode(seconds):
3614 time = timetuple_from_msec(seconds * 1000)
3615 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3616
3617
3618 def dfxp2srt(dfxp_data):
3619 '''
3620 @param dfxp_data A bytes-like object containing DFXP data
3621 @returns A unicode object containing converted SRT data
3622 '''
3623 LEGACY_NAMESPACES = (
3624 (b'http://www.w3.org/ns/ttml', [
3625 b'http://www.w3.org/2004/11/ttaf1',
3626 b'http://www.w3.org/2006/04/ttaf1',
3627 b'http://www.w3.org/2006/10/ttaf1',
3628 ]),
3629 (b'http://www.w3.org/ns/ttml#styling', [
3630 b'http://www.w3.org/ns/ttml#style',
3631 ]),
3632 )
3633
3634 SUPPORTED_STYLING = [
3635 'color',
3636 'fontFamily',
3637 'fontSize',
3638 'fontStyle',
3639 'fontWeight',
3640 'textDecoration'
3641 ]
3642
3643 _x = functools.partial(xpath_with_ns, ns_map={
3644 'xml': 'http://www.w3.org/XML/1998/namespace',
3645 'ttml': 'http://www.w3.org/ns/ttml',
3646 'tts': 'http://www.w3.org/ns/ttml#styling',
3647 })
3648
3649 styles = {}
3650 default_style = {}
3651
3652 class TTMLPElementParser(object):
3653 _out = ''
3654 _unclosed_elements = []
3655 _applied_styles = []
3656
3657 def start(self, tag, attrib):
3658 if tag in (_x('ttml:br'), 'br'):
3659 self._out += '\n'
3660 else:
3661 unclosed_elements = []
3662 style = {}
3663 element_style_id = attrib.get('style')
3664 if default_style:
3665 style.update(default_style)
3666 if element_style_id:
3667 style.update(styles.get(element_style_id, {}))
3668 for prop in SUPPORTED_STYLING:
3669 prop_val = attrib.get(_x('tts:' + prop))
3670 if prop_val:
3671 style[prop] = prop_val
3672 if style:
3673 font = ''
3674 for k, v in sorted(style.items()):
3675 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3676 continue
3677 if k == 'color':
3678 font += ' color="%s"' % v
3679 elif k == 'fontSize':
3680 font += ' size="%s"' % v
3681 elif k == 'fontFamily':
3682 font += ' face="%s"' % v
3683 elif k == 'fontWeight' and v == 'bold':
3684 self._out += '<b>'
3685 unclosed_elements.append('b')
3686 elif k == 'fontStyle' and v == 'italic':
3687 self._out += '<i>'
3688 unclosed_elements.append('i')
3689 elif k == 'textDecoration' and v == 'underline':
3690 self._out += '<u>'
3691 unclosed_elements.append('u')
3692 if font:
3693 self._out += '<font' + font + '>'
3694 unclosed_elements.append('font')
3695 applied_style = {}
3696 if self._applied_styles:
3697 applied_style.update(self._applied_styles[-1])
3698 applied_style.update(style)
3699 self._applied_styles.append(applied_style)
3700 self._unclosed_elements.append(unclosed_elements)
3701
3702 def end(self, tag):
3703 if tag not in (_x('ttml:br'), 'br'):
3704 unclosed_elements = self._unclosed_elements.pop()
3705 for element in reversed(unclosed_elements):
3706 self._out += '</%s>' % element
3707 if unclosed_elements and self._applied_styles:
3708 self._applied_styles.pop()
3709
3710 def data(self, data):
3711 self._out += data
3712
3713 def close(self):
3714 return self._out.strip()
3715
3716 def parse_node(node):
3717 target = TTMLPElementParser()
3718 parser = xml.etree.ElementTree.XMLParser(target=target)
3719 parser.feed(xml.etree.ElementTree.tostring(node))
3720 return parser.close()
3721
3722 for k, v in LEGACY_NAMESPACES:
3723 for ns in v:
3724 dfxp_data = dfxp_data.replace(ns, k)
3725
3726 dfxp = compat_etree_fromstring(dfxp_data)
3727 out = []
3728 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3729
3730 if not paras:
3731 raise ValueError('Invalid dfxp/TTML subtitle')
3732
3733 repeat = False
3734 while True:
3735 for style in dfxp.findall(_x('.//ttml:style')):
3736 style_id = style.get('id') or style.get(_x('xml:id'))
3737 if not style_id:
3738 continue
3739 parent_style_id = style.get('style')
3740 if parent_style_id:
3741 if parent_style_id not in styles:
3742 repeat = True
3743 continue
3744 styles[style_id] = styles[parent_style_id].copy()
3745 for prop in SUPPORTED_STYLING:
3746 prop_val = style.get(_x('tts:' + prop))
3747 if prop_val:
3748 styles.setdefault(style_id, {})[prop] = prop_val
3749 if repeat:
3750 repeat = False
3751 else:
3752 break
3753
3754 for p in ('body', 'div'):
3755 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3756 if ele is None:
3757 continue
3758 style = styles.get(ele.get('style'))
3759 if not style:
3760 continue
3761 default_style.update(style)
3762
3763 for para, index in zip(paras, itertools.count(1)):
3764 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3765 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3766 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3767 if begin_time is None:
3768 continue
3769 if not end_time:
3770 if not dur:
3771 continue
3772 end_time = begin_time + dur
3773 out.append('%d\n%s --> %s\n%s\n\n' % (
3774 index,
3775 srt_subtitles_timecode(begin_time),
3776 srt_subtitles_timecode(end_time),
3777 parse_node(para)))
3778
3779 return ''.join(out)
3780
3781
3782 def cli_option(params, command_option, param):
3783 param = params.get(param)
3784 if param:
3785 param = compat_str(param)
3786 return [command_option, param] if param is not None else []
3787
3788
3789 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3790 param = params.get(param)
3791 if param is None:
3792 return []
3793 assert isinstance(param, bool)
3794 if separator:
3795 return [command_option + separator + (true_value if param else false_value)]
3796 return [command_option, true_value if param else false_value]
3797
3798
3799 def cli_valueless_option(params, command_option, param, expected_value=True):
3800 param = params.get(param)
3801 return [command_option] if param == expected_value else []
3802
3803
3804 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3805 if isinstance(argdict, (list, tuple)): # for backward compatibility
3806 if use_compat:
3807 return argdict
3808 else:
3809 argdict = None
3810 if argdict is None:
3811 return default
3812 assert isinstance(argdict, dict)
3813
3814 assert isinstance(keys, (list, tuple))
3815 for key_list in keys:
3816 arg_list = list(filter(
3817 lambda x: x is not None,
3818 [argdict.get(key.lower()) for key in variadic(key_list)]))
3819 if arg_list:
3820 return [arg for args in arg_list for arg in args]
3821 return default
3822
3823
3824 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3825 main_key, exe = main_key.lower(), exe.lower()
3826 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3827 keys = [f'{root_key}{k}' for k in (keys or [''])]
3828 if root_key in keys:
3829 if main_key != exe:
3830 keys.append((main_key, exe))
3831 keys.append('default')
3832 else:
3833 use_compat = False
3834 return cli_configuration_args(argdict, keys, default, use_compat)
3835
3836
3837 class ISO639Utils(object):
3838 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3839 _lang_map = {
3840 'aa': 'aar',
3841 'ab': 'abk',
3842 'ae': 'ave',
3843 'af': 'afr',
3844 'ak': 'aka',
3845 'am': 'amh',
3846 'an': 'arg',
3847 'ar': 'ara',
3848 'as': 'asm',
3849 'av': 'ava',
3850 'ay': 'aym',
3851 'az': 'aze',
3852 'ba': 'bak',
3853 'be': 'bel',
3854 'bg': 'bul',
3855 'bh': 'bih',
3856 'bi': 'bis',
3857 'bm': 'bam',
3858 'bn': 'ben',
3859 'bo': 'bod',
3860 'br': 'bre',
3861 'bs': 'bos',
3862 'ca': 'cat',
3863 'ce': 'che',
3864 'ch': 'cha',
3865 'co': 'cos',
3866 'cr': 'cre',
3867 'cs': 'ces',
3868 'cu': 'chu',
3869 'cv': 'chv',
3870 'cy': 'cym',
3871 'da': 'dan',
3872 'de': 'deu',
3873 'dv': 'div',
3874 'dz': 'dzo',
3875 'ee': 'ewe',
3876 'el': 'ell',
3877 'en': 'eng',
3878 'eo': 'epo',
3879 'es': 'spa',
3880 'et': 'est',
3881 'eu': 'eus',
3882 'fa': 'fas',
3883 'ff': 'ful',
3884 'fi': 'fin',
3885 'fj': 'fij',
3886 'fo': 'fao',
3887 'fr': 'fra',
3888 'fy': 'fry',
3889 'ga': 'gle',
3890 'gd': 'gla',
3891 'gl': 'glg',
3892 'gn': 'grn',
3893 'gu': 'guj',
3894 'gv': 'glv',
3895 'ha': 'hau',
3896 'he': 'heb',
3897 'iw': 'heb', # Replaced by he in 1989 revision
3898 'hi': 'hin',
3899 'ho': 'hmo',
3900 'hr': 'hrv',
3901 'ht': 'hat',
3902 'hu': 'hun',
3903 'hy': 'hye',
3904 'hz': 'her',
3905 'ia': 'ina',
3906 'id': 'ind',
3907 'in': 'ind', # Replaced by id in 1989 revision
3908 'ie': 'ile',
3909 'ig': 'ibo',
3910 'ii': 'iii',
3911 'ik': 'ipk',
3912 'io': 'ido',
3913 'is': 'isl',
3914 'it': 'ita',
3915 'iu': 'iku',
3916 'ja': 'jpn',
3917 'jv': 'jav',
3918 'ka': 'kat',
3919 'kg': 'kon',
3920 'ki': 'kik',
3921 'kj': 'kua',
3922 'kk': 'kaz',
3923 'kl': 'kal',
3924 'km': 'khm',
3925 'kn': 'kan',
3926 'ko': 'kor',
3927 'kr': 'kau',
3928 'ks': 'kas',
3929 'ku': 'kur',
3930 'kv': 'kom',
3931 'kw': 'cor',
3932 'ky': 'kir',
3933 'la': 'lat',
3934 'lb': 'ltz',
3935 'lg': 'lug',
3936 'li': 'lim',
3937 'ln': 'lin',
3938 'lo': 'lao',
3939 'lt': 'lit',
3940 'lu': 'lub',
3941 'lv': 'lav',
3942 'mg': 'mlg',
3943 'mh': 'mah',
3944 'mi': 'mri',
3945 'mk': 'mkd',
3946 'ml': 'mal',
3947 'mn': 'mon',
3948 'mr': 'mar',
3949 'ms': 'msa',
3950 'mt': 'mlt',
3951 'my': 'mya',
3952 'na': 'nau',
3953 'nb': 'nob',
3954 'nd': 'nde',
3955 'ne': 'nep',
3956 'ng': 'ndo',
3957 'nl': 'nld',
3958 'nn': 'nno',
3959 'no': 'nor',
3960 'nr': 'nbl',
3961 'nv': 'nav',
3962 'ny': 'nya',
3963 'oc': 'oci',
3964 'oj': 'oji',
3965 'om': 'orm',
3966 'or': 'ori',
3967 'os': 'oss',
3968 'pa': 'pan',
3969 'pi': 'pli',
3970 'pl': 'pol',
3971 'ps': 'pus',
3972 'pt': 'por',
3973 'qu': 'que',
3974 'rm': 'roh',
3975 'rn': 'run',
3976 'ro': 'ron',
3977 'ru': 'rus',
3978 'rw': 'kin',
3979 'sa': 'san',
3980 'sc': 'srd',
3981 'sd': 'snd',
3982 'se': 'sme',
3983 'sg': 'sag',
3984 'si': 'sin',
3985 'sk': 'slk',
3986 'sl': 'slv',
3987 'sm': 'smo',
3988 'sn': 'sna',
3989 'so': 'som',
3990 'sq': 'sqi',
3991 'sr': 'srp',
3992 'ss': 'ssw',
3993 'st': 'sot',
3994 'su': 'sun',
3995 'sv': 'swe',
3996 'sw': 'swa',
3997 'ta': 'tam',
3998 'te': 'tel',
3999 'tg': 'tgk',
4000 'th': 'tha',
4001 'ti': 'tir',
4002 'tk': 'tuk',
4003 'tl': 'tgl',
4004 'tn': 'tsn',
4005 'to': 'ton',
4006 'tr': 'tur',
4007 'ts': 'tso',
4008 'tt': 'tat',
4009 'tw': 'twi',
4010 'ty': 'tah',
4011 'ug': 'uig',
4012 'uk': 'ukr',
4013 'ur': 'urd',
4014 'uz': 'uzb',
4015 've': 'ven',
4016 'vi': 'vie',
4017 'vo': 'vol',
4018 'wa': 'wln',
4019 'wo': 'wol',
4020 'xh': 'xho',
4021 'yi': 'yid',
4022 'ji': 'yid', # Replaced by yi in 1989 revision
4023 'yo': 'yor',
4024 'za': 'zha',
4025 'zh': 'zho',
4026 'zu': 'zul',
4027 }
4028
4029 @classmethod
4030 def short2long(cls, code):
4031 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4032 return cls._lang_map.get(code[:2])
4033
4034 @classmethod
4035 def long2short(cls, code):
4036 """Convert language code from ISO 639-2/T to ISO 639-1"""
4037 for short_name, long_name in cls._lang_map.items():
4038 if long_name == code:
4039 return short_name
4040
4041
4042 class ISO3166Utils(object):
4043 # From http://data.okfn.org/data/core/country-list
4044 _country_map = {
4045 'AF': 'Afghanistan',
4046 'AX': 'Åland Islands',
4047 'AL': 'Albania',
4048 'DZ': 'Algeria',
4049 'AS': 'American Samoa',
4050 'AD': 'Andorra',
4051 'AO': 'Angola',
4052 'AI': 'Anguilla',
4053 'AQ': 'Antarctica',
4054 'AG': 'Antigua and Barbuda',
4055 'AR': 'Argentina',
4056 'AM': 'Armenia',
4057 'AW': 'Aruba',
4058 'AU': 'Australia',
4059 'AT': 'Austria',
4060 'AZ': 'Azerbaijan',
4061 'BS': 'Bahamas',
4062 'BH': 'Bahrain',
4063 'BD': 'Bangladesh',
4064 'BB': 'Barbados',
4065 'BY': 'Belarus',
4066 'BE': 'Belgium',
4067 'BZ': 'Belize',
4068 'BJ': 'Benin',
4069 'BM': 'Bermuda',
4070 'BT': 'Bhutan',
4071 'BO': 'Bolivia, Plurinational State of',
4072 'BQ': 'Bonaire, Sint Eustatius and Saba',
4073 'BA': 'Bosnia and Herzegovina',
4074 'BW': 'Botswana',
4075 'BV': 'Bouvet Island',
4076 'BR': 'Brazil',
4077 'IO': 'British Indian Ocean Territory',
4078 'BN': 'Brunei Darussalam',
4079 'BG': 'Bulgaria',
4080 'BF': 'Burkina Faso',
4081 'BI': 'Burundi',
4082 'KH': 'Cambodia',
4083 'CM': 'Cameroon',
4084 'CA': 'Canada',
4085 'CV': 'Cape Verde',
4086 'KY': 'Cayman Islands',
4087 'CF': 'Central African Republic',
4088 'TD': 'Chad',
4089 'CL': 'Chile',
4090 'CN': 'China',
4091 'CX': 'Christmas Island',
4092 'CC': 'Cocos (Keeling) Islands',
4093 'CO': 'Colombia',
4094 'KM': 'Comoros',
4095 'CG': 'Congo',
4096 'CD': 'Congo, the Democratic Republic of the',
4097 'CK': 'Cook Islands',
4098 'CR': 'Costa Rica',
4099 'CI': 'Côte d\'Ivoire',
4100 'HR': 'Croatia',
4101 'CU': 'Cuba',
4102 'CW': 'Curaçao',
4103 'CY': 'Cyprus',
4104 'CZ': 'Czech Republic',
4105 'DK': 'Denmark',
4106 'DJ': 'Djibouti',
4107 'DM': 'Dominica',
4108 'DO': 'Dominican Republic',
4109 'EC': 'Ecuador',
4110 'EG': 'Egypt',
4111 'SV': 'El Salvador',
4112 'GQ': 'Equatorial Guinea',
4113 'ER': 'Eritrea',
4114 'EE': 'Estonia',
4115 'ET': 'Ethiopia',
4116 'FK': 'Falkland Islands (Malvinas)',
4117 'FO': 'Faroe Islands',
4118 'FJ': 'Fiji',
4119 'FI': 'Finland',
4120 'FR': 'France',
4121 'GF': 'French Guiana',
4122 'PF': 'French Polynesia',
4123 'TF': 'French Southern Territories',
4124 'GA': 'Gabon',
4125 'GM': 'Gambia',
4126 'GE': 'Georgia',
4127 'DE': 'Germany',
4128 'GH': 'Ghana',
4129 'GI': 'Gibraltar',
4130 'GR': 'Greece',
4131 'GL': 'Greenland',
4132 'GD': 'Grenada',
4133 'GP': 'Guadeloupe',
4134 'GU': 'Guam',
4135 'GT': 'Guatemala',
4136 'GG': 'Guernsey',
4137 'GN': 'Guinea',
4138 'GW': 'Guinea-Bissau',
4139 'GY': 'Guyana',
4140 'HT': 'Haiti',
4141 'HM': 'Heard Island and McDonald Islands',
4142 'VA': 'Holy See (Vatican City State)',
4143 'HN': 'Honduras',
4144 'HK': 'Hong Kong',
4145 'HU': 'Hungary',
4146 'IS': 'Iceland',
4147 'IN': 'India',
4148 'ID': 'Indonesia',
4149 'IR': 'Iran, Islamic Republic of',
4150 'IQ': 'Iraq',
4151 'IE': 'Ireland',
4152 'IM': 'Isle of Man',
4153 'IL': 'Israel',
4154 'IT': 'Italy',
4155 'JM': 'Jamaica',
4156 'JP': 'Japan',
4157 'JE': 'Jersey',
4158 'JO': 'Jordan',
4159 'KZ': 'Kazakhstan',
4160 'KE': 'Kenya',
4161 'KI': 'Kiribati',
4162 'KP': 'Korea, Democratic People\'s Republic of',
4163 'KR': 'Korea, Republic of',
4164 'KW': 'Kuwait',
4165 'KG': 'Kyrgyzstan',
4166 'LA': 'Lao People\'s Democratic Republic',
4167 'LV': 'Latvia',
4168 'LB': 'Lebanon',
4169 'LS': 'Lesotho',
4170 'LR': 'Liberia',
4171 'LY': 'Libya',
4172 'LI': 'Liechtenstein',
4173 'LT': 'Lithuania',
4174 'LU': 'Luxembourg',
4175 'MO': 'Macao',
4176 'MK': 'Macedonia, the Former Yugoslav Republic of',
4177 'MG': 'Madagascar',
4178 'MW': 'Malawi',
4179 'MY': 'Malaysia',
4180 'MV': 'Maldives',
4181 'ML': 'Mali',
4182 'MT': 'Malta',
4183 'MH': 'Marshall Islands',
4184 'MQ': 'Martinique',
4185 'MR': 'Mauritania',
4186 'MU': 'Mauritius',
4187 'YT': 'Mayotte',
4188 'MX': 'Mexico',
4189 'FM': 'Micronesia, Federated States of',
4190 'MD': 'Moldova, Republic of',
4191 'MC': 'Monaco',
4192 'MN': 'Mongolia',
4193 'ME': 'Montenegro',
4194 'MS': 'Montserrat',
4195 'MA': 'Morocco',
4196 'MZ': 'Mozambique',
4197 'MM': 'Myanmar',
4198 'NA': 'Namibia',
4199 'NR': 'Nauru',
4200 'NP': 'Nepal',
4201 'NL': 'Netherlands',
4202 'NC': 'New Caledonia',
4203 'NZ': 'New Zealand',
4204 'NI': 'Nicaragua',
4205 'NE': 'Niger',
4206 'NG': 'Nigeria',
4207 'NU': 'Niue',
4208 'NF': 'Norfolk Island',
4209 'MP': 'Northern Mariana Islands',
4210 'NO': 'Norway',
4211 'OM': 'Oman',
4212 'PK': 'Pakistan',
4213 'PW': 'Palau',
4214 'PS': 'Palestine, State of',
4215 'PA': 'Panama',
4216 'PG': 'Papua New Guinea',
4217 'PY': 'Paraguay',
4218 'PE': 'Peru',
4219 'PH': 'Philippines',
4220 'PN': 'Pitcairn',
4221 'PL': 'Poland',
4222 'PT': 'Portugal',
4223 'PR': 'Puerto Rico',
4224 'QA': 'Qatar',
4225 'RE': 'Réunion',
4226 'RO': 'Romania',
4227 'RU': 'Russian Federation',
4228 'RW': 'Rwanda',
4229 'BL': 'Saint Barthélemy',
4230 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4231 'KN': 'Saint Kitts and Nevis',
4232 'LC': 'Saint Lucia',
4233 'MF': 'Saint Martin (French part)',
4234 'PM': 'Saint Pierre and Miquelon',
4235 'VC': 'Saint Vincent and the Grenadines',
4236 'WS': 'Samoa',
4237 'SM': 'San Marino',
4238 'ST': 'Sao Tome and Principe',
4239 'SA': 'Saudi Arabia',
4240 'SN': 'Senegal',
4241 'RS': 'Serbia',
4242 'SC': 'Seychelles',
4243 'SL': 'Sierra Leone',
4244 'SG': 'Singapore',
4245 'SX': 'Sint Maarten (Dutch part)',
4246 'SK': 'Slovakia',
4247 'SI': 'Slovenia',
4248 'SB': 'Solomon Islands',
4249 'SO': 'Somalia',
4250 'ZA': 'South Africa',
4251 'GS': 'South Georgia and the South Sandwich Islands',
4252 'SS': 'South Sudan',
4253 'ES': 'Spain',
4254 'LK': 'Sri Lanka',
4255 'SD': 'Sudan',
4256 'SR': 'Suriname',
4257 'SJ': 'Svalbard and Jan Mayen',
4258 'SZ': 'Swaziland',
4259 'SE': 'Sweden',
4260 'CH': 'Switzerland',
4261 'SY': 'Syrian Arab Republic',
4262 'TW': 'Taiwan, Province of China',
4263 'TJ': 'Tajikistan',
4264 'TZ': 'Tanzania, United Republic of',
4265 'TH': 'Thailand',
4266 'TL': 'Timor-Leste',
4267 'TG': 'Togo',
4268 'TK': 'Tokelau',
4269 'TO': 'Tonga',
4270 'TT': 'Trinidad and Tobago',
4271 'TN': 'Tunisia',
4272 'TR': 'Turkey',
4273 'TM': 'Turkmenistan',
4274 'TC': 'Turks and Caicos Islands',
4275 'TV': 'Tuvalu',
4276 'UG': 'Uganda',
4277 'UA': 'Ukraine',
4278 'AE': 'United Arab Emirates',
4279 'GB': 'United Kingdom',
4280 'US': 'United States',
4281 'UM': 'United States Minor Outlying Islands',
4282 'UY': 'Uruguay',
4283 'UZ': 'Uzbekistan',
4284 'VU': 'Vanuatu',
4285 'VE': 'Venezuela, Bolivarian Republic of',
4286 'VN': 'Viet Nam',
4287 'VG': 'Virgin Islands, British',
4288 'VI': 'Virgin Islands, U.S.',
4289 'WF': 'Wallis and Futuna',
4290 'EH': 'Western Sahara',
4291 'YE': 'Yemen',
4292 'ZM': 'Zambia',
4293 'ZW': 'Zimbabwe',
4294 }
4295
4296 @classmethod
4297 def short2full(cls, code):
4298 """Convert an ISO 3166-2 country code to the corresponding full name"""
4299 return cls._country_map.get(code.upper())
4300
4301
4302 class GeoUtils(object):
4303 # Major IPv4 address blocks per country
4304 _country_ip_map = {
4305 'AD': '46.172.224.0/19',
4306 'AE': '94.200.0.0/13',
4307 'AF': '149.54.0.0/17',
4308 'AG': '209.59.64.0/18',
4309 'AI': '204.14.248.0/21',
4310 'AL': '46.99.0.0/16',
4311 'AM': '46.70.0.0/15',
4312 'AO': '105.168.0.0/13',
4313 'AP': '182.50.184.0/21',
4314 'AQ': '23.154.160.0/24',
4315 'AR': '181.0.0.0/12',
4316 'AS': '202.70.112.0/20',
4317 'AT': '77.116.0.0/14',
4318 'AU': '1.128.0.0/11',
4319 'AW': '181.41.0.0/18',
4320 'AX': '185.217.4.0/22',
4321 'AZ': '5.197.0.0/16',
4322 'BA': '31.176.128.0/17',
4323 'BB': '65.48.128.0/17',
4324 'BD': '114.130.0.0/16',
4325 'BE': '57.0.0.0/8',
4326 'BF': '102.178.0.0/15',
4327 'BG': '95.42.0.0/15',
4328 'BH': '37.131.0.0/17',
4329 'BI': '154.117.192.0/18',
4330 'BJ': '137.255.0.0/16',
4331 'BL': '185.212.72.0/23',
4332 'BM': '196.12.64.0/18',
4333 'BN': '156.31.0.0/16',
4334 'BO': '161.56.0.0/16',
4335 'BQ': '161.0.80.0/20',
4336 'BR': '191.128.0.0/12',
4337 'BS': '24.51.64.0/18',
4338 'BT': '119.2.96.0/19',
4339 'BW': '168.167.0.0/16',
4340 'BY': '178.120.0.0/13',
4341 'BZ': '179.42.192.0/18',
4342 'CA': '99.224.0.0/11',
4343 'CD': '41.243.0.0/16',
4344 'CF': '197.242.176.0/21',
4345 'CG': '160.113.0.0/16',
4346 'CH': '85.0.0.0/13',
4347 'CI': '102.136.0.0/14',
4348 'CK': '202.65.32.0/19',
4349 'CL': '152.172.0.0/14',
4350 'CM': '102.244.0.0/14',
4351 'CN': '36.128.0.0/10',
4352 'CO': '181.240.0.0/12',
4353 'CR': '201.192.0.0/12',
4354 'CU': '152.206.0.0/15',
4355 'CV': '165.90.96.0/19',
4356 'CW': '190.88.128.0/17',
4357 'CY': '31.153.0.0/16',
4358 'CZ': '88.100.0.0/14',
4359 'DE': '53.0.0.0/8',
4360 'DJ': '197.241.0.0/17',
4361 'DK': '87.48.0.0/12',
4362 'DM': '192.243.48.0/20',
4363 'DO': '152.166.0.0/15',
4364 'DZ': '41.96.0.0/12',
4365 'EC': '186.68.0.0/15',
4366 'EE': '90.190.0.0/15',
4367 'EG': '156.160.0.0/11',
4368 'ER': '196.200.96.0/20',
4369 'ES': '88.0.0.0/11',
4370 'ET': '196.188.0.0/14',
4371 'EU': '2.16.0.0/13',
4372 'FI': '91.152.0.0/13',
4373 'FJ': '144.120.0.0/16',
4374 'FK': '80.73.208.0/21',
4375 'FM': '119.252.112.0/20',
4376 'FO': '88.85.32.0/19',
4377 'FR': '90.0.0.0/9',
4378 'GA': '41.158.0.0/15',
4379 'GB': '25.0.0.0/8',
4380 'GD': '74.122.88.0/21',
4381 'GE': '31.146.0.0/16',
4382 'GF': '161.22.64.0/18',
4383 'GG': '62.68.160.0/19',
4384 'GH': '154.160.0.0/12',
4385 'GI': '95.164.0.0/16',
4386 'GL': '88.83.0.0/19',
4387 'GM': '160.182.0.0/15',
4388 'GN': '197.149.192.0/18',
4389 'GP': '104.250.0.0/19',
4390 'GQ': '105.235.224.0/20',
4391 'GR': '94.64.0.0/13',
4392 'GT': '168.234.0.0/16',
4393 'GU': '168.123.0.0/16',
4394 'GW': '197.214.80.0/20',
4395 'GY': '181.41.64.0/18',
4396 'HK': '113.252.0.0/14',
4397 'HN': '181.210.0.0/16',
4398 'HR': '93.136.0.0/13',
4399 'HT': '148.102.128.0/17',
4400 'HU': '84.0.0.0/14',
4401 'ID': '39.192.0.0/10',
4402 'IE': '87.32.0.0/12',
4403 'IL': '79.176.0.0/13',
4404 'IM': '5.62.80.0/20',
4405 'IN': '117.192.0.0/10',
4406 'IO': '203.83.48.0/21',
4407 'IQ': '37.236.0.0/14',
4408 'IR': '2.176.0.0/12',
4409 'IS': '82.221.0.0/16',
4410 'IT': '79.0.0.0/10',
4411 'JE': '87.244.64.0/18',
4412 'JM': '72.27.0.0/17',
4413 'JO': '176.29.0.0/16',
4414 'JP': '133.0.0.0/8',
4415 'KE': '105.48.0.0/12',
4416 'KG': '158.181.128.0/17',
4417 'KH': '36.37.128.0/17',
4418 'KI': '103.25.140.0/22',
4419 'KM': '197.255.224.0/20',
4420 'KN': '198.167.192.0/19',
4421 'KP': '175.45.176.0/22',
4422 'KR': '175.192.0.0/10',
4423 'KW': '37.36.0.0/14',
4424 'KY': '64.96.0.0/15',
4425 'KZ': '2.72.0.0/13',
4426 'LA': '115.84.64.0/18',
4427 'LB': '178.135.0.0/16',
4428 'LC': '24.92.144.0/20',
4429 'LI': '82.117.0.0/19',
4430 'LK': '112.134.0.0/15',
4431 'LR': '102.183.0.0/16',
4432 'LS': '129.232.0.0/17',
4433 'LT': '78.56.0.0/13',
4434 'LU': '188.42.0.0/16',
4435 'LV': '46.109.0.0/16',
4436 'LY': '41.252.0.0/14',
4437 'MA': '105.128.0.0/11',
4438 'MC': '88.209.64.0/18',
4439 'MD': '37.246.0.0/16',
4440 'ME': '178.175.0.0/17',
4441 'MF': '74.112.232.0/21',
4442 'MG': '154.126.0.0/17',
4443 'MH': '117.103.88.0/21',
4444 'MK': '77.28.0.0/15',
4445 'ML': '154.118.128.0/18',
4446 'MM': '37.111.0.0/17',
4447 'MN': '49.0.128.0/17',
4448 'MO': '60.246.0.0/16',
4449 'MP': '202.88.64.0/20',
4450 'MQ': '109.203.224.0/19',
4451 'MR': '41.188.64.0/18',
4452 'MS': '208.90.112.0/22',
4453 'MT': '46.11.0.0/16',
4454 'MU': '105.16.0.0/12',
4455 'MV': '27.114.128.0/18',
4456 'MW': '102.70.0.0/15',
4457 'MX': '187.192.0.0/11',
4458 'MY': '175.136.0.0/13',
4459 'MZ': '197.218.0.0/15',
4460 'NA': '41.182.0.0/16',
4461 'NC': '101.101.0.0/18',
4462 'NE': '197.214.0.0/18',
4463 'NF': '203.17.240.0/22',
4464 'NG': '105.112.0.0/12',
4465 'NI': '186.76.0.0/15',
4466 'NL': '145.96.0.0/11',
4467 'NO': '84.208.0.0/13',
4468 'NP': '36.252.0.0/15',
4469 'NR': '203.98.224.0/19',
4470 'NU': '49.156.48.0/22',
4471 'NZ': '49.224.0.0/14',
4472 'OM': '5.36.0.0/15',
4473 'PA': '186.72.0.0/15',
4474 'PE': '186.160.0.0/14',
4475 'PF': '123.50.64.0/18',
4476 'PG': '124.240.192.0/19',
4477 'PH': '49.144.0.0/13',
4478 'PK': '39.32.0.0/11',
4479 'PL': '83.0.0.0/11',
4480 'PM': '70.36.0.0/20',
4481 'PR': '66.50.0.0/16',
4482 'PS': '188.161.0.0/16',
4483 'PT': '85.240.0.0/13',
4484 'PW': '202.124.224.0/20',
4485 'PY': '181.120.0.0/14',
4486 'QA': '37.210.0.0/15',
4487 'RE': '102.35.0.0/16',
4488 'RO': '79.112.0.0/13',
4489 'RS': '93.86.0.0/15',
4490 'RU': '5.136.0.0/13',
4491 'RW': '41.186.0.0/16',
4492 'SA': '188.48.0.0/13',
4493 'SB': '202.1.160.0/19',
4494 'SC': '154.192.0.0/11',
4495 'SD': '102.120.0.0/13',
4496 'SE': '78.64.0.0/12',
4497 'SG': '8.128.0.0/10',
4498 'SI': '188.196.0.0/14',
4499 'SK': '78.98.0.0/15',
4500 'SL': '102.143.0.0/17',
4501 'SM': '89.186.32.0/19',
4502 'SN': '41.82.0.0/15',
4503 'SO': '154.115.192.0/18',
4504 'SR': '186.179.128.0/17',
4505 'SS': '105.235.208.0/21',
4506 'ST': '197.159.160.0/19',
4507 'SV': '168.243.0.0/16',
4508 'SX': '190.102.0.0/20',
4509 'SY': '5.0.0.0/16',
4510 'SZ': '41.84.224.0/19',
4511 'TC': '65.255.48.0/20',
4512 'TD': '154.68.128.0/19',
4513 'TG': '196.168.0.0/14',
4514 'TH': '171.96.0.0/13',
4515 'TJ': '85.9.128.0/18',
4516 'TK': '27.96.24.0/21',
4517 'TL': '180.189.160.0/20',
4518 'TM': '95.85.96.0/19',
4519 'TN': '197.0.0.0/11',
4520 'TO': '175.176.144.0/21',
4521 'TR': '78.160.0.0/11',
4522 'TT': '186.44.0.0/15',
4523 'TV': '202.2.96.0/19',
4524 'TW': '120.96.0.0/11',
4525 'TZ': '156.156.0.0/14',
4526 'UA': '37.52.0.0/14',
4527 'UG': '102.80.0.0/13',
4528 'US': '6.0.0.0/8',
4529 'UY': '167.56.0.0/13',
4530 'UZ': '84.54.64.0/18',
4531 'VA': '212.77.0.0/19',
4532 'VC': '207.191.240.0/21',
4533 'VE': '186.88.0.0/13',
4534 'VG': '66.81.192.0/20',
4535 'VI': '146.226.0.0/16',
4536 'VN': '14.160.0.0/11',
4537 'VU': '202.80.32.0/20',
4538 'WF': '117.20.32.0/21',
4539 'WS': '202.4.32.0/19',
4540 'YE': '134.35.0.0/16',
4541 'YT': '41.242.116.0/22',
4542 'ZA': '41.0.0.0/11',
4543 'ZM': '102.144.0.0/13',
4544 'ZW': '102.177.192.0/18',
4545 }
4546
4547 @classmethod
4548 def random_ipv4(cls, code_or_block):
4549 if len(code_or_block) == 2:
4550 block = cls._country_ip_map.get(code_or_block.upper())
4551 if not block:
4552 return None
4553 else:
4554 block = code_or_block
4555 addr, preflen = block.split('/')
4556 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4557 addr_max = addr_min | (0xffffffff >> int(preflen))
4558 return compat_str(socket.inet_ntoa(
4559 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4560
4561
4562 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4563 def __init__(self, proxies=None):
4564 # Set default handlers
4565 for type in ('http', 'https'):
4566 setattr(self, '%s_open' % type,
4567 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4568 meth(r, proxy, type))
4569 compat_urllib_request.ProxyHandler.__init__(self, proxies)
4570
4571 def proxy_open(self, req, proxy, type):
4572 req_proxy = req.headers.get('Ytdl-request-proxy')
4573 if req_proxy is not None:
4574 proxy = req_proxy
4575 del req.headers['Ytdl-request-proxy']
4576
4577 if proxy == '__noproxy__':
4578 return None # No Proxy
4579 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4580 req.add_header('Ytdl-socks-proxy', proxy)
4581 # yt-dlp's http/https handlers do wrapping the socket with socks
4582 return None
4583 return compat_urllib_request.ProxyHandler.proxy_open(
4584 self, req, proxy, type)
4585
4586
4587 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4588 # released into Public Domain
4589 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4590
4591 def long_to_bytes(n, blocksize=0):
4592 """long_to_bytes(n:long, blocksize:int) : string
4593 Convert a long integer to a byte string.
4594
4595 If optional blocksize is given and greater than zero, pad the front of the
4596 byte string with binary zeros so that the length is a multiple of
4597 blocksize.
4598 """
4599 # after much testing, this algorithm was deemed to be the fastest
4600 s = b''
4601 n = int(n)
4602 while n > 0:
4603 s = compat_struct_pack('>I', n & 0xffffffff) + s
4604 n = n >> 32
4605 # strip off leading zeros
4606 for i in range(len(s)):
4607 if s[i] != b'\000'[0]:
4608 break
4609 else:
4610 # only happens when n == 0
4611 s = b'\000'
4612 i = 0
4613 s = s[i:]
4614 # add back some pad bytes. this could be done more efficiently w.r.t. the
4615 # de-padding being done above, but sigh...
4616 if blocksize > 0 and len(s) % blocksize:
4617 s = (blocksize - len(s) % blocksize) * b'\000' + s
4618 return s
4619
4620
4621 def bytes_to_long(s):
4622 """bytes_to_long(string) : long
4623 Convert a byte string to a long integer.
4624
4625 This is (essentially) the inverse of long_to_bytes().
4626 """
4627 acc = 0
4628 length = len(s)
4629 if length % 4:
4630 extra = (4 - length % 4)
4631 s = b'\000' * extra + s
4632 length = length + extra
4633 for i in range(0, length, 4):
4634 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4635 return acc
4636
4637
4638 def ohdave_rsa_encrypt(data, exponent, modulus):
4639 '''
4640 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4641
4642 Input:
4643 data: data to encrypt, bytes-like object
4644 exponent, modulus: parameter e and N of RSA algorithm, both integer
4645 Output: hex string of encrypted data
4646
4647 Limitation: supports one block encryption only
4648 '''
4649
4650 payload = int(binascii.hexlify(data[::-1]), 16)
4651 encrypted = pow(payload, exponent, modulus)
4652 return '%x' % encrypted
4653
4654
4655 def pkcs1pad(data, length):
4656 """
4657 Padding input data with PKCS#1 scheme
4658
4659 @param {int[]} data input data
4660 @param {int} length target length
4661 @returns {int[]} padded data
4662 """
4663 if len(data) > length - 11:
4664 raise ValueError('Input data too long for PKCS#1 padding')
4665
4666 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4667 return [0, 2] + pseudo_random + [0] + data
4668
4669
4670 def encode_base_n(num, n, table=None):
4671 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4672 if not table:
4673 table = FULL_TABLE[:n]
4674
4675 if n > len(table):
4676 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4677
4678 if num == 0:
4679 return table[0]
4680
4681 ret = ''
4682 while num:
4683 ret = table[num % n] + ret
4684 num = num // n
4685 return ret
4686
4687
4688 def decode_packed_codes(code):
4689 mobj = re.search(PACKED_CODES_RE, code)
4690 obfuscated_code, base, count, symbols = mobj.groups()
4691 base = int(base)
4692 count = int(count)
4693 symbols = symbols.split('|')
4694 symbol_table = {}
4695
4696 while count:
4697 count -= 1
4698 base_n_count = encode_base_n(count, base)
4699 symbol_table[base_n_count] = symbols[count] or base_n_count
4700
4701 return re.sub(
4702 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4703 obfuscated_code)
4704
4705
4706 def caesar(s, alphabet, shift):
4707 if shift == 0:
4708 return s
4709 l = len(alphabet)
4710 return ''.join(
4711 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4712 for c in s)
4713
4714
4715 def rot47(s):
4716 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4717
4718
4719 def parse_m3u8_attributes(attrib):
4720 info = {}
4721 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4722 if val.startswith('"'):
4723 val = val[1:-1]
4724 info[key] = val
4725 return info
4726
4727
4728 def urshift(val, n):
4729 return val >> n if val >= 0 else (val + 0x100000000) >> n
4730
4731
4732 # Based on png2str() written by @gdkchan and improved by @yokrysty
4733 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4734 def decode_png(png_data):
4735 # Reference: https://www.w3.org/TR/PNG/
4736 header = png_data[8:]
4737
4738 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4739 raise IOError('Not a valid PNG file.')
4740
4741 int_map = {1: '>B', 2: '>H', 4: '>I'}
4742 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4743
4744 chunks = []
4745
4746 while header:
4747 length = unpack_integer(header[:4])
4748 header = header[4:]
4749
4750 chunk_type = header[:4]
4751 header = header[4:]
4752
4753 chunk_data = header[:length]
4754 header = header[length:]
4755
4756 header = header[4:] # Skip CRC
4757
4758 chunks.append({
4759 'type': chunk_type,
4760 'length': length,
4761 'data': chunk_data
4762 })
4763
4764 ihdr = chunks[0]['data']
4765
4766 width = unpack_integer(ihdr[:4])
4767 height = unpack_integer(ihdr[4:8])
4768
4769 idat = b''
4770
4771 for chunk in chunks:
4772 if chunk['type'] == b'IDAT':
4773 idat += chunk['data']
4774
4775 if not idat:
4776 raise IOError('Unable to read PNG data.')
4777
4778 decompressed_data = bytearray(zlib.decompress(idat))
4779
4780 stride = width * 3
4781 pixels = []
4782
4783 def _get_pixel(idx):
4784 x = idx % stride
4785 y = idx // stride
4786 return pixels[y][x]
4787
4788 for y in range(height):
4789 basePos = y * (1 + stride)
4790 filter_type = decompressed_data[basePos]
4791
4792 current_row = []
4793
4794 pixels.append(current_row)
4795
4796 for x in range(stride):
4797 color = decompressed_data[1 + basePos + x]
4798 basex = y * stride + x
4799 left = 0
4800 up = 0
4801
4802 if x > 2:
4803 left = _get_pixel(basex - 3)
4804 if y > 0:
4805 up = _get_pixel(basex - stride)
4806
4807 if filter_type == 1: # Sub
4808 color = (color + left) & 0xff
4809 elif filter_type == 2: # Up
4810 color = (color + up) & 0xff
4811 elif filter_type == 3: # Average
4812 color = (color + ((left + up) >> 1)) & 0xff
4813 elif filter_type == 4: # Paeth
4814 a = left
4815 b = up
4816 c = 0
4817
4818 if x > 2 and y > 0:
4819 c = _get_pixel(basex - stride - 3)
4820
4821 p = a + b - c
4822
4823 pa = abs(p - a)
4824 pb = abs(p - b)
4825 pc = abs(p - c)
4826
4827 if pa <= pb and pa <= pc:
4828 color = (color + a) & 0xff
4829 elif pb <= pc:
4830 color = (color + b) & 0xff
4831 else:
4832 color = (color + c) & 0xff
4833
4834 current_row.append(color)
4835
4836 return width, height, pixels
4837
4838
4839 def write_xattr(path, key, value):
4840 # This mess below finds the best xattr tool for the job
4841 try:
4842 # try the pyxattr module...
4843 import xattr
4844
4845 if hasattr(xattr, 'set'): # pyxattr
4846 # Unicode arguments are not supported in python-pyxattr until
4847 # version 0.5.0
4848 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4849 pyxattr_required_version = '0.5.0'
4850 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4851 # TODO: fallback to CLI tools
4852 raise XAttrUnavailableError(
4853 'python-pyxattr is detected but is too old. '
4854 'yt-dlp requires %s or above while your version is %s. '
4855 'Falling back to other xattr implementations' % (
4856 pyxattr_required_version, xattr.__version__))
4857
4858 setxattr = xattr.set
4859 else: # xattr
4860 setxattr = xattr.setxattr
4861
4862 try:
4863 setxattr(path, key, value)
4864 except EnvironmentError as e:
4865 raise XAttrMetadataError(e.errno, e.strerror)
4866
4867 except ImportError:
4868 if compat_os_name == 'nt':
4869 # Write xattrs to NTFS Alternate Data Streams:
4870 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4871 assert ':' not in key
4872 assert os.path.exists(path)
4873
4874 ads_fn = path + ':' + key
4875 try:
4876 with open(ads_fn, 'wb') as f:
4877 f.write(value)
4878 except EnvironmentError as e:
4879 raise XAttrMetadataError(e.errno, e.strerror)
4880 else:
4881 user_has_setfattr = check_executable('setfattr', ['--version'])
4882 user_has_xattr = check_executable('xattr', ['-h'])
4883
4884 if user_has_setfattr or user_has_xattr:
4885
4886 value = value.decode('utf-8')
4887 if user_has_setfattr:
4888 executable = 'setfattr'
4889 opts = ['-n', key, '-v', value]
4890 elif user_has_xattr:
4891 executable = 'xattr'
4892 opts = ['-w', key, value]
4893
4894 cmd = ([encodeFilename(executable, True)]
4895 + [encodeArgument(o) for o in opts]
4896 + [encodeFilename(path, True)])
4897
4898 try:
4899 p = Popen(
4900 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4901 except EnvironmentError as e:
4902 raise XAttrMetadataError(e.errno, e.strerror)
4903 stdout, stderr = p.communicate_or_kill()
4904 stderr = stderr.decode('utf-8', 'replace')
4905 if p.returncode != 0:
4906 raise XAttrMetadataError(p.returncode, stderr)
4907
4908 else:
4909 # On Unix, and can't find pyxattr, setfattr, or xattr.
4910 if sys.platform.startswith('linux'):
4911 raise XAttrUnavailableError(
4912 "Couldn't find a tool to set the xattrs. "
4913 "Install either the python 'pyxattr' or 'xattr' "
4914 "modules, or the GNU 'attr' package "
4915 "(which contains the 'setfattr' tool).")
4916 else:
4917 raise XAttrUnavailableError(
4918 "Couldn't find a tool to set the xattrs. "
4919 "Install either the python 'xattr' module, "
4920 "or the 'xattr' binary.")
4921
4922
4923 def random_birthday(year_field, month_field, day_field):
4924 start_date = datetime.date(1950, 1, 1)
4925 end_date = datetime.date(1995, 12, 31)
4926 offset = random.randint(0, (end_date - start_date).days)
4927 random_date = start_date + datetime.timedelta(offset)
4928 return {
4929 year_field: str(random_date.year),
4930 month_field: str(random_date.month),
4931 day_field: str(random_date.day),
4932 }
4933
4934
4935 # Templates for internet shortcut files, which are plain text files.
4936 DOT_URL_LINK_TEMPLATE = '''
4937 [InternetShortcut]
4938 URL=%(url)s
4939 '''.lstrip()
4940
4941 DOT_WEBLOC_LINK_TEMPLATE = '''
4942 <?xml version="1.0" encoding="UTF-8"?>
4943 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4944 <plist version="1.0">
4945 <dict>
4946 \t<key>URL</key>
4947 \t<string>%(url)s</string>
4948 </dict>
4949 </plist>
4950 '''.lstrip()
4951
4952 DOT_DESKTOP_LINK_TEMPLATE = '''
4953 [Desktop Entry]
4954 Encoding=UTF-8
4955 Name=%(filename)s
4956 Type=Link
4957 URL=%(url)s
4958 Icon=text-html
4959 '''.lstrip()
4960
4961 LINK_TEMPLATES = {
4962 'url': DOT_URL_LINK_TEMPLATE,
4963 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4964 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4965 }
4966
4967
4968 def iri_to_uri(iri):
4969 """
4970 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4971
4972 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4973 """
4974
4975 iri_parts = compat_urllib_parse_urlparse(iri)
4976
4977 if '[' in iri_parts.netloc:
4978 raise ValueError('IPv6 URIs are not, yet, supported.')
4979 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
4980
4981 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
4982
4983 net_location = ''
4984 if iri_parts.username:
4985 net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
4986 if iri_parts.password is not None:
4987 net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
4988 net_location += '@'
4989
4990 net_location += iri_parts.hostname.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
4991 # The 'idna' encoding produces ASCII text.
4992 if iri_parts.port is not None and iri_parts.port != 80:
4993 net_location += ':' + str(iri_parts.port)
4994
4995 return compat_urllib_parse_urlunparse(
4996 (iri_parts.scheme,
4997 net_location,
4998
4999 compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5000
5001 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5002 compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5003
5004 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5005 compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5006
5007 compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5008
5009 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5010
5011
5012 def to_high_limit_path(path):
5013 if sys.platform in ['win32', 'cygwin']:
5014 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5015 return r'\\?\ '.rstrip() + os.path.abspath(path)
5016
5017 return path
5018
5019
5020 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
5021 val = traverse_obj(obj, *variadic(field))
5022 if val in ignore:
5023 return default
5024 return template % (func(val) if func else val)
5025
5026
5027 def clean_podcast_url(url):
5028 return re.sub(r'''(?x)
5029 (?:
5030 (?:
5031 chtbl\.com/track|
5032 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5033 play\.podtrac\.com
5034 )/[^/]+|
5035 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5036 flex\.acast\.com|
5037 pd(?:
5038 cn\.co| # https://podcorn.com/analytics-prefix/
5039 st\.fm # https://podsights.com/docs/
5040 )/e
5041 )/''', '', url)
5042
5043
5044 _HEX_TABLE = '0123456789abcdef'
5045
5046
5047 def random_uuidv4():
5048 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5049
5050
5051 def make_dir(path, to_screen=None):
5052 try:
5053 dn = os.path.dirname(path)
5054 if dn and not os.path.exists(dn):
5055 os.makedirs(dn)
5056 return True
5057 except (OSError, IOError) as err:
5058 if callable(to_screen) is not None:
5059 to_screen('unable to create directory ' + error_to_compat_str(err))
5060 return False
5061
5062
5063 def get_executable_path():
5064 from zipimport import zipimporter
5065 if hasattr(sys, 'frozen'): # Running from PyInstaller
5066 path = os.path.dirname(sys.executable)
5067 elif isinstance(globals().get('__loader__'), zipimporter): # Running from ZIP
5068 path = os.path.join(os.path.dirname(__file__), '../..')
5069 else:
5070 path = os.path.join(os.path.dirname(__file__), '..')
5071 return os.path.abspath(path)
5072
5073
5074 def load_plugins(name, suffix, namespace):
5075 classes = {}
5076 try:
5077 plugins_spec = importlib.util.spec_from_file_location(
5078 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5079 plugins = importlib.util.module_from_spec(plugins_spec)
5080 sys.modules[plugins_spec.name] = plugins
5081 plugins_spec.loader.exec_module(plugins)
5082 for name in dir(plugins):
5083 if name in namespace:
5084 continue
5085 if not name.endswith(suffix):
5086 continue
5087 klass = getattr(plugins, name)
5088 classes[name] = namespace[name] = klass
5089 except FileNotFoundError:
5090 pass
5091 return classes
5092
5093
5094 def traverse_obj(
5095 obj, *path_list, default=None, expected_type=None, get_all=True,
5096 casesense=True, is_user_input=False, traverse_string=False):
5097 ''' Traverse nested list/dict/tuple
5098 @param path_list A list of paths which are checked one by one.
5099 Each path is a list of keys where each key is a string,
5100 a function, a tuple of strings/None or "...".
5101 When a fuction is given, it takes the key as argument and
5102 returns whether the key matches or not. When a tuple is given,
5103 all the keys given in the tuple are traversed, and
5104 "..." traverses all the keys in the object
5105 "None" returns the object without traversal
5106 @param default Default value to return
5107 @param expected_type Only accept final value of this type (Can also be any callable)
5108 @param get_all Return all the values obtained from a path or only the first one
5109 @param casesense Whether to consider dictionary keys as case sensitive
5110 @param is_user_input Whether the keys are generated from user input. If True,
5111 strings are converted to int/slice if necessary
5112 @param traverse_string Whether to traverse inside strings. If True, any
5113 non-compatible object will also be converted into a string
5114 # TODO: Write tests
5115 '''
5116 if not casesense:
5117 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5118 path_list = (map(_lower, variadic(path)) for path in path_list)
5119
5120 def _traverse_obj(obj, path, _current_depth=0):
5121 nonlocal depth
5122 path = tuple(variadic(path))
5123 for i, key in enumerate(path):
5124 if None in (key, obj):
5125 return obj
5126 if isinstance(key, (list, tuple)):
5127 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5128 key = ...
5129 if key is ...:
5130 obj = (obj.values() if isinstance(obj, dict)
5131 else obj if isinstance(obj, (list, tuple, LazyList))
5132 else str(obj) if traverse_string else [])
5133 _current_depth += 1
5134 depth = max(depth, _current_depth)
5135 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5136 elif callable(key):
5137 if isinstance(obj, (list, tuple, LazyList)):
5138 obj = enumerate(obj)
5139 elif isinstance(obj, dict):
5140 obj = obj.items()
5141 else:
5142 if not traverse_string:
5143 return None
5144 obj = str(obj)
5145 _current_depth += 1
5146 depth = max(depth, _current_depth)
5147 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
5148 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5149 obj = (obj.get(key) if casesense or (key in obj)
5150 else next((v for k, v in obj.items() if _lower(k) == key), None))
5151 else:
5152 if is_user_input:
5153 key = (int_or_none(key) if ':' not in key
5154 else slice(*map(int_or_none, key.split(':'))))
5155 if key == slice(None):
5156 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5157 if not isinstance(key, (int, slice)):
5158 return None
5159 if not isinstance(obj, (list, tuple, LazyList)):
5160 if not traverse_string:
5161 return None
5162 obj = str(obj)
5163 try:
5164 obj = obj[key]
5165 except IndexError:
5166 return None
5167 return obj
5168
5169 if isinstance(expected_type, type):
5170 type_test = lambda val: val if isinstance(val, expected_type) else None
5171 elif expected_type is not None:
5172 type_test = expected_type
5173 else:
5174 type_test = lambda val: val
5175
5176 for path in path_list:
5177 depth = 0
5178 val = _traverse_obj(obj, path)
5179 if val is not None:
5180 if depth:
5181 for _ in range(depth - 1):
5182 val = itertools.chain.from_iterable(v for v in val if v is not None)
5183 val = [v for v in map(type_test, val) if v is not None]
5184 if val:
5185 return val if get_all else val[0]
5186 else:
5187 val = type_test(val)
5188 if val is not None:
5189 return val
5190 return default
5191
5192
5193 def traverse_dict(dictn, keys, casesense=True):
5194 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5195 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5196 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5197
5198
5199 def variadic(x, allowed_types=(str, bytes, dict)):
5200 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5201
5202
5203 def decode_base(value, digits):
5204 # This will convert given base-x string to scalar (long or int)
5205 table = {char: index for index, char in enumerate(digits)}
5206 result = 0
5207 base = len(digits)
5208 for chr in value:
5209 result *= base
5210 result += table[chr]
5211 return result
5212
5213
5214 def time_seconds(**kwargs):
5215 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5216 return t.timestamp()
5217
5218
5219 # create a JSON Web Signature (jws) with HS256 algorithm
5220 # the resulting format is in JWS Compact Serialization
5221 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5222 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5223 def jwt_encode_hs256(payload_data, key, headers={}):
5224 header_data = {
5225 'alg': 'HS256',
5226 'typ': 'JWT',
5227 }
5228 if headers:
5229 header_data.update(headers)
5230 header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5231 payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5232 h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5233 signature_b64 = base64.b64encode(h.digest())
5234 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5235 return token
5236
5237
5238 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5239 def jwt_decode_hs256(jwt):
5240 header_b64, payload_b64, signature_b64 = jwt.split('.')
5241 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5242 return payload_data
5243
5244
5245 def supports_terminal_sequences(stream):
5246 if compat_os_name == 'nt':
5247 from .compat import WINDOWS_VT_MODE # Must be imported locally
5248 if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5249 return False
5250 elif not os.getenv('TERM'):
5251 return False
5252 try:
5253 return stream.isatty()
5254 except BaseException:
5255 return False
5256
5257
5258 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5259
5260
5261 def remove_terminal_sequences(string):
5262 return _terminal_sequences_re.sub('', string)
5263
5264
5265 def number_of_digits(number):
5266 return len('%d' % number)
5267
5268
5269 def join_nonempty(*values, delim='-', from_dict=None):
5270 if from_dict is not None:
5271 values = map(from_dict.get, values)
5272 return delim.join(map(str, filter(None, values)))
5273
5274
5275 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5276 """
5277 Find the largest format dimensions in terms of video width and, for each thumbnail:
5278 * Modify the URL: Match the width with the provided regex and replace with the former width
5279 * Update dimensions
5280
5281 This function is useful with video services that scale the provided thumbnails on demand
5282 """
5283 _keys = ('width', 'height')
5284 max_dimensions = max(
5285 [tuple(format.get(k) or 0 for k in _keys) for format in formats],
5286 default=(0, 0))
5287 if not max_dimensions[0]:
5288 return thumbnails
5289 return [
5290 merge_dicts(
5291 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5292 dict(zip(_keys, max_dimensions)), thumbnail)
5293 for thumbnail in thumbnails
5294 ]
5295
5296
5297 def parse_http_range(range):
5298 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5299 if not range:
5300 return None, None, None
5301 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5302 if not crg:
5303 return None, None, None
5304 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5305
5306
5307 class Config:
5308 own_args = None
5309 filename = None
5310 __initialized = False
5311
5312 def __init__(self, parser, label=None):
5313 self._parser, self.label = parser, label
5314 self._loaded_paths, self.configs = set(), []
5315
5316 def init(self, args=None, filename=None):
5317 assert not self.__initialized
5318 directory = ''
5319 if filename:
5320 location = os.path.realpath(filename)
5321 directory = os.path.dirname(location)
5322 if location in self._loaded_paths:
5323 return False
5324 self._loaded_paths.add(location)
5325
5326 self.__initialized = True
5327 self.own_args, self.filename = args, filename
5328 for location in self._parser.parse_args(args)[0].config_locations or []:
5329 location = os.path.join(directory, expand_path(location))
5330 if os.path.isdir(location):
5331 location = os.path.join(location, 'yt-dlp.conf')
5332 if not os.path.exists(location):
5333 self._parser.error(f'config location {location} does not exist')
5334 self.append_config(self.read_file(location), location)
5335 return True
5336
5337 def __str__(self):
5338 label = join_nonempty(
5339 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5340 delim=' ')
5341 return join_nonempty(
5342 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5343 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5344 delim='\n')
5345
5346 @staticmethod
5347 def read_file(filename, default=[]):
5348 try:
5349 optionf = open(filename)
5350 except IOError:
5351 return default # silently skip if file is not present
5352 try:
5353 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5354 contents = optionf.read()
5355 if sys.version_info < (3,):
5356 contents = contents.decode(preferredencoding())
5357 res = compat_shlex_split(contents, comments=True)
5358 finally:
5359 optionf.close()
5360 return res
5361
5362 @staticmethod
5363 def hide_login_info(opts):
5364 PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5365 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5366
5367 def _scrub_eq(o):
5368 m = eqre.match(o)
5369 if m:
5370 return m.group('key') + '=PRIVATE'
5371 else:
5372 return o
5373
5374 opts = list(map(_scrub_eq, opts))
5375 for idx, opt in enumerate(opts):
5376 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5377 opts[idx + 1] = 'PRIVATE'
5378 return opts
5379
5380 def append_config(self, *args, label=None):
5381 config = type(self)(self._parser, label)
5382 config._loaded_paths = self._loaded_paths
5383 if config.init(*args):
5384 self.configs.append(config)
5385
5386 @property
5387 def all_args(self):
5388 for config in reversed(self.configs):
5389 yield from config.all_args
5390 yield from self.own_args or []
5391
5392 def parse_args(self):
5393 return self._parser.parse_args(list(self.all_args))
5394
5395
5396 class WebSocketsWrapper():
5397 """Wraps websockets module to use in non-async scopes"""
5398
5399 def __init__(self, url, headers=None):
5400 self.loop = asyncio.events.new_event_loop()
5401 self.conn = compat_websockets.connect(
5402 url, extra_headers=headers, ping_interval=None,
5403 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5404 atexit.register(self.__exit__, None, None, None)
5405
5406 def __enter__(self):
5407 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5408 return self
5409
5410 def send(self, *args):
5411 self.run_with_loop(self.pool.send(*args), self.loop)
5412
5413 def recv(self, *args):
5414 return self.run_with_loop(self.pool.recv(*args), self.loop)
5415
5416 def __exit__(self, type, value, traceback):
5417 try:
5418 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5419 finally:
5420 self.loop.close()
5421 self._cancel_all_tasks(self.loop)
5422
5423 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5424 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5425 @staticmethod
5426 def run_with_loop(main, loop):
5427 if not asyncio.coroutines.iscoroutine(main):
5428 raise ValueError(f'a coroutine was expected, got {main!r}')
5429
5430 try:
5431 return loop.run_until_complete(main)
5432 finally:
5433 loop.run_until_complete(loop.shutdown_asyncgens())
5434 if hasattr(loop, 'shutdown_default_executor'):
5435 loop.run_until_complete(loop.shutdown_default_executor())
5436
5437 @staticmethod
5438 def _cancel_all_tasks(loop):
5439 to_cancel = asyncio.tasks.all_tasks(loop)
5440
5441 if not to_cancel:
5442 return
5443
5444 for task in to_cancel:
5445 task.cancel()
5446
5447 loop.run_until_complete(
5448 asyncio.tasks.gather(*to_cancel, loop=loop, return_exceptions=True))
5449
5450 for task in to_cancel:
5451 if task.cancelled():
5452 continue
5453 if task.exception() is not None:
5454 loop.call_exception_handler({
5455 'message': 'unhandled exception during asyncio.run() shutdown',
5456 'exception': task.exception(),
5457 'task': task,
5458 })
5459
5460
5461 has_websockets = bool(compat_websockets)
5462
5463
5464 def merge_headers(*dicts):
5465 """Merge dicts of network headers case insensitively, prioritizing the latter ones"""
5466 return {k.capitalize(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}