]> jfr.im git - yt-dlp.git/blob - yt_dlp/utils.py
[wget] Fix proxy (#3152)
[yt-dlp.git] / yt_dlp / utils.py
1 #!/usr/bin/env python3
2 # coding: utf-8
3
4 from __future__ import unicode_literals
5
6 import asyncio
7 import atexit
8 import base64
9 import binascii
10 import calendar
11 import codecs
12 import collections
13 import contextlib
14 import ctypes
15 import datetime
16 import email.utils
17 import email.header
18 import errno
19 import functools
20 import gzip
21 import hashlib
22 import hmac
23 import importlib.util
24 import io
25 import itertools
26 import json
27 import locale
28 import math
29 import operator
30 import os
31 import platform
32 import random
33 import re
34 import socket
35 import ssl
36 import subprocess
37 import sys
38 import tempfile
39 import time
40 import traceback
41 import xml.etree.ElementTree
42 import zlib
43 import mimetypes
44
45 from .compat import (
46 compat_HTMLParseError,
47 compat_HTMLParser,
48 compat_HTTPError,
49 compat_basestring,
50 compat_brotli,
51 compat_chr,
52 compat_cookiejar,
53 compat_ctypes_WINFUNCTYPE,
54 compat_etree_fromstring,
55 compat_expanduser,
56 compat_html_entities,
57 compat_html_entities_html5,
58 compat_http_client,
59 compat_integer_types,
60 compat_numeric_types,
61 compat_kwargs,
62 compat_os_name,
63 compat_parse_qs,
64 compat_shlex_split,
65 compat_shlex_quote,
66 compat_str,
67 compat_struct_pack,
68 compat_struct_unpack,
69 compat_urllib_error,
70 compat_urllib_parse,
71 compat_urllib_parse_urlencode,
72 compat_urllib_parse_urlparse,
73 compat_urllib_parse_urlunparse,
74 compat_urllib_parse_quote,
75 compat_urllib_parse_quote_plus,
76 compat_urllib_parse_unquote_plus,
77 compat_urllib_request,
78 compat_urlparse,
79 compat_websockets,
80 compat_xpath,
81 )
82
83 from .socks import (
84 ProxyType,
85 sockssocket,
86 )
87
88
89 def register_socks_protocols():
90 # "Register" SOCKS protocols
91 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
92 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
93 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
94 if scheme not in compat_urlparse.uses_netloc:
95 compat_urlparse.uses_netloc.append(scheme)
96
97
98 # This is not clearly defined otherwise
99 compiled_regex_type = type(re.compile(''))
100
101
102 def random_user_agent():
103 _USER_AGENT_TPL = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/%s Safari/537.36'
104 _CHROME_VERSIONS = (
105 '90.0.4430.212',
106 '90.0.4430.24',
107 '90.0.4430.70',
108 '90.0.4430.72',
109 '90.0.4430.85',
110 '90.0.4430.93',
111 '91.0.4472.101',
112 '91.0.4472.106',
113 '91.0.4472.114',
114 '91.0.4472.124',
115 '91.0.4472.164',
116 '91.0.4472.19',
117 '91.0.4472.77',
118 '92.0.4515.107',
119 '92.0.4515.115',
120 '92.0.4515.131',
121 '92.0.4515.159',
122 '92.0.4515.43',
123 '93.0.4556.0',
124 '93.0.4577.15',
125 '93.0.4577.63',
126 '93.0.4577.82',
127 '94.0.4606.41',
128 '94.0.4606.54',
129 '94.0.4606.61',
130 '94.0.4606.71',
131 '94.0.4606.81',
132 '94.0.4606.85',
133 '95.0.4638.17',
134 '95.0.4638.50',
135 '95.0.4638.54',
136 '95.0.4638.69',
137 '95.0.4638.74',
138 '96.0.4664.18',
139 '96.0.4664.45',
140 '96.0.4664.55',
141 '96.0.4664.93',
142 '97.0.4692.20',
143 )
144 return _USER_AGENT_TPL % random.choice(_CHROME_VERSIONS)
145
146
147 SUPPORTED_ENCODINGS = [
148 'gzip', 'deflate'
149 ]
150 if compat_brotli:
151 SUPPORTED_ENCODINGS.append('br')
152
153 std_headers = {
154 'User-Agent': random_user_agent(),
155 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
156 'Accept-Encoding': ', '.join(SUPPORTED_ENCODINGS),
157 'Accept-Language': 'en-us,en;q=0.5',
158 'Sec-Fetch-Mode': 'navigate',
159 }
160
161
162 USER_AGENTS = {
163 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
164 }
165
166
167 NO_DEFAULT = object()
168
169 ENGLISH_MONTH_NAMES = [
170 'January', 'February', 'March', 'April', 'May', 'June',
171 'July', 'August', 'September', 'October', 'November', 'December']
172
173 MONTH_NAMES = {
174 'en': ENGLISH_MONTH_NAMES,
175 'fr': [
176 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
177 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
178 }
179
180 KNOWN_EXTENSIONS = (
181 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
182 'flv', 'f4v', 'f4a', 'f4b',
183 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
184 'mkv', 'mka', 'mk3d',
185 'avi', 'divx',
186 'mov',
187 'asf', 'wmv', 'wma',
188 '3gp', '3g2',
189 'mp3',
190 'flac',
191 'ape',
192 'wav',
193 'f4f', 'f4m', 'm3u8', 'smil')
194
195 # needed for sanitizing filenames in restricted mode
196 ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
197 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUY', ['TH', 'ss'],
198 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuy', ['th'], 'y')))
199
200 DATE_FORMATS = (
201 '%d %B %Y',
202 '%d %b %Y',
203 '%B %d %Y',
204 '%B %dst %Y',
205 '%B %dnd %Y',
206 '%B %drd %Y',
207 '%B %dth %Y',
208 '%b %d %Y',
209 '%b %dst %Y',
210 '%b %dnd %Y',
211 '%b %drd %Y',
212 '%b %dth %Y',
213 '%b %dst %Y %I:%M',
214 '%b %dnd %Y %I:%M',
215 '%b %drd %Y %I:%M',
216 '%b %dth %Y %I:%M',
217 '%Y %m %d',
218 '%Y-%m-%d',
219 '%Y.%m.%d.',
220 '%Y/%m/%d',
221 '%Y/%m/%d %H:%M',
222 '%Y/%m/%d %H:%M:%S',
223 '%Y%m%d%H%M',
224 '%Y%m%d%H%M%S',
225 '%Y%m%d',
226 '%Y-%m-%d %H:%M',
227 '%Y-%m-%d %H:%M:%S',
228 '%Y-%m-%d %H:%M:%S.%f',
229 '%Y-%m-%d %H:%M:%S:%f',
230 '%d.%m.%Y %H:%M',
231 '%d.%m.%Y %H.%M',
232 '%Y-%m-%dT%H:%M:%SZ',
233 '%Y-%m-%dT%H:%M:%S.%fZ',
234 '%Y-%m-%dT%H:%M:%S.%f0Z',
235 '%Y-%m-%dT%H:%M:%S',
236 '%Y-%m-%dT%H:%M:%S.%f',
237 '%Y-%m-%dT%H:%M',
238 '%b %d %Y at %H:%M',
239 '%b %d %Y at %H:%M:%S',
240 '%B %d %Y at %H:%M',
241 '%B %d %Y at %H:%M:%S',
242 '%H:%M %d-%b-%Y',
243 )
244
245 DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
246 DATE_FORMATS_DAY_FIRST.extend([
247 '%d-%m-%Y',
248 '%d.%m.%Y',
249 '%d.%m.%y',
250 '%d/%m/%Y',
251 '%d/%m/%y',
252 '%d/%m/%Y %H:%M:%S',
253 ])
254
255 DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
256 DATE_FORMATS_MONTH_FIRST.extend([
257 '%m-%d-%Y',
258 '%m.%d.%Y',
259 '%m/%d/%Y',
260 '%m/%d/%y',
261 '%m/%d/%Y %H:%M:%S',
262 ])
263
264 PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
265 JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>'
266
267
268 def preferredencoding():
269 """Get preferred encoding.
270
271 Returns the best encoding scheme for the system, based on
272 locale.getpreferredencoding() and some further tweaks.
273 """
274 try:
275 pref = locale.getpreferredencoding()
276 'TEST'.encode(pref)
277 except Exception:
278 pref = 'UTF-8'
279
280 return pref
281
282
283 def write_json_file(obj, fn):
284 """ Encode obj as JSON and write it to fn, atomically if possible """
285
286 fn = encodeFilename(fn)
287 if sys.version_info < (3, 0) and sys.platform != 'win32':
288 encoding = get_filesystem_encoding()
289 # os.path.basename returns a bytes object, but NamedTemporaryFile
290 # will fail if the filename contains non ascii characters unless we
291 # use a unicode object
292 path_basename = lambda f: os.path.basename(fn).decode(encoding)
293 # the same for os.path.dirname
294 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
295 else:
296 path_basename = os.path.basename
297 path_dirname = os.path.dirname
298
299 args = {
300 'suffix': '.tmp',
301 'prefix': path_basename(fn) + '.',
302 'dir': path_dirname(fn),
303 'delete': False,
304 }
305
306 # In Python 2.x, json.dump expects a bytestream.
307 # In Python 3.x, it writes to a character stream
308 if sys.version_info < (3, 0):
309 args['mode'] = 'wb'
310 else:
311 args.update({
312 'mode': 'w',
313 'encoding': 'utf-8',
314 })
315
316 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
317
318 try:
319 with tf:
320 json.dump(obj, tf, ensure_ascii=False)
321 if sys.platform == 'win32':
322 # Need to remove existing file on Windows, else os.rename raises
323 # WindowsError or FileExistsError.
324 try:
325 os.unlink(fn)
326 except OSError:
327 pass
328 try:
329 mask = os.umask(0)
330 os.umask(mask)
331 os.chmod(tf.name, 0o666 & ~mask)
332 except OSError:
333 pass
334 os.rename(tf.name, fn)
335 except Exception:
336 try:
337 os.remove(tf.name)
338 except OSError:
339 pass
340 raise
341
342
343 if sys.version_info >= (2, 7):
344 def find_xpath_attr(node, xpath, key, val=None):
345 """ Find the xpath xpath[@key=val] """
346 assert re.match(r'^[a-zA-Z_-]+$', key)
347 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
348 return node.find(expr)
349 else:
350 def find_xpath_attr(node, xpath, key, val=None):
351 for f in node.findall(compat_xpath(xpath)):
352 if key not in f.attrib:
353 continue
354 if val is None or f.attrib.get(key) == val:
355 return f
356 return None
357
358 # On python2.6 the xml.etree.ElementTree.Element methods don't support
359 # the namespace parameter
360
361
362 def xpath_with_ns(path, ns_map):
363 components = [c.split(':') for c in path.split('/')]
364 replaced = []
365 for c in components:
366 if len(c) == 1:
367 replaced.append(c[0])
368 else:
369 ns, tag = c
370 replaced.append('{%s}%s' % (ns_map[ns], tag))
371 return '/'.join(replaced)
372
373
374 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
375 def _find_xpath(xpath):
376 return node.find(compat_xpath(xpath))
377
378 if isinstance(xpath, (str, compat_str)):
379 n = _find_xpath(xpath)
380 else:
381 for xp in xpath:
382 n = _find_xpath(xp)
383 if n is not None:
384 break
385
386 if n is None:
387 if default is not NO_DEFAULT:
388 return default
389 elif fatal:
390 name = xpath if name is None else name
391 raise ExtractorError('Could not find XML element %s' % name)
392 else:
393 return None
394 return n
395
396
397 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
398 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
399 if n is None or n == default:
400 return n
401 if n.text is None:
402 if default is not NO_DEFAULT:
403 return default
404 elif fatal:
405 name = xpath if name is None else name
406 raise ExtractorError('Could not find XML element\'s text %s' % name)
407 else:
408 return None
409 return n.text
410
411
412 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
413 n = find_xpath_attr(node, xpath, key)
414 if n is None:
415 if default is not NO_DEFAULT:
416 return default
417 elif fatal:
418 name = '%s[@%s]' % (xpath, key) if name is None else name
419 raise ExtractorError('Could not find XML attribute %s' % name)
420 else:
421 return None
422 return n.attrib[key]
423
424
425 def get_element_by_id(id, html):
426 """Return the content of the tag with the specified ID in the passed HTML document"""
427 return get_element_by_attribute('id', id, html)
428
429
430 def get_element_html_by_id(id, html):
431 """Return the html of the tag with the specified ID in the passed HTML document"""
432 return get_element_html_by_attribute('id', id, html)
433
434
435 def get_element_by_class(class_name, html):
436 """Return the content of the first tag with the specified class in the passed HTML document"""
437 retval = get_elements_by_class(class_name, html)
438 return retval[0] if retval else None
439
440
441 def get_element_html_by_class(class_name, html):
442 """Return the html of the first tag with the specified class in the passed HTML document"""
443 retval = get_elements_html_by_class(class_name, html)
444 return retval[0] if retval else None
445
446
447 def get_element_by_attribute(attribute, value, html, escape_value=True):
448 retval = get_elements_by_attribute(attribute, value, html, escape_value)
449 return retval[0] if retval else None
450
451
452 def get_element_html_by_attribute(attribute, value, html, escape_value=True):
453 retval = get_elements_html_by_attribute(attribute, value, html, escape_value)
454 return retval[0] if retval else None
455
456
457 def get_elements_by_class(class_name, html):
458 """Return the content of all tags with the specified class in the passed HTML document as a list"""
459 return get_elements_by_attribute(
460 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
461 html, escape_value=False)
462
463
464 def get_elements_html_by_class(class_name, html):
465 """Return the html of all tags with the specified class in the passed HTML document as a list"""
466 return get_elements_html_by_attribute(
467 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
468 html, escape_value=False)
469
470
471 def get_elements_by_attribute(*args, **kwargs):
472 """Return the content of the tag with the specified attribute in the passed HTML document"""
473 return [content for content, _ in get_elements_text_and_html_by_attribute(*args, **kwargs)]
474
475
476 def get_elements_html_by_attribute(*args, **kwargs):
477 """Return the html of the tag with the specified attribute in the passed HTML document"""
478 return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)]
479
480
481 def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True):
482 """
483 Return the text (content) and the html (whole) of the tag with the specified
484 attribute in the passed HTML document
485 """
486
487 value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?'
488
489 value = re.escape(value) if escape_value else value
490
491 partial_element_re = r'''(?x)
492 <(?P<tag>[a-zA-Z0-9:._-]+)
493 (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)?
494 \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q)
495 ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional}
496
497 for m in re.finditer(partial_element_re, html):
498 content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():])
499
500 yield (
501 unescapeHTML(re.sub(r'^(?P<q>["\'])(?P<content>.*)(?P=q)$', r'\g<content>', content, flags=re.DOTALL)),
502 whole
503 )
504
505
506 class HTMLBreakOnClosingTagParser(compat_HTMLParser):
507 """
508 HTML parser which raises HTMLBreakOnClosingTagException upon reaching the
509 closing tag for the first opening tag it has encountered, and can be used
510 as a context manager
511 """
512
513 class HTMLBreakOnClosingTagException(Exception):
514 pass
515
516 def __init__(self):
517 self.tagstack = collections.deque()
518 compat_HTMLParser.__init__(self)
519
520 def __enter__(self):
521 return self
522
523 def __exit__(self, *_):
524 self.close()
525
526 def close(self):
527 # handle_endtag does not return upon raising HTMLBreakOnClosingTagException,
528 # so data remains buffered; we no longer have any interest in it, thus
529 # override this method to discard it
530 pass
531
532 def handle_starttag(self, tag, _):
533 self.tagstack.append(tag)
534
535 def handle_endtag(self, tag):
536 if not self.tagstack:
537 raise compat_HTMLParseError('no tags in the stack')
538 while self.tagstack:
539 inner_tag = self.tagstack.pop()
540 if inner_tag == tag:
541 break
542 else:
543 raise compat_HTMLParseError(f'matching opening tag for closing {tag} tag not found')
544 if not self.tagstack:
545 raise self.HTMLBreakOnClosingTagException()
546
547
548 def get_element_text_and_html_by_tag(tag, html):
549 """
550 For the first element with the specified tag in the passed HTML document
551 return its' content (text) and the whole element (html)
552 """
553 def find_or_raise(haystack, needle, exc):
554 try:
555 return haystack.index(needle)
556 except ValueError:
557 raise exc
558 closing_tag = f'</{tag}>'
559 whole_start = find_or_raise(
560 html, f'<{tag}', compat_HTMLParseError(f'opening {tag} tag not found'))
561 content_start = find_or_raise(
562 html[whole_start:], '>', compat_HTMLParseError(f'malformed opening {tag} tag'))
563 content_start += whole_start + 1
564 with HTMLBreakOnClosingTagParser() as parser:
565 parser.feed(html[whole_start:content_start])
566 if not parser.tagstack or parser.tagstack[0] != tag:
567 raise compat_HTMLParseError(f'parser did not match opening {tag} tag')
568 offset = content_start
569 while offset < len(html):
570 next_closing_tag_start = find_or_raise(
571 html[offset:], closing_tag,
572 compat_HTMLParseError(f'closing {tag} tag not found'))
573 next_closing_tag_end = next_closing_tag_start + len(closing_tag)
574 try:
575 parser.feed(html[offset:offset + next_closing_tag_end])
576 offset += next_closing_tag_end
577 except HTMLBreakOnClosingTagParser.HTMLBreakOnClosingTagException:
578 return html[content_start:offset + next_closing_tag_start], \
579 html[whole_start:offset + next_closing_tag_end]
580 raise compat_HTMLParseError('unexpected end of html')
581
582
583 class HTMLAttributeParser(compat_HTMLParser):
584 """Trivial HTML parser to gather the attributes for a single element"""
585
586 def __init__(self):
587 self.attrs = {}
588 compat_HTMLParser.__init__(self)
589
590 def handle_starttag(self, tag, attrs):
591 self.attrs = dict(attrs)
592
593
594 class HTMLListAttrsParser(compat_HTMLParser):
595 """HTML parser to gather the attributes for the elements of a list"""
596
597 def __init__(self):
598 compat_HTMLParser.__init__(self)
599 self.items = []
600 self._level = 0
601
602 def handle_starttag(self, tag, attrs):
603 if tag == 'li' and self._level == 0:
604 self.items.append(dict(attrs))
605 self._level += 1
606
607 def handle_endtag(self, tag):
608 self._level -= 1
609
610
611 def extract_attributes(html_element):
612 """Given a string for an HTML element such as
613 <el
614 a="foo" B="bar" c="&98;az" d=boz
615 empty= noval entity="&amp;"
616 sq='"' dq="'"
617 >
618 Decode and return a dictionary of attributes.
619 {
620 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
621 'empty': '', 'noval': None, 'entity': '&',
622 'sq': '"', 'dq': '\''
623 }.
624 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
625 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
626 """
627 parser = HTMLAttributeParser()
628 try:
629 parser.feed(html_element)
630 parser.close()
631 # Older Python may throw HTMLParseError in case of malformed HTML
632 except compat_HTMLParseError:
633 pass
634 return parser.attrs
635
636
637 def parse_list(webpage):
638 """Given a string for an series of HTML <li> elements,
639 return a dictionary of their attributes"""
640 parser = HTMLListAttrsParser()
641 parser.feed(webpage)
642 parser.close()
643 return parser.items
644
645
646 def clean_html(html):
647 """Clean an HTML snippet into a readable string"""
648
649 if html is None: # Convenience for sanitizing descriptions etc.
650 return html
651
652 html = re.sub(r'\s+', ' ', html)
653 html = re.sub(r'(?u)\s?<\s?br\s?/?\s?>\s?', '\n', html)
654 html = re.sub(r'(?u)<\s?/\s?p\s?>\s?<\s?p[^>]*>', '\n', html)
655 # Strip html tags
656 html = re.sub('<.*?>', '', html)
657 # Replace html entities
658 html = unescapeHTML(html)
659 return html.strip()
660
661
662 def sanitize_open(filename, open_mode):
663 """Try to open the given filename, and slightly tweak it if this fails.
664
665 Attempts to open the given filename. If this fails, it tries to change
666 the filename slightly, step by step, until it's either able to open it
667 or it fails and raises a final exception, like the standard open()
668 function.
669
670 It returns the tuple (stream, definitive_file_name).
671 """
672 try:
673 if filename == '-':
674 if sys.platform == 'win32':
675 import msvcrt
676 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
677 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
678 stream = locked_file(filename, open_mode, block=False).open()
679 return (stream, filename)
680 except (IOError, OSError) as err:
681 if err.errno in (errno.EACCES,):
682 raise
683
684 # In case of error, try to remove win32 forbidden chars
685 alt_filename = sanitize_path(filename)
686 if alt_filename == filename:
687 raise
688 else:
689 # An exception here should be caught in the caller
690 stream = locked_file(filename, open_mode, block=False).open()
691 return (stream, alt_filename)
692
693
694 def timeconvert(timestr):
695 """Convert RFC 2822 defined time string into system timestamp"""
696 timestamp = None
697 timetuple = email.utils.parsedate_tz(timestr)
698 if timetuple is not None:
699 timestamp = email.utils.mktime_tz(timetuple)
700 return timestamp
701
702
703 def sanitize_filename(s, restricted=False, is_id=False):
704 """Sanitizes a string so it could be used as part of a filename.
705 If restricted is set, use a stricter subset of allowed characters.
706 Set is_id if this is not an arbitrary string, but an ID that should be kept
707 if possible.
708 """
709 def replace_insane(char):
710 if restricted and char in ACCENT_CHARS:
711 return ACCENT_CHARS[char]
712 elif not restricted and char == '\n':
713 return ' '
714 elif char == '?' or ord(char) < 32 or ord(char) == 127:
715 return ''
716 elif char == '"':
717 return '' if restricted else '\''
718 elif char == ':':
719 return '_-' if restricted else ' -'
720 elif char in '\\/|*<>':
721 return '_'
722 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
723 return '_'
724 if restricted and ord(char) > 127:
725 return '_'
726 return char
727
728 if s == '':
729 return ''
730 # Handle timestamps
731 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
732 result = ''.join(map(replace_insane, s))
733 if not is_id:
734 while '__' in result:
735 result = result.replace('__', '_')
736 result = result.strip('_')
737 # Common case of "Foreign band name - English song title"
738 if restricted and result.startswith('-_'):
739 result = result[2:]
740 if result.startswith('-'):
741 result = '_' + result[len('-'):]
742 result = result.lstrip('.')
743 if not result:
744 result = '_'
745 return result
746
747
748 def sanitize_path(s, force=False):
749 """Sanitizes and normalizes path on Windows"""
750 if sys.platform == 'win32':
751 force = False
752 drive_or_unc, _ = os.path.splitdrive(s)
753 if sys.version_info < (2, 7) and not drive_or_unc:
754 drive_or_unc, _ = os.path.splitunc(s)
755 elif force:
756 drive_or_unc = ''
757 else:
758 return s
759
760 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
761 if drive_or_unc:
762 norm_path.pop(0)
763 sanitized_path = [
764 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
765 for path_part in norm_path]
766 if drive_or_unc:
767 sanitized_path.insert(0, drive_or_unc + os.path.sep)
768 elif force and s[0] == os.path.sep:
769 sanitized_path.insert(0, os.path.sep)
770 return os.path.join(*sanitized_path)
771
772
773 def sanitize_url(url):
774 # Prepend protocol-less URLs with `http:` scheme in order to mitigate
775 # the number of unwanted failures due to missing protocol
776 if url.startswith('//'):
777 return 'http:%s' % url
778 # Fix some common typos seen so far
779 COMMON_TYPOS = (
780 # https://github.com/ytdl-org/youtube-dl/issues/15649
781 (r'^httpss://', r'https://'),
782 # https://bx1.be/lives/direct-tv/
783 (r'^rmtp([es]?)://', r'rtmp\1://'),
784 )
785 for mistake, fixup in COMMON_TYPOS:
786 if re.match(mistake, url):
787 return re.sub(mistake, fixup, url)
788 return url
789
790
791 def extract_basic_auth(url):
792 parts = compat_urlparse.urlsplit(url)
793 if parts.username is None:
794 return url, None
795 url = compat_urlparse.urlunsplit(parts._replace(netloc=(
796 parts.hostname if parts.port is None
797 else '%s:%d' % (parts.hostname, parts.port))))
798 auth_payload = base64.b64encode(
799 ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8'))
800 return url, 'Basic ' + auth_payload.decode('utf-8')
801
802
803 def sanitized_Request(url, *args, **kwargs):
804 url, auth_header = extract_basic_auth(escape_url(sanitize_url(url)))
805 if auth_header is not None:
806 headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {})
807 headers['Authorization'] = auth_header
808 return compat_urllib_request.Request(url, *args, **kwargs)
809
810
811 def expand_path(s):
812 """Expand shell variables and ~"""
813 return os.path.expandvars(compat_expanduser(s))
814
815
816 def orderedSet(iterable):
817 """ Remove all duplicates from the input iterable """
818 res = []
819 for el in iterable:
820 if el not in res:
821 res.append(el)
822 return res
823
824
825 def _htmlentity_transform(entity_with_semicolon):
826 """Transforms an HTML entity to a character."""
827 entity = entity_with_semicolon[:-1]
828
829 # Known non-numeric HTML entity
830 if entity in compat_html_entities.name2codepoint:
831 return compat_chr(compat_html_entities.name2codepoint[entity])
832
833 # TODO: HTML5 allows entities without a semicolon. For example,
834 # '&Eacuteric' should be decoded as 'Éric'.
835 if entity_with_semicolon in compat_html_entities_html5:
836 return compat_html_entities_html5[entity_with_semicolon]
837
838 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
839 if mobj is not None:
840 numstr = mobj.group(1)
841 if numstr.startswith('x'):
842 base = 16
843 numstr = '0%s' % numstr
844 else:
845 base = 10
846 # See https://github.com/ytdl-org/youtube-dl/issues/7518
847 try:
848 return compat_chr(int(numstr, base))
849 except ValueError:
850 pass
851
852 # Unknown entity in name, return its literal representation
853 return '&%s;' % entity
854
855
856 def unescapeHTML(s):
857 if s is None:
858 return None
859 assert type(s) == compat_str
860
861 return re.sub(
862 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
863
864
865 def escapeHTML(text):
866 return (
867 text
868 .replace('&', '&amp;')
869 .replace('<', '&lt;')
870 .replace('>', '&gt;')
871 .replace('"', '&quot;')
872 .replace("'", '&#39;')
873 )
874
875
876 def process_communicate_or_kill(p, *args, **kwargs):
877 try:
878 return p.communicate(*args, **kwargs)
879 except BaseException: # Including KeyboardInterrupt
880 p.kill()
881 p.wait()
882 raise
883
884
885 class Popen(subprocess.Popen):
886 if sys.platform == 'win32':
887 _startupinfo = subprocess.STARTUPINFO()
888 _startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
889 else:
890 _startupinfo = None
891
892 def __init__(self, *args, **kwargs):
893 super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo)
894
895 def communicate_or_kill(self, *args, **kwargs):
896 return process_communicate_or_kill(self, *args, **kwargs)
897
898
899 def get_subprocess_encoding():
900 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
901 # For subprocess calls, encode with locale encoding
902 # Refer to http://stackoverflow.com/a/9951851/35070
903 encoding = preferredencoding()
904 else:
905 encoding = sys.getfilesystemencoding()
906 if encoding is None:
907 encoding = 'utf-8'
908 return encoding
909
910
911 def encodeFilename(s, for_subprocess=False):
912 """
913 @param s The name of the file
914 """
915
916 assert type(s) == compat_str
917
918 # Python 3 has a Unicode API
919 if sys.version_info >= (3, 0):
920 return s
921
922 # Pass '' directly to use Unicode APIs on Windows 2000 and up
923 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
924 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
925 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
926 return s
927
928 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
929 if sys.platform.startswith('java'):
930 return s
931
932 return s.encode(get_subprocess_encoding(), 'ignore')
933
934
935 def decodeFilename(b, for_subprocess=False):
936
937 if sys.version_info >= (3, 0):
938 return b
939
940 if not isinstance(b, bytes):
941 return b
942
943 return b.decode(get_subprocess_encoding(), 'ignore')
944
945
946 def encodeArgument(s):
947 if not isinstance(s, compat_str):
948 # Legacy code that uses byte strings
949 # Uncomment the following line after fixing all post processors
950 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
951 s = s.decode('ascii')
952 return encodeFilename(s, True)
953
954
955 def decodeArgument(b):
956 return decodeFilename(b, True)
957
958
959 def decodeOption(optval):
960 if optval is None:
961 return optval
962 if isinstance(optval, bytes):
963 optval = optval.decode(preferredencoding())
964
965 assert isinstance(optval, compat_str)
966 return optval
967
968
969 _timetuple = collections.namedtuple('Time', ('hours', 'minutes', 'seconds', 'milliseconds'))
970
971
972 def timetuple_from_msec(msec):
973 secs, msec = divmod(msec, 1000)
974 mins, secs = divmod(secs, 60)
975 hrs, mins = divmod(mins, 60)
976 return _timetuple(hrs, mins, secs, msec)
977
978
979 def formatSeconds(secs, delim=':', msec=False):
980 time = timetuple_from_msec(secs * 1000)
981 if time.hours:
982 ret = '%d%s%02d%s%02d' % (time.hours, delim, time.minutes, delim, time.seconds)
983 elif time.minutes:
984 ret = '%d%s%02d' % (time.minutes, delim, time.seconds)
985 else:
986 ret = '%d' % time.seconds
987 return '%s.%03d' % (ret, time.milliseconds) if msec else ret
988
989
990 def _ssl_load_windows_store_certs(ssl_context, storename):
991 # Code adapted from _load_windows_store_certs in https://github.com/python/cpython/blob/main/Lib/ssl.py
992 try:
993 certs = [cert for cert, encoding, trust in ssl.enum_certificates(storename)
994 if encoding == 'x509_asn' and (
995 trust is True or ssl.Purpose.SERVER_AUTH.oid in trust)]
996 except PermissionError:
997 return
998 for cert in certs:
999 try:
1000 ssl_context.load_verify_locations(cadata=cert)
1001 except ssl.SSLError:
1002 pass
1003
1004
1005 def make_HTTPS_handler(params, **kwargs):
1006 opts_check_certificate = not params.get('nocheckcertificate')
1007 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1008 context.check_hostname = opts_check_certificate
1009 if params.get('legacyserverconnect'):
1010 context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT
1011 context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE
1012 if opts_check_certificate:
1013 try:
1014 context.load_default_certs()
1015 # Work around the issue in load_default_certs when there are bad certificates. See:
1016 # https://github.com/yt-dlp/yt-dlp/issues/1060,
1017 # https://bugs.python.org/issue35665, https://bugs.python.org/issue45312
1018 except ssl.SSLError:
1019 # enum_certificates is not present in mingw python. See https://github.com/yt-dlp/yt-dlp/issues/1151
1020 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'):
1021 # Create a new context to discard any certificates that were already loaded
1022 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
1023 context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED
1024 for storename in ('CA', 'ROOT'):
1025 _ssl_load_windows_store_certs(context, storename)
1026 context.set_default_verify_paths()
1027 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
1028
1029
1030 def bug_reports_message(before=';'):
1031 msg = ('please report this issue on https://github.com/yt-dlp/yt-dlp , '
1032 'filling out the appropriate issue template. '
1033 'Confirm you are on the latest version using yt-dlp -U')
1034
1035 before = before.rstrip()
1036 if not before or before.endswith(('.', '!', '?')):
1037 msg = msg[0].title() + msg[1:]
1038
1039 return (before + ' ' if before else '') + msg
1040
1041
1042 class YoutubeDLError(Exception):
1043 """Base exception for YoutubeDL errors."""
1044 msg = None
1045
1046 def __init__(self, msg=None):
1047 if msg is not None:
1048 self.msg = msg
1049 elif self.msg is None:
1050 self.msg = type(self).__name__
1051 super().__init__(self.msg)
1052
1053
1054 network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error]
1055 if hasattr(ssl, 'CertificateError'):
1056 network_exceptions.append(ssl.CertificateError)
1057 network_exceptions = tuple(network_exceptions)
1058
1059
1060 class ExtractorError(YoutubeDLError):
1061 """Error during info extraction."""
1062
1063 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None, ie=None):
1064 """ tb, if given, is the original traceback (so that it can be printed out).
1065 If expected is set, this is a normal error message and most likely not a bug in yt-dlp.
1066 """
1067 if sys.exc_info()[0] in network_exceptions:
1068 expected = True
1069
1070 self.orig_msg = str(msg)
1071 self.traceback = tb
1072 self.expected = expected
1073 self.cause = cause
1074 self.video_id = video_id
1075 self.ie = ie
1076 self.exc_info = sys.exc_info() # preserve original exception
1077
1078 super(ExtractorError, self).__init__(''.join((
1079 format_field(ie, template='[%s] '),
1080 format_field(video_id, template='%s: '),
1081 msg,
1082 format_field(cause, template=' (caused by %r)'),
1083 '' if expected else bug_reports_message())))
1084
1085 def format_traceback(self):
1086 return join_nonempty(
1087 self.traceback and ''.join(traceback.format_tb(self.traceback)),
1088 self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]),
1089 delim='\n') or None
1090
1091
1092 class UnsupportedError(ExtractorError):
1093 def __init__(self, url):
1094 super(UnsupportedError, self).__init__(
1095 'Unsupported URL: %s' % url, expected=True)
1096 self.url = url
1097
1098
1099 class RegexNotFoundError(ExtractorError):
1100 """Error when a regex didn't match"""
1101 pass
1102
1103
1104 class GeoRestrictedError(ExtractorError):
1105 """Geographic restriction Error exception.
1106
1107 This exception may be thrown when a video is not available from your
1108 geographic location due to geographic restrictions imposed by a website.
1109 """
1110
1111 def __init__(self, msg, countries=None, **kwargs):
1112 kwargs['expected'] = True
1113 super(GeoRestrictedError, self).__init__(msg, **kwargs)
1114 self.countries = countries
1115
1116
1117 class DownloadError(YoutubeDLError):
1118 """Download Error exception.
1119
1120 This exception may be thrown by FileDownloader objects if they are not
1121 configured to continue on errors. They will contain the appropriate
1122 error message.
1123 """
1124
1125 def __init__(self, msg, exc_info=None):
1126 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
1127 super(DownloadError, self).__init__(msg)
1128 self.exc_info = exc_info
1129
1130
1131 class EntryNotInPlaylist(YoutubeDLError):
1132 """Entry not in playlist exception.
1133
1134 This exception will be thrown by YoutubeDL when a requested entry
1135 is not found in the playlist info_dict
1136 """
1137 msg = 'Entry not found in info'
1138
1139
1140 class SameFileError(YoutubeDLError):
1141 """Same File exception.
1142
1143 This exception will be thrown by FileDownloader objects if they detect
1144 multiple files would have to be downloaded to the same file on disk.
1145 """
1146 msg = 'Fixed output name but more than one file to download'
1147
1148 def __init__(self, filename=None):
1149 if filename is not None:
1150 self.msg += f': {filename}'
1151 super().__init__(self.msg)
1152
1153
1154 class PostProcessingError(YoutubeDLError):
1155 """Post Processing exception.
1156
1157 This exception may be raised by PostProcessor's .run() method to
1158 indicate an error in the postprocessing task.
1159 """
1160
1161
1162 class DownloadCancelled(YoutubeDLError):
1163 """ Exception raised when the download queue should be interrupted """
1164 msg = 'The download was cancelled'
1165
1166
1167 class ExistingVideoReached(DownloadCancelled):
1168 """ --break-on-existing triggered """
1169 msg = 'Encountered a video that is already in the archive, stopping due to --break-on-existing'
1170
1171
1172 class RejectedVideoReached(DownloadCancelled):
1173 """ --break-on-reject triggered """
1174 msg = 'Encountered a video that did not match filter, stopping due to --break-on-reject'
1175
1176
1177 class MaxDownloadsReached(DownloadCancelled):
1178 """ --max-downloads limit has been reached. """
1179 msg = 'Maximum number of downloads reached, stopping due to --max-downloads'
1180
1181
1182 class ReExtractInfo(YoutubeDLError):
1183 """ Video info needs to be re-extracted. """
1184
1185 def __init__(self, msg, expected=False):
1186 super().__init__(msg)
1187 self.expected = expected
1188
1189
1190 class ThrottledDownload(ReExtractInfo):
1191 """ Download speed below --throttled-rate. """
1192 msg = 'The download speed is below throttle limit'
1193
1194 def __init__(self):
1195 super().__init__(self.msg, expected=False)
1196
1197
1198 class UnavailableVideoError(YoutubeDLError):
1199 """Unavailable Format exception.
1200
1201 This exception will be thrown when a video is requested
1202 in a format that is not available for that video.
1203 """
1204 msg = 'Unable to download video'
1205
1206 def __init__(self, err=None):
1207 if err is not None:
1208 self.msg += f': {err}'
1209 super().__init__(self.msg)
1210
1211
1212 class ContentTooShortError(YoutubeDLError):
1213 """Content Too Short exception.
1214
1215 This exception may be raised by FileDownloader objects when a file they
1216 download is too small for what the server announced first, indicating
1217 the connection was probably interrupted.
1218 """
1219
1220 def __init__(self, downloaded, expected):
1221 super(ContentTooShortError, self).__init__(
1222 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
1223 )
1224 # Both in bytes
1225 self.downloaded = downloaded
1226 self.expected = expected
1227
1228
1229 class XAttrMetadataError(YoutubeDLError):
1230 def __init__(self, code=None, msg='Unknown error'):
1231 super(XAttrMetadataError, self).__init__(msg)
1232 self.code = code
1233 self.msg = msg
1234
1235 # Parsing code and msg
1236 if (self.code in (errno.ENOSPC, errno.EDQUOT)
1237 or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
1238 self.reason = 'NO_SPACE'
1239 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
1240 self.reason = 'VALUE_TOO_LONG'
1241 else:
1242 self.reason = 'NOT_SUPPORTED'
1243
1244
1245 class XAttrUnavailableError(YoutubeDLError):
1246 pass
1247
1248
1249 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
1250 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
1251 # expected HTTP responses to meet HTTP/1.0 or later (see also
1252 # https://github.com/ytdl-org/youtube-dl/issues/6727)
1253 if sys.version_info < (3, 0):
1254 kwargs['strict'] = True
1255 hc = http_class(*args, **compat_kwargs(kwargs))
1256 source_address = ydl_handler._params.get('source_address')
1257
1258 if source_address is not None:
1259 # This is to workaround _create_connection() from socket where it will try all
1260 # address data from getaddrinfo() including IPv6. This filters the result from
1261 # getaddrinfo() based on the source_address value.
1262 # This is based on the cpython socket.create_connection() function.
1263 # https://github.com/python/cpython/blob/master/Lib/socket.py#L691
1264 def _create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT, source_address=None):
1265 host, port = address
1266 err = None
1267 addrs = socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM)
1268 af = socket.AF_INET if '.' in source_address[0] else socket.AF_INET6
1269 ip_addrs = [addr for addr in addrs if addr[0] == af]
1270 if addrs and not ip_addrs:
1271 ip_version = 'v4' if af == socket.AF_INET else 'v6'
1272 raise socket.error(
1273 "No remote IP%s addresses available for connect, can't use '%s' as source address"
1274 % (ip_version, source_address[0]))
1275 for res in ip_addrs:
1276 af, socktype, proto, canonname, sa = res
1277 sock = None
1278 try:
1279 sock = socket.socket(af, socktype, proto)
1280 if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
1281 sock.settimeout(timeout)
1282 sock.bind(source_address)
1283 sock.connect(sa)
1284 err = None # Explicitly break reference cycle
1285 return sock
1286 except socket.error as _:
1287 err = _
1288 if sock is not None:
1289 sock.close()
1290 if err is not None:
1291 raise err
1292 else:
1293 raise socket.error('getaddrinfo returns an empty list')
1294 if hasattr(hc, '_create_connection'):
1295 hc._create_connection = _create_connection
1296 sa = (source_address, 0)
1297 if hasattr(hc, 'source_address'): # Python 2.7+
1298 hc.source_address = sa
1299 else: # Python 2.6
1300 def _hc_connect(self, *args, **kwargs):
1301 sock = _create_connection(
1302 (self.host, self.port), self.timeout, sa)
1303 if is_https:
1304 self.sock = ssl.wrap_socket(
1305 sock, self.key_file, self.cert_file,
1306 ssl_version=ssl.PROTOCOL_TLSv1)
1307 else:
1308 self.sock = sock
1309 hc.connect = functools.partial(_hc_connect, hc)
1310
1311 return hc
1312
1313
1314 def handle_youtubedl_headers(headers):
1315 filtered_headers = headers
1316
1317 if 'Youtubedl-no-compression' in filtered_headers:
1318 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
1319 del filtered_headers['Youtubedl-no-compression']
1320
1321 return filtered_headers
1322
1323
1324 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
1325 """Handler for HTTP requests and responses.
1326
1327 This class, when installed with an OpenerDirector, automatically adds
1328 the standard headers to every HTTP request and handles gzipped and
1329 deflated responses from web servers. If compression is to be avoided in
1330 a particular request, the original request in the program code only has
1331 to include the HTTP header "Youtubedl-no-compression", which will be
1332 removed before making the real request.
1333
1334 Part of this code was copied from:
1335
1336 http://techknack.net/python-urllib2-handlers/
1337
1338 Andrew Rowls, the author of that code, agreed to release it to the
1339 public domain.
1340 """
1341
1342 def __init__(self, params, *args, **kwargs):
1343 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
1344 self._params = params
1345
1346 def http_open(self, req):
1347 conn_class = compat_http_client.HTTPConnection
1348
1349 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1350 if socks_proxy:
1351 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1352 del req.headers['Ytdl-socks-proxy']
1353
1354 return self.do_open(functools.partial(
1355 _create_http_connection, self, conn_class, False),
1356 req)
1357
1358 @staticmethod
1359 def deflate(data):
1360 if not data:
1361 return data
1362 try:
1363 return zlib.decompress(data, -zlib.MAX_WBITS)
1364 except zlib.error:
1365 return zlib.decompress(data)
1366
1367 @staticmethod
1368 def brotli(data):
1369 if not data:
1370 return data
1371 return compat_brotli.decompress(data)
1372
1373 def http_request(self, req):
1374 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
1375 # always respected by websites, some tend to give out URLs with non percent-encoded
1376 # non-ASCII characters (see telemb.py, ard.py [#3412])
1377 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
1378 # To work around aforementioned issue we will replace request's original URL with
1379 # percent-encoded one
1380 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
1381 # the code of this workaround has been moved here from YoutubeDL.urlopen()
1382 url = req.get_full_url()
1383 url_escaped = escape_url(url)
1384
1385 # Substitute URL if any change after escaping
1386 if url != url_escaped:
1387 req = update_Request(req, url=url_escaped)
1388
1389 for h, v in self._params.get('http_headers', std_headers).items():
1390 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
1391 # The dict keys are capitalized because of this bug by urllib
1392 if h.capitalize() not in req.headers:
1393 req.add_header(h, v)
1394
1395 req.headers = handle_youtubedl_headers(req.headers)
1396
1397 if sys.version_info < (2, 7) and '#' in req.get_full_url():
1398 # Python 2.6 is brain-dead when it comes to fragments
1399 req._Request__original = req._Request__original.partition('#')[0]
1400 req._Request__r_type = req._Request__r_type.partition('#')[0]
1401
1402 return req
1403
1404 def http_response(self, req, resp):
1405 old_resp = resp
1406 # gzip
1407 if resp.headers.get('Content-encoding', '') == 'gzip':
1408 content = resp.read()
1409 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
1410 try:
1411 uncompressed = io.BytesIO(gz.read())
1412 except IOError as original_ioerror:
1413 # There may be junk add the end of the file
1414 # See http://stackoverflow.com/q/4928560/35070 for details
1415 for i in range(1, 1024):
1416 try:
1417 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
1418 uncompressed = io.BytesIO(gz.read())
1419 except IOError:
1420 continue
1421 break
1422 else:
1423 raise original_ioerror
1424 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
1425 resp.msg = old_resp.msg
1426 del resp.headers['Content-encoding']
1427 # deflate
1428 if resp.headers.get('Content-encoding', '') == 'deflate':
1429 gz = io.BytesIO(self.deflate(resp.read()))
1430 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
1431 resp.msg = old_resp.msg
1432 del resp.headers['Content-encoding']
1433 # brotli
1434 if resp.headers.get('Content-encoding', '') == 'br':
1435 resp = compat_urllib_request.addinfourl(
1436 io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code)
1437 resp.msg = old_resp.msg
1438 del resp.headers['Content-encoding']
1439 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1440 # https://github.com/ytdl-org/youtube-dl/issues/6457).
1441 if 300 <= resp.code < 400:
1442 location = resp.headers.get('Location')
1443 if location:
1444 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1445 if sys.version_info >= (3, 0):
1446 location = location.encode('iso-8859-1').decode('utf-8')
1447 else:
1448 location = location.decode('utf-8')
1449 location_escaped = escape_url(location)
1450 if location != location_escaped:
1451 del resp.headers['Location']
1452 if sys.version_info < (3, 0):
1453 location_escaped = location_escaped.encode('utf-8')
1454 resp.headers['Location'] = location_escaped
1455 return resp
1456
1457 https_request = http_request
1458 https_response = http_response
1459
1460
1461 def make_socks_conn_class(base_class, socks_proxy):
1462 assert issubclass(base_class, (
1463 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1464
1465 url_components = compat_urlparse.urlparse(socks_proxy)
1466 if url_components.scheme.lower() == 'socks5':
1467 socks_type = ProxyType.SOCKS5
1468 elif url_components.scheme.lower() in ('socks', 'socks4'):
1469 socks_type = ProxyType.SOCKS4
1470 elif url_components.scheme.lower() == 'socks4a':
1471 socks_type = ProxyType.SOCKS4A
1472
1473 def unquote_if_non_empty(s):
1474 if not s:
1475 return s
1476 return compat_urllib_parse_unquote_plus(s)
1477
1478 proxy_args = (
1479 socks_type,
1480 url_components.hostname, url_components.port or 1080,
1481 True, # Remote DNS
1482 unquote_if_non_empty(url_components.username),
1483 unquote_if_non_empty(url_components.password),
1484 )
1485
1486 class SocksConnection(base_class):
1487 def connect(self):
1488 self.sock = sockssocket()
1489 self.sock.setproxy(*proxy_args)
1490 if type(self.timeout) in (int, float):
1491 self.sock.settimeout(self.timeout)
1492 self.sock.connect((self.host, self.port))
1493
1494 if isinstance(self, compat_http_client.HTTPSConnection):
1495 if hasattr(self, '_context'): # Python > 2.6
1496 self.sock = self._context.wrap_socket(
1497 self.sock, server_hostname=self.host)
1498 else:
1499 self.sock = ssl.wrap_socket(self.sock)
1500
1501 return SocksConnection
1502
1503
1504 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1505 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1506 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1507 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1508 self._params = params
1509
1510 def https_open(self, req):
1511 kwargs = {}
1512 conn_class = self._https_conn_class
1513
1514 if hasattr(self, '_context'): # python > 2.6
1515 kwargs['context'] = self._context
1516 if hasattr(self, '_check_hostname'): # python 3.x
1517 kwargs['check_hostname'] = self._check_hostname
1518
1519 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1520 if socks_proxy:
1521 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1522 del req.headers['Ytdl-socks-proxy']
1523
1524 return self.do_open(functools.partial(
1525 _create_http_connection, self, conn_class, True),
1526 req, **kwargs)
1527
1528
1529 class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar):
1530 """
1531 See [1] for cookie file format.
1532
1533 1. https://curl.haxx.se/docs/http-cookies.html
1534 """
1535 _HTTPONLY_PREFIX = '#HttpOnly_'
1536 _ENTRY_LEN = 7
1537 _HEADER = '''# Netscape HTTP Cookie File
1538 # This file is generated by yt-dlp. Do not edit.
1539
1540 '''
1541 _CookieFileEntry = collections.namedtuple(
1542 'CookieFileEntry',
1543 ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value'))
1544
1545 def save(self, filename=None, ignore_discard=False, ignore_expires=False):
1546 """
1547 Save cookies to a file.
1548
1549 Most of the code is taken from CPython 3.8 and slightly adapted
1550 to support cookie files with UTF-8 in both python 2 and 3.
1551 """
1552 if filename is None:
1553 if self.filename is not None:
1554 filename = self.filename
1555 else:
1556 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1557
1558 # Store session cookies with `expires` set to 0 instead of an empty
1559 # string
1560 for cookie in self:
1561 if cookie.expires is None:
1562 cookie.expires = 0
1563
1564 with io.open(filename, 'w', encoding='utf-8') as f:
1565 f.write(self._HEADER)
1566 now = time.time()
1567 for cookie in self:
1568 if not ignore_discard and cookie.discard:
1569 continue
1570 if not ignore_expires and cookie.is_expired(now):
1571 continue
1572 if cookie.secure:
1573 secure = 'TRUE'
1574 else:
1575 secure = 'FALSE'
1576 if cookie.domain.startswith('.'):
1577 initial_dot = 'TRUE'
1578 else:
1579 initial_dot = 'FALSE'
1580 if cookie.expires is not None:
1581 expires = compat_str(cookie.expires)
1582 else:
1583 expires = ''
1584 if cookie.value is None:
1585 # cookies.txt regards 'Set-Cookie: foo' as a cookie
1586 # with no name, whereas http.cookiejar regards it as a
1587 # cookie with no value.
1588 name = ''
1589 value = cookie.name
1590 else:
1591 name = cookie.name
1592 value = cookie.value
1593 f.write(
1594 '\t'.join([cookie.domain, initial_dot, cookie.path,
1595 secure, expires, name, value]) + '\n')
1596
1597 def load(self, filename=None, ignore_discard=False, ignore_expires=False):
1598 """Load cookies from a file."""
1599 if filename is None:
1600 if self.filename is not None:
1601 filename = self.filename
1602 else:
1603 raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT)
1604
1605 def prepare_line(line):
1606 if line.startswith(self._HTTPONLY_PREFIX):
1607 line = line[len(self._HTTPONLY_PREFIX):]
1608 # comments and empty lines are fine
1609 if line.startswith('#') or not line.strip():
1610 return line
1611 cookie_list = line.split('\t')
1612 if len(cookie_list) != self._ENTRY_LEN:
1613 raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list))
1614 cookie = self._CookieFileEntry(*cookie_list)
1615 if cookie.expires_at and not cookie.expires_at.isdigit():
1616 raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at)
1617 return line
1618
1619 cf = io.StringIO()
1620 with io.open(filename, encoding='utf-8') as f:
1621 for line in f:
1622 try:
1623 cf.write(prepare_line(line))
1624 except compat_cookiejar.LoadError as e:
1625 write_string(
1626 'WARNING: skipping cookie file entry due to %s: %r\n'
1627 % (e, line), sys.stderr)
1628 continue
1629 cf.seek(0)
1630 self._really_load(cf, filename, ignore_discard, ignore_expires)
1631 # Session cookies are denoted by either `expires` field set to
1632 # an empty string or 0. MozillaCookieJar only recognizes the former
1633 # (see [1]). So we need force the latter to be recognized as session
1634 # cookies on our own.
1635 # Session cookies may be important for cookies-based authentication,
1636 # e.g. usually, when user does not check 'Remember me' check box while
1637 # logging in on a site, some important cookies are stored as session
1638 # cookies so that not recognizing them will result in failed login.
1639 # 1. https://bugs.python.org/issue17164
1640 for cookie in self:
1641 # Treat `expires=0` cookies as session cookies
1642 if cookie.expires == 0:
1643 cookie.expires = None
1644 cookie.discard = True
1645
1646
1647 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1648 def __init__(self, cookiejar=None):
1649 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1650
1651 def http_response(self, request, response):
1652 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1653 # characters in Set-Cookie HTTP header of last response (see
1654 # https://github.com/ytdl-org/youtube-dl/issues/6769).
1655 # In order to at least prevent crashing we will percent encode Set-Cookie
1656 # header before HTTPCookieProcessor starts processing it.
1657 # if sys.version_info < (3, 0) and response.headers:
1658 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1659 # set_cookie = response.headers.get(set_cookie_header)
1660 # if set_cookie:
1661 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1662 # if set_cookie != set_cookie_escaped:
1663 # del response.headers[set_cookie_header]
1664 # response.headers[set_cookie_header] = set_cookie_escaped
1665 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1666
1667 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1668 https_response = http_response
1669
1670
1671 class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1672 """YoutubeDL redirect handler
1673
1674 The code is based on HTTPRedirectHandler implementation from CPython [1].
1675
1676 This redirect handler solves two issues:
1677 - ensures redirect URL is always unicode under python 2
1678 - introduces support for experimental HTTP response status code
1679 308 Permanent Redirect [2] used by some sites [3]
1680
1681 1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
1682 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
1683 3. https://github.com/ytdl-org/youtube-dl/issues/28768
1684 """
1685
1686 http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
1687
1688 def redirect_request(self, req, fp, code, msg, headers, newurl):
1689 """Return a Request or None in response to a redirect.
1690
1691 This is called by the http_error_30x methods when a
1692 redirection response is received. If a redirection should
1693 take place, return a new Request to allow http_error_30x to
1694 perform the redirect. Otherwise, raise HTTPError if no-one
1695 else should try to handle this url. Return None if you can't
1696 but another Handler might.
1697 """
1698 m = req.get_method()
1699 if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
1700 or code in (301, 302, 303) and m == "POST")):
1701 raise compat_HTTPError(req.full_url, code, msg, headers, fp)
1702 # Strictly (according to RFC 2616), 301 or 302 in response to
1703 # a POST MUST NOT cause a redirection without confirmation
1704 # from the user (of urllib.request, in this case). In practice,
1705 # essentially all clients do redirect in this case, so we do
1706 # the same.
1707
1708 # On python 2 urlh.geturl() may sometimes return redirect URL
1709 # as byte string instead of unicode. This workaround allows
1710 # to force it always return unicode.
1711 if sys.version_info[0] < 3:
1712 newurl = compat_str(newurl)
1713
1714 # Be conciliant with URIs containing a space. This is mainly
1715 # redundant with the more complete encoding done in http_error_302(),
1716 # but it is kept for compatibility with other callers.
1717 newurl = newurl.replace(' ', '%20')
1718
1719 CONTENT_HEADERS = ("content-length", "content-type")
1720 # NB: don't use dict comprehension for python 2.6 compatibility
1721 newheaders = dict((k, v) for k, v in req.headers.items()
1722 if k.lower() not in CONTENT_HEADERS)
1723 return compat_urllib_request.Request(
1724 newurl, headers=newheaders, origin_req_host=req.origin_req_host,
1725 unverifiable=True)
1726
1727
1728 def extract_timezone(date_str):
1729 m = re.search(
1730 r'''(?x)
1731 ^.{8,}? # >=8 char non-TZ prefix, if present
1732 (?P<tz>Z| # just the UTC Z, or
1733 (?:(?<=.\b\d{4}|\b\d{2}:\d\d)| # preceded by 4 digits or hh:mm or
1734 (?<!.\b[a-zA-Z]{3}|[a-zA-Z]{4}|..\b\d\d)) # not preceded by 3 alpha word or >= 4 alpha or 2 digits
1735 [ ]? # optional space
1736 (?P<sign>\+|-) # +/-
1737 (?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2}) # hh[:]mm
1738 $)
1739 ''', date_str)
1740 if not m:
1741 timezone = datetime.timedelta()
1742 else:
1743 date_str = date_str[:-len(m.group('tz'))]
1744 if not m.group('sign'):
1745 timezone = datetime.timedelta()
1746 else:
1747 sign = 1 if m.group('sign') == '+' else -1
1748 timezone = datetime.timedelta(
1749 hours=sign * int(m.group('hours')),
1750 minutes=sign * int(m.group('minutes')))
1751 return timezone, date_str
1752
1753
1754 def parse_iso8601(date_str, delimiter='T', timezone=None):
1755 """ Return a UNIX timestamp from the given date """
1756
1757 if date_str is None:
1758 return None
1759
1760 date_str = re.sub(r'\.[0-9]+', '', date_str)
1761
1762 if timezone is None:
1763 timezone, date_str = extract_timezone(date_str)
1764
1765 try:
1766 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1767 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1768 return calendar.timegm(dt.timetuple())
1769 except ValueError:
1770 pass
1771
1772
1773 def date_formats(day_first=True):
1774 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1775
1776
1777 def unified_strdate(date_str, day_first=True):
1778 """Return a string with the date in the format YYYYMMDD"""
1779
1780 if date_str is None:
1781 return None
1782 upload_date = None
1783 # Replace commas
1784 date_str = date_str.replace(',', ' ')
1785 # Remove AM/PM + timezone
1786 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1787 _, date_str = extract_timezone(date_str)
1788
1789 for expression in date_formats(day_first):
1790 try:
1791 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
1792 except ValueError:
1793 pass
1794 if upload_date is None:
1795 timetuple = email.utils.parsedate_tz(date_str)
1796 if timetuple:
1797 try:
1798 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1799 except ValueError:
1800 pass
1801 if upload_date is not None:
1802 return compat_str(upload_date)
1803
1804
1805 def unified_timestamp(date_str, day_first=True):
1806 if date_str is None:
1807 return None
1808
1809 date_str = re.sub(r'[,|]', '', date_str)
1810
1811 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
1812 timezone, date_str = extract_timezone(date_str)
1813
1814 # Remove AM/PM + timezone
1815 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1816
1817 # Remove unrecognized timezones from ISO 8601 alike timestamps
1818 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1819 if m:
1820 date_str = date_str[:-len(m.group('tz'))]
1821
1822 # Python only supports microseconds, so remove nanoseconds
1823 m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str)
1824 if m:
1825 date_str = m.group(1)
1826
1827 for expression in date_formats(day_first):
1828 try:
1829 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
1830 return calendar.timegm(dt.timetuple())
1831 except ValueError:
1832 pass
1833 timetuple = email.utils.parsedate_tz(date_str)
1834 if timetuple:
1835 return calendar.timegm(timetuple) + pm_delta * 3600
1836
1837
1838 def determine_ext(url, default_ext='unknown_video'):
1839 if url is None or '.' not in url:
1840 return default_ext
1841 guess = url.partition('?')[0].rpartition('.')[2]
1842 if re.match(r'^[A-Za-z0-9]+$', guess):
1843 return guess
1844 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1845 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
1846 return guess.rstrip('/')
1847 else:
1848 return default_ext
1849
1850
1851 def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None):
1852 return replace_extension(filename, sub_lang + '.' + sub_format, expected_real_ext)
1853
1854
1855 def datetime_from_str(date_str, precision='auto', format='%Y%m%d'):
1856 """
1857 Return a datetime object from a string in the format YYYYMMDD or
1858 (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1859
1860 format: string date format used to return datetime object from
1861 precision: round the time portion of a datetime object.
1862 auto|microsecond|second|minute|hour|day.
1863 auto: round to the unit provided in date_str (if applicable).
1864 """
1865 auto_precision = False
1866 if precision == 'auto':
1867 auto_precision = True
1868 precision = 'microsecond'
1869 today = datetime_round(datetime.datetime.utcnow(), precision)
1870 if date_str in ('now', 'today'):
1871 return today
1872 if date_str == 'yesterday':
1873 return today - datetime.timedelta(days=1)
1874 match = re.match(
1875 r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?',
1876 date_str)
1877 if match is not None:
1878 start_time = datetime_from_str(match.group('start'), precision, format)
1879 time = int(match.group('time')) * (-1 if match.group('sign') == '-' else 1)
1880 unit = match.group('unit')
1881 if unit == 'month' or unit == 'year':
1882 new_date = datetime_add_months(start_time, time * 12 if unit == 'year' else time)
1883 unit = 'day'
1884 else:
1885 if unit == 'week':
1886 unit = 'day'
1887 time *= 7
1888 delta = datetime.timedelta(**{unit + 's': time})
1889 new_date = start_time + delta
1890 if auto_precision:
1891 return datetime_round(new_date, unit)
1892 return new_date
1893
1894 return datetime_round(datetime.datetime.strptime(date_str, format), precision)
1895
1896
1897 def date_from_str(date_str, format='%Y%m%d', strict=False):
1898 """
1899 Return a datetime object from a string in the format YYYYMMDD or
1900 (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)?
1901
1902 If "strict", only (now|today)[+-][0-9](day|week|month|year)(s)? is allowed
1903
1904 format: string date format used to return datetime object from
1905 """
1906 if strict and not re.fullmatch(r'\d{8}|(now|today)[+-]\d+(day|week|month|year)(s)?', date_str):
1907 raise ValueError(f'Invalid date format {date_str}')
1908 return datetime_from_str(date_str, precision='microsecond', format=format).date()
1909
1910
1911 def datetime_add_months(dt, months):
1912 """Increment/Decrement a datetime object by months."""
1913 month = dt.month + months - 1
1914 year = dt.year + month // 12
1915 month = month % 12 + 1
1916 day = min(dt.day, calendar.monthrange(year, month)[1])
1917 return dt.replace(year, month, day)
1918
1919
1920 def datetime_round(dt, precision='day'):
1921 """
1922 Round a datetime object's time to a specific precision
1923 """
1924 if precision == 'microsecond':
1925 return dt
1926
1927 unit_seconds = {
1928 'day': 86400,
1929 'hour': 3600,
1930 'minute': 60,
1931 'second': 1,
1932 }
1933 roundto = lambda x, n: ((x + n / 2) // n) * n
1934 timestamp = calendar.timegm(dt.timetuple())
1935 return datetime.datetime.utcfromtimestamp(roundto(timestamp, unit_seconds[precision]))
1936
1937
1938 def hyphenate_date(date_str):
1939 """
1940 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1941 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1942 if match is not None:
1943 return '-'.join(match.groups())
1944 else:
1945 return date_str
1946
1947
1948 class DateRange(object):
1949 """Represents a time interval between two dates"""
1950
1951 def __init__(self, start=None, end=None):
1952 """start and end must be strings in the format accepted by date"""
1953 if start is not None:
1954 self.start = date_from_str(start, strict=True)
1955 else:
1956 self.start = datetime.datetime.min.date()
1957 if end is not None:
1958 self.end = date_from_str(end, strict=True)
1959 else:
1960 self.end = datetime.datetime.max.date()
1961 if self.start > self.end:
1962 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1963
1964 @classmethod
1965 def day(cls, day):
1966 """Returns a range that only contains the given day"""
1967 return cls(day, day)
1968
1969 def __contains__(self, date):
1970 """Check if the date is in the range"""
1971 if not isinstance(date, datetime.date):
1972 date = date_from_str(date)
1973 return self.start <= date <= self.end
1974
1975 def __str__(self):
1976 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1977
1978
1979 def platform_name():
1980 """ Returns the platform name as a compat_str """
1981 res = platform.platform()
1982 if isinstance(res, bytes):
1983 res = res.decode(preferredencoding())
1984
1985 assert isinstance(res, compat_str)
1986 return res
1987
1988
1989 def get_windows_version():
1990 ''' Get Windows version. None if it's not running on Windows '''
1991 if compat_os_name == 'nt':
1992 return version_tuple(platform.win32_ver()[1])
1993 else:
1994 return None
1995
1996
1997 def _windows_write_string(s, out):
1998 """ Returns True if the string was written using special methods,
1999 False if it has yet to be written out."""
2000 # Adapted from http://stackoverflow.com/a/3259271/35070
2001
2002 import ctypes.wintypes
2003
2004 WIN_OUTPUT_IDS = {
2005 1: -11,
2006 2: -12,
2007 }
2008
2009 try:
2010 fileno = out.fileno()
2011 except AttributeError:
2012 # If the output stream doesn't have a fileno, it's virtual
2013 return False
2014 except io.UnsupportedOperation:
2015 # Some strange Windows pseudo files?
2016 return False
2017 if fileno not in WIN_OUTPUT_IDS:
2018 return False
2019
2020 GetStdHandle = compat_ctypes_WINFUNCTYPE(
2021 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
2022 ('GetStdHandle', ctypes.windll.kernel32))
2023 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
2024
2025 WriteConsoleW = compat_ctypes_WINFUNCTYPE(
2026 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
2027 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
2028 ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32))
2029 written = ctypes.wintypes.DWORD(0)
2030
2031 GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32))
2032 FILE_TYPE_CHAR = 0x0002
2033 FILE_TYPE_REMOTE = 0x8000
2034 GetConsoleMode = compat_ctypes_WINFUNCTYPE(
2035 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
2036 ctypes.POINTER(ctypes.wintypes.DWORD))(
2037 ('GetConsoleMode', ctypes.windll.kernel32))
2038 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
2039
2040 def not_a_console(handle):
2041 if handle == INVALID_HANDLE_VALUE or handle is None:
2042 return True
2043 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
2044 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
2045
2046 if not_a_console(h):
2047 return False
2048
2049 def next_nonbmp_pos(s):
2050 try:
2051 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
2052 except StopIteration:
2053 return len(s)
2054
2055 while s:
2056 count = min(next_nonbmp_pos(s), 1024)
2057
2058 ret = WriteConsoleW(
2059 h, s, count if count else 2, ctypes.byref(written), None)
2060 if ret == 0:
2061 raise OSError('Failed to write string')
2062 if not count: # We just wrote a non-BMP character
2063 assert written.value == 2
2064 s = s[1:]
2065 else:
2066 assert written.value > 0
2067 s = s[written.value:]
2068 return True
2069
2070
2071 def write_string(s, out=None, encoding=None):
2072 if out is None:
2073 out = sys.stderr
2074 assert type(s) == compat_str
2075
2076 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
2077 if _windows_write_string(s, out):
2078 return
2079
2080 if ('b' in getattr(out, 'mode', '')
2081 or sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
2082 byt = s.encode(encoding or preferredencoding(), 'ignore')
2083 out.write(byt)
2084 elif hasattr(out, 'buffer'):
2085 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
2086 byt = s.encode(enc, 'ignore')
2087 out.buffer.write(byt)
2088 else:
2089 out.write(s)
2090 out.flush()
2091
2092
2093 def bytes_to_intlist(bs):
2094 if not bs:
2095 return []
2096 if isinstance(bs[0], int): # Python 3
2097 return list(bs)
2098 else:
2099 return [ord(c) for c in bs]
2100
2101
2102 def intlist_to_bytes(xs):
2103 if not xs:
2104 return b''
2105 return compat_struct_pack('%dB' % len(xs), *xs)
2106
2107
2108 # Cross-platform file locking
2109 if sys.platform == 'win32':
2110 import ctypes.wintypes
2111 import msvcrt
2112
2113 class OVERLAPPED(ctypes.Structure):
2114 _fields_ = [
2115 ('Internal', ctypes.wintypes.LPVOID),
2116 ('InternalHigh', ctypes.wintypes.LPVOID),
2117 ('Offset', ctypes.wintypes.DWORD),
2118 ('OffsetHigh', ctypes.wintypes.DWORD),
2119 ('hEvent', ctypes.wintypes.HANDLE),
2120 ]
2121
2122 kernel32 = ctypes.windll.kernel32
2123 LockFileEx = kernel32.LockFileEx
2124 LockFileEx.argtypes = [
2125 ctypes.wintypes.HANDLE, # hFile
2126 ctypes.wintypes.DWORD, # dwFlags
2127 ctypes.wintypes.DWORD, # dwReserved
2128 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2129 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2130 ctypes.POINTER(OVERLAPPED) # Overlapped
2131 ]
2132 LockFileEx.restype = ctypes.wintypes.BOOL
2133 UnlockFileEx = kernel32.UnlockFileEx
2134 UnlockFileEx.argtypes = [
2135 ctypes.wintypes.HANDLE, # hFile
2136 ctypes.wintypes.DWORD, # dwReserved
2137 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
2138 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
2139 ctypes.POINTER(OVERLAPPED) # Overlapped
2140 ]
2141 UnlockFileEx.restype = ctypes.wintypes.BOOL
2142 whole_low = 0xffffffff
2143 whole_high = 0x7fffffff
2144
2145 def _lock_file(f, exclusive, block):
2146 overlapped = OVERLAPPED()
2147 overlapped.Offset = 0
2148 overlapped.OffsetHigh = 0
2149 overlapped.hEvent = 0
2150 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
2151
2152 if not LockFileEx(msvcrt.get_osfhandle(f.fileno()),
2153 (0x2 if exclusive else 0x0) | (0x0 if block else 0x1),
2154 0, whole_low, whole_high, f._lock_file_overlapped_p):
2155 raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError())
2156
2157 def _unlock_file(f):
2158 assert f._lock_file_overlapped_p
2159 handle = msvcrt.get_osfhandle(f.fileno())
2160 if not UnlockFileEx(handle, 0, whole_low, whole_high, f._lock_file_overlapped_p):
2161 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
2162
2163 else:
2164 try:
2165 import fcntl
2166
2167 def _lock_file(f, exclusive, block):
2168 try:
2169 fcntl.flock(f,
2170 fcntl.LOCK_SH if not exclusive
2171 else fcntl.LOCK_EX if block
2172 else fcntl.LOCK_EX | fcntl.LOCK_NB)
2173 except BlockingIOError:
2174 raise
2175 except OSError: # AOSP does not have flock()
2176 fcntl.lockf(f,
2177 fcntl.LOCK_SH if not exclusive
2178 else fcntl.LOCK_EX if block
2179 else fcntl.LOCK_EX | fcntl.LOCK_NB)
2180
2181 def _unlock_file(f):
2182 try:
2183 fcntl.flock(f, fcntl.LOCK_UN)
2184 except OSError:
2185 fcntl.lockf(f, fcntl.LOCK_UN)
2186
2187 except ImportError:
2188 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
2189
2190 def _lock_file(f, exclusive, block):
2191 raise IOError(UNSUPPORTED_MSG)
2192
2193 def _unlock_file(f):
2194 raise IOError(UNSUPPORTED_MSG)
2195
2196
2197 class locked_file(object):
2198 _closed = False
2199
2200 def __init__(self, filename, mode, block=True, encoding=None):
2201 assert mode in ['r', 'rb', 'a', 'ab', 'w', 'wb']
2202 self.f = io.open(filename, mode, encoding=encoding)
2203 self.mode = mode
2204 self.block = block
2205
2206 def __enter__(self):
2207 exclusive = 'r' not in self.mode
2208 try:
2209 _lock_file(self.f, exclusive, self.block)
2210 except IOError:
2211 self.f.close()
2212 raise
2213 return self
2214
2215 def __exit__(self, etype, value, traceback):
2216 try:
2217 if not self._closed:
2218 _unlock_file(self.f)
2219 finally:
2220 self.f.close()
2221 self._closed = True
2222
2223 def __iter__(self):
2224 return iter(self.f)
2225
2226 def write(self, *args):
2227 return self.f.write(*args)
2228
2229 def read(self, *args):
2230 return self.f.read(*args)
2231
2232 def flush(self):
2233 self.f.flush()
2234
2235 def open(self):
2236 return self.__enter__()
2237
2238 def close(self, *args):
2239 self.__exit__(self, *args, value=False, traceback=False)
2240
2241
2242 def get_filesystem_encoding():
2243 encoding = sys.getfilesystemencoding()
2244 return encoding if encoding is not None else 'utf-8'
2245
2246
2247 def shell_quote(args):
2248 quoted_args = []
2249 encoding = get_filesystem_encoding()
2250 for a in args:
2251 if isinstance(a, bytes):
2252 # We may get a filename encoded with 'encodeFilename'
2253 a = a.decode(encoding)
2254 quoted_args.append(compat_shlex_quote(a))
2255 return ' '.join(quoted_args)
2256
2257
2258 def smuggle_url(url, data):
2259 """ Pass additional data in a URL for internal use. """
2260
2261 url, idata = unsmuggle_url(url, {})
2262 data.update(idata)
2263 sdata = compat_urllib_parse_urlencode(
2264 {'__youtubedl_smuggle': json.dumps(data)})
2265 return url + '#' + sdata
2266
2267
2268 def unsmuggle_url(smug_url, default=None):
2269 if '#__youtubedl_smuggle' not in smug_url:
2270 return smug_url, default
2271 url, _, sdata = smug_url.rpartition('#')
2272 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
2273 data = json.loads(jsond)
2274 return url, data
2275
2276
2277 def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
2278 """ Formats numbers with decimal sufixes like K, M, etc """
2279 num, factor = float_or_none(num), float(factor)
2280 if num is None or num < 0:
2281 return None
2282 POSSIBLE_SUFFIXES = 'kMGTPEZY'
2283 exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES))
2284 suffix = ['', *POSSIBLE_SUFFIXES][exponent]
2285 if factor == 1024:
2286 suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i')
2287 converted = num / (factor ** exponent)
2288 return fmt % (converted, suffix)
2289
2290
2291 def format_bytes(bytes):
2292 return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A'
2293
2294
2295 def lookup_unit_table(unit_table, s):
2296 units_re = '|'.join(re.escape(u) for u in unit_table)
2297 m = re.match(
2298 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
2299 if not m:
2300 return None
2301 num_str = m.group('num').replace(',', '.')
2302 mult = unit_table[m.group('unit')]
2303 return int(float(num_str) * mult)
2304
2305
2306 def parse_filesize(s):
2307 if s is None:
2308 return None
2309
2310 # The lower-case forms are of course incorrect and unofficial,
2311 # but we support those too
2312 _UNIT_TABLE = {
2313 'B': 1,
2314 'b': 1,
2315 'bytes': 1,
2316 'KiB': 1024,
2317 'KB': 1000,
2318 'kB': 1024,
2319 'Kb': 1000,
2320 'kb': 1000,
2321 'kilobytes': 1000,
2322 'kibibytes': 1024,
2323 'MiB': 1024 ** 2,
2324 'MB': 1000 ** 2,
2325 'mB': 1024 ** 2,
2326 'Mb': 1000 ** 2,
2327 'mb': 1000 ** 2,
2328 'megabytes': 1000 ** 2,
2329 'mebibytes': 1024 ** 2,
2330 'GiB': 1024 ** 3,
2331 'GB': 1000 ** 3,
2332 'gB': 1024 ** 3,
2333 'Gb': 1000 ** 3,
2334 'gb': 1000 ** 3,
2335 'gigabytes': 1000 ** 3,
2336 'gibibytes': 1024 ** 3,
2337 'TiB': 1024 ** 4,
2338 'TB': 1000 ** 4,
2339 'tB': 1024 ** 4,
2340 'Tb': 1000 ** 4,
2341 'tb': 1000 ** 4,
2342 'terabytes': 1000 ** 4,
2343 'tebibytes': 1024 ** 4,
2344 'PiB': 1024 ** 5,
2345 'PB': 1000 ** 5,
2346 'pB': 1024 ** 5,
2347 'Pb': 1000 ** 5,
2348 'pb': 1000 ** 5,
2349 'petabytes': 1000 ** 5,
2350 'pebibytes': 1024 ** 5,
2351 'EiB': 1024 ** 6,
2352 'EB': 1000 ** 6,
2353 'eB': 1024 ** 6,
2354 'Eb': 1000 ** 6,
2355 'eb': 1000 ** 6,
2356 'exabytes': 1000 ** 6,
2357 'exbibytes': 1024 ** 6,
2358 'ZiB': 1024 ** 7,
2359 'ZB': 1000 ** 7,
2360 'zB': 1024 ** 7,
2361 'Zb': 1000 ** 7,
2362 'zb': 1000 ** 7,
2363 'zettabytes': 1000 ** 7,
2364 'zebibytes': 1024 ** 7,
2365 'YiB': 1024 ** 8,
2366 'YB': 1000 ** 8,
2367 'yB': 1024 ** 8,
2368 'Yb': 1000 ** 8,
2369 'yb': 1000 ** 8,
2370 'yottabytes': 1000 ** 8,
2371 'yobibytes': 1024 ** 8,
2372 }
2373
2374 return lookup_unit_table(_UNIT_TABLE, s)
2375
2376
2377 def parse_count(s):
2378 if s is None:
2379 return None
2380
2381 s = re.sub(r'^[^\d]+\s', '', s).strip()
2382
2383 if re.match(r'^[\d,.]+$', s):
2384 return str_to_int(s)
2385
2386 _UNIT_TABLE = {
2387 'k': 1000,
2388 'K': 1000,
2389 'm': 1000 ** 2,
2390 'M': 1000 ** 2,
2391 'kk': 1000 ** 2,
2392 'KK': 1000 ** 2,
2393 'b': 1000 ** 3,
2394 'B': 1000 ** 3,
2395 }
2396
2397 ret = lookup_unit_table(_UNIT_TABLE, s)
2398 if ret is not None:
2399 return ret
2400
2401 mobj = re.match(r'([\d,.]+)(?:$|\s)', s)
2402 if mobj:
2403 return str_to_int(mobj.group(1))
2404
2405
2406 def parse_resolution(s):
2407 if s is None:
2408 return {}
2409
2410 mobj = re.search(r'(?<![a-zA-Z0-9])(?P<w>\d+)\s*[xX×,]\s*(?P<h>\d+)(?![a-zA-Z0-9])', s)
2411 if mobj:
2412 return {
2413 'width': int(mobj.group('w')),
2414 'height': int(mobj.group('h')),
2415 }
2416
2417 mobj = re.search(r'(?<![a-zA-Z0-9])(\d+)[pPiI](?![a-zA-Z0-9])', s)
2418 if mobj:
2419 return {'height': int(mobj.group(1))}
2420
2421 mobj = re.search(r'\b([48])[kK]\b', s)
2422 if mobj:
2423 return {'height': int(mobj.group(1)) * 540}
2424
2425 return {}
2426
2427
2428 def parse_bitrate(s):
2429 if not isinstance(s, compat_str):
2430 return
2431 mobj = re.search(r'\b(\d+)\s*kbps', s)
2432 if mobj:
2433 return int(mobj.group(1))
2434
2435
2436 def month_by_name(name, lang='en'):
2437 """ Return the number of a month by (locale-independently) English name """
2438
2439 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
2440
2441 try:
2442 return month_names.index(name) + 1
2443 except ValueError:
2444 return None
2445
2446
2447 def month_by_abbreviation(abbrev):
2448 """ Return the number of a month by (locale-independently) English
2449 abbreviations """
2450
2451 try:
2452 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
2453 except ValueError:
2454 return None
2455
2456
2457 def fix_xml_ampersands(xml_str):
2458 """Replace all the '&' by '&amp;' in XML"""
2459 return re.sub(
2460 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
2461 '&amp;',
2462 xml_str)
2463
2464
2465 def setproctitle(title):
2466 assert isinstance(title, compat_str)
2467
2468 # ctypes in Jython is not complete
2469 # http://bugs.jython.org/issue2148
2470 if sys.platform.startswith('java'):
2471 return
2472
2473 try:
2474 libc = ctypes.cdll.LoadLibrary('libc.so.6')
2475 except OSError:
2476 return
2477 except TypeError:
2478 # LoadLibrary in Windows Python 2.7.13 only expects
2479 # a bytestring, but since unicode_literals turns
2480 # every string into a unicode string, it fails.
2481 return
2482 title_bytes = title.encode('utf-8')
2483 buf = ctypes.create_string_buffer(len(title_bytes))
2484 buf.value = title_bytes
2485 try:
2486 libc.prctl(15, buf, 0, 0, 0)
2487 except AttributeError:
2488 return # Strange libc, just skip this
2489
2490
2491 def remove_start(s, start):
2492 return s[len(start):] if s is not None and s.startswith(start) else s
2493
2494
2495 def remove_end(s, end):
2496 return s[:-len(end)] if s is not None and s.endswith(end) else s
2497
2498
2499 def remove_quotes(s):
2500 if s is None or len(s) < 2:
2501 return s
2502 for quote in ('"', "'", ):
2503 if s[0] == quote and s[-1] == quote:
2504 return s[1:-1]
2505 return s
2506
2507
2508 def get_domain(url):
2509 domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url)
2510 return domain.group('domain') if domain else None
2511
2512
2513 def url_basename(url):
2514 path = compat_urlparse.urlparse(url).path
2515 return path.strip('/').split('/')[-1]
2516
2517
2518 def base_url(url):
2519 return re.match(r'https?://[^?#&]+/', url).group()
2520
2521
2522 def urljoin(base, path):
2523 if isinstance(path, bytes):
2524 path = path.decode('utf-8')
2525 if not isinstance(path, compat_str) or not path:
2526 return None
2527 if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path):
2528 return path
2529 if isinstance(base, bytes):
2530 base = base.decode('utf-8')
2531 if not isinstance(base, compat_str) or not re.match(
2532 r'^(?:https?:)?//', base):
2533 return None
2534 return compat_urlparse.urljoin(base, path)
2535
2536
2537 class HEADRequest(compat_urllib_request.Request):
2538 def get_method(self):
2539 return 'HEAD'
2540
2541
2542 class PUTRequest(compat_urllib_request.Request):
2543 def get_method(self):
2544 return 'PUT'
2545
2546
2547 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
2548 if get_attr and v is not None:
2549 v = getattr(v, get_attr, None)
2550 try:
2551 return int(v) * invscale // scale
2552 except (ValueError, TypeError, OverflowError):
2553 return default
2554
2555
2556 def str_or_none(v, default=None):
2557 return default if v is None else compat_str(v)
2558
2559
2560 def str_to_int(int_str):
2561 """ A more relaxed version of int_or_none """
2562 if isinstance(int_str, compat_integer_types):
2563 return int_str
2564 elif isinstance(int_str, compat_str):
2565 int_str = re.sub(r'[,\.\+]', '', int_str)
2566 return int_or_none(int_str)
2567
2568
2569 def float_or_none(v, scale=1, invscale=1, default=None):
2570 if v is None:
2571 return default
2572 try:
2573 return float(v) * invscale / scale
2574 except (ValueError, TypeError):
2575 return default
2576
2577
2578 def bool_or_none(v, default=None):
2579 return v if isinstance(v, bool) else default
2580
2581
2582 def strip_or_none(v, default=None):
2583 return v.strip() if isinstance(v, compat_str) else default
2584
2585
2586 def url_or_none(url):
2587 if not url or not isinstance(url, compat_str):
2588 return None
2589 url = url.strip()
2590 return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None
2591
2592
2593 def request_to_url(req):
2594 if isinstance(req, compat_urllib_request.Request):
2595 return req.get_full_url()
2596 else:
2597 return req
2598
2599
2600 def strftime_or_none(timestamp, date_format, default=None):
2601 datetime_object = None
2602 try:
2603 if isinstance(timestamp, compat_numeric_types): # unix timestamp
2604 datetime_object = datetime.datetime.utcfromtimestamp(timestamp)
2605 elif isinstance(timestamp, compat_str): # assume YYYYMMDD
2606 datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d')
2607 return datetime_object.strftime(date_format)
2608 except (ValueError, TypeError, AttributeError):
2609 return default
2610
2611
2612 def parse_duration(s):
2613 if not isinstance(s, compat_basestring):
2614 return None
2615 s = s.strip()
2616 if not s:
2617 return None
2618
2619 days, hours, mins, secs, ms = [None] * 5
2620 m = re.match(r'''(?x)
2621 (?P<before_secs>
2622 (?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?
2623 (?P<secs>(?(before_secs)[0-9]{1,2}|[0-9]+))
2624 (?P<ms>[.:][0-9]+)?Z?$
2625 ''', s)
2626 if m:
2627 days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms')
2628 else:
2629 m = re.match(
2630 r'''(?ix)(?:P?
2631 (?:
2632 [0-9]+\s*y(?:ears?)?\s*
2633 )?
2634 (?:
2635 [0-9]+\s*m(?:onths?)?\s*
2636 )?
2637 (?:
2638 [0-9]+\s*w(?:eeks?)?\s*
2639 )?
2640 (?:
2641 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
2642 )?
2643 T)?
2644 (?:
2645 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
2646 )?
2647 (?:
2648 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
2649 )?
2650 (?:
2651 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
2652 )?Z?$''', s)
2653 if m:
2654 days, hours, mins, secs, ms = m.groups()
2655 else:
2656 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
2657 if m:
2658 hours, mins = m.groups()
2659 else:
2660 return None
2661
2662 duration = 0
2663 if secs:
2664 duration += float(secs)
2665 if mins:
2666 duration += float(mins) * 60
2667 if hours:
2668 duration += float(hours) * 60 * 60
2669 if days:
2670 duration += float(days) * 24 * 60 * 60
2671 if ms:
2672 duration += float(ms.replace(':', '.'))
2673 return duration
2674
2675
2676 def prepend_extension(filename, ext, expected_real_ext=None):
2677 name, real_ext = os.path.splitext(filename)
2678 return (
2679 '{0}.{1}{2}'.format(name, ext, real_ext)
2680 if not expected_real_ext or real_ext[1:] == expected_real_ext
2681 else '{0}.{1}'.format(filename, ext))
2682
2683
2684 def replace_extension(filename, ext, expected_real_ext=None):
2685 name, real_ext = os.path.splitext(filename)
2686 return '{0}.{1}'.format(
2687 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
2688 ext)
2689
2690
2691 def check_executable(exe, args=[]):
2692 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
2693 args can be a list of arguments for a short output (like -version) """
2694 try:
2695 Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill()
2696 except OSError:
2697 return False
2698 return exe
2699
2700
2701 def _get_exe_version_output(exe, args):
2702 try:
2703 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
2704 # SIGTTOU if yt-dlp is run in the background.
2705 # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656
2706 out, _ = Popen(
2707 [encodeArgument(exe)] + args, stdin=subprocess.PIPE,
2708 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill()
2709 except OSError:
2710 return False
2711 if isinstance(out, bytes): # Python 2.x
2712 out = out.decode('ascii', 'ignore')
2713 return out
2714
2715
2716 def detect_exe_version(output, version_re=None, unrecognized='present'):
2717 assert isinstance(output, compat_str)
2718 if version_re is None:
2719 version_re = r'version\s+([-0-9._a-zA-Z]+)'
2720 m = re.search(version_re, output)
2721 if m:
2722 return m.group(1)
2723 else:
2724 return unrecognized
2725
2726
2727 def get_exe_version(exe, args=['--version'],
2728 version_re=None, unrecognized='present'):
2729 """ Returns the version of the specified executable,
2730 or False if the executable is not present """
2731 out = _get_exe_version_output(exe, args)
2732 return detect_exe_version(out, version_re, unrecognized) if out else False
2733
2734
2735 class LazyList(collections.abc.Sequence):
2736 ''' Lazy immutable list from an iterable
2737 Note that slices of a LazyList are lists and not LazyList'''
2738
2739 class IndexError(IndexError):
2740 pass
2741
2742 def __init__(self, iterable, *, reverse=False, _cache=None):
2743 self.__iterable = iter(iterable)
2744 self.__cache = [] if _cache is None else _cache
2745 self.__reversed = reverse
2746
2747 def __iter__(self):
2748 if self.__reversed:
2749 # We need to consume the entire iterable to iterate in reverse
2750 yield from self.exhaust()
2751 return
2752 yield from self.__cache
2753 for item in self.__iterable:
2754 self.__cache.append(item)
2755 yield item
2756
2757 def __exhaust(self):
2758 self.__cache.extend(self.__iterable)
2759 # Discard the emptied iterable to make it pickle-able
2760 self.__iterable = []
2761 return self.__cache
2762
2763 def exhaust(self):
2764 ''' Evaluate the entire iterable '''
2765 return self.__exhaust()[::-1 if self.__reversed else 1]
2766
2767 @staticmethod
2768 def __reverse_index(x):
2769 return None if x is None else -(x + 1)
2770
2771 def __getitem__(self, idx):
2772 if isinstance(idx, slice):
2773 if self.__reversed:
2774 idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1))
2775 start, stop, step = idx.start, idx.stop, idx.step or 1
2776 elif isinstance(idx, int):
2777 if self.__reversed:
2778 idx = self.__reverse_index(idx)
2779 start, stop, step = idx, idx, 0
2780 else:
2781 raise TypeError('indices must be integers or slices')
2782 if ((start or 0) < 0 or (stop or 0) < 0
2783 or (start is None and step < 0)
2784 or (stop is None and step > 0)):
2785 # We need to consume the entire iterable to be able to slice from the end
2786 # Obviously, never use this with infinite iterables
2787 self.__exhaust()
2788 try:
2789 return self.__cache[idx]
2790 except IndexError as e:
2791 raise self.IndexError(e) from e
2792 n = max(start or 0, stop or 0) - len(self.__cache) + 1
2793 if n > 0:
2794 self.__cache.extend(itertools.islice(self.__iterable, n))
2795 try:
2796 return self.__cache[idx]
2797 except IndexError as e:
2798 raise self.IndexError(e) from e
2799
2800 def __bool__(self):
2801 try:
2802 self[-1] if self.__reversed else self[0]
2803 except self.IndexError:
2804 return False
2805 return True
2806
2807 def __len__(self):
2808 self.__exhaust()
2809 return len(self.__cache)
2810
2811 def __reversed__(self):
2812 return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache)
2813
2814 def __copy__(self):
2815 return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache)
2816
2817 def __repr__(self):
2818 # repr and str should mimic a list. So we exhaust the iterable
2819 return repr(self.exhaust())
2820
2821 def __str__(self):
2822 return repr(self.exhaust())
2823
2824
2825 class PagedList:
2826
2827 class IndexError(IndexError):
2828 pass
2829
2830 def __len__(self):
2831 # This is only useful for tests
2832 return len(self.getslice())
2833
2834 def __init__(self, pagefunc, pagesize, use_cache=True):
2835 self._pagefunc = pagefunc
2836 self._pagesize = pagesize
2837 self._pagecount = float('inf')
2838 self._use_cache = use_cache
2839 self._cache = {}
2840
2841 def getpage(self, pagenum):
2842 page_results = self._cache.get(pagenum)
2843 if page_results is None:
2844 page_results = [] if pagenum > self._pagecount else list(self._pagefunc(pagenum))
2845 if self._use_cache:
2846 self._cache[pagenum] = page_results
2847 return page_results
2848
2849 def getslice(self, start=0, end=None):
2850 return list(self._getslice(start, end))
2851
2852 def _getslice(self, start, end):
2853 raise NotImplementedError('This method must be implemented by subclasses')
2854
2855 def __getitem__(self, idx):
2856 assert self._use_cache, 'Indexing PagedList requires cache'
2857 if not isinstance(idx, int) or idx < 0:
2858 raise TypeError('indices must be non-negative integers')
2859 entries = self.getslice(idx, idx + 1)
2860 if not entries:
2861 raise self.IndexError()
2862 return entries[0]
2863
2864
2865 class OnDemandPagedList(PagedList):
2866 def _getslice(self, start, end):
2867 for pagenum in itertools.count(start // self._pagesize):
2868 firstid = pagenum * self._pagesize
2869 nextfirstid = pagenum * self._pagesize + self._pagesize
2870 if start >= nextfirstid:
2871 continue
2872
2873 startv = (
2874 start % self._pagesize
2875 if firstid <= start < nextfirstid
2876 else 0)
2877 endv = (
2878 ((end - 1) % self._pagesize) + 1
2879 if (end is not None and firstid <= end <= nextfirstid)
2880 else None)
2881
2882 try:
2883 page_results = self.getpage(pagenum)
2884 except Exception:
2885 self._pagecount = pagenum - 1
2886 raise
2887 if startv != 0 or endv is not None:
2888 page_results = page_results[startv:endv]
2889 yield from page_results
2890
2891 # A little optimization - if current page is not "full", ie. does
2892 # not contain page_size videos then we can assume that this page
2893 # is the last one - there are no more ids on further pages -
2894 # i.e. no need to query again.
2895 if len(page_results) + startv < self._pagesize:
2896 break
2897
2898 # If we got the whole page, but the next page is not interesting,
2899 # break out early as well
2900 if end == nextfirstid:
2901 break
2902
2903
2904 class InAdvancePagedList(PagedList):
2905 def __init__(self, pagefunc, pagecount, pagesize):
2906 PagedList.__init__(self, pagefunc, pagesize, True)
2907 self._pagecount = pagecount
2908
2909 def _getslice(self, start, end):
2910 start_page = start // self._pagesize
2911 end_page = self._pagecount if end is None else min(self._pagecount, end // self._pagesize + 1)
2912 skip_elems = start - start_page * self._pagesize
2913 only_more = None if end is None else end - start
2914 for pagenum in range(start_page, end_page):
2915 page_results = self.getpage(pagenum)
2916 if skip_elems:
2917 page_results = page_results[skip_elems:]
2918 skip_elems = None
2919 if only_more is not None:
2920 if len(page_results) < only_more:
2921 only_more -= len(page_results)
2922 else:
2923 yield from page_results[:only_more]
2924 break
2925 yield from page_results
2926
2927
2928 def uppercase_escape(s):
2929 unicode_escape = codecs.getdecoder('unicode_escape')
2930 return re.sub(
2931 r'\\U[0-9a-fA-F]{8}',
2932 lambda m: unicode_escape(m.group(0))[0],
2933 s)
2934
2935
2936 def lowercase_escape(s):
2937 unicode_escape = codecs.getdecoder('unicode_escape')
2938 return re.sub(
2939 r'\\u[0-9a-fA-F]{4}',
2940 lambda m: unicode_escape(m.group(0))[0],
2941 s)
2942
2943
2944 def escape_rfc3986(s):
2945 """Escape non-ASCII characters as suggested by RFC 3986"""
2946 if sys.version_info < (3, 0) and isinstance(s, compat_str):
2947 s = s.encode('utf-8')
2948 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
2949
2950
2951 def escape_url(url):
2952 """Escape URL as suggested by RFC 3986"""
2953 url_parsed = compat_urllib_parse_urlparse(url)
2954 return url_parsed._replace(
2955 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
2956 path=escape_rfc3986(url_parsed.path),
2957 params=escape_rfc3986(url_parsed.params),
2958 query=escape_rfc3986(url_parsed.query),
2959 fragment=escape_rfc3986(url_parsed.fragment)
2960 ).geturl()
2961
2962
2963 def parse_qs(url):
2964 return compat_parse_qs(compat_urllib_parse_urlparse(url).query)
2965
2966
2967 def read_batch_urls(batch_fd):
2968 def fixup(url):
2969 if not isinstance(url, compat_str):
2970 url = url.decode('utf-8', 'replace')
2971 BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff')
2972 for bom in BOM_UTF8:
2973 if url.startswith(bom):
2974 url = url[len(bom):]
2975 url = url.lstrip()
2976 if not url or url.startswith(('#', ';', ']')):
2977 return False
2978 # "#" cannot be stripped out since it is part of the URI
2979 # However, it can be safely stipped out if follwing a whitespace
2980 return re.split(r'\s#', url, 1)[0].rstrip()
2981
2982 with contextlib.closing(batch_fd) as fd:
2983 return [url for url in map(fixup, fd) if url]
2984
2985
2986 def urlencode_postdata(*args, **kargs):
2987 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
2988
2989
2990 def update_url_query(url, query):
2991 if not query:
2992 return url
2993 parsed_url = compat_urlparse.urlparse(url)
2994 qs = compat_parse_qs(parsed_url.query)
2995 qs.update(query)
2996 return compat_urlparse.urlunparse(parsed_url._replace(
2997 query=compat_urllib_parse_urlencode(qs, True)))
2998
2999
3000 def update_Request(req, url=None, data=None, headers={}, query={}):
3001 req_headers = req.headers.copy()
3002 req_headers.update(headers)
3003 req_data = data or req.data
3004 req_url = update_url_query(url or req.get_full_url(), query)
3005 req_get_method = req.get_method()
3006 if req_get_method == 'HEAD':
3007 req_type = HEADRequest
3008 elif req_get_method == 'PUT':
3009 req_type = PUTRequest
3010 else:
3011 req_type = compat_urllib_request.Request
3012 new_req = req_type(
3013 req_url, data=req_data, headers=req_headers,
3014 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
3015 if hasattr(req, 'timeout'):
3016 new_req.timeout = req.timeout
3017 return new_req
3018
3019
3020 def _multipart_encode_impl(data, boundary):
3021 content_type = 'multipart/form-data; boundary=%s' % boundary
3022
3023 out = b''
3024 for k, v in data.items():
3025 out += b'--' + boundary.encode('ascii') + b'\r\n'
3026 if isinstance(k, compat_str):
3027 k = k.encode('utf-8')
3028 if isinstance(v, compat_str):
3029 v = v.encode('utf-8')
3030 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
3031 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
3032 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
3033 if boundary.encode('ascii') in content:
3034 raise ValueError('Boundary overlaps with data')
3035 out += content
3036
3037 out += b'--' + boundary.encode('ascii') + b'--\r\n'
3038
3039 return out, content_type
3040
3041
3042 def multipart_encode(data, boundary=None):
3043 '''
3044 Encode a dict to RFC 7578-compliant form-data
3045
3046 data:
3047 A dict where keys and values can be either Unicode or bytes-like
3048 objects.
3049 boundary:
3050 If specified a Unicode object, it's used as the boundary. Otherwise
3051 a random boundary is generated.
3052
3053 Reference: https://tools.ietf.org/html/rfc7578
3054 '''
3055 has_specified_boundary = boundary is not None
3056
3057 while True:
3058 if boundary is None:
3059 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
3060
3061 try:
3062 out, content_type = _multipart_encode_impl(data, boundary)
3063 break
3064 except ValueError:
3065 if has_specified_boundary:
3066 raise
3067 boundary = None
3068
3069 return out, content_type
3070
3071
3072 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
3073 if isinstance(key_or_keys, (list, tuple)):
3074 for key in key_or_keys:
3075 if key not in d or d[key] is None or skip_false_values and not d[key]:
3076 continue
3077 return d[key]
3078 return default
3079 return d.get(key_or_keys, default)
3080
3081
3082 def try_get(src, getter, expected_type=None):
3083 for get in variadic(getter):
3084 try:
3085 v = get(src)
3086 except (AttributeError, KeyError, TypeError, IndexError):
3087 pass
3088 else:
3089 if expected_type is None or isinstance(v, expected_type):
3090 return v
3091
3092
3093 def merge_dicts(*dicts):
3094 merged = {}
3095 for a_dict in dicts:
3096 for k, v in a_dict.items():
3097 if v is None:
3098 continue
3099 if (k not in merged
3100 or (isinstance(v, compat_str) and v
3101 and isinstance(merged[k], compat_str)
3102 and not merged[k])):
3103 merged[k] = v
3104 return merged
3105
3106
3107 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
3108 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
3109
3110
3111 US_RATINGS = {
3112 'G': 0,
3113 'PG': 10,
3114 'PG-13': 13,
3115 'R': 16,
3116 'NC': 18,
3117 }
3118
3119
3120 TV_PARENTAL_GUIDELINES = {
3121 'TV-Y': 0,
3122 'TV-Y7': 7,
3123 'TV-G': 0,
3124 'TV-PG': 0,
3125 'TV-14': 14,
3126 'TV-MA': 17,
3127 }
3128
3129
3130 def parse_age_limit(s):
3131 if type(s) == int:
3132 return s if 0 <= s <= 21 else None
3133 if not isinstance(s, compat_basestring):
3134 return None
3135 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
3136 if m:
3137 return int(m.group('age'))
3138 s = s.upper()
3139 if s in US_RATINGS:
3140 return US_RATINGS[s]
3141 m = re.match(r'^TV[_-]?(%s)$' % '|'.join(k[3:] for k in TV_PARENTAL_GUIDELINES), s)
3142 if m:
3143 return TV_PARENTAL_GUIDELINES['TV-' + m.group(1)]
3144 return None
3145
3146
3147 def strip_jsonp(code):
3148 return re.sub(
3149 r'''(?sx)^
3150 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]*)
3151 (?:\s*&&\s*(?P=func_name))?
3152 \s*\(\s*(?P<callback_data>.*)\);?
3153 \s*?(?://[^\n]*)*$''',
3154 r'\g<callback_data>', code)
3155
3156
3157 def js_to_json(code, vars={}):
3158 # vars is a dict of var, val pairs to substitute
3159 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n'
3160 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
3161 INTEGER_TABLE = (
3162 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
3163 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
3164 )
3165
3166 def fix_kv(m):
3167 v = m.group(0)
3168 if v in ('true', 'false', 'null'):
3169 return v
3170 elif v in ('undefined', 'void 0'):
3171 return 'null'
3172 elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
3173 return ""
3174
3175 if v[0] in ("'", '"'):
3176 v = re.sub(r'(?s)\\.|"', lambda m: {
3177 '"': '\\"',
3178 "\\'": "'",
3179 '\\\n': '',
3180 '\\x': '\\u00',
3181 }.get(m.group(0), m.group(0)), v[1:-1])
3182 else:
3183 for regex, base in INTEGER_TABLE:
3184 im = re.match(regex, v)
3185 if im:
3186 i = int(im.group(1), base)
3187 return '"%d":' % i if v.endswith(':') else '%d' % i
3188
3189 if v in vars:
3190 return vars[v]
3191
3192 return '"%s"' % v
3193
3194 code = re.sub(r'new Date\((".+")\)', r'\g<1>', code)
3195
3196 return re.sub(r'''(?sx)
3197 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
3198 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
3199 {comment}|,(?={skip}[\]}}])|
3200 void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*|
3201 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
3202 [0-9]+(?={skip}:)|
3203 !+
3204 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
3205
3206
3207 def qualities(quality_ids):
3208 """ Get a numeric quality value out of a list of possible values """
3209 def q(qid):
3210 try:
3211 return quality_ids.index(qid)
3212 except ValueError:
3213 return -1
3214 return q
3215
3216
3217 POSTPROCESS_WHEN = {'pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'}
3218
3219
3220 DEFAULT_OUTTMPL = {
3221 'default': '%(title)s [%(id)s].%(ext)s',
3222 'chapter': '%(title)s - %(section_number)03d %(section_title)s [%(id)s].%(ext)s',
3223 }
3224 OUTTMPL_TYPES = {
3225 'chapter': None,
3226 'subtitle': None,
3227 'thumbnail': None,
3228 'description': 'description',
3229 'annotation': 'annotations.xml',
3230 'infojson': 'info.json',
3231 'link': None,
3232 'pl_video': None,
3233 'pl_thumbnail': None,
3234 'pl_description': 'description',
3235 'pl_infojson': 'info.json',
3236 }
3237
3238 # As of [1] format syntax is:
3239 # %[mapping_key][conversion_flags][minimum_width][.precision][length_modifier]type
3240 # 1. https://docs.python.org/2/library/stdtypes.html#string-formatting
3241 STR_FORMAT_RE_TMPL = r'''(?x)
3242 (?<!%)(?P<prefix>(?:%%)*)
3243 %
3244 (?P<has_key>\((?P<key>{0})\))?
3245 (?P<format>
3246 (?P<conversion>[#0\-+ ]+)?
3247 (?P<min_width>\d+)?
3248 (?P<precision>\.\d+)?
3249 (?P<len_mod>[hlL])? # unused in python
3250 {1} # conversion type
3251 )
3252 '''
3253
3254
3255 STR_FORMAT_TYPES = 'diouxXeEfFgGcrs'
3256
3257
3258 def limit_length(s, length):
3259 """ Add ellipses to overly long strings """
3260 if s is None:
3261 return None
3262 ELLIPSES = '...'
3263 if len(s) > length:
3264 return s[:length - len(ELLIPSES)] + ELLIPSES
3265 return s
3266
3267
3268 def version_tuple(v):
3269 return tuple(int(e) for e in re.split(r'[-.]', v))
3270
3271
3272 def is_outdated_version(version, limit, assume_new=True):
3273 if not version:
3274 return not assume_new
3275 try:
3276 return version_tuple(version) < version_tuple(limit)
3277 except ValueError:
3278 return not assume_new
3279
3280
3281 def ytdl_is_updateable():
3282 """ Returns if yt-dlp can be updated with -U """
3283
3284 from .update import is_non_updateable
3285
3286 return not is_non_updateable()
3287
3288
3289 def args_to_str(args):
3290 # Get a short string representation for a subprocess command
3291 return ' '.join(compat_shlex_quote(a) for a in args)
3292
3293
3294 def error_to_compat_str(err):
3295 err_str = str(err)
3296 # On python 2 error byte string must be decoded with proper
3297 # encoding rather than ascii
3298 if sys.version_info[0] < 3:
3299 err_str = err_str.decode(preferredencoding())
3300 return err_str
3301
3302
3303 def mimetype2ext(mt):
3304 if mt is None:
3305 return None
3306
3307 mt, _, params = mt.partition(';')
3308 mt = mt.strip()
3309
3310 FULL_MAP = {
3311 'audio/mp4': 'm4a',
3312 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
3313 # it's the most popular one
3314 'audio/mpeg': 'mp3',
3315 'audio/x-wav': 'wav',
3316 'audio/wav': 'wav',
3317 'audio/wave': 'wav',
3318 }
3319
3320 ext = FULL_MAP.get(mt)
3321 if ext is not None:
3322 return ext
3323
3324 SUBTYPE_MAP = {
3325 '3gpp': '3gp',
3326 'smptett+xml': 'tt',
3327 'ttaf+xml': 'dfxp',
3328 'ttml+xml': 'ttml',
3329 'x-flv': 'flv',
3330 'x-mp4-fragmented': 'mp4',
3331 'x-ms-sami': 'sami',
3332 'x-ms-wmv': 'wmv',
3333 'mpegurl': 'm3u8',
3334 'x-mpegurl': 'm3u8',
3335 'vnd.apple.mpegurl': 'm3u8',
3336 'dash+xml': 'mpd',
3337 'f4m+xml': 'f4m',
3338 'hds+xml': 'f4m',
3339 'vnd.ms-sstr+xml': 'ism',
3340 'quicktime': 'mov',
3341 'mp2t': 'ts',
3342 'x-wav': 'wav',
3343 'filmstrip+json': 'fs',
3344 'svg+xml': 'svg',
3345 }
3346
3347 _, _, subtype = mt.rpartition('/')
3348 ext = SUBTYPE_MAP.get(subtype.lower())
3349 if ext is not None:
3350 return ext
3351
3352 SUFFIX_MAP = {
3353 'json': 'json',
3354 'xml': 'xml',
3355 'zip': 'zip',
3356 'gzip': 'gz',
3357 }
3358
3359 _, _, suffix = subtype.partition('+')
3360 ext = SUFFIX_MAP.get(suffix)
3361 if ext is not None:
3362 return ext
3363
3364 return subtype.replace('+', '.')
3365
3366
3367 def ext2mimetype(ext_or_url):
3368 if not ext_or_url:
3369 return None
3370 if '.' not in ext_or_url:
3371 ext_or_url = f'file.{ext_or_url}'
3372 return mimetypes.guess_type(ext_or_url)[0]
3373
3374
3375 def parse_codecs(codecs_str):
3376 # http://tools.ietf.org/html/rfc6381
3377 if not codecs_str:
3378 return {}
3379 split_codecs = list(filter(None, map(
3380 str.strip, codecs_str.strip().strip(',').split(','))))
3381 vcodec, acodec, tcodec, hdr = None, None, None, None
3382 for full_codec in split_codecs:
3383 parts = full_codec.split('.')
3384 codec = parts[0].replace('0', '')
3385 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2',
3386 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'):
3387 if not vcodec:
3388 vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec
3389 if codec in ('dvh1', 'dvhe'):
3390 hdr = 'DV'
3391 elif codec == 'av1' and len(parts) > 3 and parts[3] == '10':
3392 hdr = 'HDR10'
3393 elif full_codec.replace('0', '').startswith('vp9.2'):
3394 hdr = 'HDR10'
3395 elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
3396 if not acodec:
3397 acodec = full_codec
3398 elif codec in ('stpp', 'wvtt',):
3399 if not tcodec:
3400 tcodec = full_codec
3401 else:
3402 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
3403 if vcodec or acodec or tcodec:
3404 return {
3405 'vcodec': vcodec or 'none',
3406 'acodec': acodec or 'none',
3407 'dynamic_range': hdr,
3408 **({'tcodec': tcodec} if tcodec is not None else {}),
3409 }
3410 elif len(split_codecs) == 2:
3411 return {
3412 'vcodec': split_codecs[0],
3413 'acodec': split_codecs[1],
3414 }
3415 return {}
3416
3417
3418 def urlhandle_detect_ext(url_handle):
3419 getheader = url_handle.headers.get
3420
3421 cd = getheader('Content-Disposition')
3422 if cd:
3423 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
3424 if m:
3425 e = determine_ext(m.group('filename'), default_ext=None)
3426 if e:
3427 return e
3428
3429 return mimetype2ext(getheader('Content-Type'))
3430
3431
3432 def encode_data_uri(data, mime_type):
3433 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
3434
3435
3436 def age_restricted(content_limit, age_limit):
3437 """ Returns True iff the content should be blocked """
3438
3439 if age_limit is None: # No limit set
3440 return False
3441 if content_limit is None:
3442 return False # Content available for everyone
3443 return age_limit < content_limit
3444
3445
3446 def is_html(first_bytes):
3447 """ Detect whether a file contains HTML by examining its first bytes. """
3448
3449 BOMS = [
3450 (b'\xef\xbb\xbf', 'utf-8'),
3451 (b'\x00\x00\xfe\xff', 'utf-32-be'),
3452 (b'\xff\xfe\x00\x00', 'utf-32-le'),
3453 (b'\xff\xfe', 'utf-16-le'),
3454 (b'\xfe\xff', 'utf-16-be'),
3455 ]
3456 for bom, enc in BOMS:
3457 if first_bytes.startswith(bom):
3458 s = first_bytes[len(bom):].decode(enc, 'replace')
3459 break
3460 else:
3461 s = first_bytes.decode('utf-8', 'replace')
3462
3463 return re.match(r'^\s*<', s)
3464
3465
3466 def determine_protocol(info_dict):
3467 protocol = info_dict.get('protocol')
3468 if protocol is not None:
3469 return protocol
3470
3471 url = sanitize_url(info_dict['url'])
3472 if url.startswith('rtmp'):
3473 return 'rtmp'
3474 elif url.startswith('mms'):
3475 return 'mms'
3476 elif url.startswith('rtsp'):
3477 return 'rtsp'
3478
3479 ext = determine_ext(url)
3480 if ext == 'm3u8':
3481 return 'm3u8'
3482 elif ext == 'f4m':
3483 return 'f4m'
3484
3485 return compat_urllib_parse_urlparse(url).scheme
3486
3487
3488 def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False):
3489 """ Render a list of rows, each as a list of values.
3490 Text after a \t will be right aligned """
3491 def width(string):
3492 return len(remove_terminal_sequences(string).replace('\t', ''))
3493
3494 def get_max_lens(table):
3495 return [max(width(str(v)) for v in col) for col in zip(*table)]
3496
3497 def filter_using_list(row, filterArray):
3498 return [col for take, col in itertools.zip_longest(filterArray, row, fillvalue=True) if take]
3499
3500 max_lens = get_max_lens(data) if hide_empty else []
3501 header_row = filter_using_list(header_row, max_lens)
3502 data = [filter_using_list(row, max_lens) for row in data]
3503
3504 table = [header_row] + data
3505 max_lens = get_max_lens(table)
3506 extra_gap += 1
3507 if delim:
3508 table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data
3509 table[1][-1] = table[1][-1][:-extra_gap * len(delim)] # Remove extra_gap from end of delimiter
3510 for row in table:
3511 for pos, text in enumerate(map(str, row)):
3512 if '\t' in text:
3513 row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap
3514 else:
3515 row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap)
3516 ret = '\n'.join(''.join(row).rstrip() for row in table)
3517 return ret
3518
3519
3520 def _match_one(filter_part, dct, incomplete):
3521 # TODO: Generalize code with YoutubeDL._build_format_filter
3522 STRING_OPERATORS = {
3523 '*=': operator.contains,
3524 '^=': lambda attr, value: attr.startswith(value),
3525 '$=': lambda attr, value: attr.endswith(value),
3526 '~=': lambda attr, value: re.search(value, attr),
3527 }
3528 COMPARISON_OPERATORS = {
3529 **STRING_OPERATORS,
3530 '<=': operator.le, # "<=" must be defined above "<"
3531 '<': operator.lt,
3532 '>=': operator.ge,
3533 '>': operator.gt,
3534 '=': operator.eq,
3535 }
3536
3537 operator_rex = re.compile(r'''(?x)\s*
3538 (?P<key>[a-z_]+)
3539 \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
3540 (?:
3541 (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)|
3542 (?P<strval>.+?)
3543 )
3544 \s*$
3545 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
3546 m = operator_rex.search(filter_part)
3547 if m:
3548 m = m.groupdict()
3549 unnegated_op = COMPARISON_OPERATORS[m['op']]
3550 if m['negation']:
3551 op = lambda attr, value: not unnegated_op(attr, value)
3552 else:
3553 op = unnegated_op
3554 comparison_value = m['quotedstrval'] or m['strval'] or m['intval']
3555 if m['quote']:
3556 comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote'])
3557 actual_value = dct.get(m['key'])
3558 numeric_comparison = None
3559 if isinstance(actual_value, compat_numeric_types):
3560 # If the original field is a string and matching comparisonvalue is
3561 # a number we should respect the origin of the original field
3562 # and process comparison value as a string (see
3563 # https://github.com/ytdl-org/youtube-dl/issues/11082)
3564 try:
3565 numeric_comparison = int(comparison_value)
3566 except ValueError:
3567 numeric_comparison = parse_filesize(comparison_value)
3568 if numeric_comparison is None:
3569 numeric_comparison = parse_filesize(f'{comparison_value}B')
3570 if numeric_comparison is None:
3571 numeric_comparison = parse_duration(comparison_value)
3572 if numeric_comparison is not None and m['op'] in STRING_OPERATORS:
3573 raise ValueError('Operator %s only supports string values!' % m['op'])
3574 if actual_value is None:
3575 return incomplete or m['none_inclusive']
3576 return op(actual_value, comparison_value if numeric_comparison is None else numeric_comparison)
3577
3578 UNARY_OPERATORS = {
3579 '': lambda v: (v is True) if isinstance(v, bool) else (v is not None),
3580 '!': lambda v: (v is False) if isinstance(v, bool) else (v is None),
3581 }
3582 operator_rex = re.compile(r'''(?x)\s*
3583 (?P<op>%s)\s*(?P<key>[a-z_]+)
3584 \s*$
3585 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
3586 m = operator_rex.search(filter_part)
3587 if m:
3588 op = UNARY_OPERATORS[m.group('op')]
3589 actual_value = dct.get(m.group('key'))
3590 if incomplete and actual_value is None:
3591 return True
3592 return op(actual_value)
3593
3594 raise ValueError('Invalid filter part %r' % filter_part)
3595
3596
3597 def match_str(filter_str, dct, incomplete=False):
3598 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false
3599 When incomplete, all conditions passes on missing fields
3600 """
3601 return all(
3602 _match_one(filter_part.replace(r'\&', '&'), dct, incomplete)
3603 for filter_part in re.split(r'(?<!\\)&', filter_str))
3604
3605
3606 def match_filter_func(filter_str):
3607 if filter_str is None:
3608 return None
3609
3610 def _match_func(info_dict, *args, **kwargs):
3611 if match_str(filter_str, info_dict, *args, **kwargs):
3612 return None
3613 else:
3614 video_title = info_dict.get('title', info_dict.get('id', 'video'))
3615 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
3616 return _match_func
3617
3618
3619 def parse_dfxp_time_expr(time_expr):
3620 if not time_expr:
3621 return
3622
3623 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
3624 if mobj:
3625 return float(mobj.group('time_offset'))
3626
3627 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
3628 if mobj:
3629 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
3630
3631
3632 def srt_subtitles_timecode(seconds):
3633 return '%02d:%02d:%02d,%03d' % timetuple_from_msec(seconds * 1000)
3634
3635
3636 def ass_subtitles_timecode(seconds):
3637 time = timetuple_from_msec(seconds * 1000)
3638 return '%01d:%02d:%02d.%02d' % (*time[:-1], time.milliseconds / 10)
3639
3640
3641 def dfxp2srt(dfxp_data):
3642 '''
3643 @param dfxp_data A bytes-like object containing DFXP data
3644 @returns A unicode object containing converted SRT data
3645 '''
3646 LEGACY_NAMESPACES = (
3647 (b'http://www.w3.org/ns/ttml', [
3648 b'http://www.w3.org/2004/11/ttaf1',
3649 b'http://www.w3.org/2006/04/ttaf1',
3650 b'http://www.w3.org/2006/10/ttaf1',
3651 ]),
3652 (b'http://www.w3.org/ns/ttml#styling', [
3653 b'http://www.w3.org/ns/ttml#style',
3654 ]),
3655 )
3656
3657 SUPPORTED_STYLING = [
3658 'color',
3659 'fontFamily',
3660 'fontSize',
3661 'fontStyle',
3662 'fontWeight',
3663 'textDecoration'
3664 ]
3665
3666 _x = functools.partial(xpath_with_ns, ns_map={
3667 'xml': 'http://www.w3.org/XML/1998/namespace',
3668 'ttml': 'http://www.w3.org/ns/ttml',
3669 'tts': 'http://www.w3.org/ns/ttml#styling',
3670 })
3671
3672 styles = {}
3673 default_style = {}
3674
3675 class TTMLPElementParser(object):
3676 _out = ''
3677 _unclosed_elements = []
3678 _applied_styles = []
3679
3680 def start(self, tag, attrib):
3681 if tag in (_x('ttml:br'), 'br'):
3682 self._out += '\n'
3683 else:
3684 unclosed_elements = []
3685 style = {}
3686 element_style_id = attrib.get('style')
3687 if default_style:
3688 style.update(default_style)
3689 if element_style_id:
3690 style.update(styles.get(element_style_id, {}))
3691 for prop in SUPPORTED_STYLING:
3692 prop_val = attrib.get(_x('tts:' + prop))
3693 if prop_val:
3694 style[prop] = prop_val
3695 if style:
3696 font = ''
3697 for k, v in sorted(style.items()):
3698 if self._applied_styles and self._applied_styles[-1].get(k) == v:
3699 continue
3700 if k == 'color':
3701 font += ' color="%s"' % v
3702 elif k == 'fontSize':
3703 font += ' size="%s"' % v
3704 elif k == 'fontFamily':
3705 font += ' face="%s"' % v
3706 elif k == 'fontWeight' and v == 'bold':
3707 self._out += '<b>'
3708 unclosed_elements.append('b')
3709 elif k == 'fontStyle' and v == 'italic':
3710 self._out += '<i>'
3711 unclosed_elements.append('i')
3712 elif k == 'textDecoration' and v == 'underline':
3713 self._out += '<u>'
3714 unclosed_elements.append('u')
3715 if font:
3716 self._out += '<font' + font + '>'
3717 unclosed_elements.append('font')
3718 applied_style = {}
3719 if self._applied_styles:
3720 applied_style.update(self._applied_styles[-1])
3721 applied_style.update(style)
3722 self._applied_styles.append(applied_style)
3723 self._unclosed_elements.append(unclosed_elements)
3724
3725 def end(self, tag):
3726 if tag not in (_x('ttml:br'), 'br'):
3727 unclosed_elements = self._unclosed_elements.pop()
3728 for element in reversed(unclosed_elements):
3729 self._out += '</%s>' % element
3730 if unclosed_elements and self._applied_styles:
3731 self._applied_styles.pop()
3732
3733 def data(self, data):
3734 self._out += data
3735
3736 def close(self):
3737 return self._out.strip()
3738
3739 def parse_node(node):
3740 target = TTMLPElementParser()
3741 parser = xml.etree.ElementTree.XMLParser(target=target)
3742 parser.feed(xml.etree.ElementTree.tostring(node))
3743 return parser.close()
3744
3745 for k, v in LEGACY_NAMESPACES:
3746 for ns in v:
3747 dfxp_data = dfxp_data.replace(ns, k)
3748
3749 dfxp = compat_etree_fromstring(dfxp_data)
3750 out = []
3751 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
3752
3753 if not paras:
3754 raise ValueError('Invalid dfxp/TTML subtitle')
3755
3756 repeat = False
3757 while True:
3758 for style in dfxp.findall(_x('.//ttml:style')):
3759 style_id = style.get('id') or style.get(_x('xml:id'))
3760 if not style_id:
3761 continue
3762 parent_style_id = style.get('style')
3763 if parent_style_id:
3764 if parent_style_id not in styles:
3765 repeat = True
3766 continue
3767 styles[style_id] = styles[parent_style_id].copy()
3768 for prop in SUPPORTED_STYLING:
3769 prop_val = style.get(_x('tts:' + prop))
3770 if prop_val:
3771 styles.setdefault(style_id, {})[prop] = prop_val
3772 if repeat:
3773 repeat = False
3774 else:
3775 break
3776
3777 for p in ('body', 'div'):
3778 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
3779 if ele is None:
3780 continue
3781 style = styles.get(ele.get('style'))
3782 if not style:
3783 continue
3784 default_style.update(style)
3785
3786 for para, index in zip(paras, itertools.count(1)):
3787 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
3788 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
3789 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
3790 if begin_time is None:
3791 continue
3792 if not end_time:
3793 if not dur:
3794 continue
3795 end_time = begin_time + dur
3796 out.append('%d\n%s --> %s\n%s\n\n' % (
3797 index,
3798 srt_subtitles_timecode(begin_time),
3799 srt_subtitles_timecode(end_time),
3800 parse_node(para)))
3801
3802 return ''.join(out)
3803
3804
3805 def cli_option(params, command_option, param):
3806 param = params.get(param)
3807 if param:
3808 param = compat_str(param)
3809 return [command_option, param] if param is not None else []
3810
3811
3812 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
3813 param = params.get(param)
3814 if param is None:
3815 return []
3816 assert isinstance(param, bool)
3817 if separator:
3818 return [command_option + separator + (true_value if param else false_value)]
3819 return [command_option, true_value if param else false_value]
3820
3821
3822 def cli_valueless_option(params, command_option, param, expected_value=True):
3823 param = params.get(param)
3824 return [command_option] if param == expected_value else []
3825
3826
3827 def cli_configuration_args(argdict, keys, default=[], use_compat=True):
3828 if isinstance(argdict, (list, tuple)): # for backward compatibility
3829 if use_compat:
3830 return argdict
3831 else:
3832 argdict = None
3833 if argdict is None:
3834 return default
3835 assert isinstance(argdict, dict)
3836
3837 assert isinstance(keys, (list, tuple))
3838 for key_list in keys:
3839 arg_list = list(filter(
3840 lambda x: x is not None,
3841 [argdict.get(key.lower()) for key in variadic(key_list)]))
3842 if arg_list:
3843 return [arg for args in arg_list for arg in args]
3844 return default
3845
3846
3847 def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compat=True):
3848 main_key, exe = main_key.lower(), exe.lower()
3849 root_key = exe if main_key == exe else f'{main_key}+{exe}'
3850 keys = [f'{root_key}{k}' for k in (keys or [''])]
3851 if root_key in keys:
3852 if main_key != exe:
3853 keys.append((main_key, exe))
3854 keys.append('default')
3855 else:
3856 use_compat = False
3857 return cli_configuration_args(argdict, keys, default, use_compat)
3858
3859
3860 class ISO639Utils(object):
3861 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
3862 _lang_map = {
3863 'aa': 'aar',
3864 'ab': 'abk',
3865 'ae': 'ave',
3866 'af': 'afr',
3867 'ak': 'aka',
3868 'am': 'amh',
3869 'an': 'arg',
3870 'ar': 'ara',
3871 'as': 'asm',
3872 'av': 'ava',
3873 'ay': 'aym',
3874 'az': 'aze',
3875 'ba': 'bak',
3876 'be': 'bel',
3877 'bg': 'bul',
3878 'bh': 'bih',
3879 'bi': 'bis',
3880 'bm': 'bam',
3881 'bn': 'ben',
3882 'bo': 'bod',
3883 'br': 'bre',
3884 'bs': 'bos',
3885 'ca': 'cat',
3886 'ce': 'che',
3887 'ch': 'cha',
3888 'co': 'cos',
3889 'cr': 'cre',
3890 'cs': 'ces',
3891 'cu': 'chu',
3892 'cv': 'chv',
3893 'cy': 'cym',
3894 'da': 'dan',
3895 'de': 'deu',
3896 'dv': 'div',
3897 'dz': 'dzo',
3898 'ee': 'ewe',
3899 'el': 'ell',
3900 'en': 'eng',
3901 'eo': 'epo',
3902 'es': 'spa',
3903 'et': 'est',
3904 'eu': 'eus',
3905 'fa': 'fas',
3906 'ff': 'ful',
3907 'fi': 'fin',
3908 'fj': 'fij',
3909 'fo': 'fao',
3910 'fr': 'fra',
3911 'fy': 'fry',
3912 'ga': 'gle',
3913 'gd': 'gla',
3914 'gl': 'glg',
3915 'gn': 'grn',
3916 'gu': 'guj',
3917 'gv': 'glv',
3918 'ha': 'hau',
3919 'he': 'heb',
3920 'iw': 'heb', # Replaced by he in 1989 revision
3921 'hi': 'hin',
3922 'ho': 'hmo',
3923 'hr': 'hrv',
3924 'ht': 'hat',
3925 'hu': 'hun',
3926 'hy': 'hye',
3927 'hz': 'her',
3928 'ia': 'ina',
3929 'id': 'ind',
3930 'in': 'ind', # Replaced by id in 1989 revision
3931 'ie': 'ile',
3932 'ig': 'ibo',
3933 'ii': 'iii',
3934 'ik': 'ipk',
3935 'io': 'ido',
3936 'is': 'isl',
3937 'it': 'ita',
3938 'iu': 'iku',
3939 'ja': 'jpn',
3940 'jv': 'jav',
3941 'ka': 'kat',
3942 'kg': 'kon',
3943 'ki': 'kik',
3944 'kj': 'kua',
3945 'kk': 'kaz',
3946 'kl': 'kal',
3947 'km': 'khm',
3948 'kn': 'kan',
3949 'ko': 'kor',
3950 'kr': 'kau',
3951 'ks': 'kas',
3952 'ku': 'kur',
3953 'kv': 'kom',
3954 'kw': 'cor',
3955 'ky': 'kir',
3956 'la': 'lat',
3957 'lb': 'ltz',
3958 'lg': 'lug',
3959 'li': 'lim',
3960 'ln': 'lin',
3961 'lo': 'lao',
3962 'lt': 'lit',
3963 'lu': 'lub',
3964 'lv': 'lav',
3965 'mg': 'mlg',
3966 'mh': 'mah',
3967 'mi': 'mri',
3968 'mk': 'mkd',
3969 'ml': 'mal',
3970 'mn': 'mon',
3971 'mr': 'mar',
3972 'ms': 'msa',
3973 'mt': 'mlt',
3974 'my': 'mya',
3975 'na': 'nau',
3976 'nb': 'nob',
3977 'nd': 'nde',
3978 'ne': 'nep',
3979 'ng': 'ndo',
3980 'nl': 'nld',
3981 'nn': 'nno',
3982 'no': 'nor',
3983 'nr': 'nbl',
3984 'nv': 'nav',
3985 'ny': 'nya',
3986 'oc': 'oci',
3987 'oj': 'oji',
3988 'om': 'orm',
3989 'or': 'ori',
3990 'os': 'oss',
3991 'pa': 'pan',
3992 'pi': 'pli',
3993 'pl': 'pol',
3994 'ps': 'pus',
3995 'pt': 'por',
3996 'qu': 'que',
3997 'rm': 'roh',
3998 'rn': 'run',
3999 'ro': 'ron',
4000 'ru': 'rus',
4001 'rw': 'kin',
4002 'sa': 'san',
4003 'sc': 'srd',
4004 'sd': 'snd',
4005 'se': 'sme',
4006 'sg': 'sag',
4007 'si': 'sin',
4008 'sk': 'slk',
4009 'sl': 'slv',
4010 'sm': 'smo',
4011 'sn': 'sna',
4012 'so': 'som',
4013 'sq': 'sqi',
4014 'sr': 'srp',
4015 'ss': 'ssw',
4016 'st': 'sot',
4017 'su': 'sun',
4018 'sv': 'swe',
4019 'sw': 'swa',
4020 'ta': 'tam',
4021 'te': 'tel',
4022 'tg': 'tgk',
4023 'th': 'tha',
4024 'ti': 'tir',
4025 'tk': 'tuk',
4026 'tl': 'tgl',
4027 'tn': 'tsn',
4028 'to': 'ton',
4029 'tr': 'tur',
4030 'ts': 'tso',
4031 'tt': 'tat',
4032 'tw': 'twi',
4033 'ty': 'tah',
4034 'ug': 'uig',
4035 'uk': 'ukr',
4036 'ur': 'urd',
4037 'uz': 'uzb',
4038 've': 'ven',
4039 'vi': 'vie',
4040 'vo': 'vol',
4041 'wa': 'wln',
4042 'wo': 'wol',
4043 'xh': 'xho',
4044 'yi': 'yid',
4045 'ji': 'yid', # Replaced by yi in 1989 revision
4046 'yo': 'yor',
4047 'za': 'zha',
4048 'zh': 'zho',
4049 'zu': 'zul',
4050 }
4051
4052 @classmethod
4053 def short2long(cls, code):
4054 """Convert language code from ISO 639-1 to ISO 639-2/T"""
4055 return cls._lang_map.get(code[:2])
4056
4057 @classmethod
4058 def long2short(cls, code):
4059 """Convert language code from ISO 639-2/T to ISO 639-1"""
4060 for short_name, long_name in cls._lang_map.items():
4061 if long_name == code:
4062 return short_name
4063
4064
4065 class ISO3166Utils(object):
4066 # From http://data.okfn.org/data/core/country-list
4067 _country_map = {
4068 'AF': 'Afghanistan',
4069 'AX': 'Åland Islands',
4070 'AL': 'Albania',
4071 'DZ': 'Algeria',
4072 'AS': 'American Samoa',
4073 'AD': 'Andorra',
4074 'AO': 'Angola',
4075 'AI': 'Anguilla',
4076 'AQ': 'Antarctica',
4077 'AG': 'Antigua and Barbuda',
4078 'AR': 'Argentina',
4079 'AM': 'Armenia',
4080 'AW': 'Aruba',
4081 'AU': 'Australia',
4082 'AT': 'Austria',
4083 'AZ': 'Azerbaijan',
4084 'BS': 'Bahamas',
4085 'BH': 'Bahrain',
4086 'BD': 'Bangladesh',
4087 'BB': 'Barbados',
4088 'BY': 'Belarus',
4089 'BE': 'Belgium',
4090 'BZ': 'Belize',
4091 'BJ': 'Benin',
4092 'BM': 'Bermuda',
4093 'BT': 'Bhutan',
4094 'BO': 'Bolivia, Plurinational State of',
4095 'BQ': 'Bonaire, Sint Eustatius and Saba',
4096 'BA': 'Bosnia and Herzegovina',
4097 'BW': 'Botswana',
4098 'BV': 'Bouvet Island',
4099 'BR': 'Brazil',
4100 'IO': 'British Indian Ocean Territory',
4101 'BN': 'Brunei Darussalam',
4102 'BG': 'Bulgaria',
4103 'BF': 'Burkina Faso',
4104 'BI': 'Burundi',
4105 'KH': 'Cambodia',
4106 'CM': 'Cameroon',
4107 'CA': 'Canada',
4108 'CV': 'Cape Verde',
4109 'KY': 'Cayman Islands',
4110 'CF': 'Central African Republic',
4111 'TD': 'Chad',
4112 'CL': 'Chile',
4113 'CN': 'China',
4114 'CX': 'Christmas Island',
4115 'CC': 'Cocos (Keeling) Islands',
4116 'CO': 'Colombia',
4117 'KM': 'Comoros',
4118 'CG': 'Congo',
4119 'CD': 'Congo, the Democratic Republic of the',
4120 'CK': 'Cook Islands',
4121 'CR': 'Costa Rica',
4122 'CI': 'Côte d\'Ivoire',
4123 'HR': 'Croatia',
4124 'CU': 'Cuba',
4125 'CW': 'Curaçao',
4126 'CY': 'Cyprus',
4127 'CZ': 'Czech Republic',
4128 'DK': 'Denmark',
4129 'DJ': 'Djibouti',
4130 'DM': 'Dominica',
4131 'DO': 'Dominican Republic',
4132 'EC': 'Ecuador',
4133 'EG': 'Egypt',
4134 'SV': 'El Salvador',
4135 'GQ': 'Equatorial Guinea',
4136 'ER': 'Eritrea',
4137 'EE': 'Estonia',
4138 'ET': 'Ethiopia',
4139 'FK': 'Falkland Islands (Malvinas)',
4140 'FO': 'Faroe Islands',
4141 'FJ': 'Fiji',
4142 'FI': 'Finland',
4143 'FR': 'France',
4144 'GF': 'French Guiana',
4145 'PF': 'French Polynesia',
4146 'TF': 'French Southern Territories',
4147 'GA': 'Gabon',
4148 'GM': 'Gambia',
4149 'GE': 'Georgia',
4150 'DE': 'Germany',
4151 'GH': 'Ghana',
4152 'GI': 'Gibraltar',
4153 'GR': 'Greece',
4154 'GL': 'Greenland',
4155 'GD': 'Grenada',
4156 'GP': 'Guadeloupe',
4157 'GU': 'Guam',
4158 'GT': 'Guatemala',
4159 'GG': 'Guernsey',
4160 'GN': 'Guinea',
4161 'GW': 'Guinea-Bissau',
4162 'GY': 'Guyana',
4163 'HT': 'Haiti',
4164 'HM': 'Heard Island and McDonald Islands',
4165 'VA': 'Holy See (Vatican City State)',
4166 'HN': 'Honduras',
4167 'HK': 'Hong Kong',
4168 'HU': 'Hungary',
4169 'IS': 'Iceland',
4170 'IN': 'India',
4171 'ID': 'Indonesia',
4172 'IR': 'Iran, Islamic Republic of',
4173 'IQ': 'Iraq',
4174 'IE': 'Ireland',
4175 'IM': 'Isle of Man',
4176 'IL': 'Israel',
4177 'IT': 'Italy',
4178 'JM': 'Jamaica',
4179 'JP': 'Japan',
4180 'JE': 'Jersey',
4181 'JO': 'Jordan',
4182 'KZ': 'Kazakhstan',
4183 'KE': 'Kenya',
4184 'KI': 'Kiribati',
4185 'KP': 'Korea, Democratic People\'s Republic of',
4186 'KR': 'Korea, Republic of',
4187 'KW': 'Kuwait',
4188 'KG': 'Kyrgyzstan',
4189 'LA': 'Lao People\'s Democratic Republic',
4190 'LV': 'Latvia',
4191 'LB': 'Lebanon',
4192 'LS': 'Lesotho',
4193 'LR': 'Liberia',
4194 'LY': 'Libya',
4195 'LI': 'Liechtenstein',
4196 'LT': 'Lithuania',
4197 'LU': 'Luxembourg',
4198 'MO': 'Macao',
4199 'MK': 'Macedonia, the Former Yugoslav Republic of',
4200 'MG': 'Madagascar',
4201 'MW': 'Malawi',
4202 'MY': 'Malaysia',
4203 'MV': 'Maldives',
4204 'ML': 'Mali',
4205 'MT': 'Malta',
4206 'MH': 'Marshall Islands',
4207 'MQ': 'Martinique',
4208 'MR': 'Mauritania',
4209 'MU': 'Mauritius',
4210 'YT': 'Mayotte',
4211 'MX': 'Mexico',
4212 'FM': 'Micronesia, Federated States of',
4213 'MD': 'Moldova, Republic of',
4214 'MC': 'Monaco',
4215 'MN': 'Mongolia',
4216 'ME': 'Montenegro',
4217 'MS': 'Montserrat',
4218 'MA': 'Morocco',
4219 'MZ': 'Mozambique',
4220 'MM': 'Myanmar',
4221 'NA': 'Namibia',
4222 'NR': 'Nauru',
4223 'NP': 'Nepal',
4224 'NL': 'Netherlands',
4225 'NC': 'New Caledonia',
4226 'NZ': 'New Zealand',
4227 'NI': 'Nicaragua',
4228 'NE': 'Niger',
4229 'NG': 'Nigeria',
4230 'NU': 'Niue',
4231 'NF': 'Norfolk Island',
4232 'MP': 'Northern Mariana Islands',
4233 'NO': 'Norway',
4234 'OM': 'Oman',
4235 'PK': 'Pakistan',
4236 'PW': 'Palau',
4237 'PS': 'Palestine, State of',
4238 'PA': 'Panama',
4239 'PG': 'Papua New Guinea',
4240 'PY': 'Paraguay',
4241 'PE': 'Peru',
4242 'PH': 'Philippines',
4243 'PN': 'Pitcairn',
4244 'PL': 'Poland',
4245 'PT': 'Portugal',
4246 'PR': 'Puerto Rico',
4247 'QA': 'Qatar',
4248 'RE': 'Réunion',
4249 'RO': 'Romania',
4250 'RU': 'Russian Federation',
4251 'RW': 'Rwanda',
4252 'BL': 'Saint Barthélemy',
4253 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
4254 'KN': 'Saint Kitts and Nevis',
4255 'LC': 'Saint Lucia',
4256 'MF': 'Saint Martin (French part)',
4257 'PM': 'Saint Pierre and Miquelon',
4258 'VC': 'Saint Vincent and the Grenadines',
4259 'WS': 'Samoa',
4260 'SM': 'San Marino',
4261 'ST': 'Sao Tome and Principe',
4262 'SA': 'Saudi Arabia',
4263 'SN': 'Senegal',
4264 'RS': 'Serbia',
4265 'SC': 'Seychelles',
4266 'SL': 'Sierra Leone',
4267 'SG': 'Singapore',
4268 'SX': 'Sint Maarten (Dutch part)',
4269 'SK': 'Slovakia',
4270 'SI': 'Slovenia',
4271 'SB': 'Solomon Islands',
4272 'SO': 'Somalia',
4273 'ZA': 'South Africa',
4274 'GS': 'South Georgia and the South Sandwich Islands',
4275 'SS': 'South Sudan',
4276 'ES': 'Spain',
4277 'LK': 'Sri Lanka',
4278 'SD': 'Sudan',
4279 'SR': 'Suriname',
4280 'SJ': 'Svalbard and Jan Mayen',
4281 'SZ': 'Swaziland',
4282 'SE': 'Sweden',
4283 'CH': 'Switzerland',
4284 'SY': 'Syrian Arab Republic',
4285 'TW': 'Taiwan, Province of China',
4286 'TJ': 'Tajikistan',
4287 'TZ': 'Tanzania, United Republic of',
4288 'TH': 'Thailand',
4289 'TL': 'Timor-Leste',
4290 'TG': 'Togo',
4291 'TK': 'Tokelau',
4292 'TO': 'Tonga',
4293 'TT': 'Trinidad and Tobago',
4294 'TN': 'Tunisia',
4295 'TR': 'Turkey',
4296 'TM': 'Turkmenistan',
4297 'TC': 'Turks and Caicos Islands',
4298 'TV': 'Tuvalu',
4299 'UG': 'Uganda',
4300 'UA': 'Ukraine',
4301 'AE': 'United Arab Emirates',
4302 'GB': 'United Kingdom',
4303 'US': 'United States',
4304 'UM': 'United States Minor Outlying Islands',
4305 'UY': 'Uruguay',
4306 'UZ': 'Uzbekistan',
4307 'VU': 'Vanuatu',
4308 'VE': 'Venezuela, Bolivarian Republic of',
4309 'VN': 'Viet Nam',
4310 'VG': 'Virgin Islands, British',
4311 'VI': 'Virgin Islands, U.S.',
4312 'WF': 'Wallis and Futuna',
4313 'EH': 'Western Sahara',
4314 'YE': 'Yemen',
4315 'ZM': 'Zambia',
4316 'ZW': 'Zimbabwe',
4317 }
4318
4319 @classmethod
4320 def short2full(cls, code):
4321 """Convert an ISO 3166-2 country code to the corresponding full name"""
4322 return cls._country_map.get(code.upper())
4323
4324
4325 class GeoUtils(object):
4326 # Major IPv4 address blocks per country
4327 _country_ip_map = {
4328 'AD': '46.172.224.0/19',
4329 'AE': '94.200.0.0/13',
4330 'AF': '149.54.0.0/17',
4331 'AG': '209.59.64.0/18',
4332 'AI': '204.14.248.0/21',
4333 'AL': '46.99.0.0/16',
4334 'AM': '46.70.0.0/15',
4335 'AO': '105.168.0.0/13',
4336 'AP': '182.50.184.0/21',
4337 'AQ': '23.154.160.0/24',
4338 'AR': '181.0.0.0/12',
4339 'AS': '202.70.112.0/20',
4340 'AT': '77.116.0.0/14',
4341 'AU': '1.128.0.0/11',
4342 'AW': '181.41.0.0/18',
4343 'AX': '185.217.4.0/22',
4344 'AZ': '5.197.0.0/16',
4345 'BA': '31.176.128.0/17',
4346 'BB': '65.48.128.0/17',
4347 'BD': '114.130.0.0/16',
4348 'BE': '57.0.0.0/8',
4349 'BF': '102.178.0.0/15',
4350 'BG': '95.42.0.0/15',
4351 'BH': '37.131.0.0/17',
4352 'BI': '154.117.192.0/18',
4353 'BJ': '137.255.0.0/16',
4354 'BL': '185.212.72.0/23',
4355 'BM': '196.12.64.0/18',
4356 'BN': '156.31.0.0/16',
4357 'BO': '161.56.0.0/16',
4358 'BQ': '161.0.80.0/20',
4359 'BR': '191.128.0.0/12',
4360 'BS': '24.51.64.0/18',
4361 'BT': '119.2.96.0/19',
4362 'BW': '168.167.0.0/16',
4363 'BY': '178.120.0.0/13',
4364 'BZ': '179.42.192.0/18',
4365 'CA': '99.224.0.0/11',
4366 'CD': '41.243.0.0/16',
4367 'CF': '197.242.176.0/21',
4368 'CG': '160.113.0.0/16',
4369 'CH': '85.0.0.0/13',
4370 'CI': '102.136.0.0/14',
4371 'CK': '202.65.32.0/19',
4372 'CL': '152.172.0.0/14',
4373 'CM': '102.244.0.0/14',
4374 'CN': '36.128.0.0/10',
4375 'CO': '181.240.0.0/12',
4376 'CR': '201.192.0.0/12',
4377 'CU': '152.206.0.0/15',
4378 'CV': '165.90.96.0/19',
4379 'CW': '190.88.128.0/17',
4380 'CY': '31.153.0.0/16',
4381 'CZ': '88.100.0.0/14',
4382 'DE': '53.0.0.0/8',
4383 'DJ': '197.241.0.0/17',
4384 'DK': '87.48.0.0/12',
4385 'DM': '192.243.48.0/20',
4386 'DO': '152.166.0.0/15',
4387 'DZ': '41.96.0.0/12',
4388 'EC': '186.68.0.0/15',
4389 'EE': '90.190.0.0/15',
4390 'EG': '156.160.0.0/11',
4391 'ER': '196.200.96.0/20',
4392 'ES': '88.0.0.0/11',
4393 'ET': '196.188.0.0/14',
4394 'EU': '2.16.0.0/13',
4395 'FI': '91.152.0.0/13',
4396 'FJ': '144.120.0.0/16',
4397 'FK': '80.73.208.0/21',
4398 'FM': '119.252.112.0/20',
4399 'FO': '88.85.32.0/19',
4400 'FR': '90.0.0.0/9',
4401 'GA': '41.158.0.0/15',
4402 'GB': '25.0.0.0/8',
4403 'GD': '74.122.88.0/21',
4404 'GE': '31.146.0.0/16',
4405 'GF': '161.22.64.0/18',
4406 'GG': '62.68.160.0/19',
4407 'GH': '154.160.0.0/12',
4408 'GI': '95.164.0.0/16',
4409 'GL': '88.83.0.0/19',
4410 'GM': '160.182.0.0/15',
4411 'GN': '197.149.192.0/18',
4412 'GP': '104.250.0.0/19',
4413 'GQ': '105.235.224.0/20',
4414 'GR': '94.64.0.0/13',
4415 'GT': '168.234.0.0/16',
4416 'GU': '168.123.0.0/16',
4417 'GW': '197.214.80.0/20',
4418 'GY': '181.41.64.0/18',
4419 'HK': '113.252.0.0/14',
4420 'HN': '181.210.0.0/16',
4421 'HR': '93.136.0.0/13',
4422 'HT': '148.102.128.0/17',
4423 'HU': '84.0.0.0/14',
4424 'ID': '39.192.0.0/10',
4425 'IE': '87.32.0.0/12',
4426 'IL': '79.176.0.0/13',
4427 'IM': '5.62.80.0/20',
4428 'IN': '117.192.0.0/10',
4429 'IO': '203.83.48.0/21',
4430 'IQ': '37.236.0.0/14',
4431 'IR': '2.176.0.0/12',
4432 'IS': '82.221.0.0/16',
4433 'IT': '79.0.0.0/10',
4434 'JE': '87.244.64.0/18',
4435 'JM': '72.27.0.0/17',
4436 'JO': '176.29.0.0/16',
4437 'JP': '133.0.0.0/8',
4438 'KE': '105.48.0.0/12',
4439 'KG': '158.181.128.0/17',
4440 'KH': '36.37.128.0/17',
4441 'KI': '103.25.140.0/22',
4442 'KM': '197.255.224.0/20',
4443 'KN': '198.167.192.0/19',
4444 'KP': '175.45.176.0/22',
4445 'KR': '175.192.0.0/10',
4446 'KW': '37.36.0.0/14',
4447 'KY': '64.96.0.0/15',
4448 'KZ': '2.72.0.0/13',
4449 'LA': '115.84.64.0/18',
4450 'LB': '178.135.0.0/16',
4451 'LC': '24.92.144.0/20',
4452 'LI': '82.117.0.0/19',
4453 'LK': '112.134.0.0/15',
4454 'LR': '102.183.0.0/16',
4455 'LS': '129.232.0.0/17',
4456 'LT': '78.56.0.0/13',
4457 'LU': '188.42.0.0/16',
4458 'LV': '46.109.0.0/16',
4459 'LY': '41.252.0.0/14',
4460 'MA': '105.128.0.0/11',
4461 'MC': '88.209.64.0/18',
4462 'MD': '37.246.0.0/16',
4463 'ME': '178.175.0.0/17',
4464 'MF': '74.112.232.0/21',
4465 'MG': '154.126.0.0/17',
4466 'MH': '117.103.88.0/21',
4467 'MK': '77.28.0.0/15',
4468 'ML': '154.118.128.0/18',
4469 'MM': '37.111.0.0/17',
4470 'MN': '49.0.128.0/17',
4471 'MO': '60.246.0.0/16',
4472 'MP': '202.88.64.0/20',
4473 'MQ': '109.203.224.0/19',
4474 'MR': '41.188.64.0/18',
4475 'MS': '208.90.112.0/22',
4476 'MT': '46.11.0.0/16',
4477 'MU': '105.16.0.0/12',
4478 'MV': '27.114.128.0/18',
4479 'MW': '102.70.0.0/15',
4480 'MX': '187.192.0.0/11',
4481 'MY': '175.136.0.0/13',
4482 'MZ': '197.218.0.0/15',
4483 'NA': '41.182.0.0/16',
4484 'NC': '101.101.0.0/18',
4485 'NE': '197.214.0.0/18',
4486 'NF': '203.17.240.0/22',
4487 'NG': '105.112.0.0/12',
4488 'NI': '186.76.0.0/15',
4489 'NL': '145.96.0.0/11',
4490 'NO': '84.208.0.0/13',
4491 'NP': '36.252.0.0/15',
4492 'NR': '203.98.224.0/19',
4493 'NU': '49.156.48.0/22',
4494 'NZ': '49.224.0.0/14',
4495 'OM': '5.36.0.0/15',
4496 'PA': '186.72.0.0/15',
4497 'PE': '186.160.0.0/14',
4498 'PF': '123.50.64.0/18',
4499 'PG': '124.240.192.0/19',
4500 'PH': '49.144.0.0/13',
4501 'PK': '39.32.0.0/11',
4502 'PL': '83.0.0.0/11',
4503 'PM': '70.36.0.0/20',
4504 'PR': '66.50.0.0/16',
4505 'PS': '188.161.0.0/16',
4506 'PT': '85.240.0.0/13',
4507 'PW': '202.124.224.0/20',
4508 'PY': '181.120.0.0/14',
4509 'QA': '37.210.0.0/15',
4510 'RE': '102.35.0.0/16',
4511 'RO': '79.112.0.0/13',
4512 'RS': '93.86.0.0/15',
4513 'RU': '5.136.0.0/13',
4514 'RW': '41.186.0.0/16',
4515 'SA': '188.48.0.0/13',
4516 'SB': '202.1.160.0/19',
4517 'SC': '154.192.0.0/11',
4518 'SD': '102.120.0.0/13',
4519 'SE': '78.64.0.0/12',
4520 'SG': '8.128.0.0/10',
4521 'SI': '188.196.0.0/14',
4522 'SK': '78.98.0.0/15',
4523 'SL': '102.143.0.0/17',
4524 'SM': '89.186.32.0/19',
4525 'SN': '41.82.0.0/15',
4526 'SO': '154.115.192.0/18',
4527 'SR': '186.179.128.0/17',
4528 'SS': '105.235.208.0/21',
4529 'ST': '197.159.160.0/19',
4530 'SV': '168.243.0.0/16',
4531 'SX': '190.102.0.0/20',
4532 'SY': '5.0.0.0/16',
4533 'SZ': '41.84.224.0/19',
4534 'TC': '65.255.48.0/20',
4535 'TD': '154.68.128.0/19',
4536 'TG': '196.168.0.0/14',
4537 'TH': '171.96.0.0/13',
4538 'TJ': '85.9.128.0/18',
4539 'TK': '27.96.24.0/21',
4540 'TL': '180.189.160.0/20',
4541 'TM': '95.85.96.0/19',
4542 'TN': '197.0.0.0/11',
4543 'TO': '175.176.144.0/21',
4544 'TR': '78.160.0.0/11',
4545 'TT': '186.44.0.0/15',
4546 'TV': '202.2.96.0/19',
4547 'TW': '120.96.0.0/11',
4548 'TZ': '156.156.0.0/14',
4549 'UA': '37.52.0.0/14',
4550 'UG': '102.80.0.0/13',
4551 'US': '6.0.0.0/8',
4552 'UY': '167.56.0.0/13',
4553 'UZ': '84.54.64.0/18',
4554 'VA': '212.77.0.0/19',
4555 'VC': '207.191.240.0/21',
4556 'VE': '186.88.0.0/13',
4557 'VG': '66.81.192.0/20',
4558 'VI': '146.226.0.0/16',
4559 'VN': '14.160.0.0/11',
4560 'VU': '202.80.32.0/20',
4561 'WF': '117.20.32.0/21',
4562 'WS': '202.4.32.0/19',
4563 'YE': '134.35.0.0/16',
4564 'YT': '41.242.116.0/22',
4565 'ZA': '41.0.0.0/11',
4566 'ZM': '102.144.0.0/13',
4567 'ZW': '102.177.192.0/18',
4568 }
4569
4570 @classmethod
4571 def random_ipv4(cls, code_or_block):
4572 if len(code_or_block) == 2:
4573 block = cls._country_ip_map.get(code_or_block.upper())
4574 if not block:
4575 return None
4576 else:
4577 block = code_or_block
4578 addr, preflen = block.split('/')
4579 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
4580 addr_max = addr_min | (0xffffffff >> int(preflen))
4581 return compat_str(socket.inet_ntoa(
4582 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
4583
4584
4585 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
4586 def __init__(self, proxies=None):
4587 # Set default handlers
4588 for type in ('http', 'https'):
4589 setattr(self, '%s_open' % type,
4590 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
4591 meth(r, proxy, type))
4592 compat_urllib_request.ProxyHandler.__init__(self, proxies)
4593
4594 def proxy_open(self, req, proxy, type):
4595 req_proxy = req.headers.get('Ytdl-request-proxy')
4596 if req_proxy is not None:
4597 proxy = req_proxy
4598 del req.headers['Ytdl-request-proxy']
4599
4600 if proxy == '__noproxy__':
4601 return None # No Proxy
4602 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
4603 req.add_header('Ytdl-socks-proxy', proxy)
4604 # yt-dlp's http/https handlers do wrapping the socket with socks
4605 return None
4606 return compat_urllib_request.ProxyHandler.proxy_open(
4607 self, req, proxy, type)
4608
4609
4610 # Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
4611 # released into Public Domain
4612 # https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
4613
4614 def long_to_bytes(n, blocksize=0):
4615 """long_to_bytes(n:long, blocksize:int) : string
4616 Convert a long integer to a byte string.
4617
4618 If optional blocksize is given and greater than zero, pad the front of the
4619 byte string with binary zeros so that the length is a multiple of
4620 blocksize.
4621 """
4622 # after much testing, this algorithm was deemed to be the fastest
4623 s = b''
4624 n = int(n)
4625 while n > 0:
4626 s = compat_struct_pack('>I', n & 0xffffffff) + s
4627 n = n >> 32
4628 # strip off leading zeros
4629 for i in range(len(s)):
4630 if s[i] != b'\000'[0]:
4631 break
4632 else:
4633 # only happens when n == 0
4634 s = b'\000'
4635 i = 0
4636 s = s[i:]
4637 # add back some pad bytes. this could be done more efficiently w.r.t. the
4638 # de-padding being done above, but sigh...
4639 if blocksize > 0 and len(s) % blocksize:
4640 s = (blocksize - len(s) % blocksize) * b'\000' + s
4641 return s
4642
4643
4644 def bytes_to_long(s):
4645 """bytes_to_long(string) : long
4646 Convert a byte string to a long integer.
4647
4648 This is (essentially) the inverse of long_to_bytes().
4649 """
4650 acc = 0
4651 length = len(s)
4652 if length % 4:
4653 extra = (4 - length % 4)
4654 s = b'\000' * extra + s
4655 length = length + extra
4656 for i in range(0, length, 4):
4657 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
4658 return acc
4659
4660
4661 def ohdave_rsa_encrypt(data, exponent, modulus):
4662 '''
4663 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
4664
4665 Input:
4666 data: data to encrypt, bytes-like object
4667 exponent, modulus: parameter e and N of RSA algorithm, both integer
4668 Output: hex string of encrypted data
4669
4670 Limitation: supports one block encryption only
4671 '''
4672
4673 payload = int(binascii.hexlify(data[::-1]), 16)
4674 encrypted = pow(payload, exponent, modulus)
4675 return '%x' % encrypted
4676
4677
4678 def pkcs1pad(data, length):
4679 """
4680 Padding input data with PKCS#1 scheme
4681
4682 @param {int[]} data input data
4683 @param {int} length target length
4684 @returns {int[]} padded data
4685 """
4686 if len(data) > length - 11:
4687 raise ValueError('Input data too long for PKCS#1 padding')
4688
4689 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
4690 return [0, 2] + pseudo_random + [0] + data
4691
4692
4693 def encode_base_n(num, n, table=None):
4694 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
4695 if not table:
4696 table = FULL_TABLE[:n]
4697
4698 if n > len(table):
4699 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
4700
4701 if num == 0:
4702 return table[0]
4703
4704 ret = ''
4705 while num:
4706 ret = table[num % n] + ret
4707 num = num // n
4708 return ret
4709
4710
4711 def decode_packed_codes(code):
4712 mobj = re.search(PACKED_CODES_RE, code)
4713 obfuscated_code, base, count, symbols = mobj.groups()
4714 base = int(base)
4715 count = int(count)
4716 symbols = symbols.split('|')
4717 symbol_table = {}
4718
4719 while count:
4720 count -= 1
4721 base_n_count = encode_base_n(count, base)
4722 symbol_table[base_n_count] = symbols[count] or base_n_count
4723
4724 return re.sub(
4725 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
4726 obfuscated_code)
4727
4728
4729 def caesar(s, alphabet, shift):
4730 if shift == 0:
4731 return s
4732 l = len(alphabet)
4733 return ''.join(
4734 alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
4735 for c in s)
4736
4737
4738 def rot47(s):
4739 return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
4740
4741
4742 def parse_m3u8_attributes(attrib):
4743 info = {}
4744 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
4745 if val.startswith('"'):
4746 val = val[1:-1]
4747 info[key] = val
4748 return info
4749
4750
4751 def urshift(val, n):
4752 return val >> n if val >= 0 else (val + 0x100000000) >> n
4753
4754
4755 # Based on png2str() written by @gdkchan and improved by @yokrysty
4756 # Originally posted at https://github.com/ytdl-org/youtube-dl/issues/9706
4757 def decode_png(png_data):
4758 # Reference: https://www.w3.org/TR/PNG/
4759 header = png_data[8:]
4760
4761 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
4762 raise IOError('Not a valid PNG file.')
4763
4764 int_map = {1: '>B', 2: '>H', 4: '>I'}
4765 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
4766
4767 chunks = []
4768
4769 while header:
4770 length = unpack_integer(header[:4])
4771 header = header[4:]
4772
4773 chunk_type = header[:4]
4774 header = header[4:]
4775
4776 chunk_data = header[:length]
4777 header = header[length:]
4778
4779 header = header[4:] # Skip CRC
4780
4781 chunks.append({
4782 'type': chunk_type,
4783 'length': length,
4784 'data': chunk_data
4785 })
4786
4787 ihdr = chunks[0]['data']
4788
4789 width = unpack_integer(ihdr[:4])
4790 height = unpack_integer(ihdr[4:8])
4791
4792 idat = b''
4793
4794 for chunk in chunks:
4795 if chunk['type'] == b'IDAT':
4796 idat += chunk['data']
4797
4798 if not idat:
4799 raise IOError('Unable to read PNG data.')
4800
4801 decompressed_data = bytearray(zlib.decompress(idat))
4802
4803 stride = width * 3
4804 pixels = []
4805
4806 def _get_pixel(idx):
4807 x = idx % stride
4808 y = idx // stride
4809 return pixels[y][x]
4810
4811 for y in range(height):
4812 basePos = y * (1 + stride)
4813 filter_type = decompressed_data[basePos]
4814
4815 current_row = []
4816
4817 pixels.append(current_row)
4818
4819 for x in range(stride):
4820 color = decompressed_data[1 + basePos + x]
4821 basex = y * stride + x
4822 left = 0
4823 up = 0
4824
4825 if x > 2:
4826 left = _get_pixel(basex - 3)
4827 if y > 0:
4828 up = _get_pixel(basex - stride)
4829
4830 if filter_type == 1: # Sub
4831 color = (color + left) & 0xff
4832 elif filter_type == 2: # Up
4833 color = (color + up) & 0xff
4834 elif filter_type == 3: # Average
4835 color = (color + ((left + up) >> 1)) & 0xff
4836 elif filter_type == 4: # Paeth
4837 a = left
4838 b = up
4839 c = 0
4840
4841 if x > 2 and y > 0:
4842 c = _get_pixel(basex - stride - 3)
4843
4844 p = a + b - c
4845
4846 pa = abs(p - a)
4847 pb = abs(p - b)
4848 pc = abs(p - c)
4849
4850 if pa <= pb and pa <= pc:
4851 color = (color + a) & 0xff
4852 elif pb <= pc:
4853 color = (color + b) & 0xff
4854 else:
4855 color = (color + c) & 0xff
4856
4857 current_row.append(color)
4858
4859 return width, height, pixels
4860
4861
4862 def write_xattr(path, key, value):
4863 # This mess below finds the best xattr tool for the job
4864 try:
4865 # try the pyxattr module...
4866 import xattr
4867
4868 if hasattr(xattr, 'set'): # pyxattr
4869 # Unicode arguments are not supported in python-pyxattr until
4870 # version 0.5.0
4871 # See https://github.com/ytdl-org/youtube-dl/issues/5498
4872 pyxattr_required_version = '0.5.0'
4873 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
4874 # TODO: fallback to CLI tools
4875 raise XAttrUnavailableError(
4876 'python-pyxattr is detected but is too old. '
4877 'yt-dlp requires %s or above while your version is %s. '
4878 'Falling back to other xattr implementations' % (
4879 pyxattr_required_version, xattr.__version__))
4880
4881 setxattr = xattr.set
4882 else: # xattr
4883 setxattr = xattr.setxattr
4884
4885 try:
4886 setxattr(path, key, value)
4887 except EnvironmentError as e:
4888 raise XAttrMetadataError(e.errno, e.strerror)
4889
4890 except ImportError:
4891 if compat_os_name == 'nt':
4892 # Write xattrs to NTFS Alternate Data Streams:
4893 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
4894 assert ':' not in key
4895 assert os.path.exists(path)
4896
4897 ads_fn = path + ':' + key
4898 try:
4899 with open(ads_fn, 'wb') as f:
4900 f.write(value)
4901 except EnvironmentError as e:
4902 raise XAttrMetadataError(e.errno, e.strerror)
4903 else:
4904 user_has_setfattr = check_executable('setfattr', ['--version'])
4905 user_has_xattr = check_executable('xattr', ['-h'])
4906
4907 if user_has_setfattr or user_has_xattr:
4908
4909 value = value.decode('utf-8')
4910 if user_has_setfattr:
4911 executable = 'setfattr'
4912 opts = ['-n', key, '-v', value]
4913 elif user_has_xattr:
4914 executable = 'xattr'
4915 opts = ['-w', key, value]
4916
4917 cmd = ([encodeFilename(executable, True)]
4918 + [encodeArgument(o) for o in opts]
4919 + [encodeFilename(path, True)])
4920
4921 try:
4922 p = Popen(
4923 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
4924 except EnvironmentError as e:
4925 raise XAttrMetadataError(e.errno, e.strerror)
4926 stdout, stderr = p.communicate_or_kill()
4927 stderr = stderr.decode('utf-8', 'replace')
4928 if p.returncode != 0:
4929 raise XAttrMetadataError(p.returncode, stderr)
4930
4931 else:
4932 # On Unix, and can't find pyxattr, setfattr, or xattr.
4933 if sys.platform.startswith('linux'):
4934 raise XAttrUnavailableError(
4935 "Couldn't find a tool to set the xattrs. "
4936 "Install either the python 'pyxattr' or 'xattr' "
4937 "modules, or the GNU 'attr' package "
4938 "(which contains the 'setfattr' tool).")
4939 else:
4940 raise XAttrUnavailableError(
4941 "Couldn't find a tool to set the xattrs. "
4942 "Install either the python 'xattr' module, "
4943 "or the 'xattr' binary.")
4944
4945
4946 def random_birthday(year_field, month_field, day_field):
4947 start_date = datetime.date(1950, 1, 1)
4948 end_date = datetime.date(1995, 12, 31)
4949 offset = random.randint(0, (end_date - start_date).days)
4950 random_date = start_date + datetime.timedelta(offset)
4951 return {
4952 year_field: str(random_date.year),
4953 month_field: str(random_date.month),
4954 day_field: str(random_date.day),
4955 }
4956
4957
4958 # Templates for internet shortcut files, which are plain text files.
4959 DOT_URL_LINK_TEMPLATE = '''
4960 [InternetShortcut]
4961 URL=%(url)s
4962 '''.lstrip()
4963
4964 DOT_WEBLOC_LINK_TEMPLATE = '''
4965 <?xml version="1.0" encoding="UTF-8"?>
4966 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
4967 <plist version="1.0">
4968 <dict>
4969 \t<key>URL</key>
4970 \t<string>%(url)s</string>
4971 </dict>
4972 </plist>
4973 '''.lstrip()
4974
4975 DOT_DESKTOP_LINK_TEMPLATE = '''
4976 [Desktop Entry]
4977 Encoding=UTF-8
4978 Name=%(filename)s
4979 Type=Link
4980 URL=%(url)s
4981 Icon=text-html
4982 '''.lstrip()
4983
4984 LINK_TEMPLATES = {
4985 'url': DOT_URL_LINK_TEMPLATE,
4986 'desktop': DOT_DESKTOP_LINK_TEMPLATE,
4987 'webloc': DOT_WEBLOC_LINK_TEMPLATE,
4988 }
4989
4990
4991 def iri_to_uri(iri):
4992 """
4993 Converts an IRI (Internationalized Resource Identifier, allowing Unicode characters) to a URI (Uniform Resource Identifier, ASCII-only).
4994
4995 The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact.
4996 """
4997
4998 iri_parts = compat_urllib_parse_urlparse(iri)
4999
5000 if '[' in iri_parts.netloc:
5001 raise ValueError('IPv6 URIs are not, yet, supported.')
5002 # Querying `.netloc`, when there's only one bracket, also raises a ValueError.
5003
5004 # The `safe` argument values, that the following code uses, contain the characters that should not be percent-encoded. Everything else but letters, digits and '_.-' will be percent-encoded with an underlying UTF-8 encoding. Everything already percent-encoded will be left as is.
5005
5006 net_location = ''
5007 if iri_parts.username:
5008 net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~")
5009 if iri_parts.password is not None:
5010 net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~")
5011 net_location += '@'
5012
5013 net_location += iri_parts.hostname.encode('idna').decode('utf-8') # Punycode for Unicode hostnames.
5014 # The 'idna' encoding produces ASCII text.
5015 if iri_parts.port is not None and iri_parts.port != 80:
5016 net_location += ':' + str(iri_parts.port)
5017
5018 return compat_urllib_parse_urlunparse(
5019 (iri_parts.scheme,
5020 net_location,
5021
5022 compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"),
5023
5024 # Unsure about the `safe` argument, since this is a legacy way of handling parameters.
5025 compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"),
5026
5027 # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component.
5028 compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"),
5029
5030 compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~")))
5031
5032 # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes.
5033
5034
5035 def to_high_limit_path(path):
5036 if sys.platform in ['win32', 'cygwin']:
5037 # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited.
5038 return r'\\?\ '.rstrip() + os.path.abspath(path)
5039
5040 return path
5041
5042
5043 def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None):
5044 val = traverse_obj(obj, *variadic(field))
5045 if val in ignore:
5046 return default
5047 return template % (func(val) if func else val)
5048
5049
5050 def clean_podcast_url(url):
5051 return re.sub(r'''(?x)
5052 (?:
5053 (?:
5054 chtbl\.com/track|
5055 media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/
5056 play\.podtrac\.com
5057 )/[^/]+|
5058 (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure
5059 flex\.acast\.com|
5060 pd(?:
5061 cn\.co| # https://podcorn.com/analytics-prefix/
5062 st\.fm # https://podsights.com/docs/
5063 )/e
5064 )/''', '', url)
5065
5066
5067 _HEX_TABLE = '0123456789abcdef'
5068
5069
5070 def random_uuidv4():
5071 return re.sub(r'[xy]', lambda x: _HEX_TABLE[random.randint(0, 15)], 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx')
5072
5073
5074 def make_dir(path, to_screen=None):
5075 try:
5076 dn = os.path.dirname(path)
5077 if dn and not os.path.exists(dn):
5078 os.makedirs(dn)
5079 return True
5080 except (OSError, IOError) as err:
5081 if callable(to_screen) is not None:
5082 to_screen('unable to create directory ' + error_to_compat_str(err))
5083 return False
5084
5085
5086 def get_executable_path():
5087 from zipimport import zipimporter
5088 if hasattr(sys, 'frozen'): # Running from PyInstaller
5089 path = os.path.dirname(sys.executable)
5090 elif isinstance(globals().get('__loader__'), zipimporter): # Running from ZIP
5091 path = os.path.join(os.path.dirname(__file__), '../..')
5092 else:
5093 path = os.path.join(os.path.dirname(__file__), '..')
5094 return os.path.abspath(path)
5095
5096
5097 def load_plugins(name, suffix, namespace):
5098 classes = {}
5099 try:
5100 plugins_spec = importlib.util.spec_from_file_location(
5101 name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py'))
5102 plugins = importlib.util.module_from_spec(plugins_spec)
5103 sys.modules[plugins_spec.name] = plugins
5104 plugins_spec.loader.exec_module(plugins)
5105 for name in dir(plugins):
5106 if name in namespace:
5107 continue
5108 if not name.endswith(suffix):
5109 continue
5110 klass = getattr(plugins, name)
5111 classes[name] = namespace[name] = klass
5112 except FileNotFoundError:
5113 pass
5114 return classes
5115
5116
5117 def traverse_obj(
5118 obj, *path_list, default=None, expected_type=None, get_all=True,
5119 casesense=True, is_user_input=False, traverse_string=False):
5120 ''' Traverse nested list/dict/tuple
5121 @param path_list A list of paths which are checked one by one.
5122 Each path is a list of keys where each key is a string,
5123 a function, a tuple of strings/None or "...".
5124 When a fuction is given, it takes the key as argument and
5125 returns whether the key matches or not. When a tuple is given,
5126 all the keys given in the tuple are traversed, and
5127 "..." traverses all the keys in the object
5128 "None" returns the object without traversal
5129 @param default Default value to return
5130 @param expected_type Only accept final value of this type (Can also be any callable)
5131 @param get_all Return all the values obtained from a path or only the first one
5132 @param casesense Whether to consider dictionary keys as case sensitive
5133 @param is_user_input Whether the keys are generated from user input. If True,
5134 strings are converted to int/slice if necessary
5135 @param traverse_string Whether to traverse inside strings. If True, any
5136 non-compatible object will also be converted into a string
5137 # TODO: Write tests
5138 '''
5139 if not casesense:
5140 _lower = lambda k: (k.lower() if isinstance(k, str) else k)
5141 path_list = (map(_lower, variadic(path)) for path in path_list)
5142
5143 def _traverse_obj(obj, path, _current_depth=0):
5144 nonlocal depth
5145 path = tuple(variadic(path))
5146 for i, key in enumerate(path):
5147 if None in (key, obj):
5148 return obj
5149 if isinstance(key, (list, tuple)):
5150 obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key]
5151 key = ...
5152 if key is ...:
5153 obj = (obj.values() if isinstance(obj, dict)
5154 else obj if isinstance(obj, (list, tuple, LazyList))
5155 else str(obj) if traverse_string else [])
5156 _current_depth += 1
5157 depth = max(depth, _current_depth)
5158 return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj]
5159 elif callable(key):
5160 if isinstance(obj, (list, tuple, LazyList)):
5161 obj = enumerate(obj)
5162 elif isinstance(obj, dict):
5163 obj = obj.items()
5164 else:
5165 if not traverse_string:
5166 return None
5167 obj = str(obj)
5168 _current_depth += 1
5169 depth = max(depth, _current_depth)
5170 return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if key(k)]
5171 elif isinstance(obj, dict) and not (is_user_input and key == ':'):
5172 obj = (obj.get(key) if casesense or (key in obj)
5173 else next((v for k, v in obj.items() if _lower(k) == key), None))
5174 else:
5175 if is_user_input:
5176 key = (int_or_none(key) if ':' not in key
5177 else slice(*map(int_or_none, key.split(':'))))
5178 if key == slice(None):
5179 return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth)
5180 if not isinstance(key, (int, slice)):
5181 return None
5182 if not isinstance(obj, (list, tuple, LazyList)):
5183 if not traverse_string:
5184 return None
5185 obj = str(obj)
5186 try:
5187 obj = obj[key]
5188 except IndexError:
5189 return None
5190 return obj
5191
5192 if isinstance(expected_type, type):
5193 type_test = lambda val: val if isinstance(val, expected_type) else None
5194 elif expected_type is not None:
5195 type_test = expected_type
5196 else:
5197 type_test = lambda val: val
5198
5199 for path in path_list:
5200 depth = 0
5201 val = _traverse_obj(obj, path)
5202 if val is not None:
5203 if depth:
5204 for _ in range(depth - 1):
5205 val = itertools.chain.from_iterable(v for v in val if v is not None)
5206 val = [v for v in map(type_test, val) if v is not None]
5207 if val:
5208 return val if get_all else val[0]
5209 else:
5210 val = type_test(val)
5211 if val is not None:
5212 return val
5213 return default
5214
5215
5216 def traverse_dict(dictn, keys, casesense=True):
5217 write_string('DeprecationWarning: yt_dlp.utils.traverse_dict is deprecated '
5218 'and may be removed in a future version. Use yt_dlp.utils.traverse_obj instead')
5219 return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True)
5220
5221
5222 def get_first(obj, keys, **kwargs):
5223 return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False)
5224
5225
5226 def variadic(x, allowed_types=(str, bytes, dict)):
5227 return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,)
5228
5229
5230 def decode_base(value, digits):
5231 # This will convert given base-x string to scalar (long or int)
5232 table = {char: index for index, char in enumerate(digits)}
5233 result = 0
5234 base = len(digits)
5235 for chr in value:
5236 result *= base
5237 result += table[chr]
5238 return result
5239
5240
5241 def time_seconds(**kwargs):
5242 t = datetime.datetime.now(datetime.timezone(datetime.timedelta(**kwargs)))
5243 return t.timestamp()
5244
5245
5246 # create a JSON Web Signature (jws) with HS256 algorithm
5247 # the resulting format is in JWS Compact Serialization
5248 # implemented following JWT https://www.rfc-editor.org/rfc/rfc7519.html
5249 # implemented following JWS https://www.rfc-editor.org/rfc/rfc7515.html
5250 def jwt_encode_hs256(payload_data, key, headers={}):
5251 header_data = {
5252 'alg': 'HS256',
5253 'typ': 'JWT',
5254 }
5255 if headers:
5256 header_data.update(headers)
5257 header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8'))
5258 payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8'))
5259 h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256)
5260 signature_b64 = base64.b64encode(h.digest())
5261 token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64
5262 return token
5263
5264
5265 # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256
5266 def jwt_decode_hs256(jwt):
5267 header_b64, payload_b64, signature_b64 = jwt.split('.')
5268 payload_data = json.loads(base64.urlsafe_b64decode(payload_b64))
5269 return payload_data
5270
5271
5272 def supports_terminal_sequences(stream):
5273 if compat_os_name == 'nt':
5274 from .compat import WINDOWS_VT_MODE # Must be imported locally
5275 if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586):
5276 return False
5277 elif not os.getenv('TERM'):
5278 return False
5279 try:
5280 return stream.isatty()
5281 except BaseException:
5282 return False
5283
5284
5285 _terminal_sequences_re = re.compile('\033\\[[^m]+m')
5286
5287
5288 def remove_terminal_sequences(string):
5289 return _terminal_sequences_re.sub('', string)
5290
5291
5292 def number_of_digits(number):
5293 return len('%d' % number)
5294
5295
5296 def join_nonempty(*values, delim='-', from_dict=None):
5297 if from_dict is not None:
5298 values = map(from_dict.get, values)
5299 return delim.join(map(str, filter(None, values)))
5300
5301
5302 def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re):
5303 """
5304 Find the largest format dimensions in terms of video width and, for each thumbnail:
5305 * Modify the URL: Match the width with the provided regex and replace with the former width
5306 * Update dimensions
5307
5308 This function is useful with video services that scale the provided thumbnails on demand
5309 """
5310 _keys = ('width', 'height')
5311 max_dimensions = max(
5312 [tuple(format.get(k) or 0 for k in _keys) for format in formats],
5313 default=(0, 0))
5314 if not max_dimensions[0]:
5315 return thumbnails
5316 return [
5317 merge_dicts(
5318 {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
5319 dict(zip(_keys, max_dimensions)), thumbnail)
5320 for thumbnail in thumbnails
5321 ]
5322
5323
5324 def parse_http_range(range):
5325 """ Parse value of "Range" or "Content-Range" HTTP header into tuple. """
5326 if not range:
5327 return None, None, None
5328 crg = re.search(r'bytes[ =](\d+)-(\d+)?(?:/(\d+))?', range)
5329 if not crg:
5330 return None, None, None
5331 return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3))
5332
5333
5334 class Config:
5335 own_args = None
5336 filename = None
5337 __initialized = False
5338
5339 def __init__(self, parser, label=None):
5340 self._parser, self.label = parser, label
5341 self._loaded_paths, self.configs = set(), []
5342
5343 def init(self, args=None, filename=None):
5344 assert not self.__initialized
5345 directory = ''
5346 if filename:
5347 location = os.path.realpath(filename)
5348 directory = os.path.dirname(location)
5349 if location in self._loaded_paths:
5350 return False
5351 self._loaded_paths.add(location)
5352
5353 self.__initialized = True
5354 self.own_args, self.filename = args, filename
5355 for location in self._parser.parse_args(args)[0].config_locations or []:
5356 location = os.path.join(directory, expand_path(location))
5357 if os.path.isdir(location):
5358 location = os.path.join(location, 'yt-dlp.conf')
5359 if not os.path.exists(location):
5360 self._parser.error(f'config location {location} does not exist')
5361 self.append_config(self.read_file(location), location)
5362 return True
5363
5364 def __str__(self):
5365 label = join_nonempty(
5366 self.label, 'config', f'"{self.filename}"' if self.filename else '',
5367 delim=' ')
5368 return join_nonempty(
5369 self.own_args is not None and f'{label[0].upper()}{label[1:]}: {self.hide_login_info(self.own_args)}',
5370 *(f'\n{c}'.replace('\n', '\n| ')[1:] for c in self.configs),
5371 delim='\n')
5372
5373 @staticmethod
5374 def read_file(filename, default=[]):
5375 try:
5376 optionf = open(filename)
5377 except IOError:
5378 return default # silently skip if file is not present
5379 try:
5380 # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56
5381 contents = optionf.read()
5382 if sys.version_info < (3,):
5383 contents = contents.decode(preferredencoding())
5384 res = compat_shlex_split(contents, comments=True)
5385 finally:
5386 optionf.close()
5387 return res
5388
5389 @staticmethod
5390 def hide_login_info(opts):
5391 PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
5392 eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
5393
5394 def _scrub_eq(o):
5395 m = eqre.match(o)
5396 if m:
5397 return m.group('key') + '=PRIVATE'
5398 else:
5399 return o
5400
5401 opts = list(map(_scrub_eq, opts))
5402 for idx, opt in enumerate(opts):
5403 if opt in PRIVATE_OPTS and idx + 1 < len(opts):
5404 opts[idx + 1] = 'PRIVATE'
5405 return opts
5406
5407 def append_config(self, *args, label=None):
5408 config = type(self)(self._parser, label)
5409 config._loaded_paths = self._loaded_paths
5410 if config.init(*args):
5411 self.configs.append(config)
5412
5413 @property
5414 def all_args(self):
5415 for config in reversed(self.configs):
5416 yield from config.all_args
5417 yield from self.own_args or []
5418
5419 def parse_args(self):
5420 return self._parser.parse_args(list(self.all_args))
5421
5422
5423 class WebSocketsWrapper():
5424 """Wraps websockets module to use in non-async scopes"""
5425
5426 def __init__(self, url, headers=None):
5427 self.loop = asyncio.events.new_event_loop()
5428 self.conn = compat_websockets.connect(
5429 url, extra_headers=headers, ping_interval=None,
5430 close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf'))
5431 atexit.register(self.__exit__, None, None, None)
5432
5433 def __enter__(self):
5434 self.pool = self.run_with_loop(self.conn.__aenter__(), self.loop)
5435 return self
5436
5437 def send(self, *args):
5438 self.run_with_loop(self.pool.send(*args), self.loop)
5439
5440 def recv(self, *args):
5441 return self.run_with_loop(self.pool.recv(*args), self.loop)
5442
5443 def __exit__(self, type, value, traceback):
5444 try:
5445 return self.run_with_loop(self.conn.__aexit__(type, value, traceback), self.loop)
5446 finally:
5447 self.loop.close()
5448 self._cancel_all_tasks(self.loop)
5449
5450 # taken from https://github.com/python/cpython/blob/3.9/Lib/asyncio/runners.py with modifications
5451 # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class
5452 @staticmethod
5453 def run_with_loop(main, loop):
5454 if not asyncio.coroutines.iscoroutine(main):
5455 raise ValueError(f'a coroutine was expected, got {main!r}')
5456
5457 try:
5458 return loop.run_until_complete(main)
5459 finally:
5460 loop.run_until_complete(loop.shutdown_asyncgens())
5461 if hasattr(loop, 'shutdown_default_executor'):
5462 loop.run_until_complete(loop.shutdown_default_executor())
5463
5464 @staticmethod
5465 def _cancel_all_tasks(loop):
5466 to_cancel = asyncio.tasks.all_tasks(loop)
5467
5468 if not to_cancel:
5469 return
5470
5471 for task in to_cancel:
5472 task.cancel()
5473
5474 loop.run_until_complete(
5475 asyncio.tasks.gather(*to_cancel, loop=loop, return_exceptions=True))
5476
5477 for task in to_cancel:
5478 if task.cancelled():
5479 continue
5480 if task.exception() is not None:
5481 loop.call_exception_handler({
5482 'message': 'unhandled exception during asyncio.run() shutdown',
5483 'exception': task.exception(),
5484 'task': task,
5485 })
5486
5487
5488 has_websockets = bool(compat_websockets)
5489
5490
5491 def merge_headers(*dicts):
5492 """Merge dicts of http headers case insensitively, prioritizing the latter ones"""
5493 return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}