]> jfr.im git - yt-dlp.git/blob - youtube_dl/compat.py
[godtv] Relax _VALID_URL
[yt-dlp.git] / youtube_dl / compat.py
1 from __future__ import unicode_literals
2
3 import binascii
4 import collections
5 import email
6 import getpass
7 import io
8 import optparse
9 import os
10 import re
11 import shlex
12 import shutil
13 import socket
14 import struct
15 import subprocess
16 import sys
17 import itertools
18 import xml.etree.ElementTree
19
20
21 try:
22 import urllib.request as compat_urllib_request
23 except ImportError: # Python 2
24 import urllib2 as compat_urllib_request
25
26 try:
27 import urllib.error as compat_urllib_error
28 except ImportError: # Python 2
29 import urllib2 as compat_urllib_error
30
31 try:
32 import urllib.parse as compat_urllib_parse
33 except ImportError: # Python 2
34 import urllib as compat_urllib_parse
35
36 try:
37 from urllib.parse import urlparse as compat_urllib_parse_urlparse
38 except ImportError: # Python 2
39 from urlparse import urlparse as compat_urllib_parse_urlparse
40
41 try:
42 import urllib.parse as compat_urlparse
43 except ImportError: # Python 2
44 import urlparse as compat_urlparse
45
46 try:
47 import urllib.response as compat_urllib_response
48 except ImportError: # Python 2
49 import urllib as compat_urllib_response
50
51 try:
52 import http.cookiejar as compat_cookiejar
53 except ImportError: # Python 2
54 import cookielib as compat_cookiejar
55
56 try:
57 import http.cookies as compat_cookies
58 except ImportError: # Python 2
59 import Cookie as compat_cookies
60
61 try:
62 import html.entities as compat_html_entities
63 except ImportError: # Python 2
64 import htmlentitydefs as compat_html_entities
65
66 try:
67 import http.client as compat_http_client
68 except ImportError: # Python 2
69 import httplib as compat_http_client
70
71 try:
72 from urllib.error import HTTPError as compat_HTTPError
73 except ImportError: # Python 2
74 from urllib2 import HTTPError as compat_HTTPError
75
76 try:
77 from urllib.request import urlretrieve as compat_urlretrieve
78 except ImportError: # Python 2
79 from urllib import urlretrieve as compat_urlretrieve
80
81 try:
82 from html.parser import HTMLParser as compat_HTMLParser
83 except ImportError: # Python 2
84 from HTMLParser import HTMLParser as compat_HTMLParser
85
86
87 try:
88 from subprocess import DEVNULL
89 compat_subprocess_get_DEVNULL = lambda: DEVNULL
90 except ImportError:
91 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
92
93 try:
94 import http.server as compat_http_server
95 except ImportError:
96 import BaseHTTPServer as compat_http_server
97
98 try:
99 compat_str = unicode # Python 2
100 except NameError:
101 compat_str = str
102
103 try:
104 from urllib.parse import unquote_to_bytes as compat_urllib_parse_unquote_to_bytes
105 from urllib.parse import unquote as compat_urllib_parse_unquote
106 from urllib.parse import unquote_plus as compat_urllib_parse_unquote_plus
107 except ImportError: # Python 2
108 _asciire = (compat_urllib_parse._asciire if hasattr(compat_urllib_parse, '_asciire')
109 else re.compile('([\x00-\x7f]+)'))
110
111 # HACK: The following are the correct unquote_to_bytes, unquote and unquote_plus
112 # implementations from cpython 3.4.3's stdlib. Python 2's version
113 # is apparently broken (see https://github.com/rg3/youtube-dl/pull/6244)
114
115 def compat_urllib_parse_unquote_to_bytes(string):
116 """unquote_to_bytes('abc%20def') -> b'abc def'."""
117 # Note: strings are encoded as UTF-8. This is only an issue if it contains
118 # unescaped non-ASCII characters, which URIs should not.
119 if not string:
120 # Is it a string-like object?
121 string.split
122 return b''
123 if isinstance(string, compat_str):
124 string = string.encode('utf-8')
125 bits = string.split(b'%')
126 if len(bits) == 1:
127 return string
128 res = [bits[0]]
129 append = res.append
130 for item in bits[1:]:
131 try:
132 append(compat_urllib_parse._hextochr[item[:2]])
133 append(item[2:])
134 except KeyError:
135 append(b'%')
136 append(item)
137 return b''.join(res)
138
139 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
140 """Replace %xx escapes by their single-character equivalent. The optional
141 encoding and errors parameters specify how to decode percent-encoded
142 sequences into Unicode characters, as accepted by the bytes.decode()
143 method.
144 By default, percent-encoded sequences are decoded with UTF-8, and invalid
145 sequences are replaced by a placeholder character.
146
147 unquote('abc%20def') -> 'abc def'.
148 """
149 if '%' not in string:
150 string.split
151 return string
152 if encoding is None:
153 encoding = 'utf-8'
154 if errors is None:
155 errors = 'replace'
156 bits = _asciire.split(string)
157 res = [bits[0]]
158 append = res.append
159 for i in range(1, len(bits), 2):
160 append(compat_urllib_parse_unquote_to_bytes(bits[i]).decode(encoding, errors))
161 append(bits[i + 1])
162 return ''.join(res)
163
164 def compat_urllib_parse_unquote_plus(string, encoding='utf-8', errors='replace'):
165 """Like unquote(), but also replace plus signs by spaces, as required for
166 unquoting HTML form values.
167
168 unquote_plus('%7e/abc+def') -> '~/abc def'
169 """
170 string = string.replace('+', ' ')
171 return compat_urllib_parse_unquote(string, encoding, errors)
172
173 try:
174 from urllib.parse import urlencode as compat_urllib_parse_urlencode
175 except ImportError: # Python 2
176 # Python 2 will choke in urlencode on mixture of byte and unicode strings.
177 # Possible solutions are to either port it from python 3 with all
178 # the friends or manually ensure input query contains only byte strings.
179 # We will stick with latter thus recursively encoding the whole query.
180 def compat_urllib_parse_urlencode(query, doseq=0, encoding='utf-8'):
181 def encode_elem(e):
182 if isinstance(e, dict):
183 e = encode_dict(e)
184 elif isinstance(e, (list, tuple,)):
185 list_e = encode_list(e)
186 e = tuple(list_e) if isinstance(e, tuple) else list_e
187 elif isinstance(e, compat_str):
188 e = e.encode(encoding)
189 return e
190
191 def encode_dict(d):
192 return dict((encode_elem(k), encode_elem(v)) for k, v in d.items())
193
194 def encode_list(l):
195 return [encode_elem(e) for e in l]
196
197 return compat_urllib_parse.urlencode(encode_elem(query), doseq=doseq)
198
199 try:
200 from urllib.request import DataHandler as compat_urllib_request_DataHandler
201 except ImportError: # Python < 3.4
202 # Ported from CPython 98774:1733b3bd46db, Lib/urllib/request.py
203 class compat_urllib_request_DataHandler(compat_urllib_request.BaseHandler):
204 def data_open(self, req):
205 # data URLs as specified in RFC 2397.
206 #
207 # ignores POSTed data
208 #
209 # syntax:
210 # dataurl := "data:" [ mediatype ] [ ";base64" ] "," data
211 # mediatype := [ type "/" subtype ] *( ";" parameter )
212 # data := *urlchar
213 # parameter := attribute "=" value
214 url = req.get_full_url()
215
216 scheme, data = url.split(':', 1)
217 mediatype, data = data.split(',', 1)
218
219 # even base64 encoded data URLs might be quoted so unquote in any case:
220 data = compat_urllib_parse_unquote_to_bytes(data)
221 if mediatype.endswith(';base64'):
222 data = binascii.a2b_base64(data)
223 mediatype = mediatype[:-7]
224
225 if not mediatype:
226 mediatype = 'text/plain;charset=US-ASCII'
227
228 headers = email.message_from_string(
229 'Content-type: %s\nContent-length: %d\n' % (mediatype, len(data)))
230
231 return compat_urllib_response.addinfourl(io.BytesIO(data), headers, url)
232
233 try:
234 compat_basestring = basestring # Python 2
235 except NameError:
236 compat_basestring = str
237
238 try:
239 compat_chr = unichr # Python 2
240 except NameError:
241 compat_chr = chr
242
243 try:
244 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
245 except ImportError: # Python 2.6
246 from xml.parsers.expat import ExpatError as compat_xml_parse_error
247
248
249 etree = xml.etree.ElementTree
250
251
252 class _TreeBuilder(etree.TreeBuilder):
253 def doctype(self, name, pubid, system):
254 pass
255
256 if sys.version_info[0] >= 3:
257 def compat_etree_fromstring(text):
258 return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder()))
259 else:
260 # python 2.x tries to encode unicode strings with ascii (see the
261 # XMLParser._fixtext method)
262 try:
263 _etree_iter = etree.Element.iter
264 except AttributeError: # Python <=2.6
265 def _etree_iter(root):
266 for el in root.findall('*'):
267 yield el
268 for sub in _etree_iter(el):
269 yield sub
270
271 # on 2.6 XML doesn't have a parser argument, function copied from CPython
272 # 2.7 source
273 def _XML(text, parser=None):
274 if not parser:
275 parser = etree.XMLParser(target=_TreeBuilder())
276 parser.feed(text)
277 return parser.close()
278
279 def _element_factory(*args, **kwargs):
280 el = etree.Element(*args, **kwargs)
281 for k, v in el.items():
282 if isinstance(v, bytes):
283 el.set(k, v.decode('utf-8'))
284 return el
285
286 def compat_etree_fromstring(text):
287 doc = _XML(text, parser=etree.XMLParser(target=_TreeBuilder(element_factory=_element_factory)))
288 for el in _etree_iter(doc):
289 if el.text is not None and isinstance(el.text, bytes):
290 el.text = el.text.decode('utf-8')
291 return doc
292
293 if sys.version_info < (2, 7):
294 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
295 # .//node does not match if a node is a direct child of . !
296 def compat_xpath(xpath):
297 if isinstance(xpath, compat_str):
298 xpath = xpath.encode('ascii')
299 return xpath
300 else:
301 compat_xpath = lambda xpath: xpath
302
303 try:
304 from urllib.parse import parse_qs as compat_parse_qs
305 except ImportError: # Python 2
306 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
307 # Python 2's version is apparently totally broken
308
309 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
310 encoding='utf-8', errors='replace'):
311 qs, _coerce_result = qs, compat_str
312 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
313 r = []
314 for name_value in pairs:
315 if not name_value and not strict_parsing:
316 continue
317 nv = name_value.split('=', 1)
318 if len(nv) != 2:
319 if strict_parsing:
320 raise ValueError('bad query field: %r' % (name_value,))
321 # Handle case of a control-name with no equal sign
322 if keep_blank_values:
323 nv.append('')
324 else:
325 continue
326 if len(nv[1]) or keep_blank_values:
327 name = nv[0].replace('+', ' ')
328 name = compat_urllib_parse_unquote(
329 name, encoding=encoding, errors=errors)
330 name = _coerce_result(name)
331 value = nv[1].replace('+', ' ')
332 value = compat_urllib_parse_unquote(
333 value, encoding=encoding, errors=errors)
334 value = _coerce_result(value)
335 r.append((name, value))
336 return r
337
338 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
339 encoding='utf-8', errors='replace'):
340 parsed_result = {}
341 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
342 encoding=encoding, errors=errors)
343 for name, value in pairs:
344 if name in parsed_result:
345 parsed_result[name].append(value)
346 else:
347 parsed_result[name] = [value]
348 return parsed_result
349
350 try:
351 from shlex import quote as compat_shlex_quote
352 except ImportError: # Python < 3.3
353 def compat_shlex_quote(s):
354 if re.match(r'^[-_\w./]+$', s):
355 return s
356 else:
357 return "'" + s.replace("'", "'\"'\"'") + "'"
358
359
360 if sys.version_info >= (2, 7, 3):
361 compat_shlex_split = shlex.split
362 else:
363 # Working around shlex issue with unicode strings on some python 2
364 # versions (see http://bugs.python.org/issue1548891)
365 def compat_shlex_split(s, comments=False, posix=True):
366 if isinstance(s, compat_str):
367 s = s.encode('utf-8')
368 return shlex.split(s, comments, posix)
369
370
371 def compat_ord(c):
372 if type(c) is int:
373 return c
374 else:
375 return ord(c)
376
377
378 compat_os_name = os._name if os.name == 'java' else os.name
379
380
381 if sys.version_info >= (3, 0):
382 compat_getenv = os.getenv
383 compat_expanduser = os.path.expanduser
384
385 def compat_setenv(key, value, env=os.environ):
386 env[key] = value
387 else:
388 # Environment variables should be decoded with filesystem encoding.
389 # Otherwise it will fail if any non-ASCII characters present (see #3854 #3217 #2918)
390
391 def compat_getenv(key, default=None):
392 from .utils import get_filesystem_encoding
393 env = os.getenv(key, default)
394 if env:
395 env = env.decode(get_filesystem_encoding())
396 return env
397
398 def compat_setenv(key, value, env=os.environ):
399 def encode(v):
400 from .utils import get_filesystem_encoding
401 return v.encode(get_filesystem_encoding()) if isinstance(v, compat_str) else v
402 env[encode(key)] = encode(value)
403
404 # HACK: The default implementations of os.path.expanduser from cpython do not decode
405 # environment variables with filesystem encoding. We will work around this by
406 # providing adjusted implementations.
407 # The following are os.path.expanduser implementations from cpython 2.7.8 stdlib
408 # for different platforms with correct environment variables decoding.
409
410 if compat_os_name == 'posix':
411 def compat_expanduser(path):
412 """Expand ~ and ~user constructions. If user or $HOME is unknown,
413 do nothing."""
414 if not path.startswith('~'):
415 return path
416 i = path.find('/', 1)
417 if i < 0:
418 i = len(path)
419 if i == 1:
420 if 'HOME' not in os.environ:
421 import pwd
422 userhome = pwd.getpwuid(os.getuid()).pw_dir
423 else:
424 userhome = compat_getenv('HOME')
425 else:
426 import pwd
427 try:
428 pwent = pwd.getpwnam(path[1:i])
429 except KeyError:
430 return path
431 userhome = pwent.pw_dir
432 userhome = userhome.rstrip('/')
433 return (userhome + path[i:]) or '/'
434 elif compat_os_name == 'nt' or compat_os_name == 'ce':
435 def compat_expanduser(path):
436 """Expand ~ and ~user constructs.
437
438 If user or $HOME is unknown, do nothing."""
439 if path[:1] != '~':
440 return path
441 i, n = 1, len(path)
442 while i < n and path[i] not in '/\\':
443 i = i + 1
444
445 if 'HOME' in os.environ:
446 userhome = compat_getenv('HOME')
447 elif 'USERPROFILE' in os.environ:
448 userhome = compat_getenv('USERPROFILE')
449 elif 'HOMEPATH' not in os.environ:
450 return path
451 else:
452 try:
453 drive = compat_getenv('HOMEDRIVE')
454 except KeyError:
455 drive = ''
456 userhome = os.path.join(drive, compat_getenv('HOMEPATH'))
457
458 if i != 1: # ~user
459 userhome = os.path.join(os.path.dirname(userhome), path[1:i])
460
461 return userhome + path[i:]
462 else:
463 compat_expanduser = os.path.expanduser
464
465
466 if sys.version_info < (3, 0):
467 def compat_print(s):
468 from .utils import preferredencoding
469 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
470 else:
471 def compat_print(s):
472 assert isinstance(s, compat_str)
473 print(s)
474
475
476 if sys.version_info < (3, 0) and sys.platform == 'win32':
477 def compat_getpass(prompt, *args, **kwargs):
478 if isinstance(prompt, compat_str):
479 from .utils import preferredencoding
480 prompt = prompt.encode(preferredencoding())
481 return getpass.getpass(prompt, *args, **kwargs)
482 else:
483 compat_getpass = getpass.getpass
484
485 try:
486 compat_input = raw_input
487 except NameError: # Python 3
488 compat_input = input
489
490 # Python < 2.6.5 require kwargs to be bytes
491 try:
492 def _testfunc(x):
493 pass
494 _testfunc(**{'x': 0})
495 except TypeError:
496 def compat_kwargs(kwargs):
497 return dict((bytes(k), v) for k, v in kwargs.items())
498 else:
499 compat_kwargs = lambda kwargs: kwargs
500
501
502 if sys.version_info < (2, 7):
503 def compat_socket_create_connection(address, timeout, source_address=None):
504 host, port = address
505 err = None
506 for res in socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM):
507 af, socktype, proto, canonname, sa = res
508 sock = None
509 try:
510 sock = socket.socket(af, socktype, proto)
511 sock.settimeout(timeout)
512 if source_address:
513 sock.bind(source_address)
514 sock.connect(sa)
515 return sock
516 except socket.error as _:
517 err = _
518 if sock is not None:
519 sock.close()
520 if err is not None:
521 raise err
522 else:
523 raise socket.error('getaddrinfo returns an empty list')
524 else:
525 compat_socket_create_connection = socket.create_connection
526
527
528 # Fix https://github.com/rg3/youtube-dl/issues/4223
529 # See http://bugs.python.org/issue9161 for what is broken
530 def workaround_optparse_bug9161():
531 op = optparse.OptionParser()
532 og = optparse.OptionGroup(op, 'foo')
533 try:
534 og.add_option('-t')
535 except TypeError:
536 real_add_option = optparse.OptionGroup.add_option
537
538 def _compat_add_option(self, *args, **kwargs):
539 enc = lambda v: (
540 v.encode('ascii', 'replace') if isinstance(v, compat_str)
541 else v)
542 bargs = [enc(a) for a in args]
543 bkwargs = dict(
544 (k, enc(v)) for k, v in kwargs.items())
545 return real_add_option(self, *bargs, **bkwargs)
546 optparse.OptionGroup.add_option = _compat_add_option
547
548 if hasattr(shutil, 'get_terminal_size'): # Python >= 3.3
549 compat_get_terminal_size = shutil.get_terminal_size
550 else:
551 _terminal_size = collections.namedtuple('terminal_size', ['columns', 'lines'])
552
553 def compat_get_terminal_size(fallback=(80, 24)):
554 columns = compat_getenv('COLUMNS')
555 if columns:
556 columns = int(columns)
557 else:
558 columns = None
559 lines = compat_getenv('LINES')
560 if lines:
561 lines = int(lines)
562 else:
563 lines = None
564
565 if columns is None or lines is None or columns <= 0 or lines <= 0:
566 try:
567 sp = subprocess.Popen(
568 ['stty', 'size'],
569 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
570 out, err = sp.communicate()
571 _lines, _columns = map(int, out.split())
572 except Exception:
573 _columns, _lines = _terminal_size(*fallback)
574
575 if columns is None or columns <= 0:
576 columns = _columns
577 if lines is None or lines <= 0:
578 lines = _lines
579 return _terminal_size(columns, lines)
580
581 try:
582 itertools.count(start=0, step=1)
583 compat_itertools_count = itertools.count
584 except TypeError: # Python 2.6
585 def compat_itertools_count(start=0, step=1):
586 n = start
587 while True:
588 yield n
589 n += step
590
591 if sys.version_info >= (3, 0):
592 from tokenize import tokenize as compat_tokenize_tokenize
593 else:
594 from tokenize import generate_tokens as compat_tokenize_tokenize
595
596
597 try:
598 struct.pack('!I', 0)
599 except TypeError:
600 # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument
601 # See https://bugs.python.org/issue19099
602 def compat_struct_pack(spec, *args):
603 if isinstance(spec, compat_str):
604 spec = spec.encode('ascii')
605 return struct.pack(spec, *args)
606
607 def compat_struct_unpack(spec, *args):
608 if isinstance(spec, compat_str):
609 spec = spec.encode('ascii')
610 return struct.unpack(spec, *args)
611 else:
612 compat_struct_pack = struct.pack
613 compat_struct_unpack = struct.unpack
614
615
616 __all__ = [
617 'compat_HTMLParser',
618 'compat_HTTPError',
619 'compat_basestring',
620 'compat_chr',
621 'compat_cookiejar',
622 'compat_cookies',
623 'compat_etree_fromstring',
624 'compat_expanduser',
625 'compat_get_terminal_size',
626 'compat_getenv',
627 'compat_getpass',
628 'compat_html_entities',
629 'compat_http_client',
630 'compat_http_server',
631 'compat_input',
632 'compat_itertools_count',
633 'compat_kwargs',
634 'compat_ord',
635 'compat_os_name',
636 'compat_parse_qs',
637 'compat_print',
638 'compat_setenv',
639 'compat_shlex_quote',
640 'compat_shlex_split',
641 'compat_socket_create_connection',
642 'compat_str',
643 'compat_struct_pack',
644 'compat_struct_unpack',
645 'compat_subprocess_get_DEVNULL',
646 'compat_tokenize_tokenize',
647 'compat_urllib_error',
648 'compat_urllib_parse',
649 'compat_urllib_parse_unquote',
650 'compat_urllib_parse_unquote_plus',
651 'compat_urllib_parse_unquote_to_bytes',
652 'compat_urllib_parse_urlencode',
653 'compat_urllib_parse_urlparse',
654 'compat_urllib_request',
655 'compat_urllib_request_DataHandler',
656 'compat_urllib_response',
657 'compat_urlparse',
658 'compat_urlretrieve',
659 'compat_xml_parse_error',
660 'compat_xpath',
661 'workaround_optparse_bug9161',
662 ]