]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
Credit @dinesh for rte.ie (#4015)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
912b38b4 6import calendar
676eb3f2 7import codecs
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96
PH
10import datetime
11import email.utils
f45c185f 12import errno
d77c3dfd 13import gzip
b7ab0590 14import itertools
03f9daab 15import io
f4bfd65f 16import json
d77c3dfd 17import locale
02dbf93f 18import math
d77c3dfd 19import os
4eb7f1d1 20import pipes
c496ca96 21import platform
d77c3dfd 22import re
13ebea79 23import ssl
c496ca96 24import socket
b53466e1 25import struct
1c088fa8 26import subprocess
d77c3dfd 27import sys
181c8655 28import tempfile
01951dda 29import traceback
bcf89ce6 30import xml.etree.ElementTree
d77c3dfd 31import zlib
d77c3dfd 32
8c25f81b
PH
33from .compat import (
34 compat_chr,
35 compat_getenv,
36 compat_html_entities,
8c25f81b
PH
37 compat_parse_qs,
38 compat_str,
39 compat_urllib_error,
40 compat_urllib_parse,
41 compat_urllib_parse_urlparse,
42 compat_urllib_request,
43 compat_urlparse,
7d4111ed 44 shlex_quote,
8c25f81b 45)
4644ac55
S
46
47
468e2e92
FV
48# This is not clearly defined otherwise
49compiled_regex_type = type(re.compile(''))
50
3e669f36 51std_headers = {
ae8f7871 52 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
53 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
54 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
55 'Accept-Encoding': 'gzip, deflate',
56 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 57}
f427df17 58
5f6a1245 59
d77c3dfd 60def preferredencoding():
59ae15a5 61 """Get preferred encoding.
d77c3dfd 62
59ae15a5
PH
63 Returns the best encoding scheme for the system, based on
64 locale.getpreferredencoding() and some further tweaks.
65 """
66 try:
67 pref = locale.getpreferredencoding()
28e614de 68 'TEST'.encode(pref)
59ae15a5
PH
69 except:
70 pref = 'UTF-8'
bae611f2 71
59ae15a5 72 return pref
d77c3dfd 73
f4bfd65f 74
181c8655 75def write_json_file(obj, fn):
1394646a 76 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 77
92120217 78 fn = encodeFilename(fn)
61ee5aeb 79 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
80 encoding = get_filesystem_encoding()
81 # os.path.basename returns a bytes object, but NamedTemporaryFile
82 # will fail if the filename contains non ascii characters unless we
83 # use a unicode object
84 path_basename = lambda f: os.path.basename(fn).decode(encoding)
85 # the same for os.path.dirname
86 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
87 else:
88 path_basename = os.path.basename
89 path_dirname = os.path.dirname
90
73159f99
S
91 args = {
92 'suffix': '.tmp',
ec5f6016
JMF
93 'prefix': path_basename(fn) + '.',
94 'dir': path_dirname(fn),
73159f99
S
95 'delete': False,
96 }
97
181c8655
PH
98 # In Python 2.x, json.dump expects a bytestream.
99 # In Python 3.x, it writes to a character stream
100 if sys.version_info < (3, 0):
73159f99 101 args['mode'] = 'wb'
181c8655 102 else:
73159f99
S
103 args.update({
104 'mode': 'w',
105 'encoding': 'utf-8',
106 })
107
108 tf = tempfile.NamedTemporaryFile(**args)
181c8655
PH
109
110 try:
111 with tf:
112 json.dump(obj, tf)
1394646a
IK
113 if sys.platform == 'win32':
114 # Need to remove existing file on Windows, else os.rename raises
115 # WindowsError or FileExistsError.
116 try:
117 os.unlink(fn)
118 except OSError:
119 pass
181c8655
PH
120 os.rename(tf.name, fn)
121 except:
122 try:
123 os.remove(tf.name)
124 except OSError:
125 pass
126 raise
127
128
129if sys.version_info >= (2, 7):
59ae56fa
PH
130 def find_xpath_attr(node, xpath, key, val):
131 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
132 assert re.match(r'^[a-zA-Z-]+$', key)
133 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
ab4ee31e 134 expr = xpath + "[@%s='%s']" % (key, val)
59ae56fa
PH
135 return node.find(expr)
136else:
137 def find_xpath_attr(node, xpath, key, val):
4eefbfdb
PH
138 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
139 # .//node does not match if a node is a direct child of . !
140 if isinstance(xpath, unicode):
141 xpath = xpath.encode('ascii')
142
59ae56fa
PH
143 for f in node.findall(xpath):
144 if f.attrib.get(key) == val:
145 return f
146 return None
147
d7e66d39
JMF
148# On python2.6 the xml.etree.ElementTree.Element methods don't support
149# the namespace parameter
5f6a1245
JW
150
151
d7e66d39
JMF
152def xpath_with_ns(path, ns_map):
153 components = [c.split(':') for c in path.split('/')]
154 replaced = []
155 for c in components:
156 if len(c) == 1:
157 replaced.append(c[0])
158 else:
159 ns, tag = c
160 replaced.append('{%s}%s' % (ns_map[ns], tag))
161 return '/'.join(replaced)
162
d77c3dfd 163
bf0ff932 164def xpath_text(node, xpath, name=None, fatal=False):
d74bebd5
PH
165 if sys.version_info < (2, 7): # Crazy 2.6
166 xpath = xpath.encode('ascii')
167
bf0ff932 168 n = node.find(xpath)
42bdd9d0 169 if n is None or n.text is None:
bf0ff932
PH
170 if fatal:
171 name = xpath if name is None else name
172 raise ExtractorError('Could not find XML element %s' % name)
173 else:
174 return None
175 return n.text
176
177
9e6dd238 178def get_element_by_id(id, html):
43e8fafd
ND
179 """Return the content of the tag with the specified ID in the passed HTML document"""
180 return get_element_by_attribute("id", id, html)
181
12ea2f30 182
43e8fafd
ND
183def get_element_by_attribute(attribute, value, html):
184 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 185
38285056
PH
186 m = re.search(r'''(?xs)
187 <([a-zA-Z0-9:._-]+)
188 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
189 \s+%s=['"]?%s['"]?
190 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
191 \s*>
192 (?P<content>.*?)
193 </\1>
194 ''' % (re.escape(attribute), re.escape(value)), html)
195
196 if not m:
197 return None
198 res = m.group('content')
199
200 if res.startswith('"') or res.startswith("'"):
201 res = res[1:-1]
a921f407 202
38285056 203 return unescapeHTML(res)
a921f407 204
9e6dd238
FV
205
206def clean_html(html):
59ae15a5 207 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
208
209 if html is None: # Convenience for sanitizing descriptions etc.
210 return html
211
59ae15a5
PH
212 # Newline vs <br />
213 html = html.replace('\n', ' ')
6b3aef80
FV
214 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
215 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
216 # Strip html tags
217 html = re.sub('<.*?>', '', html)
218 # Replace html entities
219 html = unescapeHTML(html)
7decf895 220 return html.strip()
9e6dd238
FV
221
222
d77c3dfd 223def sanitize_open(filename, open_mode):
59ae15a5
PH
224 """Try to open the given filename, and slightly tweak it if this fails.
225
226 Attempts to open the given filename. If this fails, it tries to change
227 the filename slightly, step by step, until it's either able to open it
228 or it fails and raises a final exception, like the standard open()
229 function.
230
231 It returns the tuple (stream, definitive_file_name).
232 """
233 try:
28e614de 234 if filename == '-':
59ae15a5
PH
235 if sys.platform == 'win32':
236 import msvcrt
237 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 238 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
239 stream = open(encodeFilename(filename), open_mode)
240 return (stream, filename)
241 except (IOError, OSError) as err:
f45c185f
PH
242 if err.errno in (errno.EACCES,):
243 raise
59ae15a5 244
f45c185f
PH
245 # In case of error, try to remove win32 forbidden chars
246 alt_filename = os.path.join(
b74e86f4
PH
247 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
248 for path_part in os.path.split(filename)
249 )
f45c185f
PH
250 if alt_filename == filename:
251 raise
252 else:
253 # An exception here should be caught in the caller
254 stream = open(encodeFilename(filename), open_mode)
255 return (stream, alt_filename)
d77c3dfd
FV
256
257
258def timeconvert(timestr):
59ae15a5
PH
259 """Convert RFC 2822 defined time string into system timestamp"""
260 timestamp = None
261 timetuple = email.utils.parsedate_tz(timestr)
262 if timetuple is not None:
263 timestamp = email.utils.mktime_tz(timetuple)
264 return timestamp
1c469a94 265
5f6a1245 266
796173d0 267def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
268 """Sanitizes a string so it could be used as part of a filename.
269 If restricted is set, use a stricter subset of allowed characters.
796173d0 270 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
271 """
272 def replace_insane(char):
273 if char == '?' or ord(char) < 32 or ord(char) == 127:
274 return ''
275 elif char == '"':
276 return '' if restricted else '\''
277 elif char == ':':
278 return '_-' if restricted else ' -'
279 elif char in '\\/|*<>':
280 return '_'
627dcfff 281 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
282 return '_'
283 if restricted and ord(char) > 127:
284 return '_'
285 return char
286
28e614de 287 result = ''.join(map(replace_insane, s))
796173d0
PH
288 if not is_id:
289 while '__' in result:
290 result = result.replace('__', '_')
291 result = result.strip('_')
292 # Common case of "Foreign band name - English song title"
293 if restricted and result.startswith('-_'):
294 result = result[2:]
295 if not result:
296 result = '_'
59ae15a5 297 return result
d77c3dfd 298
5f6a1245 299
d77c3dfd 300def orderedSet(iterable):
59ae15a5
PH
301 """ Remove all duplicates from the input iterable """
302 res = []
303 for el in iterable:
304 if el not in res:
305 res.append(el)
306 return res
d77c3dfd 307
912b38b4 308
4e408e47
PH
309def _htmlentity_transform(entity):
310 """Transforms an HTML entity to a character."""
311 # Known non-numeric HTML entity
312 if entity in compat_html_entities.name2codepoint:
313 return compat_chr(compat_html_entities.name2codepoint[entity])
314
315 mobj = re.match(r'#(x?[0-9]+)', entity)
316 if mobj is not None:
317 numstr = mobj.group(1)
28e614de 318 if numstr.startswith('x'):
4e408e47 319 base = 16
28e614de 320 numstr = '0%s' % numstr
4e408e47
PH
321 else:
322 base = 10
323 return compat_chr(int(numstr, base))
324
325 # Unknown entity in name, return its literal representation
28e614de 326 return ('&%s;' % entity)
4e408e47
PH
327
328
d77c3dfd 329def unescapeHTML(s):
912b38b4
PH
330 if s is None:
331 return None
332 assert type(s) == compat_str
d77c3dfd 333
4e408e47
PH
334 return re.sub(
335 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 336
8bf48f23
PH
337
338def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
339 """
340 @param s The name of the file
341 """
d77c3dfd 342
8bf48f23 343 assert type(s) == compat_str
d77c3dfd 344
59ae15a5
PH
345 # Python 3 has a Unicode API
346 if sys.version_info >= (3, 0):
347 return s
0f00efed 348
59ae15a5 349 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
28e614de 350 # Pass '' directly to use Unicode APIs on Windows 2000 and up
59ae15a5
PH
351 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
352 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
353 if not for_subprocess:
354 return s
355 else:
356 # For subprocess calls, encode with locale encoding
357 # Refer to http://stackoverflow.com/a/9951851/35070
358 encoding = preferredencoding()
59ae15a5 359 else:
6df40dcb 360 encoding = sys.getfilesystemencoding()
8bf48f23
PH
361 if encoding is None:
362 encoding = 'utf-8'
363 return s.encode(encoding, 'ignore')
364
f07b74fc
PH
365
366def encodeArgument(s):
367 if not isinstance(s, compat_str):
368 # Legacy code that uses byte strings
369 # Uncomment the following line after fixing all post processors
7af808a5 370 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
371 s = s.decode('ascii')
372 return encodeFilename(s, True)
373
374
8271226a
PH
375def decodeOption(optval):
376 if optval is None:
377 return optval
378 if isinstance(optval, bytes):
379 optval = optval.decode(preferredencoding())
380
381 assert isinstance(optval, compat_str)
382 return optval
1c256f70 383
5f6a1245 384
4539dd30
PH
385def formatSeconds(secs):
386 if secs > 3600:
387 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
388 elif secs > 60:
389 return '%d:%02d' % (secs // 60, secs % 60)
390 else:
391 return '%d' % secs
392
a0ddb8a2
PH
393
394def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
0db261ba
JMF
395 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
396 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
0db261ba
JMF
397 if opts_no_check_certificate:
398 context.verify_mode = ssl.CERT_NONE
a2366922
PH
399 try:
400 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
401 except TypeError:
402 # Python 2.7.8
403 # (create_default_context present but HTTPSHandler has no context=)
404 pass
405
406 if sys.version_info < (3, 2):
13ebea79
PH
407 import httplib
408
409 class HTTPSConnectionV3(httplib.HTTPSConnection):
410 def __init__(self, *args, **kwargs):
411 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
412
413 def connect(self):
414 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 415 if getattr(self, '_tunnel_host', False):
13ebea79
PH
416 self.sock = sock
417 self._tunnel()
418 try:
aa37e3d4 419 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
de79c46c 420 except ssl.SSLError:
13ebea79
PH
421 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
422
423 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
424 def https_open(self, req):
425 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 426 return HTTPSHandlerV3(**kwargs)
aa37e3d4
PH
427 else: # Python < 3.4
428 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
ea6d901e 429 context.verify_mode = (ssl.CERT_NONE
dca08720 430 if opts_no_check_certificate
ea6d901e 431 else ssl.CERT_REQUIRED)
303b479e 432 context.set_default_verify_paths()
a0ddb8a2 433 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 434
732ea2f0 435
1c256f70
PH
436class ExtractorError(Exception):
437 """Error during info extraction."""
5f6a1245 438
d11271dd 439 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
440 """ tb, if given, is the original traceback (so that it can be printed out).
441 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
442 """
443
444 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
445 expected = True
d11271dd
PH
446 if video_id is not None:
447 msg = video_id + ': ' + msg
410f3e73 448 if cause:
28e614de 449 msg += ' (caused by %r)' % cause
9a82b238 450 if not expected:
732ea2f0
PH
451 if ytdl_is_updateable():
452 update_cmd = 'type youtube-dl -U to update'
453 else:
454 update_cmd = 'see https://yt-dl.org/update on how to update'
455 msg += '; please report this issue on https://yt-dl.org/bug .'
456 msg += ' Make sure you are using the latest version; %s.' % update_cmd
457 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
1c256f70 458 super(ExtractorError, self).__init__(msg)
d5979c5d 459
1c256f70 460 self.traceback = tb
8cc83b8d 461 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 462 self.cause = cause
d11271dd 463 self.video_id = video_id
1c256f70 464
01951dda
PH
465 def format_traceback(self):
466 if self.traceback is None:
467 return None
28e614de 468 return ''.join(traceback.format_tb(self.traceback))
01951dda 469
1c256f70 470
416c7fcb
PH
471class UnsupportedError(ExtractorError):
472 def __init__(self, url):
473 super(UnsupportedError, self).__init__(
474 'Unsupported URL: %s' % url, expected=True)
475 self.url = url
476
477
55b3e45b
JMF
478class RegexNotFoundError(ExtractorError):
479 """Error when a regex didn't match"""
480 pass
481
482
d77c3dfd 483class DownloadError(Exception):
59ae15a5 484 """Download Error exception.
d77c3dfd 485
59ae15a5
PH
486 This exception may be thrown by FileDownloader objects if they are not
487 configured to continue on errors. They will contain the appropriate
488 error message.
489 """
5f6a1245 490
8cc83b8d
FV
491 def __init__(self, msg, exc_info=None):
492 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
493 super(DownloadError, self).__init__(msg)
494 self.exc_info = exc_info
d77c3dfd
FV
495
496
497class SameFileError(Exception):
59ae15a5 498 """Same File exception.
d77c3dfd 499
59ae15a5
PH
500 This exception will be thrown by FileDownloader objects if they detect
501 multiple files would have to be downloaded to the same file on disk.
502 """
503 pass
d77c3dfd
FV
504
505
506class PostProcessingError(Exception):
59ae15a5 507 """Post Processing exception.
d77c3dfd 508
59ae15a5
PH
509 This exception may be raised by PostProcessor's .run() method to
510 indicate an error in the postprocessing task.
511 """
5f6a1245 512
7851b379
PH
513 def __init__(self, msg):
514 self.msg = msg
d77c3dfd 515
5f6a1245 516
d77c3dfd 517class MaxDownloadsReached(Exception):
59ae15a5
PH
518 """ --max-downloads limit has been reached. """
519 pass
d77c3dfd
FV
520
521
522class UnavailableVideoError(Exception):
59ae15a5 523 """Unavailable Format exception.
d77c3dfd 524
59ae15a5
PH
525 This exception will be thrown when a video is requested
526 in a format that is not available for that video.
527 """
528 pass
d77c3dfd
FV
529
530
531class ContentTooShortError(Exception):
59ae15a5 532 """Content Too Short exception.
d77c3dfd 533
59ae15a5
PH
534 This exception may be raised by FileDownloader objects when a file they
535 download is too small for what the server announced first, indicating
536 the connection was probably interrupted.
537 """
538 # Both in bytes
539 downloaded = None
540 expected = None
d77c3dfd 541
59ae15a5
PH
542 def __init__(self, downloaded, expected):
543 self.downloaded = downloaded
544 self.expected = expected
d77c3dfd 545
5f6a1245 546
acebc9cd 547class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
548 """Handler for HTTP requests and responses.
549
550 This class, when installed with an OpenerDirector, automatically adds
551 the standard headers to every HTTP request and handles gzipped and
552 deflated responses from web servers. If compression is to be avoided in
553 a particular request, the original request in the program code only has
554 to include the HTTP header "Youtubedl-No-Compression", which will be
555 removed before making the real request.
556
557 Part of this code was copied from:
558
559 http://techknack.net/python-urllib2-handlers/
560
561 Andrew Rowls, the author of that code, agreed to release it to the
562 public domain.
563 """
564
565 @staticmethod
566 def deflate(data):
567 try:
568 return zlib.decompress(data, -zlib.MAX_WBITS)
569 except zlib.error:
570 return zlib.decompress(data)
571
572 @staticmethod
573 def addinfourl_wrapper(stream, headers, url, code):
574 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
575 return compat_urllib_request.addinfourl(stream, headers, url, code)
576 ret = compat_urllib_request.addinfourl(stream, headers, url)
577 ret.code = code
578 return ret
579
acebc9cd 580 def http_request(self, req):
33ac271b
PH
581 for h, v in std_headers.items():
582 if h not in req.headers:
583 req.add_header(h, v)
59ae15a5
PH
584 if 'Youtubedl-no-compression' in req.headers:
585 if 'Accept-encoding' in req.headers:
586 del req.headers['Accept-encoding']
587 del req.headers['Youtubedl-no-compression']
3446dfb7 588 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
589 if 'User-agent' in req.headers:
590 del req.headers['User-agent']
591 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 592 del req.headers['Youtubedl-user-agent']
989b4b2b
PH
593
594 if sys.version_info < (2, 7) and '#' in req.get_full_url():
595 # Python 2.6 is brain-dead when it comes to fragments
596 req._Request__original = req._Request__original.partition('#')[0]
597 req._Request__r_type = req._Request__r_type.partition('#')[0]
598
59ae15a5
PH
599 return req
600
acebc9cd 601 def http_response(self, req, resp):
59ae15a5
PH
602 old_resp = resp
603 # gzip
604 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
605 content = resp.read()
606 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
607 try:
608 uncompressed = io.BytesIO(gz.read())
609 except IOError as original_ioerror:
610 # There may be junk add the end of the file
611 # See http://stackoverflow.com/q/4928560/35070 for details
612 for i in range(1, 1024):
613 try:
614 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
615 uncompressed = io.BytesIO(gz.read())
616 except IOError:
617 continue
618 break
619 else:
620 raise original_ioerror
621 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
622 resp.msg = old_resp.msg
623 # deflate
624 if resp.headers.get('Content-encoding', '') == 'deflate':
625 gz = io.BytesIO(self.deflate(resp.read()))
626 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
627 resp.msg = old_resp.msg
628 return resp
0f8d03f8 629
acebc9cd
PH
630 https_request = http_request
631 https_response = http_response
bf50b038 632
5de90176 633
305d0683 634def parse_iso8601(date_str, delimiter='T'):
912b38b4
PH
635 """ Return a UNIX timestamp from the given date """
636
637 if date_str is None:
638 return None
639
640 m = re.search(
6ad4013d 641 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
912b38b4
PH
642 date_str)
643 if not m:
644 timezone = datetime.timedelta()
645 else:
646 date_str = date_str[:-len(m.group(0))]
647 if not m.group('sign'):
648 timezone = datetime.timedelta()
649 else:
650 sign = 1 if m.group('sign') == '+' else -1
651 timezone = datetime.timedelta(
652 hours=sign * int(m.group('hours')),
653 minutes=sign * int(m.group('minutes')))
6ad4013d 654 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
305d0683 655 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
656 return calendar.timegm(dt.timetuple())
657
658
42bdd9d0 659def unified_strdate(date_str, day_first=True):
bf50b038 660 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
661
662 if date_str is None:
663 return None
bf50b038 664 upload_date = None
5f6a1245 665 # Replace commas
026fcc04 666 date_str = date_str.replace(',', ' ')
bf50b038 667 # %z (UTC offset) is only supported in python>=3.2
026fcc04 668 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0
PH
669 # Remove AM/PM + timezone
670 date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
671
19e1d359
JMF
672 format_expressions = [
673 '%d %B %Y',
0f99566c 674 '%d %b %Y',
19e1d359
JMF
675 '%B %d %Y',
676 '%b %d %Y',
78ff59d0
PP
677 '%b %dst %Y %I:%M%p',
678 '%b %dnd %Y %I:%M%p',
679 '%b %dth %Y %I:%M%p',
19e1d359 680 '%Y-%m-%d',
fe556f1b 681 '%Y/%m/%d',
19e1d359 682 '%Y/%m/%d %H:%M:%S',
5d73273f 683 '%Y-%m-%d %H:%M:%S',
e9be9a6a 684 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 685 '%d.%m.%Y %H:%M',
b047de6f 686 '%d.%m.%Y %H.%M',
19e1d359 687 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
688 '%Y-%m-%dT%H:%M:%S.%fZ',
689 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 690 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 691 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 692 '%Y-%m-%dT%H:%M',
19e1d359 693 ]
42bdd9d0
PH
694 if day_first:
695 format_expressions.extend([
776dc399
S
696 '%d.%m.%Y',
697 '%d/%m/%Y',
698 '%d/%m/%y',
42bdd9d0
PH
699 '%d/%m/%Y %H:%M:%S',
700 ])
701 else:
702 format_expressions.extend([
776dc399
S
703 '%m.%d.%Y',
704 '%m/%d/%Y',
705 '%m/%d/%y',
42bdd9d0
PH
706 '%m/%d/%Y %H:%M:%S',
707 ])
bf50b038
JMF
708 for expression in format_expressions:
709 try:
710 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 711 except ValueError:
bf50b038 712 pass
42393ce2
PH
713 if upload_date is None:
714 timetuple = email.utils.parsedate_tz(date_str)
715 if timetuple:
716 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
717 return upload_date
718
5f6a1245 719
28e614de 720def determine_ext(url, default_ext='unknown_video'):
f4776371
S
721 if url is None:
722 return default_ext
28e614de 723 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
724 if re.match(r'^[A-Za-z0-9]+$', guess):
725 return guess
726 else:
cbdbb766 727 return default_ext
73e79f2a 728
5f6a1245 729
d4051a8e 730def subtitles_filename(filename, sub_lang, sub_format):
28e614de 731 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 732
5f6a1245 733
bd558525 734def date_from_str(date_str):
37254abc
JMF
735 """
736 Return a datetime object from a string in the format YYYYMMDD or
737 (now|today)[+-][0-9](day|week|month|year)(s)?"""
738 today = datetime.date.today()
f8795e10 739 if date_str in ('now', 'today'):
37254abc 740 return today
f8795e10
PH
741 if date_str == 'yesterday':
742 return today - datetime.timedelta(days=1)
37254abc
JMF
743 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
744 if match is not None:
745 sign = match.group('sign')
746 time = int(match.group('time'))
747 if sign == '-':
748 time = -time
749 unit = match.group('unit')
5f6a1245 750 # A bad aproximation?
37254abc
JMF
751 if unit == 'month':
752 unit = 'day'
753 time *= 30
754 elif unit == 'year':
755 unit = 'day'
756 time *= 365
757 unit += 's'
758 delta = datetime.timedelta(**{unit: time})
759 return today + delta
bd558525 760 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
5f6a1245
JW
761
762
e63fc1be 763def hyphenate_date(date_str):
764 """
765 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
766 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
767 if match is not None:
768 return '-'.join(match.groups())
769 else:
770 return date_str
771
5f6a1245 772
bd558525
JMF
773class DateRange(object):
774 """Represents a time interval between two dates"""
5f6a1245 775
bd558525
JMF
776 def __init__(self, start=None, end=None):
777 """start and end must be strings in the format accepted by date"""
778 if start is not None:
779 self.start = date_from_str(start)
780 else:
781 self.start = datetime.datetime.min.date()
782 if end is not None:
783 self.end = date_from_str(end)
784 else:
785 self.end = datetime.datetime.max.date()
37254abc 786 if self.start > self.end:
bd558525 787 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 788
bd558525
JMF
789 @classmethod
790 def day(cls, day):
791 """Returns a range that only contains the given day"""
5f6a1245
JW
792 return cls(day, day)
793
bd558525
JMF
794 def __contains__(self, date):
795 """Check if the date is in the range"""
37254abc
JMF
796 if not isinstance(date, datetime.date):
797 date = date_from_str(date)
798 return self.start <= date <= self.end
5f6a1245 799
bd558525 800 def __str__(self):
5f6a1245 801 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
802
803
804def platform_name():
805 """ Returns the platform name as a compat_str """
806 res = platform.platform()
807 if isinstance(res, bytes):
808 res = res.decode(preferredencoding())
809
810 assert isinstance(res, compat_str)
811 return res
c257baff
PH
812
813
b58ddb32
PH
814def _windows_write_string(s, out):
815 """ Returns True if the string was written using special methods,
816 False if it has yet to be written out."""
817 # Adapted from http://stackoverflow.com/a/3259271/35070
818
819 import ctypes
820 import ctypes.wintypes
821
822 WIN_OUTPUT_IDS = {
823 1: -11,
824 2: -12,
825 }
826
a383a98a
PH
827 try:
828 fileno = out.fileno()
829 except AttributeError:
830 # If the output stream doesn't have a fileno, it's virtual
831 return False
b58ddb32
PH
832 if fileno not in WIN_OUTPUT_IDS:
833 return False
834
e2f89ec7 835 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 836 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
6ac4e806 837 (b"GetStdHandle", ctypes.windll.kernel32))
b58ddb32
PH
838 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
839
e2f89ec7 840 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
841 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
842 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
6ac4e806 843 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
b58ddb32
PH
844 written = ctypes.wintypes.DWORD(0)
845
6ac4e806 846 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
b58ddb32
PH
847 FILE_TYPE_CHAR = 0x0002
848 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 849 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
850 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
851 ctypes.POINTER(ctypes.wintypes.DWORD))(
6ac4e806 852 (b"GetConsoleMode", ctypes.windll.kernel32))
b58ddb32
PH
853 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
854
855 def not_a_console(handle):
856 if handle == INVALID_HANDLE_VALUE or handle is None:
857 return True
858 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
859 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
860
861 if not_a_console(h):
862 return False
863
d1b9c912
PH
864 def next_nonbmp_pos(s):
865 try:
866 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
867 except StopIteration:
868 return len(s)
869
870 while s:
871 count = min(next_nonbmp_pos(s), 1024)
872
b58ddb32 873 ret = WriteConsoleW(
d1b9c912 874 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
875 if ret == 0:
876 raise OSError('Failed to write string')
d1b9c912
PH
877 if not count: # We just wrote a non-BMP character
878 assert written.value == 2
879 s = s[1:]
880 else:
881 assert written.value > 0
882 s = s[written.value:]
b58ddb32
PH
883 return True
884
885
734f90bb 886def write_string(s, out=None, encoding=None):
7459e3a2
PH
887 if out is None:
888 out = sys.stderr
8bf48f23 889 assert type(s) == compat_str
7459e3a2 890
b58ddb32
PH
891 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
892 if _windows_write_string(s, out):
893 return
894
7459e3a2
PH
895 if ('b' in getattr(out, 'mode', '') or
896 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
897 byt = s.encode(encoding or preferredencoding(), 'ignore')
898 out.write(byt)
899 elif hasattr(out, 'buffer'):
900 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
901 byt = s.encode(enc, 'ignore')
902 out.buffer.write(byt)
903 else:
8bf48f23 904 out.write(s)
7459e3a2
PH
905 out.flush()
906
907
48ea9cea
PH
908def bytes_to_intlist(bs):
909 if not bs:
910 return []
911 if isinstance(bs[0], int): # Python 3
912 return list(bs)
913 else:
914 return [ord(c) for c in bs]
915
c257baff 916
cba892fa 917def intlist_to_bytes(xs):
918 if not xs:
919 return b''
eb4157fd 920 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
921
922
c1c9a79c
PH
923# Cross-platform file locking
924if sys.platform == 'win32':
925 import ctypes.wintypes
926 import msvcrt
927
928 class OVERLAPPED(ctypes.Structure):
929 _fields_ = [
930 ('Internal', ctypes.wintypes.LPVOID),
931 ('InternalHigh', ctypes.wintypes.LPVOID),
932 ('Offset', ctypes.wintypes.DWORD),
933 ('OffsetHigh', ctypes.wintypes.DWORD),
934 ('hEvent', ctypes.wintypes.HANDLE),
935 ]
936
937 kernel32 = ctypes.windll.kernel32
938 LockFileEx = kernel32.LockFileEx
939 LockFileEx.argtypes = [
940 ctypes.wintypes.HANDLE, # hFile
941 ctypes.wintypes.DWORD, # dwFlags
942 ctypes.wintypes.DWORD, # dwReserved
943 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
944 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
945 ctypes.POINTER(OVERLAPPED) # Overlapped
946 ]
947 LockFileEx.restype = ctypes.wintypes.BOOL
948 UnlockFileEx = kernel32.UnlockFileEx
949 UnlockFileEx.argtypes = [
950 ctypes.wintypes.HANDLE, # hFile
951 ctypes.wintypes.DWORD, # dwReserved
952 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
953 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
954 ctypes.POINTER(OVERLAPPED) # Overlapped
955 ]
956 UnlockFileEx.restype = ctypes.wintypes.BOOL
957 whole_low = 0xffffffff
958 whole_high = 0x7fffffff
959
960 def _lock_file(f, exclusive):
961 overlapped = OVERLAPPED()
962 overlapped.Offset = 0
963 overlapped.OffsetHigh = 0
964 overlapped.hEvent = 0
965 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
966 handle = msvcrt.get_osfhandle(f.fileno())
967 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
968 whole_low, whole_high, f._lock_file_overlapped_p):
969 raise OSError('Locking file failed: %r' % ctypes.FormatError())
970
971 def _unlock_file(f):
972 assert f._lock_file_overlapped_p
973 handle = msvcrt.get_osfhandle(f.fileno())
974 if not UnlockFileEx(handle, 0,
975 whole_low, whole_high, f._lock_file_overlapped_p):
976 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
977
978else:
979 import fcntl
980
981 def _lock_file(f, exclusive):
2582bebe 982 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
983
984 def _unlock_file(f):
2582bebe 985 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
986
987
988class locked_file(object):
989 def __init__(self, filename, mode, encoding=None):
990 assert mode in ['r', 'a', 'w']
991 self.f = io.open(filename, mode, encoding=encoding)
992 self.mode = mode
993
994 def __enter__(self):
995 exclusive = self.mode != 'r'
996 try:
997 _lock_file(self.f, exclusive)
998 except IOError:
999 self.f.close()
1000 raise
1001 return self
1002
1003 def __exit__(self, etype, value, traceback):
1004 try:
1005 _unlock_file(self.f)
1006 finally:
1007 self.f.close()
1008
1009 def __iter__(self):
1010 return iter(self.f)
1011
1012 def write(self, *args):
1013 return self.f.write(*args)
1014
1015 def read(self, *args):
1016 return self.f.read(*args)
4eb7f1d1
JMF
1017
1018
4644ac55
S
1019def get_filesystem_encoding():
1020 encoding = sys.getfilesystemencoding()
1021 return encoding if encoding is not None else 'utf-8'
1022
1023
4eb7f1d1 1024def shell_quote(args):
a6a173c2 1025 quoted_args = []
4644ac55 1026 encoding = get_filesystem_encoding()
a6a173c2
JMF
1027 for a in args:
1028 if isinstance(a, bytes):
1029 # We may get a filename encoded with 'encodeFilename'
1030 a = a.decode(encoding)
1031 quoted_args.append(pipes.quote(a))
28e614de 1032 return ' '.join(quoted_args)
9d4660ca
PH
1033
1034
f4d96df0
PH
1035def takewhile_inclusive(pred, seq):
1036 """ Like itertools.takewhile, but include the latest evaluated element
1037 (the first element so that Not pred(e)) """
1038 for e in seq:
1039 yield e
1040 if not pred(e):
1041 return
1042
1043
9d4660ca
PH
1044def smuggle_url(url, data):
1045 """ Pass additional data in a URL for internal use. """
1046
1047 sdata = compat_urllib_parse.urlencode(
28e614de
PH
1048 {'__youtubedl_smuggle': json.dumps(data)})
1049 return url + '#' + sdata
9d4660ca
PH
1050
1051
79f82953 1052def unsmuggle_url(smug_url, default=None):
83e865a3 1053 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1054 return smug_url, default
28e614de
PH
1055 url, _, sdata = smug_url.rpartition('#')
1056 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1057 data = json.loads(jsond)
1058 return url, data
02dbf93f
PH
1059
1060
02dbf93f
PH
1061def format_bytes(bytes):
1062 if bytes is None:
28e614de 1063 return 'N/A'
02dbf93f
PH
1064 if type(bytes) is str:
1065 bytes = float(bytes)
1066 if bytes == 0.0:
1067 exponent = 0
1068 else:
1069 exponent = int(math.log(bytes, 1024.0))
28e614de 1070 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1071 converted = float(bytes) / float(1024 ** exponent)
28e614de 1072 return '%.2f%s' % (converted, suffix)
f53c966a 1073
1c088fa8 1074
be64b5b0
PH
1075def parse_filesize(s):
1076 if s is None:
1077 return None
1078
1079 # The lower-case forms are of course incorrect and inofficial,
1080 # but we support those too
1081 _UNIT_TABLE = {
1082 'B': 1,
1083 'b': 1,
1084 'KiB': 1024,
1085 'KB': 1000,
1086 'kB': 1024,
1087 'Kb': 1000,
1088 'MiB': 1024 ** 2,
1089 'MB': 1000 ** 2,
1090 'mB': 1024 ** 2,
1091 'Mb': 1000 ** 2,
1092 'GiB': 1024 ** 3,
1093 'GB': 1000 ** 3,
1094 'gB': 1024 ** 3,
1095 'Gb': 1000 ** 3,
1096 'TiB': 1024 ** 4,
1097 'TB': 1000 ** 4,
1098 'tB': 1024 ** 4,
1099 'Tb': 1000 ** 4,
1100 'PiB': 1024 ** 5,
1101 'PB': 1000 ** 5,
1102 'pB': 1024 ** 5,
1103 'Pb': 1000 ** 5,
1104 'EiB': 1024 ** 6,
1105 'EB': 1000 ** 6,
1106 'eB': 1024 ** 6,
1107 'Eb': 1000 ** 6,
1108 'ZiB': 1024 ** 7,
1109 'ZB': 1000 ** 7,
1110 'zB': 1024 ** 7,
1111 'Zb': 1000 ** 7,
1112 'YiB': 1024 ** 8,
1113 'YB': 1000 ** 8,
1114 'yB': 1024 ** 8,
1115 'Yb': 1000 ** 8,
1116 }
1117
1118 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
4349c07d
PH
1119 m = re.match(
1120 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
be64b5b0
PH
1121 if not m:
1122 return None
1123
4349c07d
PH
1124 num_str = m.group('num').replace(',', '.')
1125 mult = _UNIT_TABLE[m.group('unit')]
1126 return int(float(num_str) * mult)
be64b5b0
PH
1127
1128
1c088fa8 1129def get_term_width():
4644ac55 1130 columns = compat_getenv('COLUMNS', None)
1c088fa8
PH
1131 if columns:
1132 return int(columns)
1133
1134 try:
1135 sp = subprocess.Popen(
1136 ['stty', 'size'],
1137 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1138 out, err = sp.communicate()
1139 return int(out.split()[1])
1140 except:
1141 pass
1142 return None
caefb1de
PH
1143
1144
1145def month_by_name(name):
1146 """ Return the number of a month by (locale-independently) English name """
1147
1148 ENGLISH_NAMES = [
28e614de
PH
1149 'January', 'February', 'March', 'April', 'May', 'June',
1150 'July', 'August', 'September', 'October', 'November', 'December']
caefb1de
PH
1151 try:
1152 return ENGLISH_NAMES.index(name) + 1
1153 except ValueError:
1154 return None
18258362
JMF
1155
1156
5aafe895 1157def fix_xml_ampersands(xml_str):
18258362 1158 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1159 return re.sub(
1160 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1161 '&amp;',
5aafe895 1162 xml_str)
e3946f98
PH
1163
1164
1165def setproctitle(title):
8bf48f23 1166 assert isinstance(title, compat_str)
e3946f98
PH
1167 try:
1168 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1169 except OSError:
1170 return
6eefe533
PH
1171 title_bytes = title.encode('utf-8')
1172 buf = ctypes.create_string_buffer(len(title_bytes))
1173 buf.value = title_bytes
e3946f98 1174 try:
6eefe533 1175 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1176 except AttributeError:
1177 return # Strange libc, just skip this
d7dda168
PH
1178
1179
1180def remove_start(s, start):
1181 if s.startswith(start):
1182 return s[len(start):]
1183 return s
29eb5174
PH
1184
1185
2b9faf55
PH
1186def remove_end(s, end):
1187 if s.endswith(end):
1188 return s[:-len(end)]
1189 return s
1190
1191
29eb5174 1192def url_basename(url):
9b8aaeed 1193 path = compat_urlparse.urlparse(url).path
28e614de 1194 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1195
1196
1197class HEADRequest(compat_urllib_request.Request):
1198 def get_method(self):
1199 return "HEAD"
7217e148
PH
1200
1201
9732d77e 1202def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1203 if get_attr:
1204 if v is not None:
1205 v = getattr(v, get_attr, None)
9572013d
PH
1206 if v == '':
1207 v = None
9732d77e
PH
1208 return default if v is None else (int(v) * invscale // scale)
1209
9572013d 1210
40a90862
JMF
1211def str_or_none(v, default=None):
1212 return default if v is None else compat_str(v)
1213
9732d77e
PH
1214
1215def str_to_int(int_str):
48d4681e 1216 """ A more relaxed version of int_or_none """
9732d77e
PH
1217 if int_str is None:
1218 return None
28e614de 1219 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1220 return int(int_str)
608d11f5
PH
1221
1222
9732d77e
PH
1223def float_or_none(v, scale=1, invscale=1, default=None):
1224 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1225
1226
608d11f5
PH
1227def parse_duration(s):
1228 if s is None:
1229 return None
1230
ca7b3246
S
1231 s = s.strip()
1232
608d11f5 1233 m = re.match(
6a68bb57 1234 r'''(?ix)T?
e8df5cee
PH
1235 (?:
1236 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1237 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1238
6a68bb57
PH
1239 (?:
1240 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1241 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1242 )?
e8df5cee
PH
1243 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1244 )$''', s)
608d11f5
PH
1245 if not m:
1246 return None
e8df5cee
PH
1247 res = 0
1248 if m.group('only_mins'):
1249 return float_or_none(m.group('only_mins'), invscale=60)
1250 if m.group('only_hours'):
1251 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1252 if m.group('secs'):
1253 res += int(m.group('secs'))
608d11f5
PH
1254 if m.group('mins'):
1255 res += int(m.group('mins')) * 60
e8df5cee
PH
1256 if m.group('hours'):
1257 res += int(m.group('hours')) * 60 * 60
7adcbe75
PH
1258 if m.group('ms'):
1259 res += float(m.group('ms'))
608d11f5 1260 return res
91d7d0b3
JMF
1261
1262
1263def prepend_extension(filename, ext):
5f6a1245 1264 name, real_ext = os.path.splitext(filename)
28e614de 1265 return '{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1266
1267
1268def check_executable(exe, args=[]):
1269 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1270 args can be a list of arguments for a short output (like -version) """
1271 try:
1272 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1273 except OSError:
1274 return False
1275 return exe
b7ab0590
PH
1276
1277
95807118 1278def get_exe_version(exe, args=['--version'],
cae97f65 1279 version_re=None, unrecognized='present'):
95807118
PH
1280 """ Returns the version of the specified executable,
1281 or False if the executable is not present """
1282 try:
cae97f65 1283 out, _ = subprocess.Popen(
95807118
PH
1284 [exe] + args,
1285 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1286 except OSError:
1287 return False
cae97f65
PH
1288 if isinstance(out, bytes): # Python 2.x
1289 out = out.decode('ascii', 'ignore')
1290 return detect_exe_version(out, version_re, unrecognized)
1291
1292
1293def detect_exe_version(output, version_re=None, unrecognized='present'):
1294 assert isinstance(output, compat_str)
1295 if version_re is None:
1296 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1297 m = re.search(version_re, output)
95807118
PH
1298 if m:
1299 return m.group(1)
1300 else:
1301 return unrecognized
1302
1303
b7ab0590 1304class PagedList(object):
dd26ced1
PH
1305 def __len__(self):
1306 # This is only useful for tests
1307 return len(self.getslice())
1308
9c44d242
PH
1309
1310class OnDemandPagedList(PagedList):
1311 def __init__(self, pagefunc, pagesize):
1312 self._pagefunc = pagefunc
1313 self._pagesize = pagesize
1314
b7ab0590
PH
1315 def getslice(self, start=0, end=None):
1316 res = []
1317 for pagenum in itertools.count(start // self._pagesize):
1318 firstid = pagenum * self._pagesize
1319 nextfirstid = pagenum * self._pagesize + self._pagesize
1320 if start >= nextfirstid:
1321 continue
1322
1323 page_results = list(self._pagefunc(pagenum))
1324
1325 startv = (
1326 start % self._pagesize
1327 if firstid <= start < nextfirstid
1328 else 0)
1329
1330 endv = (
1331 ((end - 1) % self._pagesize) + 1
1332 if (end is not None and firstid <= end <= nextfirstid)
1333 else None)
1334
1335 if startv != 0 or endv is not None:
1336 page_results = page_results[startv:endv]
1337 res.extend(page_results)
1338
1339 # A little optimization - if current page is not "full", ie. does
1340 # not contain page_size videos then we can assume that this page
1341 # is the last one - there are no more ids on further pages -
1342 # i.e. no need to query again.
1343 if len(page_results) + startv < self._pagesize:
1344 break
1345
1346 # If we got the whole page, but the next page is not interesting,
1347 # break out early as well
1348 if end == nextfirstid:
1349 break
1350 return res
81c2f20b
PH
1351
1352
9c44d242
PH
1353class InAdvancePagedList(PagedList):
1354 def __init__(self, pagefunc, pagecount, pagesize):
1355 self._pagefunc = pagefunc
1356 self._pagecount = pagecount
1357 self._pagesize = pagesize
1358
1359 def getslice(self, start=0, end=None):
1360 res = []
1361 start_page = start // self._pagesize
1362 end_page = (
1363 self._pagecount if end is None else (end // self._pagesize + 1))
1364 skip_elems = start - start_page * self._pagesize
1365 only_more = None if end is None else end - start
1366 for pagenum in range(start_page, end_page):
1367 page = list(self._pagefunc(pagenum))
1368 if skip_elems:
1369 page = page[skip_elems:]
1370 skip_elems = None
1371 if only_more is not None:
1372 if len(page) < only_more:
1373 only_more -= len(page)
1374 else:
1375 page = page[:only_more]
1376 res.extend(page)
1377 break
1378 res.extend(page)
1379 return res
1380
1381
81c2f20b 1382def uppercase_escape(s):
676eb3f2 1383 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1384 return re.sub(
a612753d 1385 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1386 lambda m: unicode_escape(m.group(0))[0],
1387 s)
b53466e1 1388
d05cfe06
S
1389
1390def escape_rfc3986(s):
1391 """Escape non-ASCII characters as suggested by RFC 3986"""
1392 if sys.version_info < (3, 0) and isinstance(s, unicode):
1393 s = s.encode('utf-8')
ecc0c5ee 1394 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1395
1396
1397def escape_url(url):
1398 """Escape URL as suggested by RFC 3986"""
1399 url_parsed = compat_urllib_parse_urlparse(url)
1400 return url_parsed._replace(
1401 path=escape_rfc3986(url_parsed.path),
1402 params=escape_rfc3986(url_parsed.params),
1403 query=escape_rfc3986(url_parsed.query),
1404 fragment=escape_rfc3986(url_parsed.fragment)
1405 ).geturl()
1406
b53466e1 1407try:
28e614de 1408 struct.pack('!I', 0)
b53466e1
PH
1409except TypeError:
1410 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1411 def struct_pack(spec, *args):
1412 if isinstance(spec, compat_str):
1413 spec = spec.encode('ascii')
1414 return struct.pack(spec, *args)
1415
1416 def struct_unpack(spec, *args):
1417 if isinstance(spec, compat_str):
1418 spec = spec.encode('ascii')
1419 return struct.unpack(spec, *args)
1420else:
1421 struct_pack = struct.pack
1422 struct_unpack = struct.unpack
62e609ab
PH
1423
1424
1425def read_batch_urls(batch_fd):
1426 def fixup(url):
1427 if not isinstance(url, compat_str):
1428 url = url.decode('utf-8', 'replace')
28e614de 1429 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1430 if url.startswith(BOM_UTF8):
1431 url = url[len(BOM_UTF8):]
1432 url = url.strip()
1433 if url.startswith(('#', ';', ']')):
1434 return False
1435 return url
1436
1437 with contextlib.closing(batch_fd) as fd:
1438 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1439
1440
1441def urlencode_postdata(*args, **kargs):
1442 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1443
1444
0990305d
PH
1445try:
1446 etree_iter = xml.etree.ElementTree.Element.iter
1447except AttributeError: # Python <=2.6
1448 etree_iter = lambda n: n.findall('.//*')
1449
1450
bcf89ce6
PH
1451def parse_xml(s):
1452 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1453 def doctype(self, name, pubid, system):
1454 pass # Ignore doctypes
1455
1456 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1457 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1458 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1459 # Fix up XML parser in Python 2.x
1460 if sys.version_info < (3, 0):
1461 for n in etree_iter(tree):
1462 if n.text is not None:
1463 if not isinstance(n.text, compat_str):
1464 n.text = n.text.decode('utf-8')
1465 return tree
e68301af
PH
1466
1467
a1a530b0
PH
1468US_RATINGS = {
1469 'G': 0,
1470 'PG': 10,
1471 'PG-13': 13,
1472 'R': 16,
1473 'NC': 18,
1474}
fac55558
PH
1475
1476
146c80e2
S
1477def parse_age_limit(s):
1478 if s is None:
d838b1bd 1479 return None
146c80e2 1480 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d838b1bd 1481 return int(m.group('age')) if m else US_RATINGS.get(s, None)
146c80e2
S
1482
1483
fac55558 1484def strip_jsonp(code):
609a61e3
PH
1485 return re.sub(
1486 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1487
1488
e05f6939
PH
1489def js_to_json(code):
1490 def fix_kv(m):
e7b6d122
PH
1491 v = m.group(0)
1492 if v in ('true', 'false', 'null'):
1493 return v
1494 if v.startswith('"'):
1495 return v
1496 if v.startswith("'"):
1497 v = v[1:-1]
1498 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1499 '\\\\': '\\\\',
1500 "\\'": "'",
1501 '"': '\\"',
1502 }[m.group(0)], v)
1503 return '"%s"' % v
e05f6939
PH
1504
1505 res = re.sub(r'''(?x)
e7b6d122
PH
1506 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1507 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1508 [a-zA-Z_][a-zA-Z_0-9]*
e05f6939
PH
1509 ''', fix_kv, code)
1510 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1511 return res
1512
1513
478c2c61
PH
1514def qualities(quality_ids):
1515 """ Get a numeric quality value out of a list of possible values """
1516 def q(qid):
1517 try:
1518 return quality_ids.index(qid)
1519 except ValueError:
1520 return -1
1521 return q
1522
acd69589
PH
1523
1524DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1525
a020a0dc
PH
1526
1527def limit_length(s, length):
1528 """ Add ellipses to overly long strings """
1529 if s is None:
1530 return None
1531 ELLIPSES = '...'
1532 if len(s) > length:
1533 return s[:length - len(ELLIPSES)] + ELLIPSES
1534 return s
48844745
PH
1535
1536
1537def version_tuple(v):
5f9b8394 1538 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1539
1540
1541def is_outdated_version(version, limit, assume_new=True):
1542 if not version:
1543 return not assume_new
1544 try:
1545 return version_tuple(version) < version_tuple(limit)
1546 except ValueError:
1547 return not assume_new
732ea2f0
PH
1548
1549
1550def ytdl_is_updateable():
1551 """ Returns if youtube-dl can be updated with -U """
1552 from zipimport import zipimporter
1553
1554 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1555
1556
1557def args_to_str(args):
1558 # Get a short string representation for a subprocess command
1559 return ' '.join(shlex_quote(a) for a in args)
2ccd1b10
PH
1560
1561
1562def urlhandle_detect_ext(url_handle):
1563 try:
1564 url_handle.headers
1565 getheader = lambda h: url_handle.headers[h]
1566 except AttributeError: # Python < 3
1567 getheader = url_handle.info().getheader
1568
1569 return getheader('Content-Type').split("/")[1]
05900629
PH
1570
1571
1572def age_restricted(content_limit, age_limit):
1573 """ Returns True iff the content should be blocked """
1574
1575 if age_limit is None: # No limit set
1576 return False
1577 if content_limit is None:
1578 return False # Content available for everyone
1579 return age_limit < content_limit