]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
Improve and test ffmpeg version detection
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
912b38b4 6import calendar
676eb3f2 7import codecs
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96
PH
10import datetime
11import email.utils
f45c185f 12import errno
d77c3dfd 13import gzip
b7ab0590 14import itertools
03f9daab 15import io
f4bfd65f 16import json
d77c3dfd 17import locale
02dbf93f 18import math
d77c3dfd 19import os
4eb7f1d1 20import pipes
c496ca96 21import platform
d77c3dfd 22import re
13ebea79 23import ssl
c496ca96 24import socket
b53466e1 25import struct
1c088fa8 26import subprocess
d77c3dfd 27import sys
181c8655 28import tempfile
01951dda 29import traceback
bcf89ce6 30import xml.etree.ElementTree
d77c3dfd 31import zlib
d77c3dfd 32
8c25f81b
PH
33from .compat import (
34 compat_chr,
35 compat_getenv,
36 compat_html_entities,
8c25f81b
PH
37 compat_parse_qs,
38 compat_str,
39 compat_urllib_error,
40 compat_urllib_parse,
41 compat_urllib_parse_urlparse,
42 compat_urllib_request,
43 compat_urlparse,
7d4111ed 44 shlex_quote,
8c25f81b 45)
4644ac55
S
46
47
468e2e92
FV
48# This is not clearly defined otherwise
49compiled_regex_type = type(re.compile(''))
50
3e669f36 51std_headers = {
ae8f7871 52 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
53 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
54 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
55 'Accept-Encoding': 'gzip, deflate',
56 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 57}
f427df17 58
5f6a1245 59
d77c3dfd 60def preferredencoding():
59ae15a5 61 """Get preferred encoding.
d77c3dfd 62
59ae15a5
PH
63 Returns the best encoding scheme for the system, based on
64 locale.getpreferredencoding() and some further tweaks.
65 """
66 try:
67 pref = locale.getpreferredencoding()
28e614de 68 'TEST'.encode(pref)
59ae15a5
PH
69 except:
70 pref = 'UTF-8'
bae611f2 71
59ae15a5 72 return pref
d77c3dfd 73
f4bfd65f 74
181c8655 75def write_json_file(obj, fn):
1394646a 76 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 77
92120217 78 fn = encodeFilename(fn)
61ee5aeb 79 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
80 encoding = get_filesystem_encoding()
81 # os.path.basename returns a bytes object, but NamedTemporaryFile
82 # will fail if the filename contains non ascii characters unless we
83 # use a unicode object
84 path_basename = lambda f: os.path.basename(fn).decode(encoding)
85 # the same for os.path.dirname
86 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
87 else:
88 path_basename = os.path.basename
89 path_dirname = os.path.dirname
90
73159f99
S
91 args = {
92 'suffix': '.tmp',
ec5f6016
JMF
93 'prefix': path_basename(fn) + '.',
94 'dir': path_dirname(fn),
73159f99
S
95 'delete': False,
96 }
97
181c8655
PH
98 # In Python 2.x, json.dump expects a bytestream.
99 # In Python 3.x, it writes to a character stream
100 if sys.version_info < (3, 0):
73159f99 101 args['mode'] = 'wb'
181c8655 102 else:
73159f99
S
103 args.update({
104 'mode': 'w',
105 'encoding': 'utf-8',
106 })
107
108 tf = tempfile.NamedTemporaryFile(**args)
181c8655
PH
109
110 try:
111 with tf:
112 json.dump(obj, tf)
1394646a
IK
113 if sys.platform == 'win32':
114 # Need to remove existing file on Windows, else os.rename raises
115 # WindowsError or FileExistsError.
116 try:
117 os.unlink(fn)
118 except OSError:
119 pass
181c8655
PH
120 os.rename(tf.name, fn)
121 except:
122 try:
123 os.remove(tf.name)
124 except OSError:
125 pass
126 raise
127
128
129if sys.version_info >= (2, 7):
59ae56fa
PH
130 def find_xpath_attr(node, xpath, key, val):
131 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
132 assert re.match(r'^[a-zA-Z-]+$', key)
133 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
ab4ee31e 134 expr = xpath + "[@%s='%s']" % (key, val)
59ae56fa
PH
135 return node.find(expr)
136else:
137 def find_xpath_attr(node, xpath, key, val):
4eefbfdb
PH
138 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
139 # .//node does not match if a node is a direct child of . !
140 if isinstance(xpath, unicode):
141 xpath = xpath.encode('ascii')
142
59ae56fa
PH
143 for f in node.findall(xpath):
144 if f.attrib.get(key) == val:
145 return f
146 return None
147
d7e66d39
JMF
148# On python2.6 the xml.etree.ElementTree.Element methods don't support
149# the namespace parameter
5f6a1245
JW
150
151
d7e66d39
JMF
152def xpath_with_ns(path, ns_map):
153 components = [c.split(':') for c in path.split('/')]
154 replaced = []
155 for c in components:
156 if len(c) == 1:
157 replaced.append(c[0])
158 else:
159 ns, tag = c
160 replaced.append('{%s}%s' % (ns_map[ns], tag))
161 return '/'.join(replaced)
162
d77c3dfd 163
bf0ff932 164def xpath_text(node, xpath, name=None, fatal=False):
d74bebd5
PH
165 if sys.version_info < (2, 7): # Crazy 2.6
166 xpath = xpath.encode('ascii')
167
bf0ff932 168 n = node.find(xpath)
42bdd9d0 169 if n is None or n.text is None:
bf0ff932
PH
170 if fatal:
171 name = xpath if name is None else name
172 raise ExtractorError('Could not find XML element %s' % name)
173 else:
174 return None
175 return n.text
176
177
9e6dd238 178def get_element_by_id(id, html):
43e8fafd
ND
179 """Return the content of the tag with the specified ID in the passed HTML document"""
180 return get_element_by_attribute("id", id, html)
181
12ea2f30 182
43e8fafd
ND
183def get_element_by_attribute(attribute, value, html):
184 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 185
38285056
PH
186 m = re.search(r'''(?xs)
187 <([a-zA-Z0-9:._-]+)
188 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
189 \s+%s=['"]?%s['"]?
190 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
191 \s*>
192 (?P<content>.*?)
193 </\1>
194 ''' % (re.escape(attribute), re.escape(value)), html)
195
196 if not m:
197 return None
198 res = m.group('content')
199
200 if res.startswith('"') or res.startswith("'"):
201 res = res[1:-1]
a921f407 202
38285056 203 return unescapeHTML(res)
a921f407 204
9e6dd238
FV
205
206def clean_html(html):
59ae15a5
PH
207 """Clean an HTML snippet into a readable string"""
208 # Newline vs <br />
209 html = html.replace('\n', ' ')
6b3aef80
FV
210 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
211 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
212 # Strip html tags
213 html = re.sub('<.*?>', '', html)
214 # Replace html entities
215 html = unescapeHTML(html)
7decf895 216 return html.strip()
9e6dd238
FV
217
218
d77c3dfd 219def sanitize_open(filename, open_mode):
59ae15a5
PH
220 """Try to open the given filename, and slightly tweak it if this fails.
221
222 Attempts to open the given filename. If this fails, it tries to change
223 the filename slightly, step by step, until it's either able to open it
224 or it fails and raises a final exception, like the standard open()
225 function.
226
227 It returns the tuple (stream, definitive_file_name).
228 """
229 try:
28e614de 230 if filename == '-':
59ae15a5
PH
231 if sys.platform == 'win32':
232 import msvcrt
233 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 234 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
235 stream = open(encodeFilename(filename), open_mode)
236 return (stream, filename)
237 except (IOError, OSError) as err:
f45c185f
PH
238 if err.errno in (errno.EACCES,):
239 raise
59ae15a5 240
f45c185f
PH
241 # In case of error, try to remove win32 forbidden chars
242 alt_filename = os.path.join(
b74e86f4
PH
243 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
244 for path_part in os.path.split(filename)
245 )
f45c185f
PH
246 if alt_filename == filename:
247 raise
248 else:
249 # An exception here should be caught in the caller
250 stream = open(encodeFilename(filename), open_mode)
251 return (stream, alt_filename)
d77c3dfd
FV
252
253
254def timeconvert(timestr):
59ae15a5
PH
255 """Convert RFC 2822 defined time string into system timestamp"""
256 timestamp = None
257 timetuple = email.utils.parsedate_tz(timestr)
258 if timetuple is not None:
259 timestamp = email.utils.mktime_tz(timetuple)
260 return timestamp
1c469a94 261
5f6a1245 262
796173d0 263def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
264 """Sanitizes a string so it could be used as part of a filename.
265 If restricted is set, use a stricter subset of allowed characters.
796173d0 266 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
267 """
268 def replace_insane(char):
269 if char == '?' or ord(char) < 32 or ord(char) == 127:
270 return ''
271 elif char == '"':
272 return '' if restricted else '\''
273 elif char == ':':
274 return '_-' if restricted else ' -'
275 elif char in '\\/|*<>':
276 return '_'
627dcfff 277 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
278 return '_'
279 if restricted and ord(char) > 127:
280 return '_'
281 return char
282
28e614de 283 result = ''.join(map(replace_insane, s))
796173d0
PH
284 if not is_id:
285 while '__' in result:
286 result = result.replace('__', '_')
287 result = result.strip('_')
288 # Common case of "Foreign band name - English song title"
289 if restricted and result.startswith('-_'):
290 result = result[2:]
291 if not result:
292 result = '_'
59ae15a5 293 return result
d77c3dfd 294
5f6a1245 295
d77c3dfd 296def orderedSet(iterable):
59ae15a5
PH
297 """ Remove all duplicates from the input iterable """
298 res = []
299 for el in iterable:
300 if el not in res:
301 res.append(el)
302 return res
d77c3dfd 303
912b38b4 304
4e408e47
PH
305def _htmlentity_transform(entity):
306 """Transforms an HTML entity to a character."""
307 # Known non-numeric HTML entity
308 if entity in compat_html_entities.name2codepoint:
309 return compat_chr(compat_html_entities.name2codepoint[entity])
310
311 mobj = re.match(r'#(x?[0-9]+)', entity)
312 if mobj is not None:
313 numstr = mobj.group(1)
28e614de 314 if numstr.startswith('x'):
4e408e47 315 base = 16
28e614de 316 numstr = '0%s' % numstr
4e408e47
PH
317 else:
318 base = 10
319 return compat_chr(int(numstr, base))
320
321 # Unknown entity in name, return its literal representation
28e614de 322 return ('&%s;' % entity)
4e408e47
PH
323
324
d77c3dfd 325def unescapeHTML(s):
912b38b4
PH
326 if s is None:
327 return None
328 assert type(s) == compat_str
d77c3dfd 329
4e408e47
PH
330 return re.sub(
331 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 332
8bf48f23
PH
333
334def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
335 """
336 @param s The name of the file
337 """
d77c3dfd 338
8bf48f23 339 assert type(s) == compat_str
d77c3dfd 340
59ae15a5
PH
341 # Python 3 has a Unicode API
342 if sys.version_info >= (3, 0):
343 return s
0f00efed 344
59ae15a5 345 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
28e614de 346 # Pass '' directly to use Unicode APIs on Windows 2000 and up
59ae15a5
PH
347 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
348 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
349 if not for_subprocess:
350 return s
351 else:
352 # For subprocess calls, encode with locale encoding
353 # Refer to http://stackoverflow.com/a/9951851/35070
354 encoding = preferredencoding()
59ae15a5 355 else:
6df40dcb 356 encoding = sys.getfilesystemencoding()
8bf48f23
PH
357 if encoding is None:
358 encoding = 'utf-8'
359 return s.encode(encoding, 'ignore')
360
f07b74fc
PH
361
362def encodeArgument(s):
363 if not isinstance(s, compat_str):
364 # Legacy code that uses byte strings
365 # Uncomment the following line after fixing all post processors
366 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
367 s = s.decode('ascii')
368 return encodeFilename(s, True)
369
370
8271226a
PH
371def decodeOption(optval):
372 if optval is None:
373 return optval
374 if isinstance(optval, bytes):
375 optval = optval.decode(preferredencoding())
376
377 assert isinstance(optval, compat_str)
378 return optval
1c256f70 379
5f6a1245 380
4539dd30
PH
381def formatSeconds(secs):
382 if secs > 3600:
383 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
384 elif secs > 60:
385 return '%d:%02d' % (secs // 60, secs % 60)
386 else:
387 return '%d' % secs
388
a0ddb8a2
PH
389
390def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
0db261ba
JMF
391 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
392 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
0db261ba
JMF
393 if opts_no_check_certificate:
394 context.verify_mode = ssl.CERT_NONE
a2366922
PH
395 try:
396 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
397 except TypeError:
398 # Python 2.7.8
399 # (create_default_context present but HTTPSHandler has no context=)
400 pass
401
402 if sys.version_info < (3, 2):
13ebea79
PH
403 import httplib
404
405 class HTTPSConnectionV3(httplib.HTTPSConnection):
406 def __init__(self, *args, **kwargs):
407 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
408
409 def connect(self):
410 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 411 if getattr(self, '_tunnel_host', False):
13ebea79
PH
412 self.sock = sock
413 self._tunnel()
414 try:
aa37e3d4 415 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
de79c46c 416 except ssl.SSLError:
13ebea79
PH
417 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
418
419 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
420 def https_open(self, req):
421 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 422 return HTTPSHandlerV3(**kwargs)
aa37e3d4
PH
423 else: # Python < 3.4
424 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
ea6d901e 425 context.verify_mode = (ssl.CERT_NONE
dca08720 426 if opts_no_check_certificate
ea6d901e 427 else ssl.CERT_REQUIRED)
303b479e 428 context.set_default_verify_paths()
a0ddb8a2 429 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 430
732ea2f0 431
1c256f70
PH
432class ExtractorError(Exception):
433 """Error during info extraction."""
5f6a1245 434
d11271dd 435 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
436 """ tb, if given, is the original traceback (so that it can be printed out).
437 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
438 """
439
440 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
441 expected = True
d11271dd
PH
442 if video_id is not None:
443 msg = video_id + ': ' + msg
410f3e73 444 if cause:
28e614de 445 msg += ' (caused by %r)' % cause
9a82b238 446 if not expected:
732ea2f0
PH
447 if ytdl_is_updateable():
448 update_cmd = 'type youtube-dl -U to update'
449 else:
450 update_cmd = 'see https://yt-dl.org/update on how to update'
451 msg += '; please report this issue on https://yt-dl.org/bug .'
452 msg += ' Make sure you are using the latest version; %s.' % update_cmd
453 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
1c256f70 454 super(ExtractorError, self).__init__(msg)
d5979c5d 455
1c256f70 456 self.traceback = tb
8cc83b8d 457 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 458 self.cause = cause
d11271dd 459 self.video_id = video_id
1c256f70 460
01951dda
PH
461 def format_traceback(self):
462 if self.traceback is None:
463 return None
28e614de 464 return ''.join(traceback.format_tb(self.traceback))
01951dda 465
1c256f70 466
55b3e45b
JMF
467class RegexNotFoundError(ExtractorError):
468 """Error when a regex didn't match"""
469 pass
470
471
d77c3dfd 472class DownloadError(Exception):
59ae15a5 473 """Download Error exception.
d77c3dfd 474
59ae15a5
PH
475 This exception may be thrown by FileDownloader objects if they are not
476 configured to continue on errors. They will contain the appropriate
477 error message.
478 """
5f6a1245 479
8cc83b8d
FV
480 def __init__(self, msg, exc_info=None):
481 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
482 super(DownloadError, self).__init__(msg)
483 self.exc_info = exc_info
d77c3dfd
FV
484
485
486class SameFileError(Exception):
59ae15a5 487 """Same File exception.
d77c3dfd 488
59ae15a5
PH
489 This exception will be thrown by FileDownloader objects if they detect
490 multiple files would have to be downloaded to the same file on disk.
491 """
492 pass
d77c3dfd
FV
493
494
495class PostProcessingError(Exception):
59ae15a5 496 """Post Processing exception.
d77c3dfd 497
59ae15a5
PH
498 This exception may be raised by PostProcessor's .run() method to
499 indicate an error in the postprocessing task.
500 """
5f6a1245 501
7851b379
PH
502 def __init__(self, msg):
503 self.msg = msg
d77c3dfd 504
5f6a1245 505
d77c3dfd 506class MaxDownloadsReached(Exception):
59ae15a5
PH
507 """ --max-downloads limit has been reached. """
508 pass
d77c3dfd
FV
509
510
511class UnavailableVideoError(Exception):
59ae15a5 512 """Unavailable Format exception.
d77c3dfd 513
59ae15a5
PH
514 This exception will be thrown when a video is requested
515 in a format that is not available for that video.
516 """
517 pass
d77c3dfd
FV
518
519
520class ContentTooShortError(Exception):
59ae15a5 521 """Content Too Short exception.
d77c3dfd 522
59ae15a5
PH
523 This exception may be raised by FileDownloader objects when a file they
524 download is too small for what the server announced first, indicating
525 the connection was probably interrupted.
526 """
527 # Both in bytes
528 downloaded = None
529 expected = None
d77c3dfd 530
59ae15a5
PH
531 def __init__(self, downloaded, expected):
532 self.downloaded = downloaded
533 self.expected = expected
d77c3dfd 534
5f6a1245 535
acebc9cd 536class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
537 """Handler for HTTP requests and responses.
538
539 This class, when installed with an OpenerDirector, automatically adds
540 the standard headers to every HTTP request and handles gzipped and
541 deflated responses from web servers. If compression is to be avoided in
542 a particular request, the original request in the program code only has
543 to include the HTTP header "Youtubedl-No-Compression", which will be
544 removed before making the real request.
545
546 Part of this code was copied from:
547
548 http://techknack.net/python-urllib2-handlers/
549
550 Andrew Rowls, the author of that code, agreed to release it to the
551 public domain.
552 """
553
554 @staticmethod
555 def deflate(data):
556 try:
557 return zlib.decompress(data, -zlib.MAX_WBITS)
558 except zlib.error:
559 return zlib.decompress(data)
560
561 @staticmethod
562 def addinfourl_wrapper(stream, headers, url, code):
563 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
564 return compat_urllib_request.addinfourl(stream, headers, url, code)
565 ret = compat_urllib_request.addinfourl(stream, headers, url)
566 ret.code = code
567 return ret
568
acebc9cd 569 def http_request(self, req):
33ac271b
PH
570 for h, v in std_headers.items():
571 if h not in req.headers:
572 req.add_header(h, v)
59ae15a5
PH
573 if 'Youtubedl-no-compression' in req.headers:
574 if 'Accept-encoding' in req.headers:
575 del req.headers['Accept-encoding']
576 del req.headers['Youtubedl-no-compression']
3446dfb7 577 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
578 if 'User-agent' in req.headers:
579 del req.headers['User-agent']
580 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 581 del req.headers['Youtubedl-user-agent']
989b4b2b
PH
582
583 if sys.version_info < (2, 7) and '#' in req.get_full_url():
584 # Python 2.6 is brain-dead when it comes to fragments
585 req._Request__original = req._Request__original.partition('#')[0]
586 req._Request__r_type = req._Request__r_type.partition('#')[0]
587
59ae15a5
PH
588 return req
589
acebc9cd 590 def http_response(self, req, resp):
59ae15a5
PH
591 old_resp = resp
592 # gzip
593 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
594 content = resp.read()
595 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
596 try:
597 uncompressed = io.BytesIO(gz.read())
598 except IOError as original_ioerror:
599 # There may be junk add the end of the file
600 # See http://stackoverflow.com/q/4928560/35070 for details
601 for i in range(1, 1024):
602 try:
603 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
604 uncompressed = io.BytesIO(gz.read())
605 except IOError:
606 continue
607 break
608 else:
609 raise original_ioerror
610 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
611 resp.msg = old_resp.msg
612 # deflate
613 if resp.headers.get('Content-encoding', '') == 'deflate':
614 gz = io.BytesIO(self.deflate(resp.read()))
615 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
616 resp.msg = old_resp.msg
617 return resp
0f8d03f8 618
acebc9cd
PH
619 https_request = http_request
620 https_response = http_response
bf50b038 621
5de90176 622
305d0683 623def parse_iso8601(date_str, delimiter='T'):
912b38b4
PH
624 """ Return a UNIX timestamp from the given date """
625
626 if date_str is None:
627 return None
628
629 m = re.search(
6ad4013d 630 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
912b38b4
PH
631 date_str)
632 if not m:
633 timezone = datetime.timedelta()
634 else:
635 date_str = date_str[:-len(m.group(0))]
636 if not m.group('sign'):
637 timezone = datetime.timedelta()
638 else:
639 sign = 1 if m.group('sign') == '+' else -1
640 timezone = datetime.timedelta(
641 hours=sign * int(m.group('hours')),
642 minutes=sign * int(m.group('minutes')))
6ad4013d 643 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
305d0683 644 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
645 return calendar.timegm(dt.timetuple())
646
647
42bdd9d0 648def unified_strdate(date_str, day_first=True):
bf50b038 649 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
650
651 if date_str is None:
652 return None
bf50b038 653 upload_date = None
5f6a1245 654 # Replace commas
026fcc04 655 date_str = date_str.replace(',', ' ')
bf50b038 656 # %z (UTC offset) is only supported in python>=3.2
026fcc04 657 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0
PH
658 # Remove AM/PM + timezone
659 date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
660
19e1d359
JMF
661 format_expressions = [
662 '%d %B %Y',
0f99566c 663 '%d %b %Y',
19e1d359
JMF
664 '%B %d %Y',
665 '%b %d %Y',
78ff59d0
PP
666 '%b %dst %Y %I:%M%p',
667 '%b %dnd %Y %I:%M%p',
668 '%b %dth %Y %I:%M%p',
19e1d359 669 '%Y-%m-%d',
fe556f1b 670 '%Y/%m/%d',
4cf96546 671 '%d.%m.%Y',
19e1d359 672 '%d/%m/%Y',
423817c4 673 '%d/%m/%y',
19e1d359 674 '%Y/%m/%d %H:%M:%S',
5d73273f 675 '%Y-%m-%d %H:%M:%S',
e9be9a6a 676 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 677 '%d.%m.%Y %H:%M',
b047de6f 678 '%d.%m.%Y %H.%M',
19e1d359 679 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
680 '%Y-%m-%dT%H:%M:%S.%fZ',
681 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 682 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 683 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 684 '%Y-%m-%dT%H:%M',
19e1d359 685 ]
42bdd9d0
PH
686 if day_first:
687 format_expressions.extend([
688 '%d/%m/%Y %H:%M:%S',
689 ])
690 else:
691 format_expressions.extend([
692 '%m/%d/%Y %H:%M:%S',
693 ])
bf50b038
JMF
694 for expression in format_expressions:
695 try:
696 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 697 except ValueError:
bf50b038 698 pass
42393ce2
PH
699 if upload_date is None:
700 timetuple = email.utils.parsedate_tz(date_str)
701 if timetuple:
702 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
703 return upload_date
704
5f6a1245 705
28e614de 706def determine_ext(url, default_ext='unknown_video'):
f4776371
S
707 if url is None:
708 return default_ext
28e614de 709 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
710 if re.match(r'^[A-Za-z0-9]+$', guess):
711 return guess
712 else:
cbdbb766 713 return default_ext
73e79f2a 714
5f6a1245 715
d4051a8e 716def subtitles_filename(filename, sub_lang, sub_format):
28e614de 717 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 718
5f6a1245 719
bd558525 720def date_from_str(date_str):
37254abc
JMF
721 """
722 Return a datetime object from a string in the format YYYYMMDD or
723 (now|today)[+-][0-9](day|week|month|year)(s)?"""
724 today = datetime.date.today()
f8795e10 725 if date_str in ('now', 'today'):
37254abc 726 return today
f8795e10
PH
727 if date_str == 'yesterday':
728 return today - datetime.timedelta(days=1)
37254abc
JMF
729 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
730 if match is not None:
731 sign = match.group('sign')
732 time = int(match.group('time'))
733 if sign == '-':
734 time = -time
735 unit = match.group('unit')
5f6a1245 736 # A bad aproximation?
37254abc
JMF
737 if unit == 'month':
738 unit = 'day'
739 time *= 30
740 elif unit == 'year':
741 unit = 'day'
742 time *= 365
743 unit += 's'
744 delta = datetime.timedelta(**{unit: time})
745 return today + delta
bd558525 746 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
5f6a1245
JW
747
748
e63fc1be 749def hyphenate_date(date_str):
750 """
751 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
752 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
753 if match is not None:
754 return '-'.join(match.groups())
755 else:
756 return date_str
757
5f6a1245 758
bd558525
JMF
759class DateRange(object):
760 """Represents a time interval between two dates"""
5f6a1245 761
bd558525
JMF
762 def __init__(self, start=None, end=None):
763 """start and end must be strings in the format accepted by date"""
764 if start is not None:
765 self.start = date_from_str(start)
766 else:
767 self.start = datetime.datetime.min.date()
768 if end is not None:
769 self.end = date_from_str(end)
770 else:
771 self.end = datetime.datetime.max.date()
37254abc 772 if self.start > self.end:
bd558525 773 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 774
bd558525
JMF
775 @classmethod
776 def day(cls, day):
777 """Returns a range that only contains the given day"""
5f6a1245
JW
778 return cls(day, day)
779
bd558525
JMF
780 def __contains__(self, date):
781 """Check if the date is in the range"""
37254abc
JMF
782 if not isinstance(date, datetime.date):
783 date = date_from_str(date)
784 return self.start <= date <= self.end
5f6a1245 785
bd558525 786 def __str__(self):
5f6a1245 787 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
788
789
790def platform_name():
791 """ Returns the platform name as a compat_str """
792 res = platform.platform()
793 if isinstance(res, bytes):
794 res = res.decode(preferredencoding())
795
796 assert isinstance(res, compat_str)
797 return res
c257baff
PH
798
799
b58ddb32
PH
800def _windows_write_string(s, out):
801 """ Returns True if the string was written using special methods,
802 False if it has yet to be written out."""
803 # Adapted from http://stackoverflow.com/a/3259271/35070
804
805 import ctypes
806 import ctypes.wintypes
807
808 WIN_OUTPUT_IDS = {
809 1: -11,
810 2: -12,
811 }
812
a383a98a
PH
813 try:
814 fileno = out.fileno()
815 except AttributeError:
816 # If the output stream doesn't have a fileno, it's virtual
817 return False
b58ddb32
PH
818 if fileno not in WIN_OUTPUT_IDS:
819 return False
820
e2f89ec7 821 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 822 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
6ac4e806 823 (b"GetStdHandle", ctypes.windll.kernel32))
b58ddb32
PH
824 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
825
e2f89ec7 826 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
827 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
828 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
6ac4e806 829 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
b58ddb32
PH
830 written = ctypes.wintypes.DWORD(0)
831
6ac4e806 832 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
b58ddb32
PH
833 FILE_TYPE_CHAR = 0x0002
834 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 835 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
836 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
837 ctypes.POINTER(ctypes.wintypes.DWORD))(
6ac4e806 838 (b"GetConsoleMode", ctypes.windll.kernel32))
b58ddb32
PH
839 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
840
841 def not_a_console(handle):
842 if handle == INVALID_HANDLE_VALUE or handle is None:
843 return True
844 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
845 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
846
847 if not_a_console(h):
848 return False
849
d1b9c912
PH
850 def next_nonbmp_pos(s):
851 try:
852 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
853 except StopIteration:
854 return len(s)
855
856 while s:
857 count = min(next_nonbmp_pos(s), 1024)
858
b58ddb32 859 ret = WriteConsoleW(
d1b9c912 860 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
861 if ret == 0:
862 raise OSError('Failed to write string')
d1b9c912
PH
863 if not count: # We just wrote a non-BMP character
864 assert written.value == 2
865 s = s[1:]
866 else:
867 assert written.value > 0
868 s = s[written.value:]
b58ddb32
PH
869 return True
870
871
734f90bb 872def write_string(s, out=None, encoding=None):
7459e3a2
PH
873 if out is None:
874 out = sys.stderr
8bf48f23 875 assert type(s) == compat_str
7459e3a2 876
b58ddb32
PH
877 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
878 if _windows_write_string(s, out):
879 return
880
7459e3a2
PH
881 if ('b' in getattr(out, 'mode', '') or
882 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
883 byt = s.encode(encoding or preferredencoding(), 'ignore')
884 out.write(byt)
885 elif hasattr(out, 'buffer'):
886 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
887 byt = s.encode(enc, 'ignore')
888 out.buffer.write(byt)
889 else:
8bf48f23 890 out.write(s)
7459e3a2
PH
891 out.flush()
892
893
48ea9cea
PH
894def bytes_to_intlist(bs):
895 if not bs:
896 return []
897 if isinstance(bs[0], int): # Python 3
898 return list(bs)
899 else:
900 return [ord(c) for c in bs]
901
c257baff 902
cba892fa 903def intlist_to_bytes(xs):
904 if not xs:
905 return b''
eb4157fd 906 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
907
908
c1c9a79c
PH
909# Cross-platform file locking
910if sys.platform == 'win32':
911 import ctypes.wintypes
912 import msvcrt
913
914 class OVERLAPPED(ctypes.Structure):
915 _fields_ = [
916 ('Internal', ctypes.wintypes.LPVOID),
917 ('InternalHigh', ctypes.wintypes.LPVOID),
918 ('Offset', ctypes.wintypes.DWORD),
919 ('OffsetHigh', ctypes.wintypes.DWORD),
920 ('hEvent', ctypes.wintypes.HANDLE),
921 ]
922
923 kernel32 = ctypes.windll.kernel32
924 LockFileEx = kernel32.LockFileEx
925 LockFileEx.argtypes = [
926 ctypes.wintypes.HANDLE, # hFile
927 ctypes.wintypes.DWORD, # dwFlags
928 ctypes.wintypes.DWORD, # dwReserved
929 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
930 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
931 ctypes.POINTER(OVERLAPPED) # Overlapped
932 ]
933 LockFileEx.restype = ctypes.wintypes.BOOL
934 UnlockFileEx = kernel32.UnlockFileEx
935 UnlockFileEx.argtypes = [
936 ctypes.wintypes.HANDLE, # hFile
937 ctypes.wintypes.DWORD, # dwReserved
938 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
939 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
940 ctypes.POINTER(OVERLAPPED) # Overlapped
941 ]
942 UnlockFileEx.restype = ctypes.wintypes.BOOL
943 whole_low = 0xffffffff
944 whole_high = 0x7fffffff
945
946 def _lock_file(f, exclusive):
947 overlapped = OVERLAPPED()
948 overlapped.Offset = 0
949 overlapped.OffsetHigh = 0
950 overlapped.hEvent = 0
951 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
952 handle = msvcrt.get_osfhandle(f.fileno())
953 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
954 whole_low, whole_high, f._lock_file_overlapped_p):
955 raise OSError('Locking file failed: %r' % ctypes.FormatError())
956
957 def _unlock_file(f):
958 assert f._lock_file_overlapped_p
959 handle = msvcrt.get_osfhandle(f.fileno())
960 if not UnlockFileEx(handle, 0,
961 whole_low, whole_high, f._lock_file_overlapped_p):
962 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
963
964else:
965 import fcntl
966
967 def _lock_file(f, exclusive):
2582bebe 968 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
969
970 def _unlock_file(f):
2582bebe 971 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
972
973
974class locked_file(object):
975 def __init__(self, filename, mode, encoding=None):
976 assert mode in ['r', 'a', 'w']
977 self.f = io.open(filename, mode, encoding=encoding)
978 self.mode = mode
979
980 def __enter__(self):
981 exclusive = self.mode != 'r'
982 try:
983 _lock_file(self.f, exclusive)
984 except IOError:
985 self.f.close()
986 raise
987 return self
988
989 def __exit__(self, etype, value, traceback):
990 try:
991 _unlock_file(self.f)
992 finally:
993 self.f.close()
994
995 def __iter__(self):
996 return iter(self.f)
997
998 def write(self, *args):
999 return self.f.write(*args)
1000
1001 def read(self, *args):
1002 return self.f.read(*args)
4eb7f1d1
JMF
1003
1004
4644ac55
S
1005def get_filesystem_encoding():
1006 encoding = sys.getfilesystemencoding()
1007 return encoding if encoding is not None else 'utf-8'
1008
1009
4eb7f1d1 1010def shell_quote(args):
a6a173c2 1011 quoted_args = []
4644ac55 1012 encoding = get_filesystem_encoding()
a6a173c2
JMF
1013 for a in args:
1014 if isinstance(a, bytes):
1015 # We may get a filename encoded with 'encodeFilename'
1016 a = a.decode(encoding)
1017 quoted_args.append(pipes.quote(a))
28e614de 1018 return ' '.join(quoted_args)
9d4660ca
PH
1019
1020
f4d96df0
PH
1021def takewhile_inclusive(pred, seq):
1022 """ Like itertools.takewhile, but include the latest evaluated element
1023 (the first element so that Not pred(e)) """
1024 for e in seq:
1025 yield e
1026 if not pred(e):
1027 return
1028
1029
9d4660ca
PH
1030def smuggle_url(url, data):
1031 """ Pass additional data in a URL for internal use. """
1032
1033 sdata = compat_urllib_parse.urlencode(
28e614de
PH
1034 {'__youtubedl_smuggle': json.dumps(data)})
1035 return url + '#' + sdata
9d4660ca
PH
1036
1037
79f82953 1038def unsmuggle_url(smug_url, default=None):
83e865a3 1039 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1040 return smug_url, default
28e614de
PH
1041 url, _, sdata = smug_url.rpartition('#')
1042 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1043 data = json.loads(jsond)
1044 return url, data
02dbf93f
PH
1045
1046
02dbf93f
PH
1047def format_bytes(bytes):
1048 if bytes is None:
28e614de 1049 return 'N/A'
02dbf93f
PH
1050 if type(bytes) is str:
1051 bytes = float(bytes)
1052 if bytes == 0.0:
1053 exponent = 0
1054 else:
1055 exponent = int(math.log(bytes, 1024.0))
28e614de 1056 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1057 converted = float(bytes) / float(1024 ** exponent)
28e614de 1058 return '%.2f%s' % (converted, suffix)
f53c966a 1059
1c088fa8 1060
be64b5b0
PH
1061def parse_filesize(s):
1062 if s is None:
1063 return None
1064
1065 # The lower-case forms are of course incorrect and inofficial,
1066 # but we support those too
1067 _UNIT_TABLE = {
1068 'B': 1,
1069 'b': 1,
1070 'KiB': 1024,
1071 'KB': 1000,
1072 'kB': 1024,
1073 'Kb': 1000,
1074 'MiB': 1024 ** 2,
1075 'MB': 1000 ** 2,
1076 'mB': 1024 ** 2,
1077 'Mb': 1000 ** 2,
1078 'GiB': 1024 ** 3,
1079 'GB': 1000 ** 3,
1080 'gB': 1024 ** 3,
1081 'Gb': 1000 ** 3,
1082 'TiB': 1024 ** 4,
1083 'TB': 1000 ** 4,
1084 'tB': 1024 ** 4,
1085 'Tb': 1000 ** 4,
1086 'PiB': 1024 ** 5,
1087 'PB': 1000 ** 5,
1088 'pB': 1024 ** 5,
1089 'Pb': 1000 ** 5,
1090 'EiB': 1024 ** 6,
1091 'EB': 1000 ** 6,
1092 'eB': 1024 ** 6,
1093 'Eb': 1000 ** 6,
1094 'ZiB': 1024 ** 7,
1095 'ZB': 1000 ** 7,
1096 'zB': 1024 ** 7,
1097 'Zb': 1000 ** 7,
1098 'YiB': 1024 ** 8,
1099 'YB': 1000 ** 8,
1100 'yB': 1024 ** 8,
1101 'Yb': 1000 ** 8,
1102 }
1103
1104 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
4349c07d
PH
1105 m = re.match(
1106 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
be64b5b0
PH
1107 if not m:
1108 return None
1109
4349c07d
PH
1110 num_str = m.group('num').replace(',', '.')
1111 mult = _UNIT_TABLE[m.group('unit')]
1112 return int(float(num_str) * mult)
be64b5b0
PH
1113
1114
1c088fa8 1115def get_term_width():
4644ac55 1116 columns = compat_getenv('COLUMNS', None)
1c088fa8
PH
1117 if columns:
1118 return int(columns)
1119
1120 try:
1121 sp = subprocess.Popen(
1122 ['stty', 'size'],
1123 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1124 out, err = sp.communicate()
1125 return int(out.split()[1])
1126 except:
1127 pass
1128 return None
caefb1de
PH
1129
1130
1131def month_by_name(name):
1132 """ Return the number of a month by (locale-independently) English name """
1133
1134 ENGLISH_NAMES = [
28e614de
PH
1135 'January', 'February', 'March', 'April', 'May', 'June',
1136 'July', 'August', 'September', 'October', 'November', 'December']
caefb1de
PH
1137 try:
1138 return ENGLISH_NAMES.index(name) + 1
1139 except ValueError:
1140 return None
18258362
JMF
1141
1142
5aafe895 1143def fix_xml_ampersands(xml_str):
18258362 1144 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1145 return re.sub(
1146 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1147 '&amp;',
5aafe895 1148 xml_str)
e3946f98
PH
1149
1150
1151def setproctitle(title):
8bf48f23 1152 assert isinstance(title, compat_str)
e3946f98
PH
1153 try:
1154 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1155 except OSError:
1156 return
6eefe533
PH
1157 title_bytes = title.encode('utf-8')
1158 buf = ctypes.create_string_buffer(len(title_bytes))
1159 buf.value = title_bytes
e3946f98 1160 try:
6eefe533 1161 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1162 except AttributeError:
1163 return # Strange libc, just skip this
d7dda168
PH
1164
1165
1166def remove_start(s, start):
1167 if s.startswith(start):
1168 return s[len(start):]
1169 return s
29eb5174
PH
1170
1171
2b9faf55
PH
1172def remove_end(s, end):
1173 if s.endswith(end):
1174 return s[:-len(end)]
1175 return s
1176
1177
29eb5174 1178def url_basename(url):
9b8aaeed 1179 path = compat_urlparse.urlparse(url).path
28e614de 1180 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1181
1182
1183class HEADRequest(compat_urllib_request.Request):
1184 def get_method(self):
1185 return "HEAD"
7217e148
PH
1186
1187
9732d77e 1188def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1189 if get_attr:
1190 if v is not None:
1191 v = getattr(v, get_attr, None)
9572013d
PH
1192 if v == '':
1193 v = None
9732d77e
PH
1194 return default if v is None else (int(v) * invscale // scale)
1195
9572013d 1196
40a90862
JMF
1197def str_or_none(v, default=None):
1198 return default if v is None else compat_str(v)
1199
9732d77e
PH
1200
1201def str_to_int(int_str):
48d4681e 1202 """ A more relaxed version of int_or_none """
9732d77e
PH
1203 if int_str is None:
1204 return None
28e614de 1205 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1206 return int(int_str)
608d11f5
PH
1207
1208
9732d77e
PH
1209def float_or_none(v, scale=1, invscale=1, default=None):
1210 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1211
1212
608d11f5
PH
1213def parse_duration(s):
1214 if s is None:
1215 return None
1216
ca7b3246
S
1217 s = s.strip()
1218
608d11f5 1219 m = re.match(
6a68bb57 1220 r'''(?ix)T?
e8df5cee
PH
1221 (?:
1222 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1223 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1224
6a68bb57
PH
1225 (?:
1226 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1227 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1228 )?
e8df5cee
PH
1229 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1230 )$''', s)
608d11f5
PH
1231 if not m:
1232 return None
e8df5cee
PH
1233 res = 0
1234 if m.group('only_mins'):
1235 return float_or_none(m.group('only_mins'), invscale=60)
1236 if m.group('only_hours'):
1237 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1238 if m.group('secs'):
1239 res += int(m.group('secs'))
608d11f5
PH
1240 if m.group('mins'):
1241 res += int(m.group('mins')) * 60
e8df5cee
PH
1242 if m.group('hours'):
1243 res += int(m.group('hours')) * 60 * 60
7adcbe75
PH
1244 if m.group('ms'):
1245 res += float(m.group('ms'))
608d11f5 1246 return res
91d7d0b3
JMF
1247
1248
1249def prepend_extension(filename, ext):
5f6a1245 1250 name, real_ext = os.path.splitext(filename)
28e614de 1251 return '{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1252
1253
1254def check_executable(exe, args=[]):
1255 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1256 args can be a list of arguments for a short output (like -version) """
1257 try:
1258 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1259 except OSError:
1260 return False
1261 return exe
b7ab0590
PH
1262
1263
95807118 1264def get_exe_version(exe, args=['--version'],
cae97f65 1265 version_re=None, unrecognized='present'):
95807118
PH
1266 """ Returns the version of the specified executable,
1267 or False if the executable is not present """
1268 try:
cae97f65 1269 out, _ = subprocess.Popen(
95807118
PH
1270 [exe] + args,
1271 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1272 except OSError:
1273 return False
cae97f65
PH
1274 if isinstance(out, bytes): # Python 2.x
1275 out = out.decode('ascii', 'ignore')
1276 return detect_exe_version(out, version_re, unrecognized)
1277
1278
1279def detect_exe_version(output, version_re=None, unrecognized='present'):
1280 assert isinstance(output, compat_str)
1281 if version_re is None:
1282 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1283 m = re.search(version_re, output)
95807118
PH
1284 if m:
1285 return m.group(1)
1286 else:
1287 return unrecognized
1288
1289
b7ab0590 1290class PagedList(object):
dd26ced1
PH
1291 def __len__(self):
1292 # This is only useful for tests
1293 return len(self.getslice())
1294
9c44d242
PH
1295
1296class OnDemandPagedList(PagedList):
1297 def __init__(self, pagefunc, pagesize):
1298 self._pagefunc = pagefunc
1299 self._pagesize = pagesize
1300
b7ab0590
PH
1301 def getslice(self, start=0, end=None):
1302 res = []
1303 for pagenum in itertools.count(start // self._pagesize):
1304 firstid = pagenum * self._pagesize
1305 nextfirstid = pagenum * self._pagesize + self._pagesize
1306 if start >= nextfirstid:
1307 continue
1308
1309 page_results = list(self._pagefunc(pagenum))
1310
1311 startv = (
1312 start % self._pagesize
1313 if firstid <= start < nextfirstid
1314 else 0)
1315
1316 endv = (
1317 ((end - 1) % self._pagesize) + 1
1318 if (end is not None and firstid <= end <= nextfirstid)
1319 else None)
1320
1321 if startv != 0 or endv is not None:
1322 page_results = page_results[startv:endv]
1323 res.extend(page_results)
1324
1325 # A little optimization - if current page is not "full", ie. does
1326 # not contain page_size videos then we can assume that this page
1327 # is the last one - there are no more ids on further pages -
1328 # i.e. no need to query again.
1329 if len(page_results) + startv < self._pagesize:
1330 break
1331
1332 # If we got the whole page, but the next page is not interesting,
1333 # break out early as well
1334 if end == nextfirstid:
1335 break
1336 return res
81c2f20b
PH
1337
1338
9c44d242
PH
1339class InAdvancePagedList(PagedList):
1340 def __init__(self, pagefunc, pagecount, pagesize):
1341 self._pagefunc = pagefunc
1342 self._pagecount = pagecount
1343 self._pagesize = pagesize
1344
1345 def getslice(self, start=0, end=None):
1346 res = []
1347 start_page = start // self._pagesize
1348 end_page = (
1349 self._pagecount if end is None else (end // self._pagesize + 1))
1350 skip_elems = start - start_page * self._pagesize
1351 only_more = None if end is None else end - start
1352 for pagenum in range(start_page, end_page):
1353 page = list(self._pagefunc(pagenum))
1354 if skip_elems:
1355 page = page[skip_elems:]
1356 skip_elems = None
1357 if only_more is not None:
1358 if len(page) < only_more:
1359 only_more -= len(page)
1360 else:
1361 page = page[:only_more]
1362 res.extend(page)
1363 break
1364 res.extend(page)
1365 return res
1366
1367
81c2f20b 1368def uppercase_escape(s):
676eb3f2 1369 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1370 return re.sub(
a612753d 1371 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1372 lambda m: unicode_escape(m.group(0))[0],
1373 s)
b53466e1 1374
d05cfe06
S
1375
1376def escape_rfc3986(s):
1377 """Escape non-ASCII characters as suggested by RFC 3986"""
1378 if sys.version_info < (3, 0) and isinstance(s, unicode):
1379 s = s.encode('utf-8')
ecc0c5ee 1380 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1381
1382
1383def escape_url(url):
1384 """Escape URL as suggested by RFC 3986"""
1385 url_parsed = compat_urllib_parse_urlparse(url)
1386 return url_parsed._replace(
1387 path=escape_rfc3986(url_parsed.path),
1388 params=escape_rfc3986(url_parsed.params),
1389 query=escape_rfc3986(url_parsed.query),
1390 fragment=escape_rfc3986(url_parsed.fragment)
1391 ).geturl()
1392
b53466e1 1393try:
28e614de 1394 struct.pack('!I', 0)
b53466e1
PH
1395except TypeError:
1396 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1397 def struct_pack(spec, *args):
1398 if isinstance(spec, compat_str):
1399 spec = spec.encode('ascii')
1400 return struct.pack(spec, *args)
1401
1402 def struct_unpack(spec, *args):
1403 if isinstance(spec, compat_str):
1404 spec = spec.encode('ascii')
1405 return struct.unpack(spec, *args)
1406else:
1407 struct_pack = struct.pack
1408 struct_unpack = struct.unpack
62e609ab
PH
1409
1410
1411def read_batch_urls(batch_fd):
1412 def fixup(url):
1413 if not isinstance(url, compat_str):
1414 url = url.decode('utf-8', 'replace')
28e614de 1415 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1416 if url.startswith(BOM_UTF8):
1417 url = url[len(BOM_UTF8):]
1418 url = url.strip()
1419 if url.startswith(('#', ';', ']')):
1420 return False
1421 return url
1422
1423 with contextlib.closing(batch_fd) as fd:
1424 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1425
1426
1427def urlencode_postdata(*args, **kargs):
1428 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1429
1430
0990305d
PH
1431try:
1432 etree_iter = xml.etree.ElementTree.Element.iter
1433except AttributeError: # Python <=2.6
1434 etree_iter = lambda n: n.findall('.//*')
1435
1436
bcf89ce6
PH
1437def parse_xml(s):
1438 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1439 def doctype(self, name, pubid, system):
1440 pass # Ignore doctypes
1441
1442 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1443 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1444 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1445 # Fix up XML parser in Python 2.x
1446 if sys.version_info < (3, 0):
1447 for n in etree_iter(tree):
1448 if n.text is not None:
1449 if not isinstance(n.text, compat_str):
1450 n.text = n.text.decode('utf-8')
1451 return tree
e68301af
PH
1452
1453
a1a530b0
PH
1454US_RATINGS = {
1455 'G': 0,
1456 'PG': 10,
1457 'PG-13': 13,
1458 'R': 16,
1459 'NC': 18,
1460}
fac55558
PH
1461
1462
146c80e2
S
1463def parse_age_limit(s):
1464 if s is None:
d838b1bd 1465 return None
146c80e2 1466 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d838b1bd 1467 return int(m.group('age')) if m else US_RATINGS.get(s, None)
146c80e2
S
1468
1469
fac55558 1470def strip_jsonp(code):
609a61e3
PH
1471 return re.sub(
1472 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1473
1474
e05f6939
PH
1475def js_to_json(code):
1476 def fix_kv(m):
e7b6d122
PH
1477 v = m.group(0)
1478 if v in ('true', 'false', 'null'):
1479 return v
1480 if v.startswith('"'):
1481 return v
1482 if v.startswith("'"):
1483 v = v[1:-1]
1484 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1485 '\\\\': '\\\\',
1486 "\\'": "'",
1487 '"': '\\"',
1488 }[m.group(0)], v)
1489 return '"%s"' % v
e05f6939
PH
1490
1491 res = re.sub(r'''(?x)
e7b6d122
PH
1492 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1493 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1494 [a-zA-Z_][a-zA-Z_0-9]*
e05f6939
PH
1495 ''', fix_kv, code)
1496 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1497 return res
1498
1499
478c2c61
PH
1500def qualities(quality_ids):
1501 """ Get a numeric quality value out of a list of possible values """
1502 def q(qid):
1503 try:
1504 return quality_ids.index(qid)
1505 except ValueError:
1506 return -1
1507 return q
1508
acd69589
PH
1509
1510DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1511
a020a0dc
PH
1512
1513def limit_length(s, length):
1514 """ Add ellipses to overly long strings """
1515 if s is None:
1516 return None
1517 ELLIPSES = '...'
1518 if len(s) > length:
1519 return s[:length - len(ELLIPSES)] + ELLIPSES
1520 return s
48844745
PH
1521
1522
1523def version_tuple(v):
5f9b8394 1524 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1525
1526
1527def is_outdated_version(version, limit, assume_new=True):
1528 if not version:
1529 return not assume_new
1530 try:
1531 return version_tuple(version) < version_tuple(limit)
1532 except ValueError:
1533 return not assume_new
732ea2f0
PH
1534
1535
1536def ytdl_is_updateable():
1537 """ Returns if youtube-dl can be updated with -U """
1538 from zipimport import zipimporter
1539
1540 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1541
1542
1543def args_to_str(args):
1544 # Get a short string representation for a subprocess command
1545 return ' '.join(shlex_quote(a) for a in args)