]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
Merge branch 'master' of github.com:rg3/youtube-dl
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
912b38b4 6import calendar
676eb3f2 7import codecs
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96
PH
10import datetime
11import email.utils
f45c185f 12import errno
d77c3dfd 13import gzip
b7ab0590 14import itertools
03f9daab 15import io
f4bfd65f 16import json
d77c3dfd 17import locale
02dbf93f 18import math
d77c3dfd 19import os
4eb7f1d1 20import pipes
c496ca96 21import platform
d77c3dfd 22import re
13ebea79 23import ssl
c496ca96 24import socket
b53466e1 25import struct
1c088fa8 26import subprocess
d77c3dfd 27import sys
181c8655 28import tempfile
01951dda 29import traceback
bcf89ce6 30import xml.etree.ElementTree
d77c3dfd 31import zlib
d77c3dfd 32
8c25f81b
PH
33from .compat import (
34 compat_chr,
35 compat_getenv,
36 compat_html_entities,
8c25f81b
PH
37 compat_parse_qs,
38 compat_str,
39 compat_urllib_error,
40 compat_urllib_parse,
41 compat_urllib_parse_urlparse,
42 compat_urllib_request,
43 compat_urlparse,
7d4111ed 44 shlex_quote,
8c25f81b 45)
4644ac55
S
46
47
468e2e92
FV
48# This is not clearly defined otherwise
49compiled_regex_type = type(re.compile(''))
50
3e669f36 51std_headers = {
ae8f7871 52 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
53 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
54 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
55 'Accept-Encoding': 'gzip, deflate',
56 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 57}
f427df17 58
5f6a1245 59
d77c3dfd 60def preferredencoding():
59ae15a5 61 """Get preferred encoding.
d77c3dfd 62
59ae15a5
PH
63 Returns the best encoding scheme for the system, based on
64 locale.getpreferredencoding() and some further tweaks.
65 """
66 try:
67 pref = locale.getpreferredencoding()
28e614de 68 'TEST'.encode(pref)
59ae15a5
PH
69 except:
70 pref = 'UTF-8'
bae611f2 71
59ae15a5 72 return pref
d77c3dfd 73
f4bfd65f 74
181c8655 75def write_json_file(obj, fn):
1394646a 76 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 77
92120217 78 fn = encodeFilename(fn)
61ee5aeb 79 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
80 encoding = get_filesystem_encoding()
81 # os.path.basename returns a bytes object, but NamedTemporaryFile
82 # will fail if the filename contains non ascii characters unless we
83 # use a unicode object
84 path_basename = lambda f: os.path.basename(fn).decode(encoding)
85 # the same for os.path.dirname
86 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
87 else:
88 path_basename = os.path.basename
89 path_dirname = os.path.dirname
90
73159f99
S
91 args = {
92 'suffix': '.tmp',
ec5f6016
JMF
93 'prefix': path_basename(fn) + '.',
94 'dir': path_dirname(fn),
73159f99
S
95 'delete': False,
96 }
97
181c8655
PH
98 # In Python 2.x, json.dump expects a bytestream.
99 # In Python 3.x, it writes to a character stream
100 if sys.version_info < (3, 0):
73159f99 101 args['mode'] = 'wb'
181c8655 102 else:
73159f99
S
103 args.update({
104 'mode': 'w',
105 'encoding': 'utf-8',
106 })
107
108 tf = tempfile.NamedTemporaryFile(**args)
181c8655
PH
109
110 try:
111 with tf:
112 json.dump(obj, tf)
1394646a
IK
113 if sys.platform == 'win32':
114 # Need to remove existing file on Windows, else os.rename raises
115 # WindowsError or FileExistsError.
116 try:
117 os.unlink(fn)
118 except OSError:
119 pass
181c8655
PH
120 os.rename(tf.name, fn)
121 except:
122 try:
123 os.remove(tf.name)
124 except OSError:
125 pass
126 raise
127
128
129if sys.version_info >= (2, 7):
59ae56fa
PH
130 def find_xpath_attr(node, xpath, key, val):
131 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
132 assert re.match(r'^[a-zA-Z-]+$', key)
133 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
ab4ee31e 134 expr = xpath + "[@%s='%s']" % (key, val)
59ae56fa
PH
135 return node.find(expr)
136else:
137 def find_xpath_attr(node, xpath, key, val):
4eefbfdb
PH
138 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
139 # .//node does not match if a node is a direct child of . !
140 if isinstance(xpath, unicode):
141 xpath = xpath.encode('ascii')
142
59ae56fa
PH
143 for f in node.findall(xpath):
144 if f.attrib.get(key) == val:
145 return f
146 return None
147
d7e66d39
JMF
148# On python2.6 the xml.etree.ElementTree.Element methods don't support
149# the namespace parameter
5f6a1245
JW
150
151
d7e66d39
JMF
152def xpath_with_ns(path, ns_map):
153 components = [c.split(':') for c in path.split('/')]
154 replaced = []
155 for c in components:
156 if len(c) == 1:
157 replaced.append(c[0])
158 else:
159 ns, tag = c
160 replaced.append('{%s}%s' % (ns_map[ns], tag))
161 return '/'.join(replaced)
162
d77c3dfd 163
bf0ff932 164def xpath_text(node, xpath, name=None, fatal=False):
d74bebd5
PH
165 if sys.version_info < (2, 7): # Crazy 2.6
166 xpath = xpath.encode('ascii')
167
bf0ff932 168 n = node.find(xpath)
42bdd9d0 169 if n is None or n.text is None:
bf0ff932
PH
170 if fatal:
171 name = xpath if name is None else name
172 raise ExtractorError('Could not find XML element %s' % name)
173 else:
174 return None
175 return n.text
176
177
9e6dd238 178def get_element_by_id(id, html):
43e8fafd
ND
179 """Return the content of the tag with the specified ID in the passed HTML document"""
180 return get_element_by_attribute("id", id, html)
181
12ea2f30 182
43e8fafd
ND
183def get_element_by_attribute(attribute, value, html):
184 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 185
38285056
PH
186 m = re.search(r'''(?xs)
187 <([a-zA-Z0-9:._-]+)
188 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
189 \s+%s=['"]?%s['"]?
190 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
191 \s*>
192 (?P<content>.*?)
193 </\1>
194 ''' % (re.escape(attribute), re.escape(value)), html)
195
196 if not m:
197 return None
198 res = m.group('content')
199
200 if res.startswith('"') or res.startswith("'"):
201 res = res[1:-1]
a921f407 202
38285056 203 return unescapeHTML(res)
a921f407 204
9e6dd238
FV
205
206def clean_html(html):
59ae15a5
PH
207 """Clean an HTML snippet into a readable string"""
208 # Newline vs <br />
209 html = html.replace('\n', ' ')
6b3aef80
FV
210 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
211 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
212 # Strip html tags
213 html = re.sub('<.*?>', '', html)
214 # Replace html entities
215 html = unescapeHTML(html)
7decf895 216 return html.strip()
9e6dd238
FV
217
218
d77c3dfd 219def sanitize_open(filename, open_mode):
59ae15a5
PH
220 """Try to open the given filename, and slightly tweak it if this fails.
221
222 Attempts to open the given filename. If this fails, it tries to change
223 the filename slightly, step by step, until it's either able to open it
224 or it fails and raises a final exception, like the standard open()
225 function.
226
227 It returns the tuple (stream, definitive_file_name).
228 """
229 try:
28e614de 230 if filename == '-':
59ae15a5
PH
231 if sys.platform == 'win32':
232 import msvcrt
233 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 234 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
235 stream = open(encodeFilename(filename), open_mode)
236 return (stream, filename)
237 except (IOError, OSError) as err:
f45c185f
PH
238 if err.errno in (errno.EACCES,):
239 raise
59ae15a5 240
f45c185f
PH
241 # In case of error, try to remove win32 forbidden chars
242 alt_filename = os.path.join(
b74e86f4
PH
243 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
244 for path_part in os.path.split(filename)
245 )
f45c185f
PH
246 if alt_filename == filename:
247 raise
248 else:
249 # An exception here should be caught in the caller
250 stream = open(encodeFilename(filename), open_mode)
251 return (stream, alt_filename)
d77c3dfd
FV
252
253
254def timeconvert(timestr):
59ae15a5
PH
255 """Convert RFC 2822 defined time string into system timestamp"""
256 timestamp = None
257 timetuple = email.utils.parsedate_tz(timestr)
258 if timetuple is not None:
259 timestamp = email.utils.mktime_tz(timetuple)
260 return timestamp
1c469a94 261
5f6a1245 262
796173d0 263def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
264 """Sanitizes a string so it could be used as part of a filename.
265 If restricted is set, use a stricter subset of allowed characters.
796173d0 266 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
267 """
268 def replace_insane(char):
269 if char == '?' or ord(char) < 32 or ord(char) == 127:
270 return ''
271 elif char == '"':
272 return '' if restricted else '\''
273 elif char == ':':
274 return '_-' if restricted else ' -'
275 elif char in '\\/|*<>':
276 return '_'
627dcfff 277 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
278 return '_'
279 if restricted and ord(char) > 127:
280 return '_'
281 return char
282
28e614de 283 result = ''.join(map(replace_insane, s))
796173d0
PH
284 if not is_id:
285 while '__' in result:
286 result = result.replace('__', '_')
287 result = result.strip('_')
288 # Common case of "Foreign band name - English song title"
289 if restricted and result.startswith('-_'):
290 result = result[2:]
291 if not result:
292 result = '_'
59ae15a5 293 return result
d77c3dfd 294
5f6a1245 295
d77c3dfd 296def orderedSet(iterable):
59ae15a5
PH
297 """ Remove all duplicates from the input iterable """
298 res = []
299 for el in iterable:
300 if el not in res:
301 res.append(el)
302 return res
d77c3dfd 303
912b38b4 304
4e408e47
PH
305def _htmlentity_transform(entity):
306 """Transforms an HTML entity to a character."""
307 # Known non-numeric HTML entity
308 if entity in compat_html_entities.name2codepoint:
309 return compat_chr(compat_html_entities.name2codepoint[entity])
310
311 mobj = re.match(r'#(x?[0-9]+)', entity)
312 if mobj is not None:
313 numstr = mobj.group(1)
28e614de 314 if numstr.startswith('x'):
4e408e47 315 base = 16
28e614de 316 numstr = '0%s' % numstr
4e408e47
PH
317 else:
318 base = 10
319 return compat_chr(int(numstr, base))
320
321 # Unknown entity in name, return its literal representation
28e614de 322 return ('&%s;' % entity)
4e408e47
PH
323
324
d77c3dfd 325def unescapeHTML(s):
912b38b4
PH
326 if s is None:
327 return None
328 assert type(s) == compat_str
d77c3dfd 329
4e408e47
PH
330 return re.sub(
331 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 332
8bf48f23
PH
333
334def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
335 """
336 @param s The name of the file
337 """
d77c3dfd 338
8bf48f23 339 assert type(s) == compat_str
d77c3dfd 340
59ae15a5
PH
341 # Python 3 has a Unicode API
342 if sys.version_info >= (3, 0):
343 return s
0f00efed 344
59ae15a5 345 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
28e614de 346 # Pass '' directly to use Unicode APIs on Windows 2000 and up
59ae15a5
PH
347 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
348 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
349 if not for_subprocess:
350 return s
351 else:
352 # For subprocess calls, encode with locale encoding
353 # Refer to http://stackoverflow.com/a/9951851/35070
354 encoding = preferredencoding()
59ae15a5 355 else:
6df40dcb 356 encoding = sys.getfilesystemencoding()
8bf48f23
PH
357 if encoding is None:
358 encoding = 'utf-8'
359 return s.encode(encoding, 'ignore')
360
f07b74fc
PH
361
362def encodeArgument(s):
363 if not isinstance(s, compat_str):
364 # Legacy code that uses byte strings
365 # Uncomment the following line after fixing all post processors
7af808a5 366 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
367 s = s.decode('ascii')
368 return encodeFilename(s, True)
369
370
8271226a
PH
371def decodeOption(optval):
372 if optval is None:
373 return optval
374 if isinstance(optval, bytes):
375 optval = optval.decode(preferredencoding())
376
377 assert isinstance(optval, compat_str)
378 return optval
1c256f70 379
5f6a1245 380
4539dd30
PH
381def formatSeconds(secs):
382 if secs > 3600:
383 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
384 elif secs > 60:
385 return '%d:%02d' % (secs // 60, secs % 60)
386 else:
387 return '%d' % secs
388
a0ddb8a2
PH
389
390def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
0db261ba
JMF
391 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
392 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
0db261ba
JMF
393 if opts_no_check_certificate:
394 context.verify_mode = ssl.CERT_NONE
a2366922
PH
395 try:
396 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
397 except TypeError:
398 # Python 2.7.8
399 # (create_default_context present but HTTPSHandler has no context=)
400 pass
401
402 if sys.version_info < (3, 2):
13ebea79
PH
403 import httplib
404
405 class HTTPSConnectionV3(httplib.HTTPSConnection):
406 def __init__(self, *args, **kwargs):
407 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
408
409 def connect(self):
410 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 411 if getattr(self, '_tunnel_host', False):
13ebea79
PH
412 self.sock = sock
413 self._tunnel()
414 try:
aa37e3d4 415 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
de79c46c 416 except ssl.SSLError:
13ebea79
PH
417 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
418
419 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
420 def https_open(self, req):
421 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 422 return HTTPSHandlerV3(**kwargs)
aa37e3d4
PH
423 else: # Python < 3.4
424 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
ea6d901e 425 context.verify_mode = (ssl.CERT_NONE
dca08720 426 if opts_no_check_certificate
ea6d901e 427 else ssl.CERT_REQUIRED)
303b479e 428 context.set_default_verify_paths()
a0ddb8a2 429 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 430
732ea2f0 431
1c256f70
PH
432class ExtractorError(Exception):
433 """Error during info extraction."""
5f6a1245 434
d11271dd 435 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
436 """ tb, if given, is the original traceback (so that it can be printed out).
437 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
438 """
439
440 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
441 expected = True
d11271dd
PH
442 if video_id is not None:
443 msg = video_id + ': ' + msg
410f3e73 444 if cause:
28e614de 445 msg += ' (caused by %r)' % cause
9a82b238 446 if not expected:
732ea2f0
PH
447 if ytdl_is_updateable():
448 update_cmd = 'type youtube-dl -U to update'
449 else:
450 update_cmd = 'see https://yt-dl.org/update on how to update'
451 msg += '; please report this issue on https://yt-dl.org/bug .'
452 msg += ' Make sure you are using the latest version; %s.' % update_cmd
453 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
1c256f70 454 super(ExtractorError, self).__init__(msg)
d5979c5d 455
1c256f70 456 self.traceback = tb
8cc83b8d 457 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 458 self.cause = cause
d11271dd 459 self.video_id = video_id
1c256f70 460
01951dda
PH
461 def format_traceback(self):
462 if self.traceback is None:
463 return None
28e614de 464 return ''.join(traceback.format_tb(self.traceback))
01951dda 465
1c256f70 466
416c7fcb
PH
467class UnsupportedError(ExtractorError):
468 def __init__(self, url):
469 super(UnsupportedError, self).__init__(
470 'Unsupported URL: %s' % url, expected=True)
471 self.url = url
472
473
55b3e45b
JMF
474class RegexNotFoundError(ExtractorError):
475 """Error when a regex didn't match"""
476 pass
477
478
d77c3dfd 479class DownloadError(Exception):
59ae15a5 480 """Download Error exception.
d77c3dfd 481
59ae15a5
PH
482 This exception may be thrown by FileDownloader objects if they are not
483 configured to continue on errors. They will contain the appropriate
484 error message.
485 """
5f6a1245 486
8cc83b8d
FV
487 def __init__(self, msg, exc_info=None):
488 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
489 super(DownloadError, self).__init__(msg)
490 self.exc_info = exc_info
d77c3dfd
FV
491
492
493class SameFileError(Exception):
59ae15a5 494 """Same File exception.
d77c3dfd 495
59ae15a5
PH
496 This exception will be thrown by FileDownloader objects if they detect
497 multiple files would have to be downloaded to the same file on disk.
498 """
499 pass
d77c3dfd
FV
500
501
502class PostProcessingError(Exception):
59ae15a5 503 """Post Processing exception.
d77c3dfd 504
59ae15a5
PH
505 This exception may be raised by PostProcessor's .run() method to
506 indicate an error in the postprocessing task.
507 """
5f6a1245 508
7851b379
PH
509 def __init__(self, msg):
510 self.msg = msg
d77c3dfd 511
5f6a1245 512
d77c3dfd 513class MaxDownloadsReached(Exception):
59ae15a5
PH
514 """ --max-downloads limit has been reached. """
515 pass
d77c3dfd
FV
516
517
518class UnavailableVideoError(Exception):
59ae15a5 519 """Unavailable Format exception.
d77c3dfd 520
59ae15a5
PH
521 This exception will be thrown when a video is requested
522 in a format that is not available for that video.
523 """
524 pass
d77c3dfd
FV
525
526
527class ContentTooShortError(Exception):
59ae15a5 528 """Content Too Short exception.
d77c3dfd 529
59ae15a5
PH
530 This exception may be raised by FileDownloader objects when a file they
531 download is too small for what the server announced first, indicating
532 the connection was probably interrupted.
533 """
534 # Both in bytes
535 downloaded = None
536 expected = None
d77c3dfd 537
59ae15a5
PH
538 def __init__(self, downloaded, expected):
539 self.downloaded = downloaded
540 self.expected = expected
d77c3dfd 541
5f6a1245 542
acebc9cd 543class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
544 """Handler for HTTP requests and responses.
545
546 This class, when installed with an OpenerDirector, automatically adds
547 the standard headers to every HTTP request and handles gzipped and
548 deflated responses from web servers. If compression is to be avoided in
549 a particular request, the original request in the program code only has
550 to include the HTTP header "Youtubedl-No-Compression", which will be
551 removed before making the real request.
552
553 Part of this code was copied from:
554
555 http://techknack.net/python-urllib2-handlers/
556
557 Andrew Rowls, the author of that code, agreed to release it to the
558 public domain.
559 """
560
561 @staticmethod
562 def deflate(data):
563 try:
564 return zlib.decompress(data, -zlib.MAX_WBITS)
565 except zlib.error:
566 return zlib.decompress(data)
567
568 @staticmethod
569 def addinfourl_wrapper(stream, headers, url, code):
570 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
571 return compat_urllib_request.addinfourl(stream, headers, url, code)
572 ret = compat_urllib_request.addinfourl(stream, headers, url)
573 ret.code = code
574 return ret
575
acebc9cd 576 def http_request(self, req):
33ac271b
PH
577 for h, v in std_headers.items():
578 if h not in req.headers:
579 req.add_header(h, v)
59ae15a5
PH
580 if 'Youtubedl-no-compression' in req.headers:
581 if 'Accept-encoding' in req.headers:
582 del req.headers['Accept-encoding']
583 del req.headers['Youtubedl-no-compression']
3446dfb7 584 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
585 if 'User-agent' in req.headers:
586 del req.headers['User-agent']
587 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 588 del req.headers['Youtubedl-user-agent']
989b4b2b
PH
589
590 if sys.version_info < (2, 7) and '#' in req.get_full_url():
591 # Python 2.6 is brain-dead when it comes to fragments
592 req._Request__original = req._Request__original.partition('#')[0]
593 req._Request__r_type = req._Request__r_type.partition('#')[0]
594
59ae15a5
PH
595 return req
596
acebc9cd 597 def http_response(self, req, resp):
59ae15a5
PH
598 old_resp = resp
599 # gzip
600 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
601 content = resp.read()
602 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
603 try:
604 uncompressed = io.BytesIO(gz.read())
605 except IOError as original_ioerror:
606 # There may be junk add the end of the file
607 # See http://stackoverflow.com/q/4928560/35070 for details
608 for i in range(1, 1024):
609 try:
610 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
611 uncompressed = io.BytesIO(gz.read())
612 except IOError:
613 continue
614 break
615 else:
616 raise original_ioerror
617 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
618 resp.msg = old_resp.msg
619 # deflate
620 if resp.headers.get('Content-encoding', '') == 'deflate':
621 gz = io.BytesIO(self.deflate(resp.read()))
622 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
623 resp.msg = old_resp.msg
624 return resp
0f8d03f8 625
acebc9cd
PH
626 https_request = http_request
627 https_response = http_response
bf50b038 628
5de90176 629
305d0683 630def parse_iso8601(date_str, delimiter='T'):
912b38b4
PH
631 """ Return a UNIX timestamp from the given date """
632
633 if date_str is None:
634 return None
635
636 m = re.search(
6ad4013d 637 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
912b38b4
PH
638 date_str)
639 if not m:
640 timezone = datetime.timedelta()
641 else:
642 date_str = date_str[:-len(m.group(0))]
643 if not m.group('sign'):
644 timezone = datetime.timedelta()
645 else:
646 sign = 1 if m.group('sign') == '+' else -1
647 timezone = datetime.timedelta(
648 hours=sign * int(m.group('hours')),
649 minutes=sign * int(m.group('minutes')))
6ad4013d 650 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
305d0683 651 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
652 return calendar.timegm(dt.timetuple())
653
654
42bdd9d0 655def unified_strdate(date_str, day_first=True):
bf50b038 656 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
657
658 if date_str is None:
659 return None
bf50b038 660 upload_date = None
5f6a1245 661 # Replace commas
026fcc04 662 date_str = date_str.replace(',', ' ')
bf50b038 663 # %z (UTC offset) is only supported in python>=3.2
026fcc04 664 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0
PH
665 # Remove AM/PM + timezone
666 date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
667
19e1d359
JMF
668 format_expressions = [
669 '%d %B %Y',
0f99566c 670 '%d %b %Y',
19e1d359
JMF
671 '%B %d %Y',
672 '%b %d %Y',
78ff59d0
PP
673 '%b %dst %Y %I:%M%p',
674 '%b %dnd %Y %I:%M%p',
675 '%b %dth %Y %I:%M%p',
19e1d359 676 '%Y-%m-%d',
fe556f1b 677 '%Y/%m/%d',
4cf96546 678 '%d.%m.%Y',
19e1d359 679 '%d/%m/%Y',
423817c4 680 '%d/%m/%y',
19e1d359 681 '%Y/%m/%d %H:%M:%S',
5d73273f 682 '%Y-%m-%d %H:%M:%S',
e9be9a6a 683 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 684 '%d.%m.%Y %H:%M',
b047de6f 685 '%d.%m.%Y %H.%M',
19e1d359 686 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
687 '%Y-%m-%dT%H:%M:%S.%fZ',
688 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 689 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 690 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 691 '%Y-%m-%dT%H:%M',
19e1d359 692 ]
42bdd9d0
PH
693 if day_first:
694 format_expressions.extend([
695 '%d/%m/%Y %H:%M:%S',
696 ])
697 else:
698 format_expressions.extend([
699 '%m/%d/%Y %H:%M:%S',
700 ])
bf50b038
JMF
701 for expression in format_expressions:
702 try:
703 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 704 except ValueError:
bf50b038 705 pass
42393ce2
PH
706 if upload_date is None:
707 timetuple = email.utils.parsedate_tz(date_str)
708 if timetuple:
709 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
710 return upload_date
711
5f6a1245 712
28e614de 713def determine_ext(url, default_ext='unknown_video'):
f4776371
S
714 if url is None:
715 return default_ext
28e614de 716 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
717 if re.match(r'^[A-Za-z0-9]+$', guess):
718 return guess
719 else:
cbdbb766 720 return default_ext
73e79f2a 721
5f6a1245 722
d4051a8e 723def subtitles_filename(filename, sub_lang, sub_format):
28e614de 724 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 725
5f6a1245 726
bd558525 727def date_from_str(date_str):
37254abc
JMF
728 """
729 Return a datetime object from a string in the format YYYYMMDD or
730 (now|today)[+-][0-9](day|week|month|year)(s)?"""
731 today = datetime.date.today()
f8795e10 732 if date_str in ('now', 'today'):
37254abc 733 return today
f8795e10
PH
734 if date_str == 'yesterday':
735 return today - datetime.timedelta(days=1)
37254abc
JMF
736 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
737 if match is not None:
738 sign = match.group('sign')
739 time = int(match.group('time'))
740 if sign == '-':
741 time = -time
742 unit = match.group('unit')
5f6a1245 743 # A bad aproximation?
37254abc
JMF
744 if unit == 'month':
745 unit = 'day'
746 time *= 30
747 elif unit == 'year':
748 unit = 'day'
749 time *= 365
750 unit += 's'
751 delta = datetime.timedelta(**{unit: time})
752 return today + delta
bd558525 753 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
5f6a1245
JW
754
755
e63fc1be 756def hyphenate_date(date_str):
757 """
758 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
759 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
760 if match is not None:
761 return '-'.join(match.groups())
762 else:
763 return date_str
764
5f6a1245 765
bd558525
JMF
766class DateRange(object):
767 """Represents a time interval between two dates"""
5f6a1245 768
bd558525
JMF
769 def __init__(self, start=None, end=None):
770 """start and end must be strings in the format accepted by date"""
771 if start is not None:
772 self.start = date_from_str(start)
773 else:
774 self.start = datetime.datetime.min.date()
775 if end is not None:
776 self.end = date_from_str(end)
777 else:
778 self.end = datetime.datetime.max.date()
37254abc 779 if self.start > self.end:
bd558525 780 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 781
bd558525
JMF
782 @classmethod
783 def day(cls, day):
784 """Returns a range that only contains the given day"""
5f6a1245
JW
785 return cls(day, day)
786
bd558525
JMF
787 def __contains__(self, date):
788 """Check if the date is in the range"""
37254abc
JMF
789 if not isinstance(date, datetime.date):
790 date = date_from_str(date)
791 return self.start <= date <= self.end
5f6a1245 792
bd558525 793 def __str__(self):
5f6a1245 794 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
795
796
797def platform_name():
798 """ Returns the platform name as a compat_str """
799 res = platform.platform()
800 if isinstance(res, bytes):
801 res = res.decode(preferredencoding())
802
803 assert isinstance(res, compat_str)
804 return res
c257baff
PH
805
806
b58ddb32
PH
807def _windows_write_string(s, out):
808 """ Returns True if the string was written using special methods,
809 False if it has yet to be written out."""
810 # Adapted from http://stackoverflow.com/a/3259271/35070
811
812 import ctypes
813 import ctypes.wintypes
814
815 WIN_OUTPUT_IDS = {
816 1: -11,
817 2: -12,
818 }
819
a383a98a
PH
820 try:
821 fileno = out.fileno()
822 except AttributeError:
823 # If the output stream doesn't have a fileno, it's virtual
824 return False
b58ddb32
PH
825 if fileno not in WIN_OUTPUT_IDS:
826 return False
827
e2f89ec7 828 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 829 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
6ac4e806 830 (b"GetStdHandle", ctypes.windll.kernel32))
b58ddb32
PH
831 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
832
e2f89ec7 833 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
834 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
835 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
6ac4e806 836 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
b58ddb32
PH
837 written = ctypes.wintypes.DWORD(0)
838
6ac4e806 839 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
b58ddb32
PH
840 FILE_TYPE_CHAR = 0x0002
841 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 842 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
843 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
844 ctypes.POINTER(ctypes.wintypes.DWORD))(
6ac4e806 845 (b"GetConsoleMode", ctypes.windll.kernel32))
b58ddb32
PH
846 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
847
848 def not_a_console(handle):
849 if handle == INVALID_HANDLE_VALUE or handle is None:
850 return True
851 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
852 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
853
854 if not_a_console(h):
855 return False
856
d1b9c912
PH
857 def next_nonbmp_pos(s):
858 try:
859 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
860 except StopIteration:
861 return len(s)
862
863 while s:
864 count = min(next_nonbmp_pos(s), 1024)
865
b58ddb32 866 ret = WriteConsoleW(
d1b9c912 867 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
868 if ret == 0:
869 raise OSError('Failed to write string')
d1b9c912
PH
870 if not count: # We just wrote a non-BMP character
871 assert written.value == 2
872 s = s[1:]
873 else:
874 assert written.value > 0
875 s = s[written.value:]
b58ddb32
PH
876 return True
877
878
734f90bb 879def write_string(s, out=None, encoding=None):
7459e3a2
PH
880 if out is None:
881 out = sys.stderr
8bf48f23 882 assert type(s) == compat_str
7459e3a2 883
b58ddb32
PH
884 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
885 if _windows_write_string(s, out):
886 return
887
7459e3a2
PH
888 if ('b' in getattr(out, 'mode', '') or
889 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
890 byt = s.encode(encoding or preferredencoding(), 'ignore')
891 out.write(byt)
892 elif hasattr(out, 'buffer'):
893 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
894 byt = s.encode(enc, 'ignore')
895 out.buffer.write(byt)
896 else:
8bf48f23 897 out.write(s)
7459e3a2
PH
898 out.flush()
899
900
48ea9cea
PH
901def bytes_to_intlist(bs):
902 if not bs:
903 return []
904 if isinstance(bs[0], int): # Python 3
905 return list(bs)
906 else:
907 return [ord(c) for c in bs]
908
c257baff 909
cba892fa 910def intlist_to_bytes(xs):
911 if not xs:
912 return b''
eb4157fd 913 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
914
915
c1c9a79c
PH
916# Cross-platform file locking
917if sys.platform == 'win32':
918 import ctypes.wintypes
919 import msvcrt
920
921 class OVERLAPPED(ctypes.Structure):
922 _fields_ = [
923 ('Internal', ctypes.wintypes.LPVOID),
924 ('InternalHigh', ctypes.wintypes.LPVOID),
925 ('Offset', ctypes.wintypes.DWORD),
926 ('OffsetHigh', ctypes.wintypes.DWORD),
927 ('hEvent', ctypes.wintypes.HANDLE),
928 ]
929
930 kernel32 = ctypes.windll.kernel32
931 LockFileEx = kernel32.LockFileEx
932 LockFileEx.argtypes = [
933 ctypes.wintypes.HANDLE, # hFile
934 ctypes.wintypes.DWORD, # dwFlags
935 ctypes.wintypes.DWORD, # dwReserved
936 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
937 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
938 ctypes.POINTER(OVERLAPPED) # Overlapped
939 ]
940 LockFileEx.restype = ctypes.wintypes.BOOL
941 UnlockFileEx = kernel32.UnlockFileEx
942 UnlockFileEx.argtypes = [
943 ctypes.wintypes.HANDLE, # hFile
944 ctypes.wintypes.DWORD, # dwReserved
945 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
946 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
947 ctypes.POINTER(OVERLAPPED) # Overlapped
948 ]
949 UnlockFileEx.restype = ctypes.wintypes.BOOL
950 whole_low = 0xffffffff
951 whole_high = 0x7fffffff
952
953 def _lock_file(f, exclusive):
954 overlapped = OVERLAPPED()
955 overlapped.Offset = 0
956 overlapped.OffsetHigh = 0
957 overlapped.hEvent = 0
958 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
959 handle = msvcrt.get_osfhandle(f.fileno())
960 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
961 whole_low, whole_high, f._lock_file_overlapped_p):
962 raise OSError('Locking file failed: %r' % ctypes.FormatError())
963
964 def _unlock_file(f):
965 assert f._lock_file_overlapped_p
966 handle = msvcrt.get_osfhandle(f.fileno())
967 if not UnlockFileEx(handle, 0,
968 whole_low, whole_high, f._lock_file_overlapped_p):
969 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
970
971else:
972 import fcntl
973
974 def _lock_file(f, exclusive):
2582bebe 975 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
976
977 def _unlock_file(f):
2582bebe 978 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
979
980
981class locked_file(object):
982 def __init__(self, filename, mode, encoding=None):
983 assert mode in ['r', 'a', 'w']
984 self.f = io.open(filename, mode, encoding=encoding)
985 self.mode = mode
986
987 def __enter__(self):
988 exclusive = self.mode != 'r'
989 try:
990 _lock_file(self.f, exclusive)
991 except IOError:
992 self.f.close()
993 raise
994 return self
995
996 def __exit__(self, etype, value, traceback):
997 try:
998 _unlock_file(self.f)
999 finally:
1000 self.f.close()
1001
1002 def __iter__(self):
1003 return iter(self.f)
1004
1005 def write(self, *args):
1006 return self.f.write(*args)
1007
1008 def read(self, *args):
1009 return self.f.read(*args)
4eb7f1d1
JMF
1010
1011
4644ac55
S
1012def get_filesystem_encoding():
1013 encoding = sys.getfilesystemencoding()
1014 return encoding if encoding is not None else 'utf-8'
1015
1016
4eb7f1d1 1017def shell_quote(args):
a6a173c2 1018 quoted_args = []
4644ac55 1019 encoding = get_filesystem_encoding()
a6a173c2
JMF
1020 for a in args:
1021 if isinstance(a, bytes):
1022 # We may get a filename encoded with 'encodeFilename'
1023 a = a.decode(encoding)
1024 quoted_args.append(pipes.quote(a))
28e614de 1025 return ' '.join(quoted_args)
9d4660ca
PH
1026
1027
f4d96df0
PH
1028def takewhile_inclusive(pred, seq):
1029 """ Like itertools.takewhile, but include the latest evaluated element
1030 (the first element so that Not pred(e)) """
1031 for e in seq:
1032 yield e
1033 if not pred(e):
1034 return
1035
1036
9d4660ca
PH
1037def smuggle_url(url, data):
1038 """ Pass additional data in a URL for internal use. """
1039
1040 sdata = compat_urllib_parse.urlencode(
28e614de
PH
1041 {'__youtubedl_smuggle': json.dumps(data)})
1042 return url + '#' + sdata
9d4660ca
PH
1043
1044
79f82953 1045def unsmuggle_url(smug_url, default=None):
83e865a3 1046 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1047 return smug_url, default
28e614de
PH
1048 url, _, sdata = smug_url.rpartition('#')
1049 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1050 data = json.loads(jsond)
1051 return url, data
02dbf93f
PH
1052
1053
02dbf93f
PH
1054def format_bytes(bytes):
1055 if bytes is None:
28e614de 1056 return 'N/A'
02dbf93f
PH
1057 if type(bytes) is str:
1058 bytes = float(bytes)
1059 if bytes == 0.0:
1060 exponent = 0
1061 else:
1062 exponent = int(math.log(bytes, 1024.0))
28e614de 1063 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1064 converted = float(bytes) / float(1024 ** exponent)
28e614de 1065 return '%.2f%s' % (converted, suffix)
f53c966a 1066
1c088fa8 1067
be64b5b0
PH
1068def parse_filesize(s):
1069 if s is None:
1070 return None
1071
1072 # The lower-case forms are of course incorrect and inofficial,
1073 # but we support those too
1074 _UNIT_TABLE = {
1075 'B': 1,
1076 'b': 1,
1077 'KiB': 1024,
1078 'KB': 1000,
1079 'kB': 1024,
1080 'Kb': 1000,
1081 'MiB': 1024 ** 2,
1082 'MB': 1000 ** 2,
1083 'mB': 1024 ** 2,
1084 'Mb': 1000 ** 2,
1085 'GiB': 1024 ** 3,
1086 'GB': 1000 ** 3,
1087 'gB': 1024 ** 3,
1088 'Gb': 1000 ** 3,
1089 'TiB': 1024 ** 4,
1090 'TB': 1000 ** 4,
1091 'tB': 1024 ** 4,
1092 'Tb': 1000 ** 4,
1093 'PiB': 1024 ** 5,
1094 'PB': 1000 ** 5,
1095 'pB': 1024 ** 5,
1096 'Pb': 1000 ** 5,
1097 'EiB': 1024 ** 6,
1098 'EB': 1000 ** 6,
1099 'eB': 1024 ** 6,
1100 'Eb': 1000 ** 6,
1101 'ZiB': 1024 ** 7,
1102 'ZB': 1000 ** 7,
1103 'zB': 1024 ** 7,
1104 'Zb': 1000 ** 7,
1105 'YiB': 1024 ** 8,
1106 'YB': 1000 ** 8,
1107 'yB': 1024 ** 8,
1108 'Yb': 1000 ** 8,
1109 }
1110
1111 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
4349c07d
PH
1112 m = re.match(
1113 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
be64b5b0
PH
1114 if not m:
1115 return None
1116
4349c07d
PH
1117 num_str = m.group('num').replace(',', '.')
1118 mult = _UNIT_TABLE[m.group('unit')]
1119 return int(float(num_str) * mult)
be64b5b0
PH
1120
1121
1c088fa8 1122def get_term_width():
4644ac55 1123 columns = compat_getenv('COLUMNS', None)
1c088fa8
PH
1124 if columns:
1125 return int(columns)
1126
1127 try:
1128 sp = subprocess.Popen(
1129 ['stty', 'size'],
1130 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1131 out, err = sp.communicate()
1132 return int(out.split()[1])
1133 except:
1134 pass
1135 return None
caefb1de
PH
1136
1137
1138def month_by_name(name):
1139 """ Return the number of a month by (locale-independently) English name """
1140
1141 ENGLISH_NAMES = [
28e614de
PH
1142 'January', 'February', 'March', 'April', 'May', 'June',
1143 'July', 'August', 'September', 'October', 'November', 'December']
caefb1de
PH
1144 try:
1145 return ENGLISH_NAMES.index(name) + 1
1146 except ValueError:
1147 return None
18258362
JMF
1148
1149
5aafe895 1150def fix_xml_ampersands(xml_str):
18258362 1151 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1152 return re.sub(
1153 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1154 '&amp;',
5aafe895 1155 xml_str)
e3946f98
PH
1156
1157
1158def setproctitle(title):
8bf48f23 1159 assert isinstance(title, compat_str)
e3946f98
PH
1160 try:
1161 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1162 except OSError:
1163 return
6eefe533
PH
1164 title_bytes = title.encode('utf-8')
1165 buf = ctypes.create_string_buffer(len(title_bytes))
1166 buf.value = title_bytes
e3946f98 1167 try:
6eefe533 1168 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1169 except AttributeError:
1170 return # Strange libc, just skip this
d7dda168
PH
1171
1172
1173def remove_start(s, start):
1174 if s.startswith(start):
1175 return s[len(start):]
1176 return s
29eb5174
PH
1177
1178
2b9faf55
PH
1179def remove_end(s, end):
1180 if s.endswith(end):
1181 return s[:-len(end)]
1182 return s
1183
1184
29eb5174 1185def url_basename(url):
9b8aaeed 1186 path = compat_urlparse.urlparse(url).path
28e614de 1187 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1188
1189
1190class HEADRequest(compat_urllib_request.Request):
1191 def get_method(self):
1192 return "HEAD"
7217e148
PH
1193
1194
9732d77e 1195def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1196 if get_attr:
1197 if v is not None:
1198 v = getattr(v, get_attr, None)
9572013d
PH
1199 if v == '':
1200 v = None
9732d77e
PH
1201 return default if v is None else (int(v) * invscale // scale)
1202
9572013d 1203
40a90862
JMF
1204def str_or_none(v, default=None):
1205 return default if v is None else compat_str(v)
1206
9732d77e
PH
1207
1208def str_to_int(int_str):
48d4681e 1209 """ A more relaxed version of int_or_none """
9732d77e
PH
1210 if int_str is None:
1211 return None
28e614de 1212 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1213 return int(int_str)
608d11f5
PH
1214
1215
9732d77e
PH
1216def float_or_none(v, scale=1, invscale=1, default=None):
1217 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1218
1219
608d11f5
PH
1220def parse_duration(s):
1221 if s is None:
1222 return None
1223
ca7b3246
S
1224 s = s.strip()
1225
608d11f5 1226 m = re.match(
6a68bb57 1227 r'''(?ix)T?
e8df5cee
PH
1228 (?:
1229 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1230 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1231
6a68bb57
PH
1232 (?:
1233 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1234 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1235 )?
e8df5cee
PH
1236 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1237 )$''', s)
608d11f5
PH
1238 if not m:
1239 return None
e8df5cee
PH
1240 res = 0
1241 if m.group('only_mins'):
1242 return float_or_none(m.group('only_mins'), invscale=60)
1243 if m.group('only_hours'):
1244 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1245 if m.group('secs'):
1246 res += int(m.group('secs'))
608d11f5
PH
1247 if m.group('mins'):
1248 res += int(m.group('mins')) * 60
e8df5cee
PH
1249 if m.group('hours'):
1250 res += int(m.group('hours')) * 60 * 60
7adcbe75
PH
1251 if m.group('ms'):
1252 res += float(m.group('ms'))
608d11f5 1253 return res
91d7d0b3
JMF
1254
1255
1256def prepend_extension(filename, ext):
5f6a1245 1257 name, real_ext = os.path.splitext(filename)
28e614de 1258 return '{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1259
1260
1261def check_executable(exe, args=[]):
1262 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1263 args can be a list of arguments for a short output (like -version) """
1264 try:
1265 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1266 except OSError:
1267 return False
1268 return exe
b7ab0590
PH
1269
1270
95807118 1271def get_exe_version(exe, args=['--version'],
cae97f65 1272 version_re=None, unrecognized='present'):
95807118
PH
1273 """ Returns the version of the specified executable,
1274 or False if the executable is not present """
1275 try:
cae97f65 1276 out, _ = subprocess.Popen(
95807118
PH
1277 [exe] + args,
1278 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1279 except OSError:
1280 return False
cae97f65
PH
1281 if isinstance(out, bytes): # Python 2.x
1282 out = out.decode('ascii', 'ignore')
1283 return detect_exe_version(out, version_re, unrecognized)
1284
1285
1286def detect_exe_version(output, version_re=None, unrecognized='present'):
1287 assert isinstance(output, compat_str)
1288 if version_re is None:
1289 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1290 m = re.search(version_re, output)
95807118
PH
1291 if m:
1292 return m.group(1)
1293 else:
1294 return unrecognized
1295
1296
b7ab0590 1297class PagedList(object):
dd26ced1
PH
1298 def __len__(self):
1299 # This is only useful for tests
1300 return len(self.getslice())
1301
9c44d242
PH
1302
1303class OnDemandPagedList(PagedList):
1304 def __init__(self, pagefunc, pagesize):
1305 self._pagefunc = pagefunc
1306 self._pagesize = pagesize
1307
b7ab0590
PH
1308 def getslice(self, start=0, end=None):
1309 res = []
1310 for pagenum in itertools.count(start // self._pagesize):
1311 firstid = pagenum * self._pagesize
1312 nextfirstid = pagenum * self._pagesize + self._pagesize
1313 if start >= nextfirstid:
1314 continue
1315
1316 page_results = list(self._pagefunc(pagenum))
1317
1318 startv = (
1319 start % self._pagesize
1320 if firstid <= start < nextfirstid
1321 else 0)
1322
1323 endv = (
1324 ((end - 1) % self._pagesize) + 1
1325 if (end is not None and firstid <= end <= nextfirstid)
1326 else None)
1327
1328 if startv != 0 or endv is not None:
1329 page_results = page_results[startv:endv]
1330 res.extend(page_results)
1331
1332 # A little optimization - if current page is not "full", ie. does
1333 # not contain page_size videos then we can assume that this page
1334 # is the last one - there are no more ids on further pages -
1335 # i.e. no need to query again.
1336 if len(page_results) + startv < self._pagesize:
1337 break
1338
1339 # If we got the whole page, but the next page is not interesting,
1340 # break out early as well
1341 if end == nextfirstid:
1342 break
1343 return res
81c2f20b
PH
1344
1345
9c44d242
PH
1346class InAdvancePagedList(PagedList):
1347 def __init__(self, pagefunc, pagecount, pagesize):
1348 self._pagefunc = pagefunc
1349 self._pagecount = pagecount
1350 self._pagesize = pagesize
1351
1352 def getslice(self, start=0, end=None):
1353 res = []
1354 start_page = start // self._pagesize
1355 end_page = (
1356 self._pagecount if end is None else (end // self._pagesize + 1))
1357 skip_elems = start - start_page * self._pagesize
1358 only_more = None if end is None else end - start
1359 for pagenum in range(start_page, end_page):
1360 page = list(self._pagefunc(pagenum))
1361 if skip_elems:
1362 page = page[skip_elems:]
1363 skip_elems = None
1364 if only_more is not None:
1365 if len(page) < only_more:
1366 only_more -= len(page)
1367 else:
1368 page = page[:only_more]
1369 res.extend(page)
1370 break
1371 res.extend(page)
1372 return res
1373
1374
81c2f20b 1375def uppercase_escape(s):
676eb3f2 1376 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1377 return re.sub(
a612753d 1378 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1379 lambda m: unicode_escape(m.group(0))[0],
1380 s)
b53466e1 1381
d05cfe06
S
1382
1383def escape_rfc3986(s):
1384 """Escape non-ASCII characters as suggested by RFC 3986"""
1385 if sys.version_info < (3, 0) and isinstance(s, unicode):
1386 s = s.encode('utf-8')
ecc0c5ee 1387 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1388
1389
1390def escape_url(url):
1391 """Escape URL as suggested by RFC 3986"""
1392 url_parsed = compat_urllib_parse_urlparse(url)
1393 return url_parsed._replace(
1394 path=escape_rfc3986(url_parsed.path),
1395 params=escape_rfc3986(url_parsed.params),
1396 query=escape_rfc3986(url_parsed.query),
1397 fragment=escape_rfc3986(url_parsed.fragment)
1398 ).geturl()
1399
b53466e1 1400try:
28e614de 1401 struct.pack('!I', 0)
b53466e1
PH
1402except TypeError:
1403 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1404 def struct_pack(spec, *args):
1405 if isinstance(spec, compat_str):
1406 spec = spec.encode('ascii')
1407 return struct.pack(spec, *args)
1408
1409 def struct_unpack(spec, *args):
1410 if isinstance(spec, compat_str):
1411 spec = spec.encode('ascii')
1412 return struct.unpack(spec, *args)
1413else:
1414 struct_pack = struct.pack
1415 struct_unpack = struct.unpack
62e609ab
PH
1416
1417
1418def read_batch_urls(batch_fd):
1419 def fixup(url):
1420 if not isinstance(url, compat_str):
1421 url = url.decode('utf-8', 'replace')
28e614de 1422 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1423 if url.startswith(BOM_UTF8):
1424 url = url[len(BOM_UTF8):]
1425 url = url.strip()
1426 if url.startswith(('#', ';', ']')):
1427 return False
1428 return url
1429
1430 with contextlib.closing(batch_fd) as fd:
1431 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1432
1433
1434def urlencode_postdata(*args, **kargs):
1435 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1436
1437
0990305d
PH
1438try:
1439 etree_iter = xml.etree.ElementTree.Element.iter
1440except AttributeError: # Python <=2.6
1441 etree_iter = lambda n: n.findall('.//*')
1442
1443
bcf89ce6
PH
1444def parse_xml(s):
1445 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1446 def doctype(self, name, pubid, system):
1447 pass # Ignore doctypes
1448
1449 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1450 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1451 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1452 # Fix up XML parser in Python 2.x
1453 if sys.version_info < (3, 0):
1454 for n in etree_iter(tree):
1455 if n.text is not None:
1456 if not isinstance(n.text, compat_str):
1457 n.text = n.text.decode('utf-8')
1458 return tree
e68301af
PH
1459
1460
a1a530b0
PH
1461US_RATINGS = {
1462 'G': 0,
1463 'PG': 10,
1464 'PG-13': 13,
1465 'R': 16,
1466 'NC': 18,
1467}
fac55558
PH
1468
1469
146c80e2
S
1470def parse_age_limit(s):
1471 if s is None:
d838b1bd 1472 return None
146c80e2 1473 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d838b1bd 1474 return int(m.group('age')) if m else US_RATINGS.get(s, None)
146c80e2
S
1475
1476
fac55558 1477def strip_jsonp(code):
609a61e3
PH
1478 return re.sub(
1479 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1480
1481
e05f6939
PH
1482def js_to_json(code):
1483 def fix_kv(m):
e7b6d122
PH
1484 v = m.group(0)
1485 if v in ('true', 'false', 'null'):
1486 return v
1487 if v.startswith('"'):
1488 return v
1489 if v.startswith("'"):
1490 v = v[1:-1]
1491 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1492 '\\\\': '\\\\',
1493 "\\'": "'",
1494 '"': '\\"',
1495 }[m.group(0)], v)
1496 return '"%s"' % v
e05f6939
PH
1497
1498 res = re.sub(r'''(?x)
e7b6d122
PH
1499 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1500 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1501 [a-zA-Z_][a-zA-Z_0-9]*
e05f6939
PH
1502 ''', fix_kv, code)
1503 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1504 return res
1505
1506
478c2c61
PH
1507def qualities(quality_ids):
1508 """ Get a numeric quality value out of a list of possible values """
1509 def q(qid):
1510 try:
1511 return quality_ids.index(qid)
1512 except ValueError:
1513 return -1
1514 return q
1515
acd69589
PH
1516
1517DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1518
a020a0dc
PH
1519
1520def limit_length(s, length):
1521 """ Add ellipses to overly long strings """
1522 if s is None:
1523 return None
1524 ELLIPSES = '...'
1525 if len(s) > length:
1526 return s[:length - len(ELLIPSES)] + ELLIPSES
1527 return s
48844745
PH
1528
1529
1530def version_tuple(v):
5f9b8394 1531 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1532
1533
1534def is_outdated_version(version, limit, assume_new=True):
1535 if not version:
1536 return not assume_new
1537 try:
1538 return version_tuple(version) < version_tuple(limit)
1539 except ValueError:
1540 return not assume_new
732ea2f0
PH
1541
1542
1543def ytdl_is_updateable():
1544 """ Returns if youtube-dl can be updated with -U """
1545 from zipimport import zipimporter
1546
1547 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1548
1549
1550def args_to_str(args):
1551 # Get a short string representation for a subprocess command
1552 return ' '.join(shlex_quote(a) for a in args)