]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
release 2014.12.11
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
912b38b4 6import calendar
676eb3f2 7import codecs
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96
PH
10import datetime
11import email.utils
f45c185f 12import errno
d77c3dfd 13import gzip
b7ab0590 14import itertools
03f9daab 15import io
f4bfd65f 16import json
d77c3dfd 17import locale
02dbf93f 18import math
d77c3dfd 19import os
4eb7f1d1 20import pipes
c496ca96 21import platform
d77c3dfd 22import re
13ebea79 23import ssl
c496ca96 24import socket
b53466e1 25import struct
1c088fa8 26import subprocess
d77c3dfd 27import sys
181c8655 28import tempfile
01951dda 29import traceback
bcf89ce6 30import xml.etree.ElementTree
d77c3dfd 31import zlib
d77c3dfd 32
8c25f81b
PH
33from .compat import (
34 compat_chr,
35 compat_getenv,
36 compat_html_entities,
8c25f81b
PH
37 compat_parse_qs,
38 compat_str,
39 compat_urllib_error,
40 compat_urllib_parse,
41 compat_urllib_parse_urlparse,
42 compat_urllib_request,
43 compat_urlparse,
7d4111ed 44 shlex_quote,
8c25f81b 45)
4644ac55
S
46
47
468e2e92
FV
48# This is not clearly defined otherwise
49compiled_regex_type = type(re.compile(''))
50
3e669f36 51std_headers = {
ae8f7871 52 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
53 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
54 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
55 'Accept-Encoding': 'gzip, deflate',
56 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 57}
f427df17 58
5f6a1245 59
d77c3dfd 60def preferredencoding():
59ae15a5 61 """Get preferred encoding.
d77c3dfd 62
59ae15a5
PH
63 Returns the best encoding scheme for the system, based on
64 locale.getpreferredencoding() and some further tweaks.
65 """
66 try:
67 pref = locale.getpreferredencoding()
28e614de 68 'TEST'.encode(pref)
59ae15a5
PH
69 except:
70 pref = 'UTF-8'
bae611f2 71
59ae15a5 72 return pref
d77c3dfd 73
f4bfd65f 74
181c8655 75def write_json_file(obj, fn):
1394646a 76 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 77
92120217 78 fn = encodeFilename(fn)
61ee5aeb 79 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
80 encoding = get_filesystem_encoding()
81 # os.path.basename returns a bytes object, but NamedTemporaryFile
82 # will fail if the filename contains non ascii characters unless we
83 # use a unicode object
84 path_basename = lambda f: os.path.basename(fn).decode(encoding)
85 # the same for os.path.dirname
86 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
87 else:
88 path_basename = os.path.basename
89 path_dirname = os.path.dirname
90
73159f99
S
91 args = {
92 'suffix': '.tmp',
ec5f6016
JMF
93 'prefix': path_basename(fn) + '.',
94 'dir': path_dirname(fn),
73159f99
S
95 'delete': False,
96 }
97
181c8655
PH
98 # In Python 2.x, json.dump expects a bytestream.
99 # In Python 3.x, it writes to a character stream
100 if sys.version_info < (3, 0):
73159f99 101 args['mode'] = 'wb'
181c8655 102 else:
73159f99
S
103 args.update({
104 'mode': 'w',
105 'encoding': 'utf-8',
106 })
107
108 tf = tempfile.NamedTemporaryFile(**args)
181c8655
PH
109
110 try:
111 with tf:
112 json.dump(obj, tf)
1394646a
IK
113 if sys.platform == 'win32':
114 # Need to remove existing file on Windows, else os.rename raises
115 # WindowsError or FileExistsError.
116 try:
117 os.unlink(fn)
118 except OSError:
119 pass
181c8655
PH
120 os.rename(tf.name, fn)
121 except:
122 try:
123 os.remove(tf.name)
124 except OSError:
125 pass
126 raise
127
128
129if sys.version_info >= (2, 7):
59ae56fa
PH
130 def find_xpath_attr(node, xpath, key, val):
131 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
132 assert re.match(r'^[a-zA-Z-]+$', key)
133 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
ab4ee31e 134 expr = xpath + "[@%s='%s']" % (key, val)
59ae56fa
PH
135 return node.find(expr)
136else:
137 def find_xpath_attr(node, xpath, key, val):
4eefbfdb
PH
138 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
139 # .//node does not match if a node is a direct child of . !
140 if isinstance(xpath, unicode):
141 xpath = xpath.encode('ascii')
142
59ae56fa
PH
143 for f in node.findall(xpath):
144 if f.attrib.get(key) == val:
145 return f
146 return None
147
d7e66d39
JMF
148# On python2.6 the xml.etree.ElementTree.Element methods don't support
149# the namespace parameter
5f6a1245
JW
150
151
d7e66d39
JMF
152def xpath_with_ns(path, ns_map):
153 components = [c.split(':') for c in path.split('/')]
154 replaced = []
155 for c in components:
156 if len(c) == 1:
157 replaced.append(c[0])
158 else:
159 ns, tag = c
160 replaced.append('{%s}%s' % (ns_map[ns], tag))
161 return '/'.join(replaced)
162
d77c3dfd 163
bf0ff932 164def xpath_text(node, xpath, name=None, fatal=False):
d74bebd5
PH
165 if sys.version_info < (2, 7): # Crazy 2.6
166 xpath = xpath.encode('ascii')
167
bf0ff932
PH
168 n = node.find(xpath)
169 if n is None:
170 if fatal:
171 name = xpath if name is None else name
172 raise ExtractorError('Could not find XML element %s' % name)
173 else:
174 return None
175 return n.text
176
177
9e6dd238 178def get_element_by_id(id, html):
43e8fafd
ND
179 """Return the content of the tag with the specified ID in the passed HTML document"""
180 return get_element_by_attribute("id", id, html)
181
12ea2f30 182
43e8fafd
ND
183def get_element_by_attribute(attribute, value, html):
184 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 185
38285056
PH
186 m = re.search(r'''(?xs)
187 <([a-zA-Z0-9:._-]+)
188 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
189 \s+%s=['"]?%s['"]?
190 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
191 \s*>
192 (?P<content>.*?)
193 </\1>
194 ''' % (re.escape(attribute), re.escape(value)), html)
195
196 if not m:
197 return None
198 res = m.group('content')
199
200 if res.startswith('"') or res.startswith("'"):
201 res = res[1:-1]
a921f407 202
38285056 203 return unescapeHTML(res)
a921f407 204
9e6dd238
FV
205
206def clean_html(html):
59ae15a5
PH
207 """Clean an HTML snippet into a readable string"""
208 # Newline vs <br />
209 html = html.replace('\n', ' ')
6b3aef80
FV
210 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
211 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
212 # Strip html tags
213 html = re.sub('<.*?>', '', html)
214 # Replace html entities
215 html = unescapeHTML(html)
7decf895 216 return html.strip()
9e6dd238
FV
217
218
d77c3dfd 219def sanitize_open(filename, open_mode):
59ae15a5
PH
220 """Try to open the given filename, and slightly tweak it if this fails.
221
222 Attempts to open the given filename. If this fails, it tries to change
223 the filename slightly, step by step, until it's either able to open it
224 or it fails and raises a final exception, like the standard open()
225 function.
226
227 It returns the tuple (stream, definitive_file_name).
228 """
229 try:
28e614de 230 if filename == '-':
59ae15a5
PH
231 if sys.platform == 'win32':
232 import msvcrt
233 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 234 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
235 stream = open(encodeFilename(filename), open_mode)
236 return (stream, filename)
237 except (IOError, OSError) as err:
f45c185f
PH
238 if err.errno in (errno.EACCES,):
239 raise
59ae15a5 240
f45c185f
PH
241 # In case of error, try to remove win32 forbidden chars
242 alt_filename = os.path.join(
b74e86f4
PH
243 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
244 for path_part in os.path.split(filename)
245 )
f45c185f
PH
246 if alt_filename == filename:
247 raise
248 else:
249 # An exception here should be caught in the caller
250 stream = open(encodeFilename(filename), open_mode)
251 return (stream, alt_filename)
d77c3dfd
FV
252
253
254def timeconvert(timestr):
59ae15a5
PH
255 """Convert RFC 2822 defined time string into system timestamp"""
256 timestamp = None
257 timetuple = email.utils.parsedate_tz(timestr)
258 if timetuple is not None:
259 timestamp = email.utils.mktime_tz(timetuple)
260 return timestamp
1c469a94 261
5f6a1245 262
796173d0 263def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
264 """Sanitizes a string so it could be used as part of a filename.
265 If restricted is set, use a stricter subset of allowed characters.
796173d0 266 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
267 """
268 def replace_insane(char):
269 if char == '?' or ord(char) < 32 or ord(char) == 127:
270 return ''
271 elif char == '"':
272 return '' if restricted else '\''
273 elif char == ':':
274 return '_-' if restricted else ' -'
275 elif char in '\\/|*<>':
276 return '_'
627dcfff 277 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
278 return '_'
279 if restricted and ord(char) > 127:
280 return '_'
281 return char
282
28e614de 283 result = ''.join(map(replace_insane, s))
796173d0
PH
284 if not is_id:
285 while '__' in result:
286 result = result.replace('__', '_')
287 result = result.strip('_')
288 # Common case of "Foreign band name - English song title"
289 if restricted and result.startswith('-_'):
290 result = result[2:]
291 if not result:
292 result = '_'
59ae15a5 293 return result
d77c3dfd 294
5f6a1245 295
d77c3dfd 296def orderedSet(iterable):
59ae15a5
PH
297 """ Remove all duplicates from the input iterable """
298 res = []
299 for el in iterable:
300 if el not in res:
301 res.append(el)
302 return res
d77c3dfd 303
912b38b4 304
4e408e47
PH
305def _htmlentity_transform(entity):
306 """Transforms an HTML entity to a character."""
307 # Known non-numeric HTML entity
308 if entity in compat_html_entities.name2codepoint:
309 return compat_chr(compat_html_entities.name2codepoint[entity])
310
311 mobj = re.match(r'#(x?[0-9]+)', entity)
312 if mobj is not None:
313 numstr = mobj.group(1)
28e614de 314 if numstr.startswith('x'):
4e408e47 315 base = 16
28e614de 316 numstr = '0%s' % numstr
4e408e47
PH
317 else:
318 base = 10
319 return compat_chr(int(numstr, base))
320
321 # Unknown entity in name, return its literal representation
28e614de 322 return ('&%s;' % entity)
4e408e47
PH
323
324
d77c3dfd 325def unescapeHTML(s):
912b38b4
PH
326 if s is None:
327 return None
328 assert type(s) == compat_str
d77c3dfd 329
4e408e47
PH
330 return re.sub(
331 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 332
8bf48f23
PH
333
334def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
335 """
336 @param s The name of the file
337 """
d77c3dfd 338
8bf48f23 339 assert type(s) == compat_str
d77c3dfd 340
59ae15a5
PH
341 # Python 3 has a Unicode API
342 if sys.version_info >= (3, 0):
343 return s
0f00efed 344
59ae15a5 345 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
28e614de 346 # Pass '' directly to use Unicode APIs on Windows 2000 and up
59ae15a5
PH
347 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
348 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
349 if not for_subprocess:
350 return s
351 else:
352 # For subprocess calls, encode with locale encoding
353 # Refer to http://stackoverflow.com/a/9951851/35070
354 encoding = preferredencoding()
59ae15a5 355 else:
6df40dcb 356 encoding = sys.getfilesystemencoding()
8bf48f23
PH
357 if encoding is None:
358 encoding = 'utf-8'
359 return s.encode(encoding, 'ignore')
360
f07b74fc
PH
361
362def encodeArgument(s):
363 if not isinstance(s, compat_str):
364 # Legacy code that uses byte strings
365 # Uncomment the following line after fixing all post processors
366 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
367 s = s.decode('ascii')
368 return encodeFilename(s, True)
369
370
8271226a
PH
371def decodeOption(optval):
372 if optval is None:
373 return optval
374 if isinstance(optval, bytes):
375 optval = optval.decode(preferredencoding())
376
377 assert isinstance(optval, compat_str)
378 return optval
1c256f70 379
5f6a1245 380
4539dd30
PH
381def formatSeconds(secs):
382 if secs > 3600:
383 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
384 elif secs > 60:
385 return '%d:%02d' % (secs // 60, secs % 60)
386 else:
387 return '%d' % secs
388
a0ddb8a2
PH
389
390def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
13ebea79
PH
391 if sys.version_info < (3, 2):
392 import httplib
393
394 class HTTPSConnectionV3(httplib.HTTPSConnection):
395 def __init__(self, *args, **kwargs):
396 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
397
398 def connect(self):
399 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 400 if getattr(self, '_tunnel_host', False):
13ebea79
PH
401 self.sock = sock
402 self._tunnel()
403 try:
aa37e3d4 404 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
de79c46c 405 except ssl.SSLError:
13ebea79
PH
406 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
407
408 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
409 def https_open(self, req):
410 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 411 return HTTPSHandlerV3(**kwargs)
aa37e3d4
PH
412 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
413 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
414 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
415 if opts_no_check_certificate:
416 context.verify_mode = ssl.CERT_NONE
417 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
418 else: # Python < 3.4
419 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
ea6d901e 420 context.verify_mode = (ssl.CERT_NONE
dca08720 421 if opts_no_check_certificate
ea6d901e 422 else ssl.CERT_REQUIRED)
303b479e
PH
423 context.set_default_verify_paths()
424 try:
425 context.load_default_certs()
426 except AttributeError:
427 pass # Python < 3.4
a0ddb8a2 428 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 429
732ea2f0 430
1c256f70
PH
431class ExtractorError(Exception):
432 """Error during info extraction."""
5f6a1245 433
d11271dd 434 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
435 """ tb, if given, is the original traceback (so that it can be printed out).
436 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
437 """
438
439 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
440 expected = True
d11271dd
PH
441 if video_id is not None:
442 msg = video_id + ': ' + msg
410f3e73 443 if cause:
28e614de 444 msg += ' (caused by %r)' % cause
9a82b238 445 if not expected:
732ea2f0
PH
446 if ytdl_is_updateable():
447 update_cmd = 'type youtube-dl -U to update'
448 else:
449 update_cmd = 'see https://yt-dl.org/update on how to update'
450 msg += '; please report this issue on https://yt-dl.org/bug .'
451 msg += ' Make sure you are using the latest version; %s.' % update_cmd
452 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
1c256f70 453 super(ExtractorError, self).__init__(msg)
d5979c5d 454
1c256f70 455 self.traceback = tb
8cc83b8d 456 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 457 self.cause = cause
d11271dd 458 self.video_id = video_id
1c256f70 459
01951dda
PH
460 def format_traceback(self):
461 if self.traceback is None:
462 return None
28e614de 463 return ''.join(traceback.format_tb(self.traceback))
01951dda 464
1c256f70 465
55b3e45b
JMF
466class RegexNotFoundError(ExtractorError):
467 """Error when a regex didn't match"""
468 pass
469
470
d77c3dfd 471class DownloadError(Exception):
59ae15a5 472 """Download Error exception.
d77c3dfd 473
59ae15a5
PH
474 This exception may be thrown by FileDownloader objects if they are not
475 configured to continue on errors. They will contain the appropriate
476 error message.
477 """
5f6a1245 478
8cc83b8d
FV
479 def __init__(self, msg, exc_info=None):
480 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
481 super(DownloadError, self).__init__(msg)
482 self.exc_info = exc_info
d77c3dfd
FV
483
484
485class SameFileError(Exception):
59ae15a5 486 """Same File exception.
d77c3dfd 487
59ae15a5
PH
488 This exception will be thrown by FileDownloader objects if they detect
489 multiple files would have to be downloaded to the same file on disk.
490 """
491 pass
d77c3dfd
FV
492
493
494class PostProcessingError(Exception):
59ae15a5 495 """Post Processing exception.
d77c3dfd 496
59ae15a5
PH
497 This exception may be raised by PostProcessor's .run() method to
498 indicate an error in the postprocessing task.
499 """
5f6a1245 500
7851b379
PH
501 def __init__(self, msg):
502 self.msg = msg
d77c3dfd 503
5f6a1245 504
d77c3dfd 505class MaxDownloadsReached(Exception):
59ae15a5
PH
506 """ --max-downloads limit has been reached. """
507 pass
d77c3dfd
FV
508
509
510class UnavailableVideoError(Exception):
59ae15a5 511 """Unavailable Format exception.
d77c3dfd 512
59ae15a5
PH
513 This exception will be thrown when a video is requested
514 in a format that is not available for that video.
515 """
516 pass
d77c3dfd
FV
517
518
519class ContentTooShortError(Exception):
59ae15a5 520 """Content Too Short exception.
d77c3dfd 521
59ae15a5
PH
522 This exception may be raised by FileDownloader objects when a file they
523 download is too small for what the server announced first, indicating
524 the connection was probably interrupted.
525 """
526 # Both in bytes
527 downloaded = None
528 expected = None
d77c3dfd 529
59ae15a5
PH
530 def __init__(self, downloaded, expected):
531 self.downloaded = downloaded
532 self.expected = expected
d77c3dfd 533
5f6a1245 534
acebc9cd 535class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
536 """Handler for HTTP requests and responses.
537
538 This class, when installed with an OpenerDirector, automatically adds
539 the standard headers to every HTTP request and handles gzipped and
540 deflated responses from web servers. If compression is to be avoided in
541 a particular request, the original request in the program code only has
542 to include the HTTP header "Youtubedl-No-Compression", which will be
543 removed before making the real request.
544
545 Part of this code was copied from:
546
547 http://techknack.net/python-urllib2-handlers/
548
549 Andrew Rowls, the author of that code, agreed to release it to the
550 public domain.
551 """
552
553 @staticmethod
554 def deflate(data):
555 try:
556 return zlib.decompress(data, -zlib.MAX_WBITS)
557 except zlib.error:
558 return zlib.decompress(data)
559
560 @staticmethod
561 def addinfourl_wrapper(stream, headers, url, code):
562 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
563 return compat_urllib_request.addinfourl(stream, headers, url, code)
564 ret = compat_urllib_request.addinfourl(stream, headers, url)
565 ret.code = code
566 return ret
567
acebc9cd 568 def http_request(self, req):
33ac271b
PH
569 for h, v in std_headers.items():
570 if h not in req.headers:
571 req.add_header(h, v)
59ae15a5
PH
572 if 'Youtubedl-no-compression' in req.headers:
573 if 'Accept-encoding' in req.headers:
574 del req.headers['Accept-encoding']
575 del req.headers['Youtubedl-no-compression']
3446dfb7 576 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
577 if 'User-agent' in req.headers:
578 del req.headers['User-agent']
579 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 580 del req.headers['Youtubedl-user-agent']
989b4b2b
PH
581
582 if sys.version_info < (2, 7) and '#' in req.get_full_url():
583 # Python 2.6 is brain-dead when it comes to fragments
584 req._Request__original = req._Request__original.partition('#')[0]
585 req._Request__r_type = req._Request__r_type.partition('#')[0]
586
59ae15a5
PH
587 return req
588
acebc9cd 589 def http_response(self, req, resp):
59ae15a5
PH
590 old_resp = resp
591 # gzip
592 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
593 content = resp.read()
594 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
595 try:
596 uncompressed = io.BytesIO(gz.read())
597 except IOError as original_ioerror:
598 # There may be junk add the end of the file
599 # See http://stackoverflow.com/q/4928560/35070 for details
600 for i in range(1, 1024):
601 try:
602 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
603 uncompressed = io.BytesIO(gz.read())
604 except IOError:
605 continue
606 break
607 else:
608 raise original_ioerror
609 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
610 resp.msg = old_resp.msg
611 # deflate
612 if resp.headers.get('Content-encoding', '') == 'deflate':
613 gz = io.BytesIO(self.deflate(resp.read()))
614 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
615 resp.msg = old_resp.msg
616 return resp
0f8d03f8 617
acebc9cd
PH
618 https_request = http_request
619 https_response = http_response
bf50b038 620
5de90176 621
305d0683 622def parse_iso8601(date_str, delimiter='T'):
912b38b4
PH
623 """ Return a UNIX timestamp from the given date """
624
625 if date_str is None:
626 return None
627
628 m = re.search(
6ad4013d 629 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
912b38b4
PH
630 date_str)
631 if not m:
632 timezone = datetime.timedelta()
633 else:
634 date_str = date_str[:-len(m.group(0))]
635 if not m.group('sign'):
636 timezone = datetime.timedelta()
637 else:
638 sign = 1 if m.group('sign') == '+' else -1
639 timezone = datetime.timedelta(
640 hours=sign * int(m.group('hours')),
641 minutes=sign * int(m.group('minutes')))
6ad4013d 642 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
305d0683 643 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
644 return calendar.timegm(dt.timetuple())
645
646
bf50b038
JMF
647def unified_strdate(date_str):
648 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
649
650 if date_str is None:
651 return None
652
bf50b038 653 upload_date = None
5f6a1245 654 # Replace commas
026fcc04 655 date_str = date_str.replace(',', ' ')
bf50b038 656 # %z (UTC offset) is only supported in python>=3.2
026fcc04 657 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
19e1d359
JMF
658 format_expressions = [
659 '%d %B %Y',
0f99566c 660 '%d %b %Y',
19e1d359
JMF
661 '%B %d %Y',
662 '%b %d %Y',
78ff59d0
PP
663 '%b %dst %Y %I:%M%p',
664 '%b %dnd %Y %I:%M%p',
665 '%b %dth %Y %I:%M%p',
19e1d359 666 '%Y-%m-%d',
fe556f1b 667 '%Y/%m/%d',
4cf96546 668 '%d.%m.%Y',
19e1d359 669 '%d/%m/%Y',
423817c4 670 '%d/%m/%y',
19e1d359 671 '%Y/%m/%d %H:%M:%S',
99b67fec 672 '%d/%m/%Y %H:%M:%S',
5d73273f 673 '%Y-%m-%d %H:%M:%S',
e9be9a6a 674 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 675 '%d.%m.%Y %H:%M',
b047de6f 676 '%d.%m.%Y %H.%M',
19e1d359 677 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
678 '%Y-%m-%dT%H:%M:%S.%fZ',
679 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 680 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 681 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 682 '%Y-%m-%dT%H:%M',
19e1d359 683 ]
bf50b038
JMF
684 for expression in format_expressions:
685 try:
686 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 687 except ValueError:
bf50b038 688 pass
42393ce2
PH
689 if upload_date is None:
690 timetuple = email.utils.parsedate_tz(date_str)
691 if timetuple:
692 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
693 return upload_date
694
5f6a1245 695
28e614de 696def determine_ext(url, default_ext='unknown_video'):
f4776371
S
697 if url is None:
698 return default_ext
28e614de 699 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
700 if re.match(r'^[A-Za-z0-9]+$', guess):
701 return guess
702 else:
cbdbb766 703 return default_ext
73e79f2a 704
5f6a1245 705
d4051a8e 706def subtitles_filename(filename, sub_lang, sub_format):
28e614de 707 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 708
5f6a1245 709
bd558525 710def date_from_str(date_str):
37254abc
JMF
711 """
712 Return a datetime object from a string in the format YYYYMMDD or
713 (now|today)[+-][0-9](day|week|month|year)(s)?"""
714 today = datetime.date.today()
f8795e10 715 if date_str in ('now', 'today'):
37254abc 716 return today
f8795e10
PH
717 if date_str == 'yesterday':
718 return today - datetime.timedelta(days=1)
37254abc
JMF
719 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
720 if match is not None:
721 sign = match.group('sign')
722 time = int(match.group('time'))
723 if sign == '-':
724 time = -time
725 unit = match.group('unit')
5f6a1245 726 # A bad aproximation?
37254abc
JMF
727 if unit == 'month':
728 unit = 'day'
729 time *= 30
730 elif unit == 'year':
731 unit = 'day'
732 time *= 365
733 unit += 's'
734 delta = datetime.timedelta(**{unit: time})
735 return today + delta
bd558525 736 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
5f6a1245
JW
737
738
e63fc1be 739def hyphenate_date(date_str):
740 """
741 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
742 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
743 if match is not None:
744 return '-'.join(match.groups())
745 else:
746 return date_str
747
5f6a1245 748
bd558525
JMF
749class DateRange(object):
750 """Represents a time interval between two dates"""
5f6a1245 751
bd558525
JMF
752 def __init__(self, start=None, end=None):
753 """start and end must be strings in the format accepted by date"""
754 if start is not None:
755 self.start = date_from_str(start)
756 else:
757 self.start = datetime.datetime.min.date()
758 if end is not None:
759 self.end = date_from_str(end)
760 else:
761 self.end = datetime.datetime.max.date()
37254abc 762 if self.start > self.end:
bd558525 763 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 764
bd558525
JMF
765 @classmethod
766 def day(cls, day):
767 """Returns a range that only contains the given day"""
5f6a1245
JW
768 return cls(day, day)
769
bd558525
JMF
770 def __contains__(self, date):
771 """Check if the date is in the range"""
37254abc
JMF
772 if not isinstance(date, datetime.date):
773 date = date_from_str(date)
774 return self.start <= date <= self.end
5f6a1245 775
bd558525 776 def __str__(self):
5f6a1245 777 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
778
779
780def platform_name():
781 """ Returns the platform name as a compat_str """
782 res = platform.platform()
783 if isinstance(res, bytes):
784 res = res.decode(preferredencoding())
785
786 assert isinstance(res, compat_str)
787 return res
c257baff
PH
788
789
b58ddb32
PH
790def _windows_write_string(s, out):
791 """ Returns True if the string was written using special methods,
792 False if it has yet to be written out."""
793 # Adapted from http://stackoverflow.com/a/3259271/35070
794
795 import ctypes
796 import ctypes.wintypes
797
798 WIN_OUTPUT_IDS = {
799 1: -11,
800 2: -12,
801 }
802
a383a98a
PH
803 try:
804 fileno = out.fileno()
805 except AttributeError:
806 # If the output stream doesn't have a fileno, it's virtual
807 return False
b58ddb32
PH
808 if fileno not in WIN_OUTPUT_IDS:
809 return False
810
811 GetStdHandle = ctypes.WINFUNCTYPE(
812 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
813 ("GetStdHandle", ctypes.windll.kernel32))
814 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
815
816 WriteConsoleW = ctypes.WINFUNCTYPE(
817 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
818 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
819 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
820 written = ctypes.wintypes.DWORD(0)
821
822 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
823 FILE_TYPE_CHAR = 0x0002
824 FILE_TYPE_REMOTE = 0x8000
825 GetConsoleMode = ctypes.WINFUNCTYPE(
826 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
827 ctypes.POINTER(ctypes.wintypes.DWORD))(
828 ("GetConsoleMode", ctypes.windll.kernel32))
829 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
830
831 def not_a_console(handle):
832 if handle == INVALID_HANDLE_VALUE or handle is None:
833 return True
834 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
835 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
836
837 if not_a_console(h):
838 return False
839
d1b9c912
PH
840 def next_nonbmp_pos(s):
841 try:
842 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
843 except StopIteration:
844 return len(s)
845
846 while s:
847 count = min(next_nonbmp_pos(s), 1024)
848
b58ddb32 849 ret = WriteConsoleW(
d1b9c912 850 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
851 if ret == 0:
852 raise OSError('Failed to write string')
d1b9c912
PH
853 if not count: # We just wrote a non-BMP character
854 assert written.value == 2
855 s = s[1:]
856 else:
857 assert written.value > 0
858 s = s[written.value:]
b58ddb32
PH
859 return True
860
861
734f90bb 862def write_string(s, out=None, encoding=None):
7459e3a2
PH
863 if out is None:
864 out = sys.stderr
8bf48f23 865 assert type(s) == compat_str
7459e3a2 866
b58ddb32
PH
867 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
868 if _windows_write_string(s, out):
869 return
870
7459e3a2
PH
871 if ('b' in getattr(out, 'mode', '') or
872 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
873 byt = s.encode(encoding or preferredencoding(), 'ignore')
874 out.write(byt)
875 elif hasattr(out, 'buffer'):
876 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
877 byt = s.encode(enc, 'ignore')
878 out.buffer.write(byt)
879 else:
8bf48f23 880 out.write(s)
7459e3a2
PH
881 out.flush()
882
883
48ea9cea
PH
884def bytes_to_intlist(bs):
885 if not bs:
886 return []
887 if isinstance(bs[0], int): # Python 3
888 return list(bs)
889 else:
890 return [ord(c) for c in bs]
891
c257baff 892
cba892fa 893def intlist_to_bytes(xs):
894 if not xs:
895 return b''
eb4157fd 896 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
897
898
c1c9a79c
PH
899# Cross-platform file locking
900if sys.platform == 'win32':
901 import ctypes.wintypes
902 import msvcrt
903
904 class OVERLAPPED(ctypes.Structure):
905 _fields_ = [
906 ('Internal', ctypes.wintypes.LPVOID),
907 ('InternalHigh', ctypes.wintypes.LPVOID),
908 ('Offset', ctypes.wintypes.DWORD),
909 ('OffsetHigh', ctypes.wintypes.DWORD),
910 ('hEvent', ctypes.wintypes.HANDLE),
911 ]
912
913 kernel32 = ctypes.windll.kernel32
914 LockFileEx = kernel32.LockFileEx
915 LockFileEx.argtypes = [
916 ctypes.wintypes.HANDLE, # hFile
917 ctypes.wintypes.DWORD, # dwFlags
918 ctypes.wintypes.DWORD, # dwReserved
919 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
920 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
921 ctypes.POINTER(OVERLAPPED) # Overlapped
922 ]
923 LockFileEx.restype = ctypes.wintypes.BOOL
924 UnlockFileEx = kernel32.UnlockFileEx
925 UnlockFileEx.argtypes = [
926 ctypes.wintypes.HANDLE, # hFile
927 ctypes.wintypes.DWORD, # dwReserved
928 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
929 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
930 ctypes.POINTER(OVERLAPPED) # Overlapped
931 ]
932 UnlockFileEx.restype = ctypes.wintypes.BOOL
933 whole_low = 0xffffffff
934 whole_high = 0x7fffffff
935
936 def _lock_file(f, exclusive):
937 overlapped = OVERLAPPED()
938 overlapped.Offset = 0
939 overlapped.OffsetHigh = 0
940 overlapped.hEvent = 0
941 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
942 handle = msvcrt.get_osfhandle(f.fileno())
943 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
944 whole_low, whole_high, f._lock_file_overlapped_p):
945 raise OSError('Locking file failed: %r' % ctypes.FormatError())
946
947 def _unlock_file(f):
948 assert f._lock_file_overlapped_p
949 handle = msvcrt.get_osfhandle(f.fileno())
950 if not UnlockFileEx(handle, 0,
951 whole_low, whole_high, f._lock_file_overlapped_p):
952 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
953
954else:
955 import fcntl
956
957 def _lock_file(f, exclusive):
2582bebe 958 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
959
960 def _unlock_file(f):
2582bebe 961 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
962
963
964class locked_file(object):
965 def __init__(self, filename, mode, encoding=None):
966 assert mode in ['r', 'a', 'w']
967 self.f = io.open(filename, mode, encoding=encoding)
968 self.mode = mode
969
970 def __enter__(self):
971 exclusive = self.mode != 'r'
972 try:
973 _lock_file(self.f, exclusive)
974 except IOError:
975 self.f.close()
976 raise
977 return self
978
979 def __exit__(self, etype, value, traceback):
980 try:
981 _unlock_file(self.f)
982 finally:
983 self.f.close()
984
985 def __iter__(self):
986 return iter(self.f)
987
988 def write(self, *args):
989 return self.f.write(*args)
990
991 def read(self, *args):
992 return self.f.read(*args)
4eb7f1d1
JMF
993
994
4644ac55
S
995def get_filesystem_encoding():
996 encoding = sys.getfilesystemencoding()
997 return encoding if encoding is not None else 'utf-8'
998
999
4eb7f1d1 1000def shell_quote(args):
a6a173c2 1001 quoted_args = []
4644ac55 1002 encoding = get_filesystem_encoding()
a6a173c2
JMF
1003 for a in args:
1004 if isinstance(a, bytes):
1005 # We may get a filename encoded with 'encodeFilename'
1006 a = a.decode(encoding)
1007 quoted_args.append(pipes.quote(a))
28e614de 1008 return ' '.join(quoted_args)
9d4660ca
PH
1009
1010
f4d96df0
PH
1011def takewhile_inclusive(pred, seq):
1012 """ Like itertools.takewhile, but include the latest evaluated element
1013 (the first element so that Not pred(e)) """
1014 for e in seq:
1015 yield e
1016 if not pred(e):
1017 return
1018
1019
9d4660ca
PH
1020def smuggle_url(url, data):
1021 """ Pass additional data in a URL for internal use. """
1022
1023 sdata = compat_urllib_parse.urlencode(
28e614de
PH
1024 {'__youtubedl_smuggle': json.dumps(data)})
1025 return url + '#' + sdata
9d4660ca
PH
1026
1027
79f82953 1028def unsmuggle_url(smug_url, default=None):
83e865a3 1029 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1030 return smug_url, default
28e614de
PH
1031 url, _, sdata = smug_url.rpartition('#')
1032 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1033 data = json.loads(jsond)
1034 return url, data
02dbf93f
PH
1035
1036
02dbf93f
PH
1037def format_bytes(bytes):
1038 if bytes is None:
28e614de 1039 return 'N/A'
02dbf93f
PH
1040 if type(bytes) is str:
1041 bytes = float(bytes)
1042 if bytes == 0.0:
1043 exponent = 0
1044 else:
1045 exponent = int(math.log(bytes, 1024.0))
28e614de 1046 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1047 converted = float(bytes) / float(1024 ** exponent)
28e614de 1048 return '%.2f%s' % (converted, suffix)
f53c966a 1049
1c088fa8 1050
be64b5b0
PH
1051def parse_filesize(s):
1052 if s is None:
1053 return None
1054
1055 # The lower-case forms are of course incorrect and inofficial,
1056 # but we support those too
1057 _UNIT_TABLE = {
1058 'B': 1,
1059 'b': 1,
1060 'KiB': 1024,
1061 'KB': 1000,
1062 'kB': 1024,
1063 'Kb': 1000,
1064 'MiB': 1024 ** 2,
1065 'MB': 1000 ** 2,
1066 'mB': 1024 ** 2,
1067 'Mb': 1000 ** 2,
1068 'GiB': 1024 ** 3,
1069 'GB': 1000 ** 3,
1070 'gB': 1024 ** 3,
1071 'Gb': 1000 ** 3,
1072 'TiB': 1024 ** 4,
1073 'TB': 1000 ** 4,
1074 'tB': 1024 ** 4,
1075 'Tb': 1000 ** 4,
1076 'PiB': 1024 ** 5,
1077 'PB': 1000 ** 5,
1078 'pB': 1024 ** 5,
1079 'Pb': 1000 ** 5,
1080 'EiB': 1024 ** 6,
1081 'EB': 1000 ** 6,
1082 'eB': 1024 ** 6,
1083 'Eb': 1000 ** 6,
1084 'ZiB': 1024 ** 7,
1085 'ZB': 1000 ** 7,
1086 'zB': 1024 ** 7,
1087 'Zb': 1000 ** 7,
1088 'YiB': 1024 ** 8,
1089 'YB': 1000 ** 8,
1090 'yB': 1024 ** 8,
1091 'Yb': 1000 ** 8,
1092 }
1093
1094 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
4349c07d
PH
1095 m = re.match(
1096 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
be64b5b0
PH
1097 if not m:
1098 return None
1099
4349c07d
PH
1100 num_str = m.group('num').replace(',', '.')
1101 mult = _UNIT_TABLE[m.group('unit')]
1102 return int(float(num_str) * mult)
be64b5b0
PH
1103
1104
1c088fa8 1105def get_term_width():
4644ac55 1106 columns = compat_getenv('COLUMNS', None)
1c088fa8
PH
1107 if columns:
1108 return int(columns)
1109
1110 try:
1111 sp = subprocess.Popen(
1112 ['stty', 'size'],
1113 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1114 out, err = sp.communicate()
1115 return int(out.split()[1])
1116 except:
1117 pass
1118 return None
caefb1de
PH
1119
1120
1121def month_by_name(name):
1122 """ Return the number of a month by (locale-independently) English name """
1123
1124 ENGLISH_NAMES = [
28e614de
PH
1125 'January', 'February', 'March', 'April', 'May', 'June',
1126 'July', 'August', 'September', 'October', 'November', 'December']
caefb1de
PH
1127 try:
1128 return ENGLISH_NAMES.index(name) + 1
1129 except ValueError:
1130 return None
18258362
JMF
1131
1132
5aafe895 1133def fix_xml_ampersands(xml_str):
18258362 1134 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1135 return re.sub(
1136 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1137 '&amp;',
5aafe895 1138 xml_str)
e3946f98
PH
1139
1140
1141def setproctitle(title):
8bf48f23 1142 assert isinstance(title, compat_str)
e3946f98
PH
1143 try:
1144 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1145 except OSError:
1146 return
6eefe533
PH
1147 title_bytes = title.encode('utf-8')
1148 buf = ctypes.create_string_buffer(len(title_bytes))
1149 buf.value = title_bytes
e3946f98 1150 try:
6eefe533 1151 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1152 except AttributeError:
1153 return # Strange libc, just skip this
d7dda168
PH
1154
1155
1156def remove_start(s, start):
1157 if s.startswith(start):
1158 return s[len(start):]
1159 return s
29eb5174
PH
1160
1161
2b9faf55
PH
1162def remove_end(s, end):
1163 if s.endswith(end):
1164 return s[:-len(end)]
1165 return s
1166
1167
29eb5174 1168def url_basename(url):
9b8aaeed 1169 path = compat_urlparse.urlparse(url).path
28e614de 1170 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1171
1172
1173class HEADRequest(compat_urllib_request.Request):
1174 def get_method(self):
1175 return "HEAD"
7217e148
PH
1176
1177
9732d77e 1178def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1179 if get_attr:
1180 if v is not None:
1181 v = getattr(v, get_attr, None)
9572013d
PH
1182 if v == '':
1183 v = None
9732d77e
PH
1184 return default if v is None else (int(v) * invscale // scale)
1185
9572013d 1186
40a90862
JMF
1187def str_or_none(v, default=None):
1188 return default if v is None else compat_str(v)
1189
9732d77e
PH
1190
1191def str_to_int(int_str):
48d4681e 1192 """ A more relaxed version of int_or_none """
9732d77e
PH
1193 if int_str is None:
1194 return None
28e614de 1195 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1196 return int(int_str)
608d11f5
PH
1197
1198
9732d77e
PH
1199def float_or_none(v, scale=1, invscale=1, default=None):
1200 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1201
1202
608d11f5
PH
1203def parse_duration(s):
1204 if s is None:
1205 return None
1206
ca7b3246
S
1207 s = s.strip()
1208
608d11f5 1209 m = re.match(
6a68bb57 1210 r'''(?ix)T?
e8df5cee
PH
1211 (?:
1212 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1213 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1214
6a68bb57
PH
1215 (?:
1216 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1217 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1218 )?
e8df5cee
PH
1219 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1220 )$''', s)
608d11f5
PH
1221 if not m:
1222 return None
e8df5cee
PH
1223 res = 0
1224 if m.group('only_mins'):
1225 return float_or_none(m.group('only_mins'), invscale=60)
1226 if m.group('only_hours'):
1227 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1228 if m.group('secs'):
1229 res += int(m.group('secs'))
608d11f5
PH
1230 if m.group('mins'):
1231 res += int(m.group('mins')) * 60
e8df5cee
PH
1232 if m.group('hours'):
1233 res += int(m.group('hours')) * 60 * 60
7adcbe75
PH
1234 if m.group('ms'):
1235 res += float(m.group('ms'))
608d11f5 1236 return res
91d7d0b3
JMF
1237
1238
1239def prepend_extension(filename, ext):
5f6a1245 1240 name, real_ext = os.path.splitext(filename)
28e614de 1241 return '{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1242
1243
1244def check_executable(exe, args=[]):
1245 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1246 args can be a list of arguments for a short output (like -version) """
1247 try:
1248 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1249 except OSError:
1250 return False
1251 return exe
b7ab0590
PH
1252
1253
95807118
PH
1254def get_exe_version(exe, args=['--version'],
1255 version_re=r'version\s+([0-9._-a-zA-Z]+)',
28e614de 1256 unrecognized='present'):
95807118
PH
1257 """ Returns the version of the specified executable,
1258 or False if the executable is not present """
1259 try:
1260 out, err = subprocess.Popen(
1261 [exe] + args,
1262 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1263 except OSError:
1264 return False
1265 firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1266 m = re.search(version_re, firstline)
1267 if m:
1268 return m.group(1)
1269 else:
1270 return unrecognized
1271
1272
b7ab0590 1273class PagedList(object):
dd26ced1
PH
1274 def __len__(self):
1275 # This is only useful for tests
1276 return len(self.getslice())
1277
9c44d242
PH
1278
1279class OnDemandPagedList(PagedList):
1280 def __init__(self, pagefunc, pagesize):
1281 self._pagefunc = pagefunc
1282 self._pagesize = pagesize
1283
b7ab0590
PH
1284 def getslice(self, start=0, end=None):
1285 res = []
1286 for pagenum in itertools.count(start // self._pagesize):
1287 firstid = pagenum * self._pagesize
1288 nextfirstid = pagenum * self._pagesize + self._pagesize
1289 if start >= nextfirstid:
1290 continue
1291
1292 page_results = list(self._pagefunc(pagenum))
1293
1294 startv = (
1295 start % self._pagesize
1296 if firstid <= start < nextfirstid
1297 else 0)
1298
1299 endv = (
1300 ((end - 1) % self._pagesize) + 1
1301 if (end is not None and firstid <= end <= nextfirstid)
1302 else None)
1303
1304 if startv != 0 or endv is not None:
1305 page_results = page_results[startv:endv]
1306 res.extend(page_results)
1307
1308 # A little optimization - if current page is not "full", ie. does
1309 # not contain page_size videos then we can assume that this page
1310 # is the last one - there are no more ids on further pages -
1311 # i.e. no need to query again.
1312 if len(page_results) + startv < self._pagesize:
1313 break
1314
1315 # If we got the whole page, but the next page is not interesting,
1316 # break out early as well
1317 if end == nextfirstid:
1318 break
1319 return res
81c2f20b
PH
1320
1321
9c44d242
PH
1322class InAdvancePagedList(PagedList):
1323 def __init__(self, pagefunc, pagecount, pagesize):
1324 self._pagefunc = pagefunc
1325 self._pagecount = pagecount
1326 self._pagesize = pagesize
1327
1328 def getslice(self, start=0, end=None):
1329 res = []
1330 start_page = start // self._pagesize
1331 end_page = (
1332 self._pagecount if end is None else (end // self._pagesize + 1))
1333 skip_elems = start - start_page * self._pagesize
1334 only_more = None if end is None else end - start
1335 for pagenum in range(start_page, end_page):
1336 page = list(self._pagefunc(pagenum))
1337 if skip_elems:
1338 page = page[skip_elems:]
1339 skip_elems = None
1340 if only_more is not None:
1341 if len(page) < only_more:
1342 only_more -= len(page)
1343 else:
1344 page = page[:only_more]
1345 res.extend(page)
1346 break
1347 res.extend(page)
1348 return res
1349
1350
81c2f20b 1351def uppercase_escape(s):
676eb3f2 1352 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1353 return re.sub(
a612753d 1354 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1355 lambda m: unicode_escape(m.group(0))[0],
1356 s)
b53466e1 1357
d05cfe06
S
1358
1359def escape_rfc3986(s):
1360 """Escape non-ASCII characters as suggested by RFC 3986"""
1361 if sys.version_info < (3, 0) and isinstance(s, unicode):
1362 s = s.encode('utf-8')
ecc0c5ee 1363 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1364
1365
1366def escape_url(url):
1367 """Escape URL as suggested by RFC 3986"""
1368 url_parsed = compat_urllib_parse_urlparse(url)
1369 return url_parsed._replace(
1370 path=escape_rfc3986(url_parsed.path),
1371 params=escape_rfc3986(url_parsed.params),
1372 query=escape_rfc3986(url_parsed.query),
1373 fragment=escape_rfc3986(url_parsed.fragment)
1374 ).geturl()
1375
b53466e1 1376try:
28e614de 1377 struct.pack('!I', 0)
b53466e1
PH
1378except TypeError:
1379 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1380 def struct_pack(spec, *args):
1381 if isinstance(spec, compat_str):
1382 spec = spec.encode('ascii')
1383 return struct.pack(spec, *args)
1384
1385 def struct_unpack(spec, *args):
1386 if isinstance(spec, compat_str):
1387 spec = spec.encode('ascii')
1388 return struct.unpack(spec, *args)
1389else:
1390 struct_pack = struct.pack
1391 struct_unpack = struct.unpack
62e609ab
PH
1392
1393
1394def read_batch_urls(batch_fd):
1395 def fixup(url):
1396 if not isinstance(url, compat_str):
1397 url = url.decode('utf-8', 'replace')
28e614de 1398 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1399 if url.startswith(BOM_UTF8):
1400 url = url[len(BOM_UTF8):]
1401 url = url.strip()
1402 if url.startswith(('#', ';', ']')):
1403 return False
1404 return url
1405
1406 with contextlib.closing(batch_fd) as fd:
1407 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1408
1409
1410def urlencode_postdata(*args, **kargs):
1411 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1412
1413
0990305d
PH
1414try:
1415 etree_iter = xml.etree.ElementTree.Element.iter
1416except AttributeError: # Python <=2.6
1417 etree_iter = lambda n: n.findall('.//*')
1418
1419
bcf89ce6
PH
1420def parse_xml(s):
1421 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1422 def doctype(self, name, pubid, system):
1423 pass # Ignore doctypes
1424
1425 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1426 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1427 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1428 # Fix up XML parser in Python 2.x
1429 if sys.version_info < (3, 0):
1430 for n in etree_iter(tree):
1431 if n.text is not None:
1432 if not isinstance(n.text, compat_str):
1433 n.text = n.text.decode('utf-8')
1434 return tree
e68301af
PH
1435
1436
a1a530b0
PH
1437US_RATINGS = {
1438 'G': 0,
1439 'PG': 10,
1440 'PG-13': 13,
1441 'R': 16,
1442 'NC': 18,
1443}
fac55558
PH
1444
1445
146c80e2
S
1446def parse_age_limit(s):
1447 if s is None:
d838b1bd 1448 return None
146c80e2 1449 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d838b1bd 1450 return int(m.group('age')) if m else US_RATINGS.get(s, None)
146c80e2
S
1451
1452
fac55558 1453def strip_jsonp(code):
609a61e3
PH
1454 return re.sub(
1455 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1456
1457
e05f6939
PH
1458def js_to_json(code):
1459 def fix_kv(m):
e7b6d122
PH
1460 v = m.group(0)
1461 if v in ('true', 'false', 'null'):
1462 return v
1463 if v.startswith('"'):
1464 return v
1465 if v.startswith("'"):
1466 v = v[1:-1]
1467 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1468 '\\\\': '\\\\',
1469 "\\'": "'",
1470 '"': '\\"',
1471 }[m.group(0)], v)
1472 return '"%s"' % v
e05f6939
PH
1473
1474 res = re.sub(r'''(?x)
e7b6d122
PH
1475 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1476 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1477 [a-zA-Z_][a-zA-Z_0-9]*
e05f6939
PH
1478 ''', fix_kv, code)
1479 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1480 return res
1481
1482
478c2c61
PH
1483def qualities(quality_ids):
1484 """ Get a numeric quality value out of a list of possible values """
1485 def q(qid):
1486 try:
1487 return quality_ids.index(qid)
1488 except ValueError:
1489 return -1
1490 return q
1491
acd69589
PH
1492
1493DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1494
a020a0dc
PH
1495
1496def limit_length(s, length):
1497 """ Add ellipses to overly long strings """
1498 if s is None:
1499 return None
1500 ELLIPSES = '...'
1501 if len(s) > length:
1502 return s[:length - len(ELLIPSES)] + ELLIPSES
1503 return s
48844745
PH
1504
1505
1506def version_tuple(v):
5f9b8394 1507 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1508
1509
1510def is_outdated_version(version, limit, assume_new=True):
1511 if not version:
1512 return not assume_new
1513 try:
1514 return version_tuple(version) < version_tuple(limit)
1515 except ValueError:
1516 return not assume_new
732ea2f0
PH
1517
1518
1519def ytdl_is_updateable():
1520 """ Returns if youtube-dl can be updated with -U """
1521 from zipimport import zipimporter
1522
1523 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1524
1525
1526def args_to_str(args):
1527 # Get a short string representation for a subprocess command
1528 return ' '.join(shlex_quote(a) for a in args)