]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[YoutubeDL] Set format_id for video+audio (Closes #3634)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
912b38b4 6import calendar
676eb3f2 7import codecs
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96
PH
10import datetime
11import email.utils
f45c185f 12import errno
be4a824d 13import functools
d77c3dfd 14import gzip
b7ab0590 15import itertools
03f9daab 16import io
f4bfd65f 17import json
d77c3dfd 18import locale
02dbf93f 19import math
d77c3dfd 20import os
4eb7f1d1 21import pipes
c496ca96 22import platform
d77c3dfd 23import re
13ebea79 24import ssl
c496ca96 25import socket
b53466e1 26import struct
1c088fa8 27import subprocess
d77c3dfd 28import sys
181c8655 29import tempfile
01951dda 30import traceback
bcf89ce6 31import xml.etree.ElementTree
d77c3dfd 32import zlib
d77c3dfd 33
8c25f81b
PH
34from .compat import (
35 compat_chr,
36 compat_getenv,
37 compat_html_entities,
be4a824d 38 compat_http_client,
8c25f81b 39 compat_parse_qs,
be4a824d 40 compat_socket_create_connection,
8c25f81b
PH
41 compat_str,
42 compat_urllib_error,
43 compat_urllib_parse,
44 compat_urllib_parse_urlparse,
45 compat_urllib_request,
46 compat_urlparse,
7d4111ed 47 shlex_quote,
8c25f81b 48)
4644ac55
S
49
50
468e2e92
FV
51# This is not clearly defined otherwise
52compiled_regex_type = type(re.compile(''))
53
3e669f36 54std_headers = {
ae8f7871 55 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
56 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
57 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
58 'Accept-Encoding': 'gzip, deflate',
59 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 60}
f427df17 61
5f6a1245 62
d77c3dfd 63def preferredencoding():
59ae15a5 64 """Get preferred encoding.
d77c3dfd 65
59ae15a5
PH
66 Returns the best encoding scheme for the system, based on
67 locale.getpreferredencoding() and some further tweaks.
68 """
69 try:
70 pref = locale.getpreferredencoding()
28e614de 71 'TEST'.encode(pref)
59ae15a5
PH
72 except:
73 pref = 'UTF-8'
bae611f2 74
59ae15a5 75 return pref
d77c3dfd 76
f4bfd65f 77
181c8655 78def write_json_file(obj, fn):
1394646a 79 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 80
92120217 81 fn = encodeFilename(fn)
61ee5aeb 82 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
83 encoding = get_filesystem_encoding()
84 # os.path.basename returns a bytes object, but NamedTemporaryFile
85 # will fail if the filename contains non ascii characters unless we
86 # use a unicode object
87 path_basename = lambda f: os.path.basename(fn).decode(encoding)
88 # the same for os.path.dirname
89 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
90 else:
91 path_basename = os.path.basename
92 path_dirname = os.path.dirname
93
73159f99
S
94 args = {
95 'suffix': '.tmp',
ec5f6016
JMF
96 'prefix': path_basename(fn) + '.',
97 'dir': path_dirname(fn),
73159f99
S
98 'delete': False,
99 }
100
181c8655
PH
101 # In Python 2.x, json.dump expects a bytestream.
102 # In Python 3.x, it writes to a character stream
103 if sys.version_info < (3, 0):
73159f99 104 args['mode'] = 'wb'
181c8655 105 else:
73159f99
S
106 args.update({
107 'mode': 'w',
108 'encoding': 'utf-8',
109 })
110
111 tf = tempfile.NamedTemporaryFile(**args)
181c8655
PH
112
113 try:
114 with tf:
115 json.dump(obj, tf)
1394646a
IK
116 if sys.platform == 'win32':
117 # Need to remove existing file on Windows, else os.rename raises
118 # WindowsError or FileExistsError.
119 try:
120 os.unlink(fn)
121 except OSError:
122 pass
181c8655
PH
123 os.rename(tf.name, fn)
124 except:
125 try:
126 os.remove(tf.name)
127 except OSError:
128 pass
129 raise
130
131
132if sys.version_info >= (2, 7):
59ae56fa
PH
133 def find_xpath_attr(node, xpath, key, val):
134 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
135 assert re.match(r'^[a-zA-Z-]+$', key)
136 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
ab4ee31e 137 expr = xpath + "[@%s='%s']" % (key, val)
59ae56fa
PH
138 return node.find(expr)
139else:
140 def find_xpath_attr(node, xpath, key, val):
4eefbfdb
PH
141 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
142 # .//node does not match if a node is a direct child of . !
143 if isinstance(xpath, unicode):
144 xpath = xpath.encode('ascii')
145
59ae56fa
PH
146 for f in node.findall(xpath):
147 if f.attrib.get(key) == val:
148 return f
149 return None
150
d7e66d39
JMF
151# On python2.6 the xml.etree.ElementTree.Element methods don't support
152# the namespace parameter
5f6a1245
JW
153
154
d7e66d39
JMF
155def xpath_with_ns(path, ns_map):
156 components = [c.split(':') for c in path.split('/')]
157 replaced = []
158 for c in components:
159 if len(c) == 1:
160 replaced.append(c[0])
161 else:
162 ns, tag = c
163 replaced.append('{%s}%s' % (ns_map[ns], tag))
164 return '/'.join(replaced)
165
d77c3dfd 166
bf0ff932 167def xpath_text(node, xpath, name=None, fatal=False):
d74bebd5
PH
168 if sys.version_info < (2, 7): # Crazy 2.6
169 xpath = xpath.encode('ascii')
170
bf0ff932 171 n = node.find(xpath)
42bdd9d0 172 if n is None or n.text is None:
bf0ff932
PH
173 if fatal:
174 name = xpath if name is None else name
175 raise ExtractorError('Could not find XML element %s' % name)
176 else:
177 return None
178 return n.text
179
180
9e6dd238 181def get_element_by_id(id, html):
43e8fafd
ND
182 """Return the content of the tag with the specified ID in the passed HTML document"""
183 return get_element_by_attribute("id", id, html)
184
12ea2f30 185
43e8fafd
ND
186def get_element_by_attribute(attribute, value, html):
187 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 188
38285056
PH
189 m = re.search(r'''(?xs)
190 <([a-zA-Z0-9:._-]+)
191 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
192 \s+%s=['"]?%s['"]?
193 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
194 \s*>
195 (?P<content>.*?)
196 </\1>
197 ''' % (re.escape(attribute), re.escape(value)), html)
198
199 if not m:
200 return None
201 res = m.group('content')
202
203 if res.startswith('"') or res.startswith("'"):
204 res = res[1:-1]
a921f407 205
38285056 206 return unescapeHTML(res)
a921f407 207
9e6dd238
FV
208
209def clean_html(html):
59ae15a5 210 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
211
212 if html is None: # Convenience for sanitizing descriptions etc.
213 return html
214
59ae15a5
PH
215 # Newline vs <br />
216 html = html.replace('\n', ' ')
6b3aef80
FV
217 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
218 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
219 # Strip html tags
220 html = re.sub('<.*?>', '', html)
221 # Replace html entities
222 html = unescapeHTML(html)
7decf895 223 return html.strip()
9e6dd238
FV
224
225
d77c3dfd 226def sanitize_open(filename, open_mode):
59ae15a5
PH
227 """Try to open the given filename, and slightly tweak it if this fails.
228
229 Attempts to open the given filename. If this fails, it tries to change
230 the filename slightly, step by step, until it's either able to open it
231 or it fails and raises a final exception, like the standard open()
232 function.
233
234 It returns the tuple (stream, definitive_file_name).
235 """
236 try:
28e614de 237 if filename == '-':
59ae15a5
PH
238 if sys.platform == 'win32':
239 import msvcrt
240 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 241 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
242 stream = open(encodeFilename(filename), open_mode)
243 return (stream, filename)
244 except (IOError, OSError) as err:
f45c185f
PH
245 if err.errno in (errno.EACCES,):
246 raise
59ae15a5 247
f45c185f
PH
248 # In case of error, try to remove win32 forbidden chars
249 alt_filename = os.path.join(
b74e86f4
PH
250 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
251 for path_part in os.path.split(filename)
252 )
f45c185f
PH
253 if alt_filename == filename:
254 raise
255 else:
256 # An exception here should be caught in the caller
257 stream = open(encodeFilename(filename), open_mode)
258 return (stream, alt_filename)
d77c3dfd
FV
259
260
261def timeconvert(timestr):
59ae15a5
PH
262 """Convert RFC 2822 defined time string into system timestamp"""
263 timestamp = None
264 timetuple = email.utils.parsedate_tz(timestr)
265 if timetuple is not None:
266 timestamp = email.utils.mktime_tz(timetuple)
267 return timestamp
1c469a94 268
5f6a1245 269
796173d0 270def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
271 """Sanitizes a string so it could be used as part of a filename.
272 If restricted is set, use a stricter subset of allowed characters.
796173d0 273 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
274 """
275 def replace_insane(char):
276 if char == '?' or ord(char) < 32 or ord(char) == 127:
277 return ''
278 elif char == '"':
279 return '' if restricted else '\''
280 elif char == ':':
281 return '_-' if restricted else ' -'
282 elif char in '\\/|*<>':
283 return '_'
627dcfff 284 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
285 return '_'
286 if restricted and ord(char) > 127:
287 return '_'
288 return char
289
2aeb06d6
PH
290 # Handle timestamps
291 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 292 result = ''.join(map(replace_insane, s))
796173d0
PH
293 if not is_id:
294 while '__' in result:
295 result = result.replace('__', '_')
296 result = result.strip('_')
297 # Common case of "Foreign band name - English song title"
298 if restricted and result.startswith('-_'):
299 result = result[2:]
300 if not result:
301 result = '_'
59ae15a5 302 return result
d77c3dfd 303
5f6a1245 304
d77c3dfd 305def orderedSet(iterable):
59ae15a5
PH
306 """ Remove all duplicates from the input iterable """
307 res = []
308 for el in iterable:
309 if el not in res:
310 res.append(el)
311 return res
d77c3dfd 312
912b38b4 313
4e408e47
PH
314def _htmlentity_transform(entity):
315 """Transforms an HTML entity to a character."""
316 # Known non-numeric HTML entity
317 if entity in compat_html_entities.name2codepoint:
318 return compat_chr(compat_html_entities.name2codepoint[entity])
319
320 mobj = re.match(r'#(x?[0-9]+)', entity)
321 if mobj is not None:
322 numstr = mobj.group(1)
28e614de 323 if numstr.startswith('x'):
4e408e47 324 base = 16
28e614de 325 numstr = '0%s' % numstr
4e408e47
PH
326 else:
327 base = 10
328 return compat_chr(int(numstr, base))
329
330 # Unknown entity in name, return its literal representation
28e614de 331 return ('&%s;' % entity)
4e408e47
PH
332
333
d77c3dfd 334def unescapeHTML(s):
912b38b4
PH
335 if s is None:
336 return None
337 assert type(s) == compat_str
d77c3dfd 338
4e408e47
PH
339 return re.sub(
340 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 341
8bf48f23
PH
342
343def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
344 """
345 @param s The name of the file
346 """
d77c3dfd 347
8bf48f23 348 assert type(s) == compat_str
d77c3dfd 349
59ae15a5
PH
350 # Python 3 has a Unicode API
351 if sys.version_info >= (3, 0):
352 return s
0f00efed 353
59ae15a5 354 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
28e614de 355 # Pass '' directly to use Unicode APIs on Windows 2000 and up
59ae15a5
PH
356 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
357 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
358 if not for_subprocess:
359 return s
360 else:
361 # For subprocess calls, encode with locale encoding
362 # Refer to http://stackoverflow.com/a/9951851/35070
363 encoding = preferredencoding()
59ae15a5 364 else:
6df40dcb 365 encoding = sys.getfilesystemencoding()
8bf48f23
PH
366 if encoding is None:
367 encoding = 'utf-8'
368 return s.encode(encoding, 'ignore')
369
f07b74fc
PH
370
371def encodeArgument(s):
372 if not isinstance(s, compat_str):
373 # Legacy code that uses byte strings
374 # Uncomment the following line after fixing all post processors
7af808a5 375 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
376 s = s.decode('ascii')
377 return encodeFilename(s, True)
378
379
8271226a
PH
380def decodeOption(optval):
381 if optval is None:
382 return optval
383 if isinstance(optval, bytes):
384 optval = optval.decode(preferredencoding())
385
386 assert isinstance(optval, compat_str)
387 return optval
1c256f70 388
5f6a1245 389
4539dd30
PH
390def formatSeconds(secs):
391 if secs > 3600:
392 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
393 elif secs > 60:
394 return '%d:%02d' % (secs // 60, secs % 60)
395 else:
396 return '%d' % secs
397
a0ddb8a2 398
be4a824d
PH
399def make_HTTPS_handler(params, **kwargs):
400 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 401 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 402 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 403 if opts_no_check_certificate:
be5f2c19 404 context.check_hostname = False
0db261ba 405 context.verify_mode = ssl.CERT_NONE
a2366922 406 try:
be4a824d 407 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
408 except TypeError:
409 # Python 2.7.8
410 # (create_default_context present but HTTPSHandler has no context=)
411 pass
412
413 if sys.version_info < (3, 2):
d7932313 414 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 415 else: # Python < 3.4
d7932313 416 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 417 context.verify_mode = (ssl.CERT_NONE
dca08720 418 if opts_no_check_certificate
ea6d901e 419 else ssl.CERT_REQUIRED)
303b479e 420 context.set_default_verify_paths()
be4a824d 421 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 422
732ea2f0 423
1c256f70
PH
424class ExtractorError(Exception):
425 """Error during info extraction."""
5f6a1245 426
d11271dd 427 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
428 """ tb, if given, is the original traceback (so that it can be printed out).
429 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
430 """
431
432 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
433 expected = True
d11271dd
PH
434 if video_id is not None:
435 msg = video_id + ': ' + msg
410f3e73 436 if cause:
28e614de 437 msg += ' (caused by %r)' % cause
9a82b238 438 if not expected:
732ea2f0
PH
439 if ytdl_is_updateable():
440 update_cmd = 'type youtube-dl -U to update'
441 else:
442 update_cmd = 'see https://yt-dl.org/update on how to update'
443 msg += '; please report this issue on https://yt-dl.org/bug .'
444 msg += ' Make sure you are using the latest version; %s.' % update_cmd
445 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
1c256f70 446 super(ExtractorError, self).__init__(msg)
d5979c5d 447
1c256f70 448 self.traceback = tb
8cc83b8d 449 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 450 self.cause = cause
d11271dd 451 self.video_id = video_id
1c256f70 452
01951dda
PH
453 def format_traceback(self):
454 if self.traceback is None:
455 return None
28e614de 456 return ''.join(traceback.format_tb(self.traceback))
01951dda 457
1c256f70 458
416c7fcb
PH
459class UnsupportedError(ExtractorError):
460 def __init__(self, url):
461 super(UnsupportedError, self).__init__(
462 'Unsupported URL: %s' % url, expected=True)
463 self.url = url
464
465
55b3e45b
JMF
466class RegexNotFoundError(ExtractorError):
467 """Error when a regex didn't match"""
468 pass
469
470
d77c3dfd 471class DownloadError(Exception):
59ae15a5 472 """Download Error exception.
d77c3dfd 473
59ae15a5
PH
474 This exception may be thrown by FileDownloader objects if they are not
475 configured to continue on errors. They will contain the appropriate
476 error message.
477 """
5f6a1245 478
8cc83b8d
FV
479 def __init__(self, msg, exc_info=None):
480 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
481 super(DownloadError, self).__init__(msg)
482 self.exc_info = exc_info
d77c3dfd
FV
483
484
485class SameFileError(Exception):
59ae15a5 486 """Same File exception.
d77c3dfd 487
59ae15a5
PH
488 This exception will be thrown by FileDownloader objects if they detect
489 multiple files would have to be downloaded to the same file on disk.
490 """
491 pass
d77c3dfd
FV
492
493
494class PostProcessingError(Exception):
59ae15a5 495 """Post Processing exception.
d77c3dfd 496
59ae15a5
PH
497 This exception may be raised by PostProcessor's .run() method to
498 indicate an error in the postprocessing task.
499 """
5f6a1245 500
7851b379
PH
501 def __init__(self, msg):
502 self.msg = msg
d77c3dfd 503
5f6a1245 504
d77c3dfd 505class MaxDownloadsReached(Exception):
59ae15a5
PH
506 """ --max-downloads limit has been reached. """
507 pass
d77c3dfd
FV
508
509
510class UnavailableVideoError(Exception):
59ae15a5 511 """Unavailable Format exception.
d77c3dfd 512
59ae15a5
PH
513 This exception will be thrown when a video is requested
514 in a format that is not available for that video.
515 """
516 pass
d77c3dfd
FV
517
518
519class ContentTooShortError(Exception):
59ae15a5 520 """Content Too Short exception.
d77c3dfd 521
59ae15a5
PH
522 This exception may be raised by FileDownloader objects when a file they
523 download is too small for what the server announced first, indicating
524 the connection was probably interrupted.
525 """
526 # Both in bytes
527 downloaded = None
528 expected = None
d77c3dfd 529
59ae15a5
PH
530 def __init__(self, downloaded, expected):
531 self.downloaded = downloaded
532 self.expected = expected
d77c3dfd 533
5f6a1245 534
c5a59d93 535def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
be4a824d
PH
536 hc = http_class(*args, **kwargs)
537 source_address = ydl_handler._params.get('source_address')
538 if source_address is not None:
539 sa = (source_address, 0)
540 if hasattr(hc, 'source_address'): # Python 2.7+
541 hc.source_address = sa
542 else: # Python 2.6
543 def _hc_connect(self, *args, **kwargs):
544 sock = compat_socket_create_connection(
545 (self.host, self.port), self.timeout, sa)
546 if is_https:
d7932313
PH
547 self.sock = ssl.wrap_socket(
548 sock, self.key_file, self.cert_file,
549 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
550 else:
551 self.sock = sock
552 hc.connect = functools.partial(_hc_connect, hc)
553
554 return hc
555
556
acebc9cd 557class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
558 """Handler for HTTP requests and responses.
559
560 This class, when installed with an OpenerDirector, automatically adds
561 the standard headers to every HTTP request and handles gzipped and
562 deflated responses from web servers. If compression is to be avoided in
563 a particular request, the original request in the program code only has
564 to include the HTTP header "Youtubedl-No-Compression", which will be
565 removed before making the real request.
566
567 Part of this code was copied from:
568
569 http://techknack.net/python-urllib2-handlers/
570
571 Andrew Rowls, the author of that code, agreed to release it to the
572 public domain.
573 """
574
be4a824d
PH
575 def __init__(self, params, *args, **kwargs):
576 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
577 self._params = params
578
579 def http_open(self, req):
580 return self.do_open(functools.partial(
c5a59d93 581 _create_http_connection, self, compat_http_client.HTTPConnection, False),
be4a824d
PH
582 req)
583
59ae15a5
PH
584 @staticmethod
585 def deflate(data):
586 try:
587 return zlib.decompress(data, -zlib.MAX_WBITS)
588 except zlib.error:
589 return zlib.decompress(data)
590
591 @staticmethod
592 def addinfourl_wrapper(stream, headers, url, code):
593 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
594 return compat_urllib_request.addinfourl(stream, headers, url, code)
595 ret = compat_urllib_request.addinfourl(stream, headers, url)
596 ret.code = code
597 return ret
598
acebc9cd 599 def http_request(self, req):
33ac271b 600 for h, v in std_headers.items():
3d5f7a39
JK
601 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
602 # The dict keys are capitalized because of this bug by urllib
603 if h.capitalize() not in req.headers:
33ac271b 604 req.add_header(h, v)
59ae15a5
PH
605 if 'Youtubedl-no-compression' in req.headers:
606 if 'Accept-encoding' in req.headers:
607 del req.headers['Accept-encoding']
608 del req.headers['Youtubedl-no-compression']
989b4b2b
PH
609
610 if sys.version_info < (2, 7) and '#' in req.get_full_url():
611 # Python 2.6 is brain-dead when it comes to fragments
612 req._Request__original = req._Request__original.partition('#')[0]
613 req._Request__r_type = req._Request__r_type.partition('#')[0]
614
59ae15a5
PH
615 return req
616
acebc9cd 617 def http_response(self, req, resp):
59ae15a5
PH
618 old_resp = resp
619 # gzip
620 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
621 content = resp.read()
622 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
623 try:
624 uncompressed = io.BytesIO(gz.read())
625 except IOError as original_ioerror:
626 # There may be junk add the end of the file
627 # See http://stackoverflow.com/q/4928560/35070 for details
628 for i in range(1, 1024):
629 try:
630 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
631 uncompressed = io.BytesIO(gz.read())
632 except IOError:
633 continue
634 break
635 else:
636 raise original_ioerror
637 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
638 resp.msg = old_resp.msg
639 # deflate
640 if resp.headers.get('Content-encoding', '') == 'deflate':
641 gz = io.BytesIO(self.deflate(resp.read()))
642 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
643 resp.msg = old_resp.msg
644 return resp
0f8d03f8 645
acebc9cd
PH
646 https_request = http_request
647 https_response = http_response
bf50b038 648
5de90176 649
be4a824d
PH
650class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
651 def __init__(self, params, https_conn_class=None, *args, **kwargs):
652 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
653 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
654 self._params = params
655
656 def https_open(self, req):
657 return self.do_open(functools.partial(
658 _create_http_connection, self, self._https_conn_class, True),
659 req)
660
661
305d0683 662def parse_iso8601(date_str, delimiter='T'):
912b38b4
PH
663 """ Return a UNIX timestamp from the given date """
664
665 if date_str is None:
666 return None
667
668 m = re.search(
6ad4013d 669 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
912b38b4
PH
670 date_str)
671 if not m:
672 timezone = datetime.timedelta()
673 else:
674 date_str = date_str[:-len(m.group(0))]
675 if not m.group('sign'):
676 timezone = datetime.timedelta()
677 else:
678 sign = 1 if m.group('sign') == '+' else -1
679 timezone = datetime.timedelta(
680 hours=sign * int(m.group('hours')),
681 minutes=sign * int(m.group('minutes')))
6ad4013d 682 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
305d0683 683 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
684 return calendar.timegm(dt.timetuple())
685
686
42bdd9d0 687def unified_strdate(date_str, day_first=True):
bf50b038 688 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
689
690 if date_str is None:
691 return None
bf50b038 692 upload_date = None
5f6a1245 693 # Replace commas
026fcc04 694 date_str = date_str.replace(',', ' ')
bf50b038 695 # %z (UTC offset) is only supported in python>=3.2
026fcc04 696 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0
PH
697 # Remove AM/PM + timezone
698 date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
699
19e1d359
JMF
700 format_expressions = [
701 '%d %B %Y',
0f99566c 702 '%d %b %Y',
19e1d359
JMF
703 '%B %d %Y',
704 '%b %d %Y',
78ff59d0
PP
705 '%b %dst %Y %I:%M%p',
706 '%b %dnd %Y %I:%M%p',
707 '%b %dth %Y %I:%M%p',
a69801e2 708 '%Y %m %d',
19e1d359 709 '%Y-%m-%d',
fe556f1b 710 '%Y/%m/%d',
19e1d359 711 '%Y/%m/%d %H:%M:%S',
5d73273f 712 '%Y-%m-%d %H:%M:%S',
e9be9a6a 713 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 714 '%d.%m.%Y %H:%M',
b047de6f 715 '%d.%m.%Y %H.%M',
19e1d359 716 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
717 '%Y-%m-%dT%H:%M:%S.%fZ',
718 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 719 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 720 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 721 '%Y-%m-%dT%H:%M',
19e1d359 722 ]
42bdd9d0
PH
723 if day_first:
724 format_expressions.extend([
776dc399
S
725 '%d.%m.%Y',
726 '%d/%m/%Y',
727 '%d/%m/%y',
42bdd9d0
PH
728 '%d/%m/%Y %H:%M:%S',
729 ])
730 else:
731 format_expressions.extend([
776dc399
S
732 '%m.%d.%Y',
733 '%m/%d/%Y',
734 '%m/%d/%y',
42bdd9d0
PH
735 '%m/%d/%Y %H:%M:%S',
736 ])
bf50b038
JMF
737 for expression in format_expressions:
738 try:
739 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 740 except ValueError:
bf50b038 741 pass
42393ce2
PH
742 if upload_date is None:
743 timetuple = email.utils.parsedate_tz(date_str)
744 if timetuple:
745 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
746 return upload_date
747
5f6a1245 748
28e614de 749def determine_ext(url, default_ext='unknown_video'):
f4776371
S
750 if url is None:
751 return default_ext
28e614de 752 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
753 if re.match(r'^[A-Za-z0-9]+$', guess):
754 return guess
755 else:
cbdbb766 756 return default_ext
73e79f2a 757
5f6a1245 758
d4051a8e 759def subtitles_filename(filename, sub_lang, sub_format):
28e614de 760 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 761
5f6a1245 762
bd558525 763def date_from_str(date_str):
37254abc
JMF
764 """
765 Return a datetime object from a string in the format YYYYMMDD or
766 (now|today)[+-][0-9](day|week|month|year)(s)?"""
767 today = datetime.date.today()
f8795e10 768 if date_str in ('now', 'today'):
37254abc 769 return today
f8795e10
PH
770 if date_str == 'yesterday':
771 return today - datetime.timedelta(days=1)
37254abc
JMF
772 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
773 if match is not None:
774 sign = match.group('sign')
775 time = int(match.group('time'))
776 if sign == '-':
777 time = -time
778 unit = match.group('unit')
5f6a1245 779 # A bad aproximation?
37254abc
JMF
780 if unit == 'month':
781 unit = 'day'
782 time *= 30
783 elif unit == 'year':
784 unit = 'day'
785 time *= 365
786 unit += 's'
787 delta = datetime.timedelta(**{unit: time})
788 return today + delta
bd558525 789 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
5f6a1245
JW
790
791
e63fc1be 792def hyphenate_date(date_str):
793 """
794 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
795 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
796 if match is not None:
797 return '-'.join(match.groups())
798 else:
799 return date_str
800
5f6a1245 801
bd558525
JMF
802class DateRange(object):
803 """Represents a time interval between two dates"""
5f6a1245 804
bd558525
JMF
805 def __init__(self, start=None, end=None):
806 """start and end must be strings in the format accepted by date"""
807 if start is not None:
808 self.start = date_from_str(start)
809 else:
810 self.start = datetime.datetime.min.date()
811 if end is not None:
812 self.end = date_from_str(end)
813 else:
814 self.end = datetime.datetime.max.date()
37254abc 815 if self.start > self.end:
bd558525 816 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 817
bd558525
JMF
818 @classmethod
819 def day(cls, day):
820 """Returns a range that only contains the given day"""
5f6a1245
JW
821 return cls(day, day)
822
bd558525
JMF
823 def __contains__(self, date):
824 """Check if the date is in the range"""
37254abc
JMF
825 if not isinstance(date, datetime.date):
826 date = date_from_str(date)
827 return self.start <= date <= self.end
5f6a1245 828
bd558525 829 def __str__(self):
5f6a1245 830 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
831
832
833def platform_name():
834 """ Returns the platform name as a compat_str """
835 res = platform.platform()
836 if isinstance(res, bytes):
837 res = res.decode(preferredencoding())
838
839 assert isinstance(res, compat_str)
840 return res
c257baff
PH
841
842
b58ddb32
PH
843def _windows_write_string(s, out):
844 """ Returns True if the string was written using special methods,
845 False if it has yet to be written out."""
846 # Adapted from http://stackoverflow.com/a/3259271/35070
847
848 import ctypes
849 import ctypes.wintypes
850
851 WIN_OUTPUT_IDS = {
852 1: -11,
853 2: -12,
854 }
855
a383a98a
PH
856 try:
857 fileno = out.fileno()
858 except AttributeError:
859 # If the output stream doesn't have a fileno, it's virtual
860 return False
aa42e873
PH
861 except io.UnsupportedOperation:
862 # Some strange Windows pseudo files?
863 return False
b58ddb32
PH
864 if fileno not in WIN_OUTPUT_IDS:
865 return False
866
e2f89ec7 867 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 868 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
6ac4e806 869 (b"GetStdHandle", ctypes.windll.kernel32))
b58ddb32
PH
870 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
871
e2f89ec7 872 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
873 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
874 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
6ac4e806 875 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
b58ddb32
PH
876 written = ctypes.wintypes.DWORD(0)
877
6ac4e806 878 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
b58ddb32
PH
879 FILE_TYPE_CHAR = 0x0002
880 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 881 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
882 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
883 ctypes.POINTER(ctypes.wintypes.DWORD))(
6ac4e806 884 (b"GetConsoleMode", ctypes.windll.kernel32))
b58ddb32
PH
885 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
886
887 def not_a_console(handle):
888 if handle == INVALID_HANDLE_VALUE or handle is None:
889 return True
890 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
891 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
892
893 if not_a_console(h):
894 return False
895
d1b9c912
PH
896 def next_nonbmp_pos(s):
897 try:
898 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
899 except StopIteration:
900 return len(s)
901
902 while s:
903 count = min(next_nonbmp_pos(s), 1024)
904
b58ddb32 905 ret = WriteConsoleW(
d1b9c912 906 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
907 if ret == 0:
908 raise OSError('Failed to write string')
d1b9c912
PH
909 if not count: # We just wrote a non-BMP character
910 assert written.value == 2
911 s = s[1:]
912 else:
913 assert written.value > 0
914 s = s[written.value:]
b58ddb32
PH
915 return True
916
917
734f90bb 918def write_string(s, out=None, encoding=None):
7459e3a2
PH
919 if out is None:
920 out = sys.stderr
8bf48f23 921 assert type(s) == compat_str
7459e3a2 922
b58ddb32
PH
923 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
924 if _windows_write_string(s, out):
925 return
926
7459e3a2
PH
927 if ('b' in getattr(out, 'mode', '') or
928 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
929 byt = s.encode(encoding or preferredencoding(), 'ignore')
930 out.write(byt)
931 elif hasattr(out, 'buffer'):
932 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
933 byt = s.encode(enc, 'ignore')
934 out.buffer.write(byt)
935 else:
8bf48f23 936 out.write(s)
7459e3a2
PH
937 out.flush()
938
939
48ea9cea
PH
940def bytes_to_intlist(bs):
941 if not bs:
942 return []
943 if isinstance(bs[0], int): # Python 3
944 return list(bs)
945 else:
946 return [ord(c) for c in bs]
947
c257baff 948
cba892fa 949def intlist_to_bytes(xs):
950 if not xs:
951 return b''
eb4157fd 952 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
953
954
c1c9a79c
PH
955# Cross-platform file locking
956if sys.platform == 'win32':
957 import ctypes.wintypes
958 import msvcrt
959
960 class OVERLAPPED(ctypes.Structure):
961 _fields_ = [
962 ('Internal', ctypes.wintypes.LPVOID),
963 ('InternalHigh', ctypes.wintypes.LPVOID),
964 ('Offset', ctypes.wintypes.DWORD),
965 ('OffsetHigh', ctypes.wintypes.DWORD),
966 ('hEvent', ctypes.wintypes.HANDLE),
967 ]
968
969 kernel32 = ctypes.windll.kernel32
970 LockFileEx = kernel32.LockFileEx
971 LockFileEx.argtypes = [
972 ctypes.wintypes.HANDLE, # hFile
973 ctypes.wintypes.DWORD, # dwFlags
974 ctypes.wintypes.DWORD, # dwReserved
975 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
976 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
977 ctypes.POINTER(OVERLAPPED) # Overlapped
978 ]
979 LockFileEx.restype = ctypes.wintypes.BOOL
980 UnlockFileEx = kernel32.UnlockFileEx
981 UnlockFileEx.argtypes = [
982 ctypes.wintypes.HANDLE, # hFile
983 ctypes.wintypes.DWORD, # dwReserved
984 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
985 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
986 ctypes.POINTER(OVERLAPPED) # Overlapped
987 ]
988 UnlockFileEx.restype = ctypes.wintypes.BOOL
989 whole_low = 0xffffffff
990 whole_high = 0x7fffffff
991
992 def _lock_file(f, exclusive):
993 overlapped = OVERLAPPED()
994 overlapped.Offset = 0
995 overlapped.OffsetHigh = 0
996 overlapped.hEvent = 0
997 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
998 handle = msvcrt.get_osfhandle(f.fileno())
999 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1000 whole_low, whole_high, f._lock_file_overlapped_p):
1001 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1002
1003 def _unlock_file(f):
1004 assert f._lock_file_overlapped_p
1005 handle = msvcrt.get_osfhandle(f.fileno())
1006 if not UnlockFileEx(handle, 0,
1007 whole_low, whole_high, f._lock_file_overlapped_p):
1008 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1009
1010else:
1011 import fcntl
1012
1013 def _lock_file(f, exclusive):
2582bebe 1014 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
1015
1016 def _unlock_file(f):
2582bebe 1017 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
1018
1019
1020class locked_file(object):
1021 def __init__(self, filename, mode, encoding=None):
1022 assert mode in ['r', 'a', 'w']
1023 self.f = io.open(filename, mode, encoding=encoding)
1024 self.mode = mode
1025
1026 def __enter__(self):
1027 exclusive = self.mode != 'r'
1028 try:
1029 _lock_file(self.f, exclusive)
1030 except IOError:
1031 self.f.close()
1032 raise
1033 return self
1034
1035 def __exit__(self, etype, value, traceback):
1036 try:
1037 _unlock_file(self.f)
1038 finally:
1039 self.f.close()
1040
1041 def __iter__(self):
1042 return iter(self.f)
1043
1044 def write(self, *args):
1045 return self.f.write(*args)
1046
1047 def read(self, *args):
1048 return self.f.read(*args)
4eb7f1d1
JMF
1049
1050
4644ac55
S
1051def get_filesystem_encoding():
1052 encoding = sys.getfilesystemencoding()
1053 return encoding if encoding is not None else 'utf-8'
1054
1055
4eb7f1d1 1056def shell_quote(args):
a6a173c2 1057 quoted_args = []
4644ac55 1058 encoding = get_filesystem_encoding()
a6a173c2
JMF
1059 for a in args:
1060 if isinstance(a, bytes):
1061 # We may get a filename encoded with 'encodeFilename'
1062 a = a.decode(encoding)
1063 quoted_args.append(pipes.quote(a))
28e614de 1064 return ' '.join(quoted_args)
9d4660ca
PH
1065
1066
f4d96df0
PH
1067def takewhile_inclusive(pred, seq):
1068 """ Like itertools.takewhile, but include the latest evaluated element
1069 (the first element so that Not pred(e)) """
1070 for e in seq:
1071 yield e
1072 if not pred(e):
1073 return
1074
1075
9d4660ca
PH
1076def smuggle_url(url, data):
1077 """ Pass additional data in a URL for internal use. """
1078
1079 sdata = compat_urllib_parse.urlencode(
28e614de
PH
1080 {'__youtubedl_smuggle': json.dumps(data)})
1081 return url + '#' + sdata
9d4660ca
PH
1082
1083
79f82953 1084def unsmuggle_url(smug_url, default=None):
83e865a3 1085 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1086 return smug_url, default
28e614de
PH
1087 url, _, sdata = smug_url.rpartition('#')
1088 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1089 data = json.loads(jsond)
1090 return url, data
02dbf93f
PH
1091
1092
02dbf93f
PH
1093def format_bytes(bytes):
1094 if bytes is None:
28e614de 1095 return 'N/A'
02dbf93f
PH
1096 if type(bytes) is str:
1097 bytes = float(bytes)
1098 if bytes == 0.0:
1099 exponent = 0
1100 else:
1101 exponent = int(math.log(bytes, 1024.0))
28e614de 1102 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1103 converted = float(bytes) / float(1024 ** exponent)
28e614de 1104 return '%.2f%s' % (converted, suffix)
f53c966a 1105
1c088fa8 1106
be64b5b0
PH
1107def parse_filesize(s):
1108 if s is None:
1109 return None
1110
1111 # The lower-case forms are of course incorrect and inofficial,
1112 # but we support those too
1113 _UNIT_TABLE = {
1114 'B': 1,
1115 'b': 1,
1116 'KiB': 1024,
1117 'KB': 1000,
1118 'kB': 1024,
1119 'Kb': 1000,
1120 'MiB': 1024 ** 2,
1121 'MB': 1000 ** 2,
1122 'mB': 1024 ** 2,
1123 'Mb': 1000 ** 2,
1124 'GiB': 1024 ** 3,
1125 'GB': 1000 ** 3,
1126 'gB': 1024 ** 3,
1127 'Gb': 1000 ** 3,
1128 'TiB': 1024 ** 4,
1129 'TB': 1000 ** 4,
1130 'tB': 1024 ** 4,
1131 'Tb': 1000 ** 4,
1132 'PiB': 1024 ** 5,
1133 'PB': 1000 ** 5,
1134 'pB': 1024 ** 5,
1135 'Pb': 1000 ** 5,
1136 'EiB': 1024 ** 6,
1137 'EB': 1000 ** 6,
1138 'eB': 1024 ** 6,
1139 'Eb': 1000 ** 6,
1140 'ZiB': 1024 ** 7,
1141 'ZB': 1000 ** 7,
1142 'zB': 1024 ** 7,
1143 'Zb': 1000 ** 7,
1144 'YiB': 1024 ** 8,
1145 'YB': 1000 ** 8,
1146 'yB': 1024 ** 8,
1147 'Yb': 1000 ** 8,
1148 }
1149
1150 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
4349c07d
PH
1151 m = re.match(
1152 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
be64b5b0
PH
1153 if not m:
1154 return None
1155
4349c07d
PH
1156 num_str = m.group('num').replace(',', '.')
1157 mult = _UNIT_TABLE[m.group('unit')]
1158 return int(float(num_str) * mult)
be64b5b0
PH
1159
1160
1c088fa8 1161def get_term_width():
4644ac55 1162 columns = compat_getenv('COLUMNS', None)
1c088fa8
PH
1163 if columns:
1164 return int(columns)
1165
1166 try:
1167 sp = subprocess.Popen(
1168 ['stty', 'size'],
1169 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1170 out, err = sp.communicate()
1171 return int(out.split()[1])
1172 except:
1173 pass
1174 return None
caefb1de
PH
1175
1176
1177def month_by_name(name):
1178 """ Return the number of a month by (locale-independently) English name """
1179
1180 ENGLISH_NAMES = [
28e614de
PH
1181 'January', 'February', 'March', 'April', 'May', 'June',
1182 'July', 'August', 'September', 'October', 'November', 'December']
caefb1de
PH
1183 try:
1184 return ENGLISH_NAMES.index(name) + 1
1185 except ValueError:
1186 return None
18258362
JMF
1187
1188
5aafe895 1189def fix_xml_ampersands(xml_str):
18258362 1190 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1191 return re.sub(
1192 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1193 '&amp;',
5aafe895 1194 xml_str)
e3946f98
PH
1195
1196
1197def setproctitle(title):
8bf48f23 1198 assert isinstance(title, compat_str)
e3946f98
PH
1199 try:
1200 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1201 except OSError:
1202 return
6eefe533
PH
1203 title_bytes = title.encode('utf-8')
1204 buf = ctypes.create_string_buffer(len(title_bytes))
1205 buf.value = title_bytes
e3946f98 1206 try:
6eefe533 1207 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1208 except AttributeError:
1209 return # Strange libc, just skip this
d7dda168
PH
1210
1211
1212def remove_start(s, start):
1213 if s.startswith(start):
1214 return s[len(start):]
1215 return s
29eb5174
PH
1216
1217
2b9faf55
PH
1218def remove_end(s, end):
1219 if s.endswith(end):
1220 return s[:-len(end)]
1221 return s
1222
1223
29eb5174 1224def url_basename(url):
9b8aaeed 1225 path = compat_urlparse.urlparse(url).path
28e614de 1226 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1227
1228
1229class HEADRequest(compat_urllib_request.Request):
1230 def get_method(self):
1231 return "HEAD"
7217e148
PH
1232
1233
9732d77e 1234def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1235 if get_attr:
1236 if v is not None:
1237 v = getattr(v, get_attr, None)
9572013d
PH
1238 if v == '':
1239 v = None
9732d77e
PH
1240 return default if v is None else (int(v) * invscale // scale)
1241
9572013d 1242
40a90862
JMF
1243def str_or_none(v, default=None):
1244 return default if v is None else compat_str(v)
1245
9732d77e
PH
1246
1247def str_to_int(int_str):
48d4681e 1248 """ A more relaxed version of int_or_none """
9732d77e
PH
1249 if int_str is None:
1250 return None
28e614de 1251 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1252 return int(int_str)
608d11f5
PH
1253
1254
9732d77e
PH
1255def float_or_none(v, scale=1, invscale=1, default=None):
1256 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1257
1258
608d11f5 1259def parse_duration(s):
227d4822 1260 if not isinstance(s, basestring if sys.version_info < (3, 0) else compat_str):
608d11f5
PH
1261 return None
1262
ca7b3246
S
1263 s = s.strip()
1264
608d11f5 1265 m = re.match(
9d22a7df 1266 r'''(?ix)(?:P?T)?
e8df5cee
PH
1267 (?:
1268 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1269 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1270
6a68bb57
PH
1271 (?:
1272 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1273 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1274 )?
e8df5cee
PH
1275 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1276 )$''', s)
608d11f5
PH
1277 if not m:
1278 return None
e8df5cee
PH
1279 res = 0
1280 if m.group('only_mins'):
1281 return float_or_none(m.group('only_mins'), invscale=60)
1282 if m.group('only_hours'):
1283 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1284 if m.group('secs'):
1285 res += int(m.group('secs'))
608d11f5
PH
1286 if m.group('mins'):
1287 res += int(m.group('mins')) * 60
e8df5cee
PH
1288 if m.group('hours'):
1289 res += int(m.group('hours')) * 60 * 60
7adcbe75
PH
1290 if m.group('ms'):
1291 res += float(m.group('ms'))
608d11f5 1292 return res
91d7d0b3
JMF
1293
1294
1295def prepend_extension(filename, ext):
5f6a1245 1296 name, real_ext = os.path.splitext(filename)
28e614de 1297 return '{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1298
1299
1300def check_executable(exe, args=[]):
1301 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1302 args can be a list of arguments for a short output (like -version) """
1303 try:
1304 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1305 except OSError:
1306 return False
1307 return exe
b7ab0590
PH
1308
1309
95807118 1310def get_exe_version(exe, args=['--version'],
cae97f65 1311 version_re=None, unrecognized='present'):
95807118
PH
1312 """ Returns the version of the specified executable,
1313 or False if the executable is not present """
1314 try:
cae97f65 1315 out, _ = subprocess.Popen(
95807118
PH
1316 [exe] + args,
1317 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1318 except OSError:
1319 return False
cae97f65
PH
1320 if isinstance(out, bytes): # Python 2.x
1321 out = out.decode('ascii', 'ignore')
1322 return detect_exe_version(out, version_re, unrecognized)
1323
1324
1325def detect_exe_version(output, version_re=None, unrecognized='present'):
1326 assert isinstance(output, compat_str)
1327 if version_re is None:
1328 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1329 m = re.search(version_re, output)
95807118
PH
1330 if m:
1331 return m.group(1)
1332 else:
1333 return unrecognized
1334
1335
b7ab0590 1336class PagedList(object):
dd26ced1
PH
1337 def __len__(self):
1338 # This is only useful for tests
1339 return len(self.getslice())
1340
9c44d242
PH
1341
1342class OnDemandPagedList(PagedList):
1343 def __init__(self, pagefunc, pagesize):
1344 self._pagefunc = pagefunc
1345 self._pagesize = pagesize
1346
b7ab0590
PH
1347 def getslice(self, start=0, end=None):
1348 res = []
1349 for pagenum in itertools.count(start // self._pagesize):
1350 firstid = pagenum * self._pagesize
1351 nextfirstid = pagenum * self._pagesize + self._pagesize
1352 if start >= nextfirstid:
1353 continue
1354
1355 page_results = list(self._pagefunc(pagenum))
1356
1357 startv = (
1358 start % self._pagesize
1359 if firstid <= start < nextfirstid
1360 else 0)
1361
1362 endv = (
1363 ((end - 1) % self._pagesize) + 1
1364 if (end is not None and firstid <= end <= nextfirstid)
1365 else None)
1366
1367 if startv != 0 or endv is not None:
1368 page_results = page_results[startv:endv]
1369 res.extend(page_results)
1370
1371 # A little optimization - if current page is not "full", ie. does
1372 # not contain page_size videos then we can assume that this page
1373 # is the last one - there are no more ids on further pages -
1374 # i.e. no need to query again.
1375 if len(page_results) + startv < self._pagesize:
1376 break
1377
1378 # If we got the whole page, but the next page is not interesting,
1379 # break out early as well
1380 if end == nextfirstid:
1381 break
1382 return res
81c2f20b
PH
1383
1384
9c44d242
PH
1385class InAdvancePagedList(PagedList):
1386 def __init__(self, pagefunc, pagecount, pagesize):
1387 self._pagefunc = pagefunc
1388 self._pagecount = pagecount
1389 self._pagesize = pagesize
1390
1391 def getslice(self, start=0, end=None):
1392 res = []
1393 start_page = start // self._pagesize
1394 end_page = (
1395 self._pagecount if end is None else (end // self._pagesize + 1))
1396 skip_elems = start - start_page * self._pagesize
1397 only_more = None if end is None else end - start
1398 for pagenum in range(start_page, end_page):
1399 page = list(self._pagefunc(pagenum))
1400 if skip_elems:
1401 page = page[skip_elems:]
1402 skip_elems = None
1403 if only_more is not None:
1404 if len(page) < only_more:
1405 only_more -= len(page)
1406 else:
1407 page = page[:only_more]
1408 res.extend(page)
1409 break
1410 res.extend(page)
1411 return res
1412
1413
81c2f20b 1414def uppercase_escape(s):
676eb3f2 1415 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1416 return re.sub(
a612753d 1417 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1418 lambda m: unicode_escape(m.group(0))[0],
1419 s)
b53466e1 1420
d05cfe06
S
1421
1422def escape_rfc3986(s):
1423 """Escape non-ASCII characters as suggested by RFC 3986"""
1424 if sys.version_info < (3, 0) and isinstance(s, unicode):
1425 s = s.encode('utf-8')
ecc0c5ee 1426 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1427
1428
1429def escape_url(url):
1430 """Escape URL as suggested by RFC 3986"""
1431 url_parsed = compat_urllib_parse_urlparse(url)
1432 return url_parsed._replace(
1433 path=escape_rfc3986(url_parsed.path),
1434 params=escape_rfc3986(url_parsed.params),
1435 query=escape_rfc3986(url_parsed.query),
1436 fragment=escape_rfc3986(url_parsed.fragment)
1437 ).geturl()
1438
b53466e1 1439try:
28e614de 1440 struct.pack('!I', 0)
b53466e1
PH
1441except TypeError:
1442 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1443 def struct_pack(spec, *args):
1444 if isinstance(spec, compat_str):
1445 spec = spec.encode('ascii')
1446 return struct.pack(spec, *args)
1447
1448 def struct_unpack(spec, *args):
1449 if isinstance(spec, compat_str):
1450 spec = spec.encode('ascii')
1451 return struct.unpack(spec, *args)
1452else:
1453 struct_pack = struct.pack
1454 struct_unpack = struct.unpack
62e609ab
PH
1455
1456
1457def read_batch_urls(batch_fd):
1458 def fixup(url):
1459 if not isinstance(url, compat_str):
1460 url = url.decode('utf-8', 'replace')
28e614de 1461 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1462 if url.startswith(BOM_UTF8):
1463 url = url[len(BOM_UTF8):]
1464 url = url.strip()
1465 if url.startswith(('#', ';', ']')):
1466 return False
1467 return url
1468
1469 with contextlib.closing(batch_fd) as fd:
1470 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1471
1472
1473def urlencode_postdata(*args, **kargs):
1474 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1475
1476
0990305d
PH
1477try:
1478 etree_iter = xml.etree.ElementTree.Element.iter
1479except AttributeError: # Python <=2.6
1480 etree_iter = lambda n: n.findall('.//*')
1481
1482
bcf89ce6
PH
1483def parse_xml(s):
1484 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1485 def doctype(self, name, pubid, system):
1486 pass # Ignore doctypes
1487
1488 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1489 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1490 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1491 # Fix up XML parser in Python 2.x
1492 if sys.version_info < (3, 0):
1493 for n in etree_iter(tree):
1494 if n.text is not None:
1495 if not isinstance(n.text, compat_str):
1496 n.text = n.text.decode('utf-8')
1497 return tree
e68301af
PH
1498
1499
a1a530b0
PH
1500US_RATINGS = {
1501 'G': 0,
1502 'PG': 10,
1503 'PG-13': 13,
1504 'R': 16,
1505 'NC': 18,
1506}
fac55558
PH
1507
1508
146c80e2
S
1509def parse_age_limit(s):
1510 if s is None:
d838b1bd 1511 return None
146c80e2 1512 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d838b1bd 1513 return int(m.group('age')) if m else US_RATINGS.get(s, None)
146c80e2
S
1514
1515
fac55558 1516def strip_jsonp(code):
609a61e3
PH
1517 return re.sub(
1518 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1519
1520
e05f6939
PH
1521def js_to_json(code):
1522 def fix_kv(m):
e7b6d122
PH
1523 v = m.group(0)
1524 if v in ('true', 'false', 'null'):
1525 return v
1526 if v.startswith('"'):
1527 return v
1528 if v.startswith("'"):
1529 v = v[1:-1]
1530 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1531 '\\\\': '\\\\',
1532 "\\'": "'",
1533 '"': '\\"',
1534 }[m.group(0)], v)
1535 return '"%s"' % v
e05f6939
PH
1536
1537 res = re.sub(r'''(?x)
e7b6d122
PH
1538 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1539 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1540 [a-zA-Z_][a-zA-Z_0-9]*
e05f6939
PH
1541 ''', fix_kv, code)
1542 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1543 return res
1544
1545
478c2c61
PH
1546def qualities(quality_ids):
1547 """ Get a numeric quality value out of a list of possible values """
1548 def q(qid):
1549 try:
1550 return quality_ids.index(qid)
1551 except ValueError:
1552 return -1
1553 return q
1554
acd69589
PH
1555
1556DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1557
a020a0dc
PH
1558
1559def limit_length(s, length):
1560 """ Add ellipses to overly long strings """
1561 if s is None:
1562 return None
1563 ELLIPSES = '...'
1564 if len(s) > length:
1565 return s[:length - len(ELLIPSES)] + ELLIPSES
1566 return s
48844745
PH
1567
1568
1569def version_tuple(v):
5f9b8394 1570 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1571
1572
1573def is_outdated_version(version, limit, assume_new=True):
1574 if not version:
1575 return not assume_new
1576 try:
1577 return version_tuple(version) < version_tuple(limit)
1578 except ValueError:
1579 return not assume_new
732ea2f0
PH
1580
1581
1582def ytdl_is_updateable():
1583 """ Returns if youtube-dl can be updated with -U """
1584 from zipimport import zipimporter
1585
1586 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1587
1588
1589def args_to_str(args):
1590 # Get a short string representation for a subprocess command
1591 return ' '.join(shlex_quote(a) for a in args)
2ccd1b10
PH
1592
1593
1594def urlhandle_detect_ext(url_handle):
1595 try:
1596 url_handle.headers
1597 getheader = lambda h: url_handle.headers[h]
1598 except AttributeError: # Python < 3
1599 getheader = url_handle.info().getheader
1600
b55ee18f
PH
1601 cd = getheader('Content-Disposition')
1602 if cd:
1603 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1604 if m:
1605 e = determine_ext(m.group('filename'), default_ext=None)
1606 if e:
1607 return e
1608
2ccd1b10 1609 return getheader('Content-Type').split("/")[1]
05900629
PH
1610
1611
1612def age_restricted(content_limit, age_limit):
1613 """ Returns True iff the content should be blocked """
1614
1615 if age_limit is None: # No limit set
1616 return False
1617 if content_limit is None:
1618 return False # Content available for everyone
1619 return age_limit < content_limit
61ca9a80
PH
1620
1621
1622def is_html(first_bytes):
1623 """ Detect whether a file contains HTML by examining its first bytes. """
1624
1625 BOMS = [
1626 (b'\xef\xbb\xbf', 'utf-8'),
1627 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1628 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1629 (b'\xff\xfe', 'utf-16-le'),
1630 (b'\xfe\xff', 'utf-16-be'),
1631 ]
1632 for bom, enc in BOMS:
1633 if first_bytes.startswith(bom):
1634 s = first_bytes[len(bom):].decode(enc, 'replace')
1635 break
1636 else:
1637 s = first_bytes.decode('utf-8', 'replace')
1638
1639 return re.match(r'^\s*<', s)
a055469f
PH
1640
1641
1642def determine_protocol(info_dict):
1643 protocol = info_dict.get('protocol')
1644 if protocol is not None:
1645 return protocol
1646
1647 url = info_dict['url']
1648 if url.startswith('rtmp'):
1649 return 'rtmp'
1650 elif url.startswith('mms'):
1651 return 'mms'
1652 elif url.startswith('rtsp'):
1653 return 'rtsp'
1654
1655 ext = determine_ext(url)
1656 if ext == 'm3u8':
1657 return 'm3u8'
1658 elif ext == 'f4m':
1659 return 'f4m'
1660
1661 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
1662
1663
1664def render_table(header_row, data):
1665 """ Render a list of rows, each as a list of values """
1666 table = [header_row] + data
1667 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1668 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1669 return '\n'.join(format_str % tuple(row) for row in table)