]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
README: Recommend using flake8 instead of pyflake and pep8 separately
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
912b38b4 6import calendar
676eb3f2 7import codecs
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96
PH
10import datetime
11import email.utils
f45c185f 12import errno
be4a824d 13import functools
d77c3dfd 14import gzip
b7ab0590 15import itertools
03f9daab 16import io
f4bfd65f 17import json
d77c3dfd 18import locale
02dbf93f 19import math
d77c3dfd 20import os
4eb7f1d1 21import pipes
c496ca96 22import platform
d77c3dfd 23import re
13ebea79 24import ssl
c496ca96 25import socket
b53466e1 26import struct
1c088fa8 27import subprocess
d77c3dfd 28import sys
181c8655 29import tempfile
01951dda 30import traceback
bcf89ce6 31import xml.etree.ElementTree
d77c3dfd 32import zlib
d77c3dfd 33
8c25f81b
PH
34from .compat import (
35 compat_chr,
36 compat_getenv,
37 compat_html_entities,
be4a824d 38 compat_http_client,
8c25f81b 39 compat_parse_qs,
be4a824d 40 compat_socket_create_connection,
8c25f81b
PH
41 compat_str,
42 compat_urllib_error,
43 compat_urllib_parse,
44 compat_urllib_parse_urlparse,
45 compat_urllib_request,
46 compat_urlparse,
7d4111ed 47 shlex_quote,
8c25f81b 48)
4644ac55
S
49
50
468e2e92
FV
51# This is not clearly defined otherwise
52compiled_regex_type = type(re.compile(''))
53
3e669f36 54std_headers = {
ae8f7871 55 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
56 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
57 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
58 'Accept-Encoding': 'gzip, deflate',
59 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 60}
f427df17 61
5f6a1245 62
d77c3dfd 63def preferredencoding():
59ae15a5 64 """Get preferred encoding.
d77c3dfd 65
59ae15a5
PH
66 Returns the best encoding scheme for the system, based on
67 locale.getpreferredencoding() and some further tweaks.
68 """
69 try:
70 pref = locale.getpreferredencoding()
28e614de 71 'TEST'.encode(pref)
59ae15a5
PH
72 except:
73 pref = 'UTF-8'
bae611f2 74
59ae15a5 75 return pref
d77c3dfd 76
f4bfd65f 77
181c8655 78def write_json_file(obj, fn):
1394646a 79 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 80
92120217 81 fn = encodeFilename(fn)
61ee5aeb 82 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
83 encoding = get_filesystem_encoding()
84 # os.path.basename returns a bytes object, but NamedTemporaryFile
85 # will fail if the filename contains non ascii characters unless we
86 # use a unicode object
87 path_basename = lambda f: os.path.basename(fn).decode(encoding)
88 # the same for os.path.dirname
89 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
90 else:
91 path_basename = os.path.basename
92 path_dirname = os.path.dirname
93
73159f99
S
94 args = {
95 'suffix': '.tmp',
ec5f6016
JMF
96 'prefix': path_basename(fn) + '.',
97 'dir': path_dirname(fn),
73159f99
S
98 'delete': False,
99 }
100
181c8655
PH
101 # In Python 2.x, json.dump expects a bytestream.
102 # In Python 3.x, it writes to a character stream
103 if sys.version_info < (3, 0):
73159f99 104 args['mode'] = 'wb'
181c8655 105 else:
73159f99
S
106 args.update({
107 'mode': 'w',
108 'encoding': 'utf-8',
109 })
110
111 tf = tempfile.NamedTemporaryFile(**args)
181c8655
PH
112
113 try:
114 with tf:
115 json.dump(obj, tf)
1394646a
IK
116 if sys.platform == 'win32':
117 # Need to remove existing file on Windows, else os.rename raises
118 # WindowsError or FileExistsError.
119 try:
120 os.unlink(fn)
121 except OSError:
122 pass
181c8655
PH
123 os.rename(tf.name, fn)
124 except:
125 try:
126 os.remove(tf.name)
127 except OSError:
128 pass
129 raise
130
131
132if sys.version_info >= (2, 7):
59ae56fa
PH
133 def find_xpath_attr(node, xpath, key, val):
134 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
135 assert re.match(r'^[a-zA-Z-]+$', key)
136 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
ab4ee31e 137 expr = xpath + "[@%s='%s']" % (key, val)
59ae56fa
PH
138 return node.find(expr)
139else:
140 def find_xpath_attr(node, xpath, key, val):
4eefbfdb
PH
141 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
142 # .//node does not match if a node is a direct child of . !
143 if isinstance(xpath, unicode):
144 xpath = xpath.encode('ascii')
145
59ae56fa
PH
146 for f in node.findall(xpath):
147 if f.attrib.get(key) == val:
148 return f
149 return None
150
d7e66d39
JMF
151# On python2.6 the xml.etree.ElementTree.Element methods don't support
152# the namespace parameter
5f6a1245
JW
153
154
d7e66d39
JMF
155def xpath_with_ns(path, ns_map):
156 components = [c.split(':') for c in path.split('/')]
157 replaced = []
158 for c in components:
159 if len(c) == 1:
160 replaced.append(c[0])
161 else:
162 ns, tag = c
163 replaced.append('{%s}%s' % (ns_map[ns], tag))
164 return '/'.join(replaced)
165
d77c3dfd 166
bf0ff932 167def xpath_text(node, xpath, name=None, fatal=False):
d74bebd5
PH
168 if sys.version_info < (2, 7): # Crazy 2.6
169 xpath = xpath.encode('ascii')
170
bf0ff932 171 n = node.find(xpath)
42bdd9d0 172 if n is None or n.text is None:
bf0ff932
PH
173 if fatal:
174 name = xpath if name is None else name
175 raise ExtractorError('Could not find XML element %s' % name)
176 else:
177 return None
178 return n.text
179
180
9e6dd238 181def get_element_by_id(id, html):
43e8fafd
ND
182 """Return the content of the tag with the specified ID in the passed HTML document"""
183 return get_element_by_attribute("id", id, html)
184
12ea2f30 185
43e8fafd
ND
186def get_element_by_attribute(attribute, value, html):
187 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 188
38285056
PH
189 m = re.search(r'''(?xs)
190 <([a-zA-Z0-9:._-]+)
191 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
192 \s+%s=['"]?%s['"]?
193 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
194 \s*>
195 (?P<content>.*?)
196 </\1>
197 ''' % (re.escape(attribute), re.escape(value)), html)
198
199 if not m:
200 return None
201 res = m.group('content')
202
203 if res.startswith('"') or res.startswith("'"):
204 res = res[1:-1]
a921f407 205
38285056 206 return unescapeHTML(res)
a921f407 207
9e6dd238
FV
208
209def clean_html(html):
59ae15a5 210 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
211
212 if html is None: # Convenience for sanitizing descriptions etc.
213 return html
214
59ae15a5
PH
215 # Newline vs <br />
216 html = html.replace('\n', ' ')
6b3aef80
FV
217 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
218 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
219 # Strip html tags
220 html = re.sub('<.*?>', '', html)
221 # Replace html entities
222 html = unescapeHTML(html)
7decf895 223 return html.strip()
9e6dd238
FV
224
225
d77c3dfd 226def sanitize_open(filename, open_mode):
59ae15a5
PH
227 """Try to open the given filename, and slightly tweak it if this fails.
228
229 Attempts to open the given filename. If this fails, it tries to change
230 the filename slightly, step by step, until it's either able to open it
231 or it fails and raises a final exception, like the standard open()
232 function.
233
234 It returns the tuple (stream, definitive_file_name).
235 """
236 try:
28e614de 237 if filename == '-':
59ae15a5
PH
238 if sys.platform == 'win32':
239 import msvcrt
240 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 241 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
242 stream = open(encodeFilename(filename), open_mode)
243 return (stream, filename)
244 except (IOError, OSError) as err:
f45c185f
PH
245 if err.errno in (errno.EACCES,):
246 raise
59ae15a5 247
f45c185f
PH
248 # In case of error, try to remove win32 forbidden chars
249 alt_filename = os.path.join(
b74e86f4
PH
250 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
251 for path_part in os.path.split(filename)
252 )
f45c185f
PH
253 if alt_filename == filename:
254 raise
255 else:
256 # An exception here should be caught in the caller
257 stream = open(encodeFilename(filename), open_mode)
258 return (stream, alt_filename)
d77c3dfd
FV
259
260
261def timeconvert(timestr):
59ae15a5
PH
262 """Convert RFC 2822 defined time string into system timestamp"""
263 timestamp = None
264 timetuple = email.utils.parsedate_tz(timestr)
265 if timetuple is not None:
266 timestamp = email.utils.mktime_tz(timetuple)
267 return timestamp
1c469a94 268
5f6a1245 269
796173d0 270def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
271 """Sanitizes a string so it could be used as part of a filename.
272 If restricted is set, use a stricter subset of allowed characters.
796173d0 273 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
274 """
275 def replace_insane(char):
276 if char == '?' or ord(char) < 32 or ord(char) == 127:
277 return ''
278 elif char == '"':
279 return '' if restricted else '\''
280 elif char == ':':
281 return '_-' if restricted else ' -'
282 elif char in '\\/|*<>':
283 return '_'
627dcfff 284 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
285 return '_'
286 if restricted and ord(char) > 127:
287 return '_'
288 return char
289
2aeb06d6
PH
290 # Handle timestamps
291 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 292 result = ''.join(map(replace_insane, s))
796173d0
PH
293 if not is_id:
294 while '__' in result:
295 result = result.replace('__', '_')
296 result = result.strip('_')
297 # Common case of "Foreign band name - English song title"
298 if restricted and result.startswith('-_'):
299 result = result[2:]
300 if not result:
301 result = '_'
59ae15a5 302 return result
d77c3dfd 303
5f6a1245 304
d77c3dfd 305def orderedSet(iterable):
59ae15a5
PH
306 """ Remove all duplicates from the input iterable """
307 res = []
308 for el in iterable:
309 if el not in res:
310 res.append(el)
311 return res
d77c3dfd 312
912b38b4 313
4e408e47
PH
314def _htmlentity_transform(entity):
315 """Transforms an HTML entity to a character."""
316 # Known non-numeric HTML entity
317 if entity in compat_html_entities.name2codepoint:
318 return compat_chr(compat_html_entities.name2codepoint[entity])
319
320 mobj = re.match(r'#(x?[0-9]+)', entity)
321 if mobj is not None:
322 numstr = mobj.group(1)
28e614de 323 if numstr.startswith('x'):
4e408e47 324 base = 16
28e614de 325 numstr = '0%s' % numstr
4e408e47
PH
326 else:
327 base = 10
328 return compat_chr(int(numstr, base))
329
330 # Unknown entity in name, return its literal representation
28e614de 331 return ('&%s;' % entity)
4e408e47
PH
332
333
d77c3dfd 334def unescapeHTML(s):
912b38b4
PH
335 if s is None:
336 return None
337 assert type(s) == compat_str
d77c3dfd 338
4e408e47
PH
339 return re.sub(
340 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 341
8bf48f23
PH
342
343def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
344 """
345 @param s The name of the file
346 """
d77c3dfd 347
8bf48f23 348 assert type(s) == compat_str
d77c3dfd 349
59ae15a5
PH
350 # Python 3 has a Unicode API
351 if sys.version_info >= (3, 0):
352 return s
0f00efed 353
59ae15a5 354 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
28e614de 355 # Pass '' directly to use Unicode APIs on Windows 2000 and up
59ae15a5
PH
356 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
357 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
358 if not for_subprocess:
359 return s
360 else:
361 # For subprocess calls, encode with locale encoding
362 # Refer to http://stackoverflow.com/a/9951851/35070
363 encoding = preferredencoding()
59ae15a5 364 else:
6df40dcb 365 encoding = sys.getfilesystemencoding()
8bf48f23
PH
366 if encoding is None:
367 encoding = 'utf-8'
368 return s.encode(encoding, 'ignore')
369
f07b74fc
PH
370
371def encodeArgument(s):
372 if not isinstance(s, compat_str):
373 # Legacy code that uses byte strings
374 # Uncomment the following line after fixing all post processors
7af808a5 375 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
376 s = s.decode('ascii')
377 return encodeFilename(s, True)
378
379
8271226a
PH
380def decodeOption(optval):
381 if optval is None:
382 return optval
383 if isinstance(optval, bytes):
384 optval = optval.decode(preferredencoding())
385
386 assert isinstance(optval, compat_str)
387 return optval
1c256f70 388
5f6a1245 389
4539dd30
PH
390def formatSeconds(secs):
391 if secs > 3600:
392 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
393 elif secs > 60:
394 return '%d:%02d' % (secs // 60, secs % 60)
395 else:
396 return '%d' % secs
397
a0ddb8a2 398
be4a824d
PH
399def make_HTTPS_handler(params, **kwargs):
400 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 401 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 402 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 403 if opts_no_check_certificate:
be5f2c19 404 context.check_hostname = False
0db261ba 405 context.verify_mode = ssl.CERT_NONE
a2366922 406 try:
be4a824d 407 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
408 except TypeError:
409 # Python 2.7.8
410 # (create_default_context present but HTTPSHandler has no context=)
411 pass
412
413 if sys.version_info < (3, 2):
d7932313 414 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 415 else: # Python < 3.4
d7932313 416 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 417 context.verify_mode = (ssl.CERT_NONE
dca08720 418 if opts_no_check_certificate
ea6d901e 419 else ssl.CERT_REQUIRED)
303b479e 420 context.set_default_verify_paths()
be4a824d 421 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 422
732ea2f0 423
1c256f70
PH
424class ExtractorError(Exception):
425 """Error during info extraction."""
5f6a1245 426
d11271dd 427 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
428 """ tb, if given, is the original traceback (so that it can be printed out).
429 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
430 """
431
432 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
433 expected = True
d11271dd
PH
434 if video_id is not None:
435 msg = video_id + ': ' + msg
410f3e73 436 if cause:
28e614de 437 msg += ' (caused by %r)' % cause
9a82b238 438 if not expected:
732ea2f0
PH
439 if ytdl_is_updateable():
440 update_cmd = 'type youtube-dl -U to update'
441 else:
442 update_cmd = 'see https://yt-dl.org/update on how to update'
443 msg += '; please report this issue on https://yt-dl.org/bug .'
444 msg += ' Make sure you are using the latest version; %s.' % update_cmd
445 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
1c256f70 446 super(ExtractorError, self).__init__(msg)
d5979c5d 447
1c256f70 448 self.traceback = tb
8cc83b8d 449 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 450 self.cause = cause
d11271dd 451 self.video_id = video_id
1c256f70 452
01951dda
PH
453 def format_traceback(self):
454 if self.traceback is None:
455 return None
28e614de 456 return ''.join(traceback.format_tb(self.traceback))
01951dda 457
1c256f70 458
416c7fcb
PH
459class UnsupportedError(ExtractorError):
460 def __init__(self, url):
461 super(UnsupportedError, self).__init__(
462 'Unsupported URL: %s' % url, expected=True)
463 self.url = url
464
465
55b3e45b
JMF
466class RegexNotFoundError(ExtractorError):
467 """Error when a regex didn't match"""
468 pass
469
470
d77c3dfd 471class DownloadError(Exception):
59ae15a5 472 """Download Error exception.
d77c3dfd 473
59ae15a5
PH
474 This exception may be thrown by FileDownloader objects if they are not
475 configured to continue on errors. They will contain the appropriate
476 error message.
477 """
5f6a1245 478
8cc83b8d
FV
479 def __init__(self, msg, exc_info=None):
480 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
481 super(DownloadError, self).__init__(msg)
482 self.exc_info = exc_info
d77c3dfd
FV
483
484
485class SameFileError(Exception):
59ae15a5 486 """Same File exception.
d77c3dfd 487
59ae15a5
PH
488 This exception will be thrown by FileDownloader objects if they detect
489 multiple files would have to be downloaded to the same file on disk.
490 """
491 pass
d77c3dfd
FV
492
493
494class PostProcessingError(Exception):
59ae15a5 495 """Post Processing exception.
d77c3dfd 496
59ae15a5
PH
497 This exception may be raised by PostProcessor's .run() method to
498 indicate an error in the postprocessing task.
499 """
5f6a1245 500
7851b379
PH
501 def __init__(self, msg):
502 self.msg = msg
d77c3dfd 503
5f6a1245 504
d77c3dfd 505class MaxDownloadsReached(Exception):
59ae15a5
PH
506 """ --max-downloads limit has been reached. """
507 pass
d77c3dfd
FV
508
509
510class UnavailableVideoError(Exception):
59ae15a5 511 """Unavailable Format exception.
d77c3dfd 512
59ae15a5
PH
513 This exception will be thrown when a video is requested
514 in a format that is not available for that video.
515 """
516 pass
d77c3dfd
FV
517
518
519class ContentTooShortError(Exception):
59ae15a5 520 """Content Too Short exception.
d77c3dfd 521
59ae15a5
PH
522 This exception may be raised by FileDownloader objects when a file they
523 download is too small for what the server announced first, indicating
524 the connection was probably interrupted.
525 """
526 # Both in bytes
527 downloaded = None
528 expected = None
d77c3dfd 529
59ae15a5
PH
530 def __init__(self, downloaded, expected):
531 self.downloaded = downloaded
532 self.expected = expected
d77c3dfd 533
5f6a1245 534
c5a59d93 535def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
be4a824d
PH
536 hc = http_class(*args, **kwargs)
537 source_address = ydl_handler._params.get('source_address')
538 if source_address is not None:
539 sa = (source_address, 0)
540 if hasattr(hc, 'source_address'): # Python 2.7+
541 hc.source_address = sa
542 else: # Python 2.6
543 def _hc_connect(self, *args, **kwargs):
544 sock = compat_socket_create_connection(
545 (self.host, self.port), self.timeout, sa)
546 if is_https:
d7932313
PH
547 self.sock = ssl.wrap_socket(
548 sock, self.key_file, self.cert_file,
549 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
550 else:
551 self.sock = sock
552 hc.connect = functools.partial(_hc_connect, hc)
553
554 return hc
555
556
acebc9cd 557class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
558 """Handler for HTTP requests and responses.
559
560 This class, when installed with an OpenerDirector, automatically adds
561 the standard headers to every HTTP request and handles gzipped and
562 deflated responses from web servers. If compression is to be avoided in
563 a particular request, the original request in the program code only has
564 to include the HTTP header "Youtubedl-No-Compression", which will be
565 removed before making the real request.
566
567 Part of this code was copied from:
568
569 http://techknack.net/python-urllib2-handlers/
570
571 Andrew Rowls, the author of that code, agreed to release it to the
572 public domain.
573 """
574
be4a824d
PH
575 def __init__(self, params, *args, **kwargs):
576 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
577 self._params = params
578
579 def http_open(self, req):
580 return self.do_open(functools.partial(
c5a59d93 581 _create_http_connection, self, compat_http_client.HTTPConnection, False),
be4a824d
PH
582 req)
583
59ae15a5
PH
584 @staticmethod
585 def deflate(data):
586 try:
587 return zlib.decompress(data, -zlib.MAX_WBITS)
588 except zlib.error:
589 return zlib.decompress(data)
590
591 @staticmethod
592 def addinfourl_wrapper(stream, headers, url, code):
593 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
594 return compat_urllib_request.addinfourl(stream, headers, url, code)
595 ret = compat_urllib_request.addinfourl(stream, headers, url)
596 ret.code = code
597 return ret
598
acebc9cd 599 def http_request(self, req):
33ac271b 600 for h, v in std_headers.items():
3d5f7a39
JK
601 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
602 # The dict keys are capitalized because of this bug by urllib
603 if h.capitalize() not in req.headers:
33ac271b 604 req.add_header(h, v)
59ae15a5
PH
605 if 'Youtubedl-no-compression' in req.headers:
606 if 'Accept-encoding' in req.headers:
607 del req.headers['Accept-encoding']
608 del req.headers['Youtubedl-no-compression']
3446dfb7 609 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
610 if 'User-agent' in req.headers:
611 del req.headers['User-agent']
612 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 613 del req.headers['Youtubedl-user-agent']
989b4b2b
PH
614
615 if sys.version_info < (2, 7) and '#' in req.get_full_url():
616 # Python 2.6 is brain-dead when it comes to fragments
617 req._Request__original = req._Request__original.partition('#')[0]
618 req._Request__r_type = req._Request__r_type.partition('#')[0]
619
59ae15a5
PH
620 return req
621
acebc9cd 622 def http_response(self, req, resp):
59ae15a5
PH
623 old_resp = resp
624 # gzip
625 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
626 content = resp.read()
627 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
628 try:
629 uncompressed = io.BytesIO(gz.read())
630 except IOError as original_ioerror:
631 # There may be junk add the end of the file
632 # See http://stackoverflow.com/q/4928560/35070 for details
633 for i in range(1, 1024):
634 try:
635 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
636 uncompressed = io.BytesIO(gz.read())
637 except IOError:
638 continue
639 break
640 else:
641 raise original_ioerror
642 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
643 resp.msg = old_resp.msg
644 # deflate
645 if resp.headers.get('Content-encoding', '') == 'deflate':
646 gz = io.BytesIO(self.deflate(resp.read()))
647 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
648 resp.msg = old_resp.msg
649 return resp
0f8d03f8 650
acebc9cd
PH
651 https_request = http_request
652 https_response = http_response
bf50b038 653
5de90176 654
be4a824d
PH
655class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
656 def __init__(self, params, https_conn_class=None, *args, **kwargs):
657 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
658 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
659 self._params = params
660
661 def https_open(self, req):
662 return self.do_open(functools.partial(
663 _create_http_connection, self, self._https_conn_class, True),
664 req)
665
666
305d0683 667def parse_iso8601(date_str, delimiter='T'):
912b38b4
PH
668 """ Return a UNIX timestamp from the given date """
669
670 if date_str is None:
671 return None
672
673 m = re.search(
6ad4013d 674 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
912b38b4
PH
675 date_str)
676 if not m:
677 timezone = datetime.timedelta()
678 else:
679 date_str = date_str[:-len(m.group(0))]
680 if not m.group('sign'):
681 timezone = datetime.timedelta()
682 else:
683 sign = 1 if m.group('sign') == '+' else -1
684 timezone = datetime.timedelta(
685 hours=sign * int(m.group('hours')),
686 minutes=sign * int(m.group('minutes')))
6ad4013d 687 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
305d0683 688 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
689 return calendar.timegm(dt.timetuple())
690
691
42bdd9d0 692def unified_strdate(date_str, day_first=True):
bf50b038 693 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
694
695 if date_str is None:
696 return None
bf50b038 697 upload_date = None
5f6a1245 698 # Replace commas
026fcc04 699 date_str = date_str.replace(',', ' ')
bf50b038 700 # %z (UTC offset) is only supported in python>=3.2
026fcc04 701 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0
PH
702 # Remove AM/PM + timezone
703 date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
704
19e1d359
JMF
705 format_expressions = [
706 '%d %B %Y',
0f99566c 707 '%d %b %Y',
19e1d359
JMF
708 '%B %d %Y',
709 '%b %d %Y',
78ff59d0
PP
710 '%b %dst %Y %I:%M%p',
711 '%b %dnd %Y %I:%M%p',
712 '%b %dth %Y %I:%M%p',
a69801e2 713 '%Y %m %d',
19e1d359 714 '%Y-%m-%d',
fe556f1b 715 '%Y/%m/%d',
19e1d359 716 '%Y/%m/%d %H:%M:%S',
5d73273f 717 '%Y-%m-%d %H:%M:%S',
e9be9a6a 718 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 719 '%d.%m.%Y %H:%M',
b047de6f 720 '%d.%m.%Y %H.%M',
19e1d359 721 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
722 '%Y-%m-%dT%H:%M:%S.%fZ',
723 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 724 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 725 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 726 '%Y-%m-%dT%H:%M',
19e1d359 727 ]
42bdd9d0
PH
728 if day_first:
729 format_expressions.extend([
776dc399
S
730 '%d.%m.%Y',
731 '%d/%m/%Y',
732 '%d/%m/%y',
42bdd9d0
PH
733 '%d/%m/%Y %H:%M:%S',
734 ])
735 else:
736 format_expressions.extend([
776dc399
S
737 '%m.%d.%Y',
738 '%m/%d/%Y',
739 '%m/%d/%y',
42bdd9d0
PH
740 '%m/%d/%Y %H:%M:%S',
741 ])
bf50b038
JMF
742 for expression in format_expressions:
743 try:
744 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 745 except ValueError:
bf50b038 746 pass
42393ce2
PH
747 if upload_date is None:
748 timetuple = email.utils.parsedate_tz(date_str)
749 if timetuple:
750 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
751 return upload_date
752
5f6a1245 753
28e614de 754def determine_ext(url, default_ext='unknown_video'):
f4776371
S
755 if url is None:
756 return default_ext
28e614de 757 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
758 if re.match(r'^[A-Za-z0-9]+$', guess):
759 return guess
760 else:
cbdbb766 761 return default_ext
73e79f2a 762
5f6a1245 763
d4051a8e 764def subtitles_filename(filename, sub_lang, sub_format):
28e614de 765 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 766
5f6a1245 767
bd558525 768def date_from_str(date_str):
37254abc
JMF
769 """
770 Return a datetime object from a string in the format YYYYMMDD or
771 (now|today)[+-][0-9](day|week|month|year)(s)?"""
772 today = datetime.date.today()
f8795e10 773 if date_str in ('now', 'today'):
37254abc 774 return today
f8795e10
PH
775 if date_str == 'yesterday':
776 return today - datetime.timedelta(days=1)
37254abc
JMF
777 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
778 if match is not None:
779 sign = match.group('sign')
780 time = int(match.group('time'))
781 if sign == '-':
782 time = -time
783 unit = match.group('unit')
5f6a1245 784 # A bad aproximation?
37254abc
JMF
785 if unit == 'month':
786 unit = 'day'
787 time *= 30
788 elif unit == 'year':
789 unit = 'day'
790 time *= 365
791 unit += 's'
792 delta = datetime.timedelta(**{unit: time})
793 return today + delta
bd558525 794 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
5f6a1245
JW
795
796
e63fc1be 797def hyphenate_date(date_str):
798 """
799 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
800 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
801 if match is not None:
802 return '-'.join(match.groups())
803 else:
804 return date_str
805
5f6a1245 806
bd558525
JMF
807class DateRange(object):
808 """Represents a time interval between two dates"""
5f6a1245 809
bd558525
JMF
810 def __init__(self, start=None, end=None):
811 """start and end must be strings in the format accepted by date"""
812 if start is not None:
813 self.start = date_from_str(start)
814 else:
815 self.start = datetime.datetime.min.date()
816 if end is not None:
817 self.end = date_from_str(end)
818 else:
819 self.end = datetime.datetime.max.date()
37254abc 820 if self.start > self.end:
bd558525 821 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 822
bd558525
JMF
823 @classmethod
824 def day(cls, day):
825 """Returns a range that only contains the given day"""
5f6a1245
JW
826 return cls(day, day)
827
bd558525
JMF
828 def __contains__(self, date):
829 """Check if the date is in the range"""
37254abc
JMF
830 if not isinstance(date, datetime.date):
831 date = date_from_str(date)
832 return self.start <= date <= self.end
5f6a1245 833
bd558525 834 def __str__(self):
5f6a1245 835 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
836
837
838def platform_name():
839 """ Returns the platform name as a compat_str """
840 res = platform.platform()
841 if isinstance(res, bytes):
842 res = res.decode(preferredencoding())
843
844 assert isinstance(res, compat_str)
845 return res
c257baff
PH
846
847
b58ddb32
PH
848def _windows_write_string(s, out):
849 """ Returns True if the string was written using special methods,
850 False if it has yet to be written out."""
851 # Adapted from http://stackoverflow.com/a/3259271/35070
852
853 import ctypes
854 import ctypes.wintypes
855
856 WIN_OUTPUT_IDS = {
857 1: -11,
858 2: -12,
859 }
860
a383a98a
PH
861 try:
862 fileno = out.fileno()
863 except AttributeError:
864 # If the output stream doesn't have a fileno, it's virtual
865 return False
aa42e873
PH
866 except io.UnsupportedOperation:
867 # Some strange Windows pseudo files?
868 return False
b58ddb32
PH
869 if fileno not in WIN_OUTPUT_IDS:
870 return False
871
e2f89ec7 872 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 873 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
6ac4e806 874 (b"GetStdHandle", ctypes.windll.kernel32))
b58ddb32
PH
875 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
876
e2f89ec7 877 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
878 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
879 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
6ac4e806 880 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
b58ddb32
PH
881 written = ctypes.wintypes.DWORD(0)
882
6ac4e806 883 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
b58ddb32
PH
884 FILE_TYPE_CHAR = 0x0002
885 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 886 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
887 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
888 ctypes.POINTER(ctypes.wintypes.DWORD))(
6ac4e806 889 (b"GetConsoleMode", ctypes.windll.kernel32))
b58ddb32
PH
890 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
891
892 def not_a_console(handle):
893 if handle == INVALID_HANDLE_VALUE or handle is None:
894 return True
895 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
896 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
897
898 if not_a_console(h):
899 return False
900
d1b9c912
PH
901 def next_nonbmp_pos(s):
902 try:
903 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
904 except StopIteration:
905 return len(s)
906
907 while s:
908 count = min(next_nonbmp_pos(s), 1024)
909
b58ddb32 910 ret = WriteConsoleW(
d1b9c912 911 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
912 if ret == 0:
913 raise OSError('Failed to write string')
d1b9c912
PH
914 if not count: # We just wrote a non-BMP character
915 assert written.value == 2
916 s = s[1:]
917 else:
918 assert written.value > 0
919 s = s[written.value:]
b58ddb32
PH
920 return True
921
922
734f90bb 923def write_string(s, out=None, encoding=None):
7459e3a2
PH
924 if out is None:
925 out = sys.stderr
8bf48f23 926 assert type(s) == compat_str
7459e3a2 927
b58ddb32
PH
928 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
929 if _windows_write_string(s, out):
930 return
931
7459e3a2
PH
932 if ('b' in getattr(out, 'mode', '') or
933 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
934 byt = s.encode(encoding or preferredencoding(), 'ignore')
935 out.write(byt)
936 elif hasattr(out, 'buffer'):
937 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
938 byt = s.encode(enc, 'ignore')
939 out.buffer.write(byt)
940 else:
8bf48f23 941 out.write(s)
7459e3a2
PH
942 out.flush()
943
944
48ea9cea
PH
945def bytes_to_intlist(bs):
946 if not bs:
947 return []
948 if isinstance(bs[0], int): # Python 3
949 return list(bs)
950 else:
951 return [ord(c) for c in bs]
952
c257baff 953
cba892fa 954def intlist_to_bytes(xs):
955 if not xs:
956 return b''
eb4157fd 957 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
958
959
c1c9a79c
PH
960# Cross-platform file locking
961if sys.platform == 'win32':
962 import ctypes.wintypes
963 import msvcrt
964
965 class OVERLAPPED(ctypes.Structure):
966 _fields_ = [
967 ('Internal', ctypes.wintypes.LPVOID),
968 ('InternalHigh', ctypes.wintypes.LPVOID),
969 ('Offset', ctypes.wintypes.DWORD),
970 ('OffsetHigh', ctypes.wintypes.DWORD),
971 ('hEvent', ctypes.wintypes.HANDLE),
972 ]
973
974 kernel32 = ctypes.windll.kernel32
975 LockFileEx = kernel32.LockFileEx
976 LockFileEx.argtypes = [
977 ctypes.wintypes.HANDLE, # hFile
978 ctypes.wintypes.DWORD, # dwFlags
979 ctypes.wintypes.DWORD, # dwReserved
980 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
981 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
982 ctypes.POINTER(OVERLAPPED) # Overlapped
983 ]
984 LockFileEx.restype = ctypes.wintypes.BOOL
985 UnlockFileEx = kernel32.UnlockFileEx
986 UnlockFileEx.argtypes = [
987 ctypes.wintypes.HANDLE, # hFile
988 ctypes.wintypes.DWORD, # dwReserved
989 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
990 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
991 ctypes.POINTER(OVERLAPPED) # Overlapped
992 ]
993 UnlockFileEx.restype = ctypes.wintypes.BOOL
994 whole_low = 0xffffffff
995 whole_high = 0x7fffffff
996
997 def _lock_file(f, exclusive):
998 overlapped = OVERLAPPED()
999 overlapped.Offset = 0
1000 overlapped.OffsetHigh = 0
1001 overlapped.hEvent = 0
1002 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1003 handle = msvcrt.get_osfhandle(f.fileno())
1004 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1005 whole_low, whole_high, f._lock_file_overlapped_p):
1006 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1007
1008 def _unlock_file(f):
1009 assert f._lock_file_overlapped_p
1010 handle = msvcrt.get_osfhandle(f.fileno())
1011 if not UnlockFileEx(handle, 0,
1012 whole_low, whole_high, f._lock_file_overlapped_p):
1013 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1014
1015else:
1016 import fcntl
1017
1018 def _lock_file(f, exclusive):
2582bebe 1019 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
1020
1021 def _unlock_file(f):
2582bebe 1022 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
1023
1024
1025class locked_file(object):
1026 def __init__(self, filename, mode, encoding=None):
1027 assert mode in ['r', 'a', 'w']
1028 self.f = io.open(filename, mode, encoding=encoding)
1029 self.mode = mode
1030
1031 def __enter__(self):
1032 exclusive = self.mode != 'r'
1033 try:
1034 _lock_file(self.f, exclusive)
1035 except IOError:
1036 self.f.close()
1037 raise
1038 return self
1039
1040 def __exit__(self, etype, value, traceback):
1041 try:
1042 _unlock_file(self.f)
1043 finally:
1044 self.f.close()
1045
1046 def __iter__(self):
1047 return iter(self.f)
1048
1049 def write(self, *args):
1050 return self.f.write(*args)
1051
1052 def read(self, *args):
1053 return self.f.read(*args)
4eb7f1d1
JMF
1054
1055
4644ac55
S
1056def get_filesystem_encoding():
1057 encoding = sys.getfilesystemencoding()
1058 return encoding if encoding is not None else 'utf-8'
1059
1060
4eb7f1d1 1061def shell_quote(args):
a6a173c2 1062 quoted_args = []
4644ac55 1063 encoding = get_filesystem_encoding()
a6a173c2
JMF
1064 for a in args:
1065 if isinstance(a, bytes):
1066 # We may get a filename encoded with 'encodeFilename'
1067 a = a.decode(encoding)
1068 quoted_args.append(pipes.quote(a))
28e614de 1069 return ' '.join(quoted_args)
9d4660ca
PH
1070
1071
f4d96df0
PH
1072def takewhile_inclusive(pred, seq):
1073 """ Like itertools.takewhile, but include the latest evaluated element
1074 (the first element so that Not pred(e)) """
1075 for e in seq:
1076 yield e
1077 if not pred(e):
1078 return
1079
1080
9d4660ca
PH
1081def smuggle_url(url, data):
1082 """ Pass additional data in a URL for internal use. """
1083
1084 sdata = compat_urllib_parse.urlencode(
28e614de
PH
1085 {'__youtubedl_smuggle': json.dumps(data)})
1086 return url + '#' + sdata
9d4660ca
PH
1087
1088
79f82953 1089def unsmuggle_url(smug_url, default=None):
83e865a3 1090 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1091 return smug_url, default
28e614de
PH
1092 url, _, sdata = smug_url.rpartition('#')
1093 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1094 data = json.loads(jsond)
1095 return url, data
02dbf93f
PH
1096
1097
02dbf93f
PH
1098def format_bytes(bytes):
1099 if bytes is None:
28e614de 1100 return 'N/A'
02dbf93f
PH
1101 if type(bytes) is str:
1102 bytes = float(bytes)
1103 if bytes == 0.0:
1104 exponent = 0
1105 else:
1106 exponent = int(math.log(bytes, 1024.0))
28e614de 1107 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1108 converted = float(bytes) / float(1024 ** exponent)
28e614de 1109 return '%.2f%s' % (converted, suffix)
f53c966a 1110
1c088fa8 1111
be64b5b0
PH
1112def parse_filesize(s):
1113 if s is None:
1114 return None
1115
1116 # The lower-case forms are of course incorrect and inofficial,
1117 # but we support those too
1118 _UNIT_TABLE = {
1119 'B': 1,
1120 'b': 1,
1121 'KiB': 1024,
1122 'KB': 1000,
1123 'kB': 1024,
1124 'Kb': 1000,
1125 'MiB': 1024 ** 2,
1126 'MB': 1000 ** 2,
1127 'mB': 1024 ** 2,
1128 'Mb': 1000 ** 2,
1129 'GiB': 1024 ** 3,
1130 'GB': 1000 ** 3,
1131 'gB': 1024 ** 3,
1132 'Gb': 1000 ** 3,
1133 'TiB': 1024 ** 4,
1134 'TB': 1000 ** 4,
1135 'tB': 1024 ** 4,
1136 'Tb': 1000 ** 4,
1137 'PiB': 1024 ** 5,
1138 'PB': 1000 ** 5,
1139 'pB': 1024 ** 5,
1140 'Pb': 1000 ** 5,
1141 'EiB': 1024 ** 6,
1142 'EB': 1000 ** 6,
1143 'eB': 1024 ** 6,
1144 'Eb': 1000 ** 6,
1145 'ZiB': 1024 ** 7,
1146 'ZB': 1000 ** 7,
1147 'zB': 1024 ** 7,
1148 'Zb': 1000 ** 7,
1149 'YiB': 1024 ** 8,
1150 'YB': 1000 ** 8,
1151 'yB': 1024 ** 8,
1152 'Yb': 1000 ** 8,
1153 }
1154
1155 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
4349c07d
PH
1156 m = re.match(
1157 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
be64b5b0
PH
1158 if not m:
1159 return None
1160
4349c07d
PH
1161 num_str = m.group('num').replace(',', '.')
1162 mult = _UNIT_TABLE[m.group('unit')]
1163 return int(float(num_str) * mult)
be64b5b0
PH
1164
1165
1c088fa8 1166def get_term_width():
4644ac55 1167 columns = compat_getenv('COLUMNS', None)
1c088fa8
PH
1168 if columns:
1169 return int(columns)
1170
1171 try:
1172 sp = subprocess.Popen(
1173 ['stty', 'size'],
1174 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1175 out, err = sp.communicate()
1176 return int(out.split()[1])
1177 except:
1178 pass
1179 return None
caefb1de
PH
1180
1181
1182def month_by_name(name):
1183 """ Return the number of a month by (locale-independently) English name """
1184
1185 ENGLISH_NAMES = [
28e614de
PH
1186 'January', 'February', 'March', 'April', 'May', 'June',
1187 'July', 'August', 'September', 'October', 'November', 'December']
caefb1de
PH
1188 try:
1189 return ENGLISH_NAMES.index(name) + 1
1190 except ValueError:
1191 return None
18258362
JMF
1192
1193
5aafe895 1194def fix_xml_ampersands(xml_str):
18258362 1195 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1196 return re.sub(
1197 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1198 '&amp;',
5aafe895 1199 xml_str)
e3946f98
PH
1200
1201
1202def setproctitle(title):
8bf48f23 1203 assert isinstance(title, compat_str)
e3946f98
PH
1204 try:
1205 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1206 except OSError:
1207 return
6eefe533
PH
1208 title_bytes = title.encode('utf-8')
1209 buf = ctypes.create_string_buffer(len(title_bytes))
1210 buf.value = title_bytes
e3946f98 1211 try:
6eefe533 1212 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1213 except AttributeError:
1214 return # Strange libc, just skip this
d7dda168
PH
1215
1216
1217def remove_start(s, start):
1218 if s.startswith(start):
1219 return s[len(start):]
1220 return s
29eb5174
PH
1221
1222
2b9faf55
PH
1223def remove_end(s, end):
1224 if s.endswith(end):
1225 return s[:-len(end)]
1226 return s
1227
1228
29eb5174 1229def url_basename(url):
9b8aaeed 1230 path = compat_urlparse.urlparse(url).path
28e614de 1231 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1232
1233
1234class HEADRequest(compat_urllib_request.Request):
1235 def get_method(self):
1236 return "HEAD"
7217e148
PH
1237
1238
9732d77e 1239def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1240 if get_attr:
1241 if v is not None:
1242 v = getattr(v, get_attr, None)
9572013d
PH
1243 if v == '':
1244 v = None
9732d77e
PH
1245 return default if v is None else (int(v) * invscale // scale)
1246
9572013d 1247
40a90862
JMF
1248def str_or_none(v, default=None):
1249 return default if v is None else compat_str(v)
1250
9732d77e
PH
1251
1252def str_to_int(int_str):
48d4681e 1253 """ A more relaxed version of int_or_none """
9732d77e
PH
1254 if int_str is None:
1255 return None
28e614de 1256 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1257 return int(int_str)
608d11f5
PH
1258
1259
9732d77e
PH
1260def float_or_none(v, scale=1, invscale=1, default=None):
1261 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1262
1263
608d11f5 1264def parse_duration(s):
227d4822 1265 if not isinstance(s, basestring if sys.version_info < (3, 0) else compat_str):
608d11f5
PH
1266 return None
1267
ca7b3246
S
1268 s = s.strip()
1269
608d11f5 1270 m = re.match(
9d22a7df 1271 r'''(?ix)(?:P?T)?
e8df5cee
PH
1272 (?:
1273 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1274 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1275
6a68bb57
PH
1276 (?:
1277 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1278 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1279 )?
e8df5cee
PH
1280 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1281 )$''', s)
608d11f5
PH
1282 if not m:
1283 return None
e8df5cee
PH
1284 res = 0
1285 if m.group('only_mins'):
1286 return float_or_none(m.group('only_mins'), invscale=60)
1287 if m.group('only_hours'):
1288 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1289 if m.group('secs'):
1290 res += int(m.group('secs'))
608d11f5
PH
1291 if m.group('mins'):
1292 res += int(m.group('mins')) * 60
e8df5cee
PH
1293 if m.group('hours'):
1294 res += int(m.group('hours')) * 60 * 60
7adcbe75
PH
1295 if m.group('ms'):
1296 res += float(m.group('ms'))
608d11f5 1297 return res
91d7d0b3
JMF
1298
1299
1300def prepend_extension(filename, ext):
5f6a1245 1301 name, real_ext = os.path.splitext(filename)
28e614de 1302 return '{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1303
1304
1305def check_executable(exe, args=[]):
1306 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1307 args can be a list of arguments for a short output (like -version) """
1308 try:
1309 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1310 except OSError:
1311 return False
1312 return exe
b7ab0590
PH
1313
1314
95807118 1315def get_exe_version(exe, args=['--version'],
cae97f65 1316 version_re=None, unrecognized='present'):
95807118
PH
1317 """ Returns the version of the specified executable,
1318 or False if the executable is not present """
1319 try:
cae97f65 1320 out, _ = subprocess.Popen(
95807118
PH
1321 [exe] + args,
1322 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1323 except OSError:
1324 return False
cae97f65
PH
1325 if isinstance(out, bytes): # Python 2.x
1326 out = out.decode('ascii', 'ignore')
1327 return detect_exe_version(out, version_re, unrecognized)
1328
1329
1330def detect_exe_version(output, version_re=None, unrecognized='present'):
1331 assert isinstance(output, compat_str)
1332 if version_re is None:
1333 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1334 m = re.search(version_re, output)
95807118
PH
1335 if m:
1336 return m.group(1)
1337 else:
1338 return unrecognized
1339
1340
b7ab0590 1341class PagedList(object):
dd26ced1
PH
1342 def __len__(self):
1343 # This is only useful for tests
1344 return len(self.getslice())
1345
9c44d242
PH
1346
1347class OnDemandPagedList(PagedList):
1348 def __init__(self, pagefunc, pagesize):
1349 self._pagefunc = pagefunc
1350 self._pagesize = pagesize
1351
b7ab0590
PH
1352 def getslice(self, start=0, end=None):
1353 res = []
1354 for pagenum in itertools.count(start // self._pagesize):
1355 firstid = pagenum * self._pagesize
1356 nextfirstid = pagenum * self._pagesize + self._pagesize
1357 if start >= nextfirstid:
1358 continue
1359
1360 page_results = list(self._pagefunc(pagenum))
1361
1362 startv = (
1363 start % self._pagesize
1364 if firstid <= start < nextfirstid
1365 else 0)
1366
1367 endv = (
1368 ((end - 1) % self._pagesize) + 1
1369 if (end is not None and firstid <= end <= nextfirstid)
1370 else None)
1371
1372 if startv != 0 or endv is not None:
1373 page_results = page_results[startv:endv]
1374 res.extend(page_results)
1375
1376 # A little optimization - if current page is not "full", ie. does
1377 # not contain page_size videos then we can assume that this page
1378 # is the last one - there are no more ids on further pages -
1379 # i.e. no need to query again.
1380 if len(page_results) + startv < self._pagesize:
1381 break
1382
1383 # If we got the whole page, but the next page is not interesting,
1384 # break out early as well
1385 if end == nextfirstid:
1386 break
1387 return res
81c2f20b
PH
1388
1389
9c44d242
PH
1390class InAdvancePagedList(PagedList):
1391 def __init__(self, pagefunc, pagecount, pagesize):
1392 self._pagefunc = pagefunc
1393 self._pagecount = pagecount
1394 self._pagesize = pagesize
1395
1396 def getslice(self, start=0, end=None):
1397 res = []
1398 start_page = start // self._pagesize
1399 end_page = (
1400 self._pagecount if end is None else (end // self._pagesize + 1))
1401 skip_elems = start - start_page * self._pagesize
1402 only_more = None if end is None else end - start
1403 for pagenum in range(start_page, end_page):
1404 page = list(self._pagefunc(pagenum))
1405 if skip_elems:
1406 page = page[skip_elems:]
1407 skip_elems = None
1408 if only_more is not None:
1409 if len(page) < only_more:
1410 only_more -= len(page)
1411 else:
1412 page = page[:only_more]
1413 res.extend(page)
1414 break
1415 res.extend(page)
1416 return res
1417
1418
81c2f20b 1419def uppercase_escape(s):
676eb3f2 1420 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1421 return re.sub(
a612753d 1422 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1423 lambda m: unicode_escape(m.group(0))[0],
1424 s)
b53466e1 1425
d05cfe06
S
1426
1427def escape_rfc3986(s):
1428 """Escape non-ASCII characters as suggested by RFC 3986"""
1429 if sys.version_info < (3, 0) and isinstance(s, unicode):
1430 s = s.encode('utf-8')
ecc0c5ee 1431 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1432
1433
1434def escape_url(url):
1435 """Escape URL as suggested by RFC 3986"""
1436 url_parsed = compat_urllib_parse_urlparse(url)
1437 return url_parsed._replace(
1438 path=escape_rfc3986(url_parsed.path),
1439 params=escape_rfc3986(url_parsed.params),
1440 query=escape_rfc3986(url_parsed.query),
1441 fragment=escape_rfc3986(url_parsed.fragment)
1442 ).geturl()
1443
b53466e1 1444try:
28e614de 1445 struct.pack('!I', 0)
b53466e1
PH
1446except TypeError:
1447 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1448 def struct_pack(spec, *args):
1449 if isinstance(spec, compat_str):
1450 spec = spec.encode('ascii')
1451 return struct.pack(spec, *args)
1452
1453 def struct_unpack(spec, *args):
1454 if isinstance(spec, compat_str):
1455 spec = spec.encode('ascii')
1456 return struct.unpack(spec, *args)
1457else:
1458 struct_pack = struct.pack
1459 struct_unpack = struct.unpack
62e609ab
PH
1460
1461
1462def read_batch_urls(batch_fd):
1463 def fixup(url):
1464 if not isinstance(url, compat_str):
1465 url = url.decode('utf-8', 'replace')
28e614de 1466 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1467 if url.startswith(BOM_UTF8):
1468 url = url[len(BOM_UTF8):]
1469 url = url.strip()
1470 if url.startswith(('#', ';', ']')):
1471 return False
1472 return url
1473
1474 with contextlib.closing(batch_fd) as fd:
1475 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1476
1477
1478def urlencode_postdata(*args, **kargs):
1479 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1480
1481
0990305d
PH
1482try:
1483 etree_iter = xml.etree.ElementTree.Element.iter
1484except AttributeError: # Python <=2.6
1485 etree_iter = lambda n: n.findall('.//*')
1486
1487
bcf89ce6
PH
1488def parse_xml(s):
1489 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1490 def doctype(self, name, pubid, system):
1491 pass # Ignore doctypes
1492
1493 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1494 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1495 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1496 # Fix up XML parser in Python 2.x
1497 if sys.version_info < (3, 0):
1498 for n in etree_iter(tree):
1499 if n.text is not None:
1500 if not isinstance(n.text, compat_str):
1501 n.text = n.text.decode('utf-8')
1502 return tree
e68301af
PH
1503
1504
a1a530b0
PH
1505US_RATINGS = {
1506 'G': 0,
1507 'PG': 10,
1508 'PG-13': 13,
1509 'R': 16,
1510 'NC': 18,
1511}
fac55558
PH
1512
1513
146c80e2
S
1514def parse_age_limit(s):
1515 if s is None:
d838b1bd 1516 return None
146c80e2 1517 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d838b1bd 1518 return int(m.group('age')) if m else US_RATINGS.get(s, None)
146c80e2
S
1519
1520
fac55558 1521def strip_jsonp(code):
609a61e3
PH
1522 return re.sub(
1523 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1524
1525
e05f6939
PH
1526def js_to_json(code):
1527 def fix_kv(m):
e7b6d122
PH
1528 v = m.group(0)
1529 if v in ('true', 'false', 'null'):
1530 return v
1531 if v.startswith('"'):
1532 return v
1533 if v.startswith("'"):
1534 v = v[1:-1]
1535 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1536 '\\\\': '\\\\',
1537 "\\'": "'",
1538 '"': '\\"',
1539 }[m.group(0)], v)
1540 return '"%s"' % v
e05f6939
PH
1541
1542 res = re.sub(r'''(?x)
e7b6d122
PH
1543 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1544 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1545 [a-zA-Z_][a-zA-Z_0-9]*
e05f6939
PH
1546 ''', fix_kv, code)
1547 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1548 return res
1549
1550
478c2c61
PH
1551def qualities(quality_ids):
1552 """ Get a numeric quality value out of a list of possible values """
1553 def q(qid):
1554 try:
1555 return quality_ids.index(qid)
1556 except ValueError:
1557 return -1
1558 return q
1559
acd69589
PH
1560
1561DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1562
a020a0dc
PH
1563
1564def limit_length(s, length):
1565 """ Add ellipses to overly long strings """
1566 if s is None:
1567 return None
1568 ELLIPSES = '...'
1569 if len(s) > length:
1570 return s[:length - len(ELLIPSES)] + ELLIPSES
1571 return s
48844745
PH
1572
1573
1574def version_tuple(v):
5f9b8394 1575 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1576
1577
1578def is_outdated_version(version, limit, assume_new=True):
1579 if not version:
1580 return not assume_new
1581 try:
1582 return version_tuple(version) < version_tuple(limit)
1583 except ValueError:
1584 return not assume_new
732ea2f0
PH
1585
1586
1587def ytdl_is_updateable():
1588 """ Returns if youtube-dl can be updated with -U """
1589 from zipimport import zipimporter
1590
1591 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1592
1593
1594def args_to_str(args):
1595 # Get a short string representation for a subprocess command
1596 return ' '.join(shlex_quote(a) for a in args)
2ccd1b10
PH
1597
1598
1599def urlhandle_detect_ext(url_handle):
1600 try:
1601 url_handle.headers
1602 getheader = lambda h: url_handle.headers[h]
1603 except AttributeError: # Python < 3
1604 getheader = url_handle.info().getheader
1605
b55ee18f
PH
1606 cd = getheader('Content-Disposition')
1607 if cd:
1608 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1609 if m:
1610 e = determine_ext(m.group('filename'), default_ext=None)
1611 if e:
1612 return e
1613
2ccd1b10 1614 return getheader('Content-Type').split("/")[1]
05900629
PH
1615
1616
1617def age_restricted(content_limit, age_limit):
1618 """ Returns True iff the content should be blocked """
1619
1620 if age_limit is None: # No limit set
1621 return False
1622 if content_limit is None:
1623 return False # Content available for everyone
1624 return age_limit < content_limit
61ca9a80
PH
1625
1626
1627def is_html(first_bytes):
1628 """ Detect whether a file contains HTML by examining its first bytes. """
1629
1630 BOMS = [
1631 (b'\xef\xbb\xbf', 'utf-8'),
1632 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1633 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1634 (b'\xff\xfe', 'utf-16-le'),
1635 (b'\xfe\xff', 'utf-16-be'),
1636 ]
1637 for bom, enc in BOMS:
1638 if first_bytes.startswith(bom):
1639 s = first_bytes[len(bom):].decode(enc, 'replace')
1640 break
1641 else:
1642 s = first_bytes.decode('utf-8', 'replace')
1643
1644 return re.match(r'^\s*<', s)