]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
Use shutil.get_terminal_size for getting the terminal width if it's available (python...
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
912b38b4 6import calendar
676eb3f2 7import codecs
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96
PH
10import datetime
11import email.utils
f45c185f 12import errno
be4a824d 13import functools
d77c3dfd 14import gzip
b7ab0590 15import itertools
03f9daab 16import io
f4bfd65f 17import json
d77c3dfd 18import locale
02dbf93f 19import math
347de493 20import operator
d77c3dfd 21import os
4eb7f1d1 22import pipes
c496ca96 23import platform
d77c3dfd 24import re
13ebea79 25import ssl
c496ca96 26import socket
b53466e1 27import struct
1c088fa8 28import subprocess
d77c3dfd 29import sys
181c8655 30import tempfile
01951dda 31import traceback
bcf89ce6 32import xml.etree.ElementTree
d77c3dfd 33import zlib
d77c3dfd 34
8c25f81b 35from .compat import (
8f9312c3 36 compat_basestring,
8c25f81b 37 compat_chr,
8c25f81b 38 compat_html_entities,
be4a824d 39 compat_http_client,
8c25f81b 40 compat_parse_qs,
be4a824d 41 compat_socket_create_connection,
8c25f81b
PH
42 compat_str,
43 compat_urllib_error,
44 compat_urllib_parse,
45 compat_urllib_parse_urlparse,
46 compat_urllib_request,
47 compat_urlparse,
7d4111ed 48 shlex_quote,
8c25f81b 49)
4644ac55
S
50
51
468e2e92
FV
52# This is not clearly defined otherwise
53compiled_regex_type = type(re.compile(''))
54
3e669f36 55std_headers = {
18313934 56 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
59ae15a5
PH
57 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
58 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59 'Accept-Encoding': 'gzip, deflate',
60 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 61}
f427df17 62
5f6a1245 63
7105440c
YCH
64ENGLISH_MONTH_NAMES = [
65 'January', 'February', 'March', 'April', 'May', 'June',
66 'July', 'August', 'September', 'October', 'November', 'December']
67
68
d77c3dfd 69def preferredencoding():
59ae15a5 70 """Get preferred encoding.
d77c3dfd 71
59ae15a5
PH
72 Returns the best encoding scheme for the system, based on
73 locale.getpreferredencoding() and some further tweaks.
74 """
75 try:
76 pref = locale.getpreferredencoding()
28e614de 77 'TEST'.encode(pref)
59ae15a5
PH
78 except:
79 pref = 'UTF-8'
bae611f2 80
59ae15a5 81 return pref
d77c3dfd 82
f4bfd65f 83
181c8655 84def write_json_file(obj, fn):
1394646a 85 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 86
92120217 87 fn = encodeFilename(fn)
61ee5aeb 88 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
89 encoding = get_filesystem_encoding()
90 # os.path.basename returns a bytes object, but NamedTemporaryFile
91 # will fail if the filename contains non ascii characters unless we
92 # use a unicode object
93 path_basename = lambda f: os.path.basename(fn).decode(encoding)
94 # the same for os.path.dirname
95 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
96 else:
97 path_basename = os.path.basename
98 path_dirname = os.path.dirname
99
73159f99
S
100 args = {
101 'suffix': '.tmp',
ec5f6016
JMF
102 'prefix': path_basename(fn) + '.',
103 'dir': path_dirname(fn),
73159f99
S
104 'delete': False,
105 }
106
181c8655
PH
107 # In Python 2.x, json.dump expects a bytestream.
108 # In Python 3.x, it writes to a character stream
109 if sys.version_info < (3, 0):
73159f99 110 args['mode'] = 'wb'
181c8655 111 else:
73159f99
S
112 args.update({
113 'mode': 'w',
114 'encoding': 'utf-8',
115 })
116
117 tf = tempfile.NamedTemporaryFile(**args)
181c8655
PH
118
119 try:
120 with tf:
121 json.dump(obj, tf)
1394646a
IK
122 if sys.platform == 'win32':
123 # Need to remove existing file on Windows, else os.rename raises
124 # WindowsError or FileExistsError.
125 try:
126 os.unlink(fn)
127 except OSError:
128 pass
181c8655
PH
129 os.rename(tf.name, fn)
130 except:
131 try:
132 os.remove(tf.name)
133 except OSError:
134 pass
135 raise
136
137
138if sys.version_info >= (2, 7):
59ae56fa
PH
139 def find_xpath_attr(node, xpath, key, val):
140 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
141 assert re.match(r'^[a-zA-Z-]+$', key)
142 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
ab4ee31e 143 expr = xpath + "[@%s='%s']" % (key, val)
59ae56fa
PH
144 return node.find(expr)
145else:
146 def find_xpath_attr(node, xpath, key, val):
4eefbfdb
PH
147 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
148 # .//node does not match if a node is a direct child of . !
8f9312c3 149 if isinstance(xpath, compat_str):
4eefbfdb
PH
150 xpath = xpath.encode('ascii')
151
59ae56fa
PH
152 for f in node.findall(xpath):
153 if f.attrib.get(key) == val:
154 return f
155 return None
156
d7e66d39
JMF
157# On python2.6 the xml.etree.ElementTree.Element methods don't support
158# the namespace parameter
5f6a1245
JW
159
160
d7e66d39
JMF
161def xpath_with_ns(path, ns_map):
162 components = [c.split(':') for c in path.split('/')]
163 replaced = []
164 for c in components:
165 if len(c) == 1:
166 replaced.append(c[0])
167 else:
168 ns, tag = c
169 replaced.append('{%s}%s' % (ns_map[ns], tag))
170 return '/'.join(replaced)
171
d77c3dfd 172
bf0ff932 173def xpath_text(node, xpath, name=None, fatal=False):
d74bebd5
PH
174 if sys.version_info < (2, 7): # Crazy 2.6
175 xpath = xpath.encode('ascii')
176
bf0ff932 177 n = node.find(xpath)
42bdd9d0 178 if n is None or n.text is None:
bf0ff932
PH
179 if fatal:
180 name = xpath if name is None else name
181 raise ExtractorError('Could not find XML element %s' % name)
182 else:
183 return None
184 return n.text
185
186
9e6dd238 187def get_element_by_id(id, html):
43e8fafd
ND
188 """Return the content of the tag with the specified ID in the passed HTML document"""
189 return get_element_by_attribute("id", id, html)
190
12ea2f30 191
43e8fafd
ND
192def get_element_by_attribute(attribute, value, html):
193 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 194
38285056
PH
195 m = re.search(r'''(?xs)
196 <([a-zA-Z0-9:._-]+)
197 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
198 \s+%s=['"]?%s['"]?
199 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
200 \s*>
201 (?P<content>.*?)
202 </\1>
203 ''' % (re.escape(attribute), re.escape(value)), html)
204
205 if not m:
206 return None
207 res = m.group('content')
208
209 if res.startswith('"') or res.startswith("'"):
210 res = res[1:-1]
a921f407 211
38285056 212 return unescapeHTML(res)
a921f407 213
9e6dd238
FV
214
215def clean_html(html):
59ae15a5 216 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
217
218 if html is None: # Convenience for sanitizing descriptions etc.
219 return html
220
59ae15a5
PH
221 # Newline vs <br />
222 html = html.replace('\n', ' ')
6b3aef80
FV
223 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
224 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
225 # Strip html tags
226 html = re.sub('<.*?>', '', html)
227 # Replace html entities
228 html = unescapeHTML(html)
7decf895 229 return html.strip()
9e6dd238
FV
230
231
d77c3dfd 232def sanitize_open(filename, open_mode):
59ae15a5
PH
233 """Try to open the given filename, and slightly tweak it if this fails.
234
235 Attempts to open the given filename. If this fails, it tries to change
236 the filename slightly, step by step, until it's either able to open it
237 or it fails and raises a final exception, like the standard open()
238 function.
239
240 It returns the tuple (stream, definitive_file_name).
241 """
242 try:
28e614de 243 if filename == '-':
59ae15a5
PH
244 if sys.platform == 'win32':
245 import msvcrt
246 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 247 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
248 stream = open(encodeFilename(filename), open_mode)
249 return (stream, filename)
250 except (IOError, OSError) as err:
f45c185f
PH
251 if err.errno in (errno.EACCES,):
252 raise
59ae15a5 253
f45c185f
PH
254 # In case of error, try to remove win32 forbidden chars
255 alt_filename = os.path.join(
b74e86f4
PH
256 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
257 for path_part in os.path.split(filename)
258 )
f45c185f
PH
259 if alt_filename == filename:
260 raise
261 else:
262 # An exception here should be caught in the caller
263 stream = open(encodeFilename(filename), open_mode)
264 return (stream, alt_filename)
d77c3dfd
FV
265
266
267def timeconvert(timestr):
59ae15a5
PH
268 """Convert RFC 2822 defined time string into system timestamp"""
269 timestamp = None
270 timetuple = email.utils.parsedate_tz(timestr)
271 if timetuple is not None:
272 timestamp = email.utils.mktime_tz(timetuple)
273 return timestamp
1c469a94 274
5f6a1245 275
796173d0 276def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
277 """Sanitizes a string so it could be used as part of a filename.
278 If restricted is set, use a stricter subset of allowed characters.
796173d0 279 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
280 """
281 def replace_insane(char):
282 if char == '?' or ord(char) < 32 or ord(char) == 127:
283 return ''
284 elif char == '"':
285 return '' if restricted else '\''
286 elif char == ':':
287 return '_-' if restricted else ' -'
288 elif char in '\\/|*<>':
289 return '_'
627dcfff 290 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
291 return '_'
292 if restricted and ord(char) > 127:
293 return '_'
294 return char
295
2aeb06d6
PH
296 # Handle timestamps
297 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 298 result = ''.join(map(replace_insane, s))
796173d0
PH
299 if not is_id:
300 while '__' in result:
301 result = result.replace('__', '_')
302 result = result.strip('_')
303 # Common case of "Foreign band name - English song title"
304 if restricted and result.startswith('-_'):
305 result = result[2:]
5a42414b
PH
306 if result.startswith('-'):
307 result = '_' + result[len('-'):]
796173d0
PH
308 if not result:
309 result = '_'
59ae15a5 310 return result
d77c3dfd 311
5f6a1245 312
d77c3dfd 313def orderedSet(iterable):
59ae15a5
PH
314 """ Remove all duplicates from the input iterable """
315 res = []
316 for el in iterable:
317 if el not in res:
318 res.append(el)
319 return res
d77c3dfd 320
912b38b4 321
4e408e47
PH
322def _htmlentity_transform(entity):
323 """Transforms an HTML entity to a character."""
324 # Known non-numeric HTML entity
325 if entity in compat_html_entities.name2codepoint:
326 return compat_chr(compat_html_entities.name2codepoint[entity])
327
328 mobj = re.match(r'#(x?[0-9]+)', entity)
329 if mobj is not None:
330 numstr = mobj.group(1)
28e614de 331 if numstr.startswith('x'):
4e408e47 332 base = 16
28e614de 333 numstr = '0%s' % numstr
4e408e47
PH
334 else:
335 base = 10
336 return compat_chr(int(numstr, base))
337
338 # Unknown entity in name, return its literal representation
28e614de 339 return ('&%s;' % entity)
4e408e47
PH
340
341
d77c3dfd 342def unescapeHTML(s):
912b38b4
PH
343 if s is None:
344 return None
345 assert type(s) == compat_str
d77c3dfd 346
4e408e47
PH
347 return re.sub(
348 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 349
8bf48f23
PH
350
351def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
352 """
353 @param s The name of the file
354 """
d77c3dfd 355
8bf48f23 356 assert type(s) == compat_str
d77c3dfd 357
59ae15a5
PH
358 # Python 3 has a Unicode API
359 if sys.version_info >= (3, 0):
360 return s
0f00efed 361
59ae15a5 362 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
28e614de 363 # Pass '' directly to use Unicode APIs on Windows 2000 and up
59ae15a5
PH
364 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
365 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
366 if not for_subprocess:
367 return s
368 else:
369 # For subprocess calls, encode with locale encoding
370 # Refer to http://stackoverflow.com/a/9951851/35070
371 encoding = preferredencoding()
59ae15a5 372 else:
6df40dcb 373 encoding = sys.getfilesystemencoding()
8bf48f23
PH
374 if encoding is None:
375 encoding = 'utf-8'
376 return s.encode(encoding, 'ignore')
377
f07b74fc
PH
378
379def encodeArgument(s):
380 if not isinstance(s, compat_str):
381 # Legacy code that uses byte strings
382 # Uncomment the following line after fixing all post processors
7af808a5 383 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
384 s = s.decode('ascii')
385 return encodeFilename(s, True)
386
387
8271226a
PH
388def decodeOption(optval):
389 if optval is None:
390 return optval
391 if isinstance(optval, bytes):
392 optval = optval.decode(preferredencoding())
393
394 assert isinstance(optval, compat_str)
395 return optval
1c256f70 396
5f6a1245 397
4539dd30
PH
398def formatSeconds(secs):
399 if secs > 3600:
400 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
401 elif secs > 60:
402 return '%d:%02d' % (secs // 60, secs % 60)
403 else:
404 return '%d' % secs
405
a0ddb8a2 406
be4a824d
PH
407def make_HTTPS_handler(params, **kwargs):
408 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 409 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 410 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 411 if opts_no_check_certificate:
be5f2c19 412 context.check_hostname = False
0db261ba 413 context.verify_mode = ssl.CERT_NONE
a2366922 414 try:
be4a824d 415 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
416 except TypeError:
417 # Python 2.7.8
418 # (create_default_context present but HTTPSHandler has no context=)
419 pass
420
421 if sys.version_info < (3, 2):
d7932313 422 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 423 else: # Python < 3.4
d7932313 424 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 425 context.verify_mode = (ssl.CERT_NONE
dca08720 426 if opts_no_check_certificate
ea6d901e 427 else ssl.CERT_REQUIRED)
303b479e 428 context.set_default_verify_paths()
be4a824d 429 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 430
732ea2f0 431
1c256f70
PH
432class ExtractorError(Exception):
433 """Error during info extraction."""
5f6a1245 434
d11271dd 435 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
436 """ tb, if given, is the original traceback (so that it can be printed out).
437 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
438 """
439
440 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
441 expected = True
d11271dd
PH
442 if video_id is not None:
443 msg = video_id + ': ' + msg
410f3e73 444 if cause:
28e614de 445 msg += ' (caused by %r)' % cause
9a82b238 446 if not expected:
732ea2f0
PH
447 if ytdl_is_updateable():
448 update_cmd = 'type youtube-dl -U to update'
449 else:
450 update_cmd = 'see https://yt-dl.org/update on how to update'
451 msg += '; please report this issue on https://yt-dl.org/bug .'
452 msg += ' Make sure you are using the latest version; %s.' % update_cmd
453 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
1c256f70 454 super(ExtractorError, self).__init__(msg)
d5979c5d 455
1c256f70 456 self.traceback = tb
8cc83b8d 457 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 458 self.cause = cause
d11271dd 459 self.video_id = video_id
1c256f70 460
01951dda
PH
461 def format_traceback(self):
462 if self.traceback is None:
463 return None
28e614de 464 return ''.join(traceback.format_tb(self.traceback))
01951dda 465
1c256f70 466
416c7fcb
PH
467class UnsupportedError(ExtractorError):
468 def __init__(self, url):
469 super(UnsupportedError, self).__init__(
470 'Unsupported URL: %s' % url, expected=True)
471 self.url = url
472
473
55b3e45b
JMF
474class RegexNotFoundError(ExtractorError):
475 """Error when a regex didn't match"""
476 pass
477
478
d77c3dfd 479class DownloadError(Exception):
59ae15a5 480 """Download Error exception.
d77c3dfd 481
59ae15a5
PH
482 This exception may be thrown by FileDownloader objects if they are not
483 configured to continue on errors. They will contain the appropriate
484 error message.
485 """
5f6a1245 486
8cc83b8d
FV
487 def __init__(self, msg, exc_info=None):
488 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
489 super(DownloadError, self).__init__(msg)
490 self.exc_info = exc_info
d77c3dfd
FV
491
492
493class SameFileError(Exception):
59ae15a5 494 """Same File exception.
d77c3dfd 495
59ae15a5
PH
496 This exception will be thrown by FileDownloader objects if they detect
497 multiple files would have to be downloaded to the same file on disk.
498 """
499 pass
d77c3dfd
FV
500
501
502class PostProcessingError(Exception):
59ae15a5 503 """Post Processing exception.
d77c3dfd 504
59ae15a5
PH
505 This exception may be raised by PostProcessor's .run() method to
506 indicate an error in the postprocessing task.
507 """
5f6a1245 508
7851b379
PH
509 def __init__(self, msg):
510 self.msg = msg
d77c3dfd 511
5f6a1245 512
d77c3dfd 513class MaxDownloadsReached(Exception):
59ae15a5
PH
514 """ --max-downloads limit has been reached. """
515 pass
d77c3dfd
FV
516
517
518class UnavailableVideoError(Exception):
59ae15a5 519 """Unavailable Format exception.
d77c3dfd 520
59ae15a5
PH
521 This exception will be thrown when a video is requested
522 in a format that is not available for that video.
523 """
524 pass
d77c3dfd
FV
525
526
527class ContentTooShortError(Exception):
59ae15a5 528 """Content Too Short exception.
d77c3dfd 529
59ae15a5
PH
530 This exception may be raised by FileDownloader objects when a file they
531 download is too small for what the server announced first, indicating
532 the connection was probably interrupted.
533 """
534 # Both in bytes
535 downloaded = None
536 expected = None
d77c3dfd 537
59ae15a5
PH
538 def __init__(self, downloaded, expected):
539 self.downloaded = downloaded
540 self.expected = expected
d77c3dfd 541
5f6a1245 542
c5a59d93 543def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
be4a824d
PH
544 hc = http_class(*args, **kwargs)
545 source_address = ydl_handler._params.get('source_address')
546 if source_address is not None:
547 sa = (source_address, 0)
548 if hasattr(hc, 'source_address'): # Python 2.7+
549 hc.source_address = sa
550 else: # Python 2.6
551 def _hc_connect(self, *args, **kwargs):
552 sock = compat_socket_create_connection(
553 (self.host, self.port), self.timeout, sa)
554 if is_https:
d7932313
PH
555 self.sock = ssl.wrap_socket(
556 sock, self.key_file, self.cert_file,
557 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
558 else:
559 self.sock = sock
560 hc.connect = functools.partial(_hc_connect, hc)
561
562 return hc
563
564
acebc9cd 565class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
566 """Handler for HTTP requests and responses.
567
568 This class, when installed with an OpenerDirector, automatically adds
569 the standard headers to every HTTP request and handles gzipped and
570 deflated responses from web servers. If compression is to be avoided in
571 a particular request, the original request in the program code only has
572 to include the HTTP header "Youtubedl-No-Compression", which will be
573 removed before making the real request.
574
575 Part of this code was copied from:
576
577 http://techknack.net/python-urllib2-handlers/
578
579 Andrew Rowls, the author of that code, agreed to release it to the
580 public domain.
581 """
582
be4a824d
PH
583 def __init__(self, params, *args, **kwargs):
584 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
585 self._params = params
586
587 def http_open(self, req):
588 return self.do_open(functools.partial(
c5a59d93 589 _create_http_connection, self, compat_http_client.HTTPConnection, False),
be4a824d
PH
590 req)
591
59ae15a5
PH
592 @staticmethod
593 def deflate(data):
594 try:
595 return zlib.decompress(data, -zlib.MAX_WBITS)
596 except zlib.error:
597 return zlib.decompress(data)
598
599 @staticmethod
600 def addinfourl_wrapper(stream, headers, url, code):
601 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
602 return compat_urllib_request.addinfourl(stream, headers, url, code)
603 ret = compat_urllib_request.addinfourl(stream, headers, url)
604 ret.code = code
605 return ret
606
acebc9cd 607 def http_request(self, req):
33ac271b 608 for h, v in std_headers.items():
3d5f7a39
JK
609 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
610 # The dict keys are capitalized because of this bug by urllib
611 if h.capitalize() not in req.headers:
33ac271b 612 req.add_header(h, v)
59ae15a5
PH
613 if 'Youtubedl-no-compression' in req.headers:
614 if 'Accept-encoding' in req.headers:
615 del req.headers['Accept-encoding']
616 del req.headers['Youtubedl-no-compression']
989b4b2b
PH
617
618 if sys.version_info < (2, 7) and '#' in req.get_full_url():
619 # Python 2.6 is brain-dead when it comes to fragments
620 req._Request__original = req._Request__original.partition('#')[0]
621 req._Request__r_type = req._Request__r_type.partition('#')[0]
622
59ae15a5
PH
623 return req
624
acebc9cd 625 def http_response(self, req, resp):
59ae15a5
PH
626 old_resp = resp
627 # gzip
628 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
629 content = resp.read()
630 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
631 try:
632 uncompressed = io.BytesIO(gz.read())
633 except IOError as original_ioerror:
634 # There may be junk add the end of the file
635 # See http://stackoverflow.com/q/4928560/35070 for details
636 for i in range(1, 1024):
637 try:
638 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
639 uncompressed = io.BytesIO(gz.read())
640 except IOError:
641 continue
642 break
643 else:
644 raise original_ioerror
645 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
646 resp.msg = old_resp.msg
647 # deflate
648 if resp.headers.get('Content-encoding', '') == 'deflate':
649 gz = io.BytesIO(self.deflate(resp.read()))
650 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
651 resp.msg = old_resp.msg
652 return resp
0f8d03f8 653
acebc9cd
PH
654 https_request = http_request
655 https_response = http_response
bf50b038 656
5de90176 657
be4a824d
PH
658class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
659 def __init__(self, params, https_conn_class=None, *args, **kwargs):
660 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
661 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
662 self._params = params
663
664 def https_open(self, req):
4f264c02
JMF
665 kwargs = {}
666 if hasattr(self, '_context'): # python > 2.6
667 kwargs['context'] = self._context
668 if hasattr(self, '_check_hostname'): # python 3.x
669 kwargs['check_hostname'] = self._check_hostname
be4a824d
PH
670 return self.do_open(functools.partial(
671 _create_http_connection, self, self._https_conn_class, True),
4f264c02 672 req, **kwargs)
be4a824d
PH
673
674
08b38d54 675def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
676 """ Return a UNIX timestamp from the given date """
677
678 if date_str is None:
679 return None
680
08b38d54
PH
681 if timezone is None:
682 m = re.search(
683 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
684 date_str)
685 if not m:
912b38b4
PH
686 timezone = datetime.timedelta()
687 else:
08b38d54
PH
688 date_str = date_str[:-len(m.group(0))]
689 if not m.group('sign'):
690 timezone = datetime.timedelta()
691 else:
692 sign = 1 if m.group('sign') == '+' else -1
693 timezone = datetime.timedelta(
694 hours=sign * int(m.group('hours')),
695 minutes=sign * int(m.group('minutes')))
6ad4013d 696 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
305d0683 697 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
698 return calendar.timegm(dt.timetuple())
699
700
42bdd9d0 701def unified_strdate(date_str, day_first=True):
bf50b038 702 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
703
704 if date_str is None:
705 return None
bf50b038 706 upload_date = None
5f6a1245 707 # Replace commas
026fcc04 708 date_str = date_str.replace(',', ' ')
bf50b038 709 # %z (UTC offset) is only supported in python>=3.2
026fcc04 710 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0 711 # Remove AM/PM + timezone
9bb8e0a3 712 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
42bdd9d0 713
19e1d359
JMF
714 format_expressions = [
715 '%d %B %Y',
0f99566c 716 '%d %b %Y',
19e1d359
JMF
717 '%B %d %Y',
718 '%b %d %Y',
78ff59d0
PP
719 '%b %dst %Y %I:%M%p',
720 '%b %dnd %Y %I:%M%p',
721 '%b %dth %Y %I:%M%p',
a69801e2 722 '%Y %m %d',
19e1d359 723 '%Y-%m-%d',
fe556f1b 724 '%Y/%m/%d',
19e1d359 725 '%Y/%m/%d %H:%M:%S',
5d73273f 726 '%Y-%m-%d %H:%M:%S',
e9be9a6a 727 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 728 '%d.%m.%Y %H:%M',
b047de6f 729 '%d.%m.%Y %H.%M',
19e1d359 730 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
731 '%Y-%m-%dT%H:%M:%S.%fZ',
732 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 733 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 734 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 735 '%Y-%m-%dT%H:%M',
19e1d359 736 ]
42bdd9d0
PH
737 if day_first:
738 format_expressions.extend([
776dc399
S
739 '%d.%m.%Y',
740 '%d/%m/%Y',
741 '%d/%m/%y',
42bdd9d0
PH
742 '%d/%m/%Y %H:%M:%S',
743 ])
744 else:
745 format_expressions.extend([
776dc399
S
746 '%m.%d.%Y',
747 '%m/%d/%Y',
748 '%m/%d/%y',
42bdd9d0
PH
749 '%m/%d/%Y %H:%M:%S',
750 ])
bf50b038
JMF
751 for expression in format_expressions:
752 try:
753 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 754 except ValueError:
bf50b038 755 pass
42393ce2
PH
756 if upload_date is None:
757 timetuple = email.utils.parsedate_tz(date_str)
758 if timetuple:
759 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
760 return upload_date
761
5f6a1245 762
28e614de 763def determine_ext(url, default_ext='unknown_video'):
f4776371
S
764 if url is None:
765 return default_ext
28e614de 766 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
767 if re.match(r'^[A-Za-z0-9]+$', guess):
768 return guess
769 else:
cbdbb766 770 return default_ext
73e79f2a 771
5f6a1245 772
d4051a8e 773def subtitles_filename(filename, sub_lang, sub_format):
28e614de 774 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 775
5f6a1245 776
bd558525 777def date_from_str(date_str):
37254abc
JMF
778 """
779 Return a datetime object from a string in the format YYYYMMDD or
780 (now|today)[+-][0-9](day|week|month|year)(s)?"""
781 today = datetime.date.today()
f8795e10 782 if date_str in ('now', 'today'):
37254abc 783 return today
f8795e10
PH
784 if date_str == 'yesterday':
785 return today - datetime.timedelta(days=1)
37254abc
JMF
786 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
787 if match is not None:
788 sign = match.group('sign')
789 time = int(match.group('time'))
790 if sign == '-':
791 time = -time
792 unit = match.group('unit')
5f6a1245 793 # A bad aproximation?
37254abc
JMF
794 if unit == 'month':
795 unit = 'day'
796 time *= 30
797 elif unit == 'year':
798 unit = 'day'
799 time *= 365
800 unit += 's'
801 delta = datetime.timedelta(**{unit: time})
802 return today + delta
bd558525 803 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
5f6a1245
JW
804
805
e63fc1be 806def hyphenate_date(date_str):
807 """
808 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
809 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
810 if match is not None:
811 return '-'.join(match.groups())
812 else:
813 return date_str
814
5f6a1245 815
bd558525
JMF
816class DateRange(object):
817 """Represents a time interval between two dates"""
5f6a1245 818
bd558525
JMF
819 def __init__(self, start=None, end=None):
820 """start and end must be strings in the format accepted by date"""
821 if start is not None:
822 self.start = date_from_str(start)
823 else:
824 self.start = datetime.datetime.min.date()
825 if end is not None:
826 self.end = date_from_str(end)
827 else:
828 self.end = datetime.datetime.max.date()
37254abc 829 if self.start > self.end:
bd558525 830 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 831
bd558525
JMF
832 @classmethod
833 def day(cls, day):
834 """Returns a range that only contains the given day"""
5f6a1245
JW
835 return cls(day, day)
836
bd558525
JMF
837 def __contains__(self, date):
838 """Check if the date is in the range"""
37254abc
JMF
839 if not isinstance(date, datetime.date):
840 date = date_from_str(date)
841 return self.start <= date <= self.end
5f6a1245 842
bd558525 843 def __str__(self):
5f6a1245 844 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
845
846
847def platform_name():
848 """ Returns the platform name as a compat_str """
849 res = platform.platform()
850 if isinstance(res, bytes):
851 res = res.decode(preferredencoding())
852
853 assert isinstance(res, compat_str)
854 return res
c257baff
PH
855
856
b58ddb32
PH
857def _windows_write_string(s, out):
858 """ Returns True if the string was written using special methods,
859 False if it has yet to be written out."""
860 # Adapted from http://stackoverflow.com/a/3259271/35070
861
862 import ctypes
863 import ctypes.wintypes
864
865 WIN_OUTPUT_IDS = {
866 1: -11,
867 2: -12,
868 }
869
a383a98a
PH
870 try:
871 fileno = out.fileno()
872 except AttributeError:
873 # If the output stream doesn't have a fileno, it's virtual
874 return False
aa42e873
PH
875 except io.UnsupportedOperation:
876 # Some strange Windows pseudo files?
877 return False
b58ddb32
PH
878 if fileno not in WIN_OUTPUT_IDS:
879 return False
880
e2f89ec7 881 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 882 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
6ac4e806 883 (b"GetStdHandle", ctypes.windll.kernel32))
b58ddb32
PH
884 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
885
e2f89ec7 886 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
887 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
888 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
6ac4e806 889 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
b58ddb32
PH
890 written = ctypes.wintypes.DWORD(0)
891
6ac4e806 892 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
b58ddb32
PH
893 FILE_TYPE_CHAR = 0x0002
894 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 895 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
896 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
897 ctypes.POINTER(ctypes.wintypes.DWORD))(
6ac4e806 898 (b"GetConsoleMode", ctypes.windll.kernel32))
b58ddb32
PH
899 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
900
901 def not_a_console(handle):
902 if handle == INVALID_HANDLE_VALUE or handle is None:
903 return True
8fb3ac36
PH
904 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
905 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
906
907 if not_a_console(h):
908 return False
909
d1b9c912
PH
910 def next_nonbmp_pos(s):
911 try:
912 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
913 except StopIteration:
914 return len(s)
915
916 while s:
917 count = min(next_nonbmp_pos(s), 1024)
918
b58ddb32 919 ret = WriteConsoleW(
d1b9c912 920 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
921 if ret == 0:
922 raise OSError('Failed to write string')
d1b9c912
PH
923 if not count: # We just wrote a non-BMP character
924 assert written.value == 2
925 s = s[1:]
926 else:
927 assert written.value > 0
928 s = s[written.value:]
b58ddb32
PH
929 return True
930
931
734f90bb 932def write_string(s, out=None, encoding=None):
7459e3a2
PH
933 if out is None:
934 out = sys.stderr
8bf48f23 935 assert type(s) == compat_str
7459e3a2 936
b58ddb32
PH
937 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
938 if _windows_write_string(s, out):
939 return
940
7459e3a2
PH
941 if ('b' in getattr(out, 'mode', '') or
942 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
943 byt = s.encode(encoding or preferredencoding(), 'ignore')
944 out.write(byt)
945 elif hasattr(out, 'buffer'):
946 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
947 byt = s.encode(enc, 'ignore')
948 out.buffer.write(byt)
949 else:
8bf48f23 950 out.write(s)
7459e3a2
PH
951 out.flush()
952
953
48ea9cea
PH
954def bytes_to_intlist(bs):
955 if not bs:
956 return []
957 if isinstance(bs[0], int): # Python 3
958 return list(bs)
959 else:
960 return [ord(c) for c in bs]
961
c257baff 962
cba892fa 963def intlist_to_bytes(xs):
964 if not xs:
965 return b''
eb4157fd 966 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
967
968
c1c9a79c
PH
969# Cross-platform file locking
970if sys.platform == 'win32':
971 import ctypes.wintypes
972 import msvcrt
973
974 class OVERLAPPED(ctypes.Structure):
975 _fields_ = [
976 ('Internal', ctypes.wintypes.LPVOID),
977 ('InternalHigh', ctypes.wintypes.LPVOID),
978 ('Offset', ctypes.wintypes.DWORD),
979 ('OffsetHigh', ctypes.wintypes.DWORD),
980 ('hEvent', ctypes.wintypes.HANDLE),
981 ]
982
983 kernel32 = ctypes.windll.kernel32
984 LockFileEx = kernel32.LockFileEx
985 LockFileEx.argtypes = [
986 ctypes.wintypes.HANDLE, # hFile
987 ctypes.wintypes.DWORD, # dwFlags
988 ctypes.wintypes.DWORD, # dwReserved
989 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
990 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
991 ctypes.POINTER(OVERLAPPED) # Overlapped
992 ]
993 LockFileEx.restype = ctypes.wintypes.BOOL
994 UnlockFileEx = kernel32.UnlockFileEx
995 UnlockFileEx.argtypes = [
996 ctypes.wintypes.HANDLE, # hFile
997 ctypes.wintypes.DWORD, # dwReserved
998 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
999 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1000 ctypes.POINTER(OVERLAPPED) # Overlapped
1001 ]
1002 UnlockFileEx.restype = ctypes.wintypes.BOOL
1003 whole_low = 0xffffffff
1004 whole_high = 0x7fffffff
1005
1006 def _lock_file(f, exclusive):
1007 overlapped = OVERLAPPED()
1008 overlapped.Offset = 0
1009 overlapped.OffsetHigh = 0
1010 overlapped.hEvent = 0
1011 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1012 handle = msvcrt.get_osfhandle(f.fileno())
1013 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1014 whole_low, whole_high, f._lock_file_overlapped_p):
1015 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1016
1017 def _unlock_file(f):
1018 assert f._lock_file_overlapped_p
1019 handle = msvcrt.get_osfhandle(f.fileno())
1020 if not UnlockFileEx(handle, 0,
1021 whole_low, whole_high, f._lock_file_overlapped_p):
1022 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1023
1024else:
1025 import fcntl
1026
1027 def _lock_file(f, exclusive):
2582bebe 1028 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
1029
1030 def _unlock_file(f):
2582bebe 1031 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
1032
1033
1034class locked_file(object):
1035 def __init__(self, filename, mode, encoding=None):
1036 assert mode in ['r', 'a', 'w']
1037 self.f = io.open(filename, mode, encoding=encoding)
1038 self.mode = mode
1039
1040 def __enter__(self):
1041 exclusive = self.mode != 'r'
1042 try:
1043 _lock_file(self.f, exclusive)
1044 except IOError:
1045 self.f.close()
1046 raise
1047 return self
1048
1049 def __exit__(self, etype, value, traceback):
1050 try:
1051 _unlock_file(self.f)
1052 finally:
1053 self.f.close()
1054
1055 def __iter__(self):
1056 return iter(self.f)
1057
1058 def write(self, *args):
1059 return self.f.write(*args)
1060
1061 def read(self, *args):
1062 return self.f.read(*args)
4eb7f1d1
JMF
1063
1064
4644ac55
S
1065def get_filesystem_encoding():
1066 encoding = sys.getfilesystemencoding()
1067 return encoding if encoding is not None else 'utf-8'
1068
1069
4eb7f1d1 1070def shell_quote(args):
a6a173c2 1071 quoted_args = []
4644ac55 1072 encoding = get_filesystem_encoding()
a6a173c2
JMF
1073 for a in args:
1074 if isinstance(a, bytes):
1075 # We may get a filename encoded with 'encodeFilename'
1076 a = a.decode(encoding)
1077 quoted_args.append(pipes.quote(a))
28e614de 1078 return ' '.join(quoted_args)
9d4660ca
PH
1079
1080
f4d96df0
PH
1081def takewhile_inclusive(pred, seq):
1082 """ Like itertools.takewhile, but include the latest evaluated element
1083 (the first element so that Not pred(e)) """
1084 for e in seq:
1085 yield e
1086 if not pred(e):
1087 return
1088
1089
9d4660ca
PH
1090def smuggle_url(url, data):
1091 """ Pass additional data in a URL for internal use. """
1092
1093 sdata = compat_urllib_parse.urlencode(
28e614de
PH
1094 {'__youtubedl_smuggle': json.dumps(data)})
1095 return url + '#' + sdata
9d4660ca
PH
1096
1097
79f82953 1098def unsmuggle_url(smug_url, default=None):
83e865a3 1099 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1100 return smug_url, default
28e614de
PH
1101 url, _, sdata = smug_url.rpartition('#')
1102 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1103 data = json.loads(jsond)
1104 return url, data
02dbf93f
PH
1105
1106
02dbf93f
PH
1107def format_bytes(bytes):
1108 if bytes is None:
28e614de 1109 return 'N/A'
02dbf93f
PH
1110 if type(bytes) is str:
1111 bytes = float(bytes)
1112 if bytes == 0.0:
1113 exponent = 0
1114 else:
1115 exponent = int(math.log(bytes, 1024.0))
28e614de 1116 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1117 converted = float(bytes) / float(1024 ** exponent)
28e614de 1118 return '%.2f%s' % (converted, suffix)
f53c966a 1119
1c088fa8 1120
be64b5b0
PH
1121def parse_filesize(s):
1122 if s is None:
1123 return None
1124
1125 # The lower-case forms are of course incorrect and inofficial,
1126 # but we support those too
1127 _UNIT_TABLE = {
1128 'B': 1,
1129 'b': 1,
1130 'KiB': 1024,
1131 'KB': 1000,
1132 'kB': 1024,
1133 'Kb': 1000,
1134 'MiB': 1024 ** 2,
1135 'MB': 1000 ** 2,
1136 'mB': 1024 ** 2,
1137 'Mb': 1000 ** 2,
1138 'GiB': 1024 ** 3,
1139 'GB': 1000 ** 3,
1140 'gB': 1024 ** 3,
1141 'Gb': 1000 ** 3,
1142 'TiB': 1024 ** 4,
1143 'TB': 1000 ** 4,
1144 'tB': 1024 ** 4,
1145 'Tb': 1000 ** 4,
1146 'PiB': 1024 ** 5,
1147 'PB': 1000 ** 5,
1148 'pB': 1024 ** 5,
1149 'Pb': 1000 ** 5,
1150 'EiB': 1024 ** 6,
1151 'EB': 1000 ** 6,
1152 'eB': 1024 ** 6,
1153 'Eb': 1000 ** 6,
1154 'ZiB': 1024 ** 7,
1155 'ZB': 1000 ** 7,
1156 'zB': 1024 ** 7,
1157 'Zb': 1000 ** 7,
1158 'YiB': 1024 ** 8,
1159 'YB': 1000 ** 8,
1160 'yB': 1024 ** 8,
1161 'Yb': 1000 ** 8,
1162 }
1163
1164 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
4349c07d
PH
1165 m = re.match(
1166 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
be64b5b0
PH
1167 if not m:
1168 return None
1169
4349c07d
PH
1170 num_str = m.group('num').replace(',', '.')
1171 mult = _UNIT_TABLE[m.group('unit')]
1172 return int(float(num_str) * mult)
be64b5b0
PH
1173
1174
caefb1de
PH
1175def month_by_name(name):
1176 """ Return the number of a month by (locale-independently) English name """
1177
caefb1de 1178 try:
7105440c
YCH
1179 return ENGLISH_MONTH_NAMES.index(name) + 1
1180 except ValueError:
1181 return None
1182
1183
1184def month_by_abbreviation(abbrev):
1185 """ Return the number of a month by (locale-independently) English
1186 abbreviations """
1187
1188 try:
1189 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1190 except ValueError:
1191 return None
18258362
JMF
1192
1193
5aafe895 1194def fix_xml_ampersands(xml_str):
18258362 1195 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1196 return re.sub(
1197 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1198 '&amp;',
5aafe895 1199 xml_str)
e3946f98
PH
1200
1201
1202def setproctitle(title):
8bf48f23 1203 assert isinstance(title, compat_str)
e3946f98
PH
1204 try:
1205 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1206 except OSError:
1207 return
6eefe533
PH
1208 title_bytes = title.encode('utf-8')
1209 buf = ctypes.create_string_buffer(len(title_bytes))
1210 buf.value = title_bytes
e3946f98 1211 try:
6eefe533 1212 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1213 except AttributeError:
1214 return # Strange libc, just skip this
d7dda168
PH
1215
1216
1217def remove_start(s, start):
1218 if s.startswith(start):
1219 return s[len(start):]
1220 return s
29eb5174
PH
1221
1222
2b9faf55
PH
1223def remove_end(s, end):
1224 if s.endswith(end):
1225 return s[:-len(end)]
1226 return s
1227
1228
29eb5174 1229def url_basename(url):
9b8aaeed 1230 path = compat_urlparse.urlparse(url).path
28e614de 1231 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1232
1233
1234class HEADRequest(compat_urllib_request.Request):
1235 def get_method(self):
1236 return "HEAD"
7217e148
PH
1237
1238
9732d77e 1239def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1240 if get_attr:
1241 if v is not None:
1242 v = getattr(v, get_attr, None)
9572013d
PH
1243 if v == '':
1244 v = None
9732d77e
PH
1245 return default if v is None else (int(v) * invscale // scale)
1246
9572013d 1247
40a90862
JMF
1248def str_or_none(v, default=None):
1249 return default if v is None else compat_str(v)
1250
9732d77e
PH
1251
1252def str_to_int(int_str):
48d4681e 1253 """ A more relaxed version of int_or_none """
9732d77e
PH
1254 if int_str is None:
1255 return None
28e614de 1256 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1257 return int(int_str)
608d11f5
PH
1258
1259
9732d77e
PH
1260def float_or_none(v, scale=1, invscale=1, default=None):
1261 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1262
1263
608d11f5 1264def parse_duration(s):
8f9312c3 1265 if not isinstance(s, compat_basestring):
608d11f5
PH
1266 return None
1267
ca7b3246
S
1268 s = s.strip()
1269
608d11f5 1270 m = re.match(
9d22a7df 1271 r'''(?ix)(?:P?T)?
e8df5cee
PH
1272 (?:
1273 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1274 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1275
3e675fab 1276 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
6a68bb57 1277 (?:
8f4b58d7
PH
1278 (?:
1279 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1280 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1281 )?
6a68bb57
PH
1282 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1283 )?
e8df5cee
PH
1284 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1285 )$''', s)
608d11f5
PH
1286 if not m:
1287 return None
e8df5cee
PH
1288 res = 0
1289 if m.group('only_mins'):
1290 return float_or_none(m.group('only_mins'), invscale=60)
1291 if m.group('only_hours'):
1292 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1293 if m.group('secs'):
1294 res += int(m.group('secs'))
3e675fab
PH
1295 if m.group('mins_reversed'):
1296 res += int(m.group('mins_reversed')) * 60
608d11f5
PH
1297 if m.group('mins'):
1298 res += int(m.group('mins')) * 60
e8df5cee
PH
1299 if m.group('hours'):
1300 res += int(m.group('hours')) * 60 * 60
3e675fab
PH
1301 if m.group('hours_reversed'):
1302 res += int(m.group('hours_reversed')) * 60 * 60
8f4b58d7
PH
1303 if m.group('days'):
1304 res += int(m.group('days')) * 24 * 60 * 60
7adcbe75
PH
1305 if m.group('ms'):
1306 res += float(m.group('ms'))
608d11f5 1307 return res
91d7d0b3
JMF
1308
1309
1310def prepend_extension(filename, ext):
5f6a1245 1311 name, real_ext = os.path.splitext(filename)
28e614de 1312 return '{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1313
1314
1315def check_executable(exe, args=[]):
1316 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1317 args can be a list of arguments for a short output (like -version) """
1318 try:
1319 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1320 except OSError:
1321 return False
1322 return exe
b7ab0590
PH
1323
1324
95807118 1325def get_exe_version(exe, args=['--version'],
cae97f65 1326 version_re=None, unrecognized='present'):
95807118
PH
1327 """ Returns the version of the specified executable,
1328 or False if the executable is not present """
1329 try:
cae97f65 1330 out, _ = subprocess.Popen(
95807118
PH
1331 [exe] + args,
1332 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1333 except OSError:
1334 return False
cae97f65
PH
1335 if isinstance(out, bytes): # Python 2.x
1336 out = out.decode('ascii', 'ignore')
1337 return detect_exe_version(out, version_re, unrecognized)
1338
1339
1340def detect_exe_version(output, version_re=None, unrecognized='present'):
1341 assert isinstance(output, compat_str)
1342 if version_re is None:
1343 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1344 m = re.search(version_re, output)
95807118
PH
1345 if m:
1346 return m.group(1)
1347 else:
1348 return unrecognized
1349
1350
b7ab0590 1351class PagedList(object):
dd26ced1
PH
1352 def __len__(self):
1353 # This is only useful for tests
1354 return len(self.getslice())
1355
9c44d242
PH
1356
1357class OnDemandPagedList(PagedList):
1358 def __init__(self, pagefunc, pagesize):
1359 self._pagefunc = pagefunc
1360 self._pagesize = pagesize
1361
b7ab0590
PH
1362 def getslice(self, start=0, end=None):
1363 res = []
1364 for pagenum in itertools.count(start // self._pagesize):
1365 firstid = pagenum * self._pagesize
1366 nextfirstid = pagenum * self._pagesize + self._pagesize
1367 if start >= nextfirstid:
1368 continue
1369
1370 page_results = list(self._pagefunc(pagenum))
1371
1372 startv = (
1373 start % self._pagesize
1374 if firstid <= start < nextfirstid
1375 else 0)
1376
1377 endv = (
1378 ((end - 1) % self._pagesize) + 1
1379 if (end is not None and firstid <= end <= nextfirstid)
1380 else None)
1381
1382 if startv != 0 or endv is not None:
1383 page_results = page_results[startv:endv]
1384 res.extend(page_results)
1385
1386 # A little optimization - if current page is not "full", ie. does
1387 # not contain page_size videos then we can assume that this page
1388 # is the last one - there are no more ids on further pages -
1389 # i.e. no need to query again.
1390 if len(page_results) + startv < self._pagesize:
1391 break
1392
1393 # If we got the whole page, but the next page is not interesting,
1394 # break out early as well
1395 if end == nextfirstid:
1396 break
1397 return res
81c2f20b
PH
1398
1399
9c44d242
PH
1400class InAdvancePagedList(PagedList):
1401 def __init__(self, pagefunc, pagecount, pagesize):
1402 self._pagefunc = pagefunc
1403 self._pagecount = pagecount
1404 self._pagesize = pagesize
1405
1406 def getslice(self, start=0, end=None):
1407 res = []
1408 start_page = start // self._pagesize
1409 end_page = (
1410 self._pagecount if end is None else (end // self._pagesize + 1))
1411 skip_elems = start - start_page * self._pagesize
1412 only_more = None if end is None else end - start
1413 for pagenum in range(start_page, end_page):
1414 page = list(self._pagefunc(pagenum))
1415 if skip_elems:
1416 page = page[skip_elems:]
1417 skip_elems = None
1418 if only_more is not None:
1419 if len(page) < only_more:
1420 only_more -= len(page)
1421 else:
1422 page = page[:only_more]
1423 res.extend(page)
1424 break
1425 res.extend(page)
1426 return res
1427
1428
81c2f20b 1429def uppercase_escape(s):
676eb3f2 1430 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1431 return re.sub(
a612753d 1432 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1433 lambda m: unicode_escape(m.group(0))[0],
1434 s)
b53466e1 1435
d05cfe06
S
1436
1437def escape_rfc3986(s):
1438 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1439 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1440 s = s.encode('utf-8')
ecc0c5ee 1441 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1442
1443
1444def escape_url(url):
1445 """Escape URL as suggested by RFC 3986"""
1446 url_parsed = compat_urllib_parse_urlparse(url)
1447 return url_parsed._replace(
1448 path=escape_rfc3986(url_parsed.path),
1449 params=escape_rfc3986(url_parsed.params),
1450 query=escape_rfc3986(url_parsed.query),
1451 fragment=escape_rfc3986(url_parsed.fragment)
1452 ).geturl()
1453
b53466e1 1454try:
28e614de 1455 struct.pack('!I', 0)
b53466e1
PH
1456except TypeError:
1457 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1458 def struct_pack(spec, *args):
1459 if isinstance(spec, compat_str):
1460 spec = spec.encode('ascii')
1461 return struct.pack(spec, *args)
1462
1463 def struct_unpack(spec, *args):
1464 if isinstance(spec, compat_str):
1465 spec = spec.encode('ascii')
1466 return struct.unpack(spec, *args)
1467else:
1468 struct_pack = struct.pack
1469 struct_unpack = struct.unpack
62e609ab
PH
1470
1471
1472def read_batch_urls(batch_fd):
1473 def fixup(url):
1474 if not isinstance(url, compat_str):
1475 url = url.decode('utf-8', 'replace')
28e614de 1476 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1477 if url.startswith(BOM_UTF8):
1478 url = url[len(BOM_UTF8):]
1479 url = url.strip()
1480 if url.startswith(('#', ';', ']')):
1481 return False
1482 return url
1483
1484 with contextlib.closing(batch_fd) as fd:
1485 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1486
1487
1488def urlencode_postdata(*args, **kargs):
1489 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1490
1491
0990305d
PH
1492try:
1493 etree_iter = xml.etree.ElementTree.Element.iter
1494except AttributeError: # Python <=2.6
1495 etree_iter = lambda n: n.findall('.//*')
1496
1497
bcf89ce6
PH
1498def parse_xml(s):
1499 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1500 def doctype(self, name, pubid, system):
1501 pass # Ignore doctypes
1502
1503 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1504 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1505 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1506 # Fix up XML parser in Python 2.x
1507 if sys.version_info < (3, 0):
1508 for n in etree_iter(tree):
1509 if n.text is not None:
1510 if not isinstance(n.text, compat_str):
1511 n.text = n.text.decode('utf-8')
1512 return tree
e68301af
PH
1513
1514
a1a530b0
PH
1515US_RATINGS = {
1516 'G': 0,
1517 'PG': 10,
1518 'PG-13': 13,
1519 'R': 16,
1520 'NC': 18,
1521}
fac55558
PH
1522
1523
146c80e2
S
1524def parse_age_limit(s):
1525 if s is None:
d838b1bd 1526 return None
146c80e2 1527 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d838b1bd 1528 return int(m.group('age')) if m else US_RATINGS.get(s, None)
146c80e2
S
1529
1530
fac55558 1531def strip_jsonp(code):
609a61e3
PH
1532 return re.sub(
1533 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1534
1535
e05f6939
PH
1536def js_to_json(code):
1537 def fix_kv(m):
e7b6d122
PH
1538 v = m.group(0)
1539 if v in ('true', 'false', 'null'):
1540 return v
1541 if v.startswith('"'):
1542 return v
1543 if v.startswith("'"):
1544 v = v[1:-1]
1545 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1546 '\\\\': '\\\\',
1547 "\\'": "'",
1548 '"': '\\"',
1549 }[m.group(0)], v)
1550 return '"%s"' % v
e05f6939
PH
1551
1552 res = re.sub(r'''(?x)
d305dd73
PH
1553 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1554 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
8f4b58d7 1555 [a-zA-Z_][.a-zA-Z_0-9]*
e05f6939
PH
1556 ''', fix_kv, code)
1557 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1558 return res
1559
1560
478c2c61
PH
1561def qualities(quality_ids):
1562 """ Get a numeric quality value out of a list of possible values """
1563 def q(qid):
1564 try:
1565 return quality_ids.index(qid)
1566 except ValueError:
1567 return -1
1568 return q
1569
acd69589
PH
1570
1571DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1572
a020a0dc
PH
1573
1574def limit_length(s, length):
1575 """ Add ellipses to overly long strings """
1576 if s is None:
1577 return None
1578 ELLIPSES = '...'
1579 if len(s) > length:
1580 return s[:length - len(ELLIPSES)] + ELLIPSES
1581 return s
48844745
PH
1582
1583
1584def version_tuple(v):
5f9b8394 1585 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1586
1587
1588def is_outdated_version(version, limit, assume_new=True):
1589 if not version:
1590 return not assume_new
1591 try:
1592 return version_tuple(version) < version_tuple(limit)
1593 except ValueError:
1594 return not assume_new
732ea2f0
PH
1595
1596
1597def ytdl_is_updateable():
1598 """ Returns if youtube-dl can be updated with -U """
1599 from zipimport import zipimporter
1600
1601 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1602
1603
1604def args_to_str(args):
1605 # Get a short string representation for a subprocess command
1606 return ' '.join(shlex_quote(a) for a in args)
2ccd1b10
PH
1607
1608
c460bdd5
PH
1609def mimetype2ext(mt):
1610 _, _, res = mt.rpartition('/')
1611
1612 return {
1613 'x-ms-wmv': 'wmv',
1614 'x-mp4-fragmented': 'mp4',
1615 }.get(res, res)
1616
1617
2ccd1b10
PH
1618def urlhandle_detect_ext(url_handle):
1619 try:
1620 url_handle.headers
1621 getheader = lambda h: url_handle.headers[h]
1622 except AttributeError: # Python < 3
1623 getheader = url_handle.info().getheader
1624
b55ee18f
PH
1625 cd = getheader('Content-Disposition')
1626 if cd:
1627 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1628 if m:
1629 e = determine_ext(m.group('filename'), default_ext=None)
1630 if e:
1631 return e
1632
c460bdd5 1633 return mimetype2ext(getheader('Content-Type'))
05900629
PH
1634
1635
1636def age_restricted(content_limit, age_limit):
1637 """ Returns True iff the content should be blocked """
1638
1639 if age_limit is None: # No limit set
1640 return False
1641 if content_limit is None:
1642 return False # Content available for everyone
1643 return age_limit < content_limit
61ca9a80
PH
1644
1645
1646def is_html(first_bytes):
1647 """ Detect whether a file contains HTML by examining its first bytes. """
1648
1649 BOMS = [
1650 (b'\xef\xbb\xbf', 'utf-8'),
1651 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1652 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1653 (b'\xff\xfe', 'utf-16-le'),
1654 (b'\xfe\xff', 'utf-16-be'),
1655 ]
1656 for bom, enc in BOMS:
1657 if first_bytes.startswith(bom):
1658 s = first_bytes[len(bom):].decode(enc, 'replace')
1659 break
1660 else:
1661 s = first_bytes.decode('utf-8', 'replace')
1662
1663 return re.match(r'^\s*<', s)
a055469f
PH
1664
1665
1666def determine_protocol(info_dict):
1667 protocol = info_dict.get('protocol')
1668 if protocol is not None:
1669 return protocol
1670
1671 url = info_dict['url']
1672 if url.startswith('rtmp'):
1673 return 'rtmp'
1674 elif url.startswith('mms'):
1675 return 'mms'
1676 elif url.startswith('rtsp'):
1677 return 'rtsp'
1678
1679 ext = determine_ext(url)
1680 if ext == 'm3u8':
1681 return 'm3u8'
1682 elif ext == 'f4m':
1683 return 'f4m'
1684
1685 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
1686
1687
1688def render_table(header_row, data):
1689 """ Render a list of rows, each as a list of values """
1690 table = [header_row] + data
1691 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1692 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1693 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
1694
1695
1696def _match_one(filter_part, dct):
1697 COMPARISON_OPERATORS = {
1698 '<': operator.lt,
1699 '<=': operator.le,
1700 '>': operator.gt,
1701 '>=': operator.ge,
1702 '=': operator.eq,
1703 '!=': operator.ne,
1704 }
1705 operator_rex = re.compile(r'''(?x)\s*
1706 (?P<key>[a-z_]+)
1707 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1708 (?:
1709 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1710 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1711 )
1712 \s*$
1713 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1714 m = operator_rex.search(filter_part)
1715 if m:
1716 op = COMPARISON_OPERATORS[m.group('op')]
1717 if m.group('strval') is not None:
1718 if m.group('op') not in ('=', '!='):
1719 raise ValueError(
1720 'Operator %s does not support string values!' % m.group('op'))
1721 comparison_value = m.group('strval')
1722 else:
1723 try:
1724 comparison_value = int(m.group('intval'))
1725 except ValueError:
1726 comparison_value = parse_filesize(m.group('intval'))
1727 if comparison_value is None:
1728 comparison_value = parse_filesize(m.group('intval') + 'B')
1729 if comparison_value is None:
1730 raise ValueError(
1731 'Invalid integer value %r in filter part %r' % (
1732 m.group('intval'), filter_part))
1733 actual_value = dct.get(m.group('key'))
1734 if actual_value is None:
1735 return m.group('none_inclusive')
1736 return op(actual_value, comparison_value)
1737
1738 UNARY_OPERATORS = {
1739 '': lambda v: v is not None,
1740 '!': lambda v: v is None,
1741 }
1742 operator_rex = re.compile(r'''(?x)\s*
1743 (?P<op>%s)\s*(?P<key>[a-z_]+)
1744 \s*$
1745 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1746 m = operator_rex.search(filter_part)
1747 if m:
1748 op = UNARY_OPERATORS[m.group('op')]
1749 actual_value = dct.get(m.group('key'))
1750 return op(actual_value)
1751
1752 raise ValueError('Invalid filter part %r' % filter_part)
1753
1754
1755def match_str(filter_str, dct):
1756 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1757
1758 return all(
1759 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1760
1761
1762def match_filter_func(filter_str):
1763 def _match_func(info_dict):
1764 if match_str(filter_str, info_dict):
1765 return None
1766 else:
1767 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1768 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1769 return _match_func