]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[rtve] PEP8
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
912b38b4 6import calendar
676eb3f2 7import codecs
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96
PH
10import datetime
11import email.utils
f45c185f 12import errno
be4a824d 13import functools
d77c3dfd 14import gzip
b7ab0590 15import itertools
03f9daab 16import io
f4bfd65f 17import json
d77c3dfd 18import locale
02dbf93f 19import math
347de493 20import operator
d77c3dfd 21import os
4eb7f1d1 22import pipes
c496ca96 23import platform
d77c3dfd 24import re
13ebea79 25import ssl
c496ca96 26import socket
b53466e1 27import struct
1c088fa8 28import subprocess
d77c3dfd 29import sys
181c8655 30import tempfile
01951dda 31import traceback
bcf89ce6 32import xml.etree.ElementTree
d77c3dfd 33import zlib
d77c3dfd 34
8c25f81b 35from .compat import (
8f9312c3 36 compat_basestring,
8c25f81b
PH
37 compat_chr,
38 compat_getenv,
39 compat_html_entities,
be4a824d 40 compat_http_client,
8c25f81b 41 compat_parse_qs,
be4a824d 42 compat_socket_create_connection,
8c25f81b
PH
43 compat_str,
44 compat_urllib_error,
45 compat_urllib_parse,
46 compat_urllib_parse_urlparse,
47 compat_urllib_request,
48 compat_urlparse,
7d4111ed 49 shlex_quote,
8c25f81b 50)
4644ac55
S
51
52
468e2e92
FV
53# This is not clearly defined otherwise
54compiled_regex_type = type(re.compile(''))
55
3e669f36 56std_headers = {
ae8f7871 57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 62}
f427df17 63
5f6a1245 64
7105440c
YCH
65ENGLISH_MONTH_NAMES = [
66 'January', 'February', 'March', 'April', 'May', 'June',
67 'July', 'August', 'September', 'October', 'November', 'December']
68
69
d77c3dfd 70def preferredencoding():
59ae15a5 71 """Get preferred encoding.
d77c3dfd 72
59ae15a5
PH
73 Returns the best encoding scheme for the system, based on
74 locale.getpreferredencoding() and some further tweaks.
75 """
76 try:
77 pref = locale.getpreferredencoding()
28e614de 78 'TEST'.encode(pref)
59ae15a5
PH
79 except:
80 pref = 'UTF-8'
bae611f2 81
59ae15a5 82 return pref
d77c3dfd 83
f4bfd65f 84
181c8655 85def write_json_file(obj, fn):
1394646a 86 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 87
92120217 88 fn = encodeFilename(fn)
61ee5aeb 89 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
90 encoding = get_filesystem_encoding()
91 # os.path.basename returns a bytes object, but NamedTemporaryFile
92 # will fail if the filename contains non ascii characters unless we
93 # use a unicode object
94 path_basename = lambda f: os.path.basename(fn).decode(encoding)
95 # the same for os.path.dirname
96 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
97 else:
98 path_basename = os.path.basename
99 path_dirname = os.path.dirname
100
73159f99
S
101 args = {
102 'suffix': '.tmp',
ec5f6016
JMF
103 'prefix': path_basename(fn) + '.',
104 'dir': path_dirname(fn),
73159f99
S
105 'delete': False,
106 }
107
181c8655
PH
108 # In Python 2.x, json.dump expects a bytestream.
109 # In Python 3.x, it writes to a character stream
110 if sys.version_info < (3, 0):
73159f99 111 args['mode'] = 'wb'
181c8655 112 else:
73159f99
S
113 args.update({
114 'mode': 'w',
115 'encoding': 'utf-8',
116 })
117
118 tf = tempfile.NamedTemporaryFile(**args)
181c8655
PH
119
120 try:
121 with tf:
122 json.dump(obj, tf)
1394646a
IK
123 if sys.platform == 'win32':
124 # Need to remove existing file on Windows, else os.rename raises
125 # WindowsError or FileExistsError.
126 try:
127 os.unlink(fn)
128 except OSError:
129 pass
181c8655
PH
130 os.rename(tf.name, fn)
131 except:
132 try:
133 os.remove(tf.name)
134 except OSError:
135 pass
136 raise
137
138
139if sys.version_info >= (2, 7):
59ae56fa
PH
140 def find_xpath_attr(node, xpath, key, val):
141 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
142 assert re.match(r'^[a-zA-Z-]+$', key)
143 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
ab4ee31e 144 expr = xpath + "[@%s='%s']" % (key, val)
59ae56fa
PH
145 return node.find(expr)
146else:
147 def find_xpath_attr(node, xpath, key, val):
4eefbfdb
PH
148 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
149 # .//node does not match if a node is a direct child of . !
8f9312c3 150 if isinstance(xpath, compat_str):
4eefbfdb
PH
151 xpath = xpath.encode('ascii')
152
59ae56fa
PH
153 for f in node.findall(xpath):
154 if f.attrib.get(key) == val:
155 return f
156 return None
157
d7e66d39
JMF
158# On python2.6 the xml.etree.ElementTree.Element methods don't support
159# the namespace parameter
5f6a1245
JW
160
161
d7e66d39
JMF
162def xpath_with_ns(path, ns_map):
163 components = [c.split(':') for c in path.split('/')]
164 replaced = []
165 for c in components:
166 if len(c) == 1:
167 replaced.append(c[0])
168 else:
169 ns, tag = c
170 replaced.append('{%s}%s' % (ns_map[ns], tag))
171 return '/'.join(replaced)
172
d77c3dfd 173
bf0ff932 174def xpath_text(node, xpath, name=None, fatal=False):
d74bebd5
PH
175 if sys.version_info < (2, 7): # Crazy 2.6
176 xpath = xpath.encode('ascii')
177
bf0ff932 178 n = node.find(xpath)
42bdd9d0 179 if n is None or n.text is None:
bf0ff932
PH
180 if fatal:
181 name = xpath if name is None else name
182 raise ExtractorError('Could not find XML element %s' % name)
183 else:
184 return None
185 return n.text
186
187
9e6dd238 188def get_element_by_id(id, html):
43e8fafd
ND
189 """Return the content of the tag with the specified ID in the passed HTML document"""
190 return get_element_by_attribute("id", id, html)
191
12ea2f30 192
43e8fafd
ND
193def get_element_by_attribute(attribute, value, html):
194 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 195
38285056
PH
196 m = re.search(r'''(?xs)
197 <([a-zA-Z0-9:._-]+)
198 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
199 \s+%s=['"]?%s['"]?
200 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
201 \s*>
202 (?P<content>.*?)
203 </\1>
204 ''' % (re.escape(attribute), re.escape(value)), html)
205
206 if not m:
207 return None
208 res = m.group('content')
209
210 if res.startswith('"') or res.startswith("'"):
211 res = res[1:-1]
a921f407 212
38285056 213 return unescapeHTML(res)
a921f407 214
9e6dd238
FV
215
216def clean_html(html):
59ae15a5 217 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
218
219 if html is None: # Convenience for sanitizing descriptions etc.
220 return html
221
59ae15a5
PH
222 # Newline vs <br />
223 html = html.replace('\n', ' ')
6b3aef80
FV
224 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
225 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
226 # Strip html tags
227 html = re.sub('<.*?>', '', html)
228 # Replace html entities
229 html = unescapeHTML(html)
7decf895 230 return html.strip()
9e6dd238
FV
231
232
d77c3dfd 233def sanitize_open(filename, open_mode):
59ae15a5
PH
234 """Try to open the given filename, and slightly tweak it if this fails.
235
236 Attempts to open the given filename. If this fails, it tries to change
237 the filename slightly, step by step, until it's either able to open it
238 or it fails and raises a final exception, like the standard open()
239 function.
240
241 It returns the tuple (stream, definitive_file_name).
242 """
243 try:
28e614de 244 if filename == '-':
59ae15a5
PH
245 if sys.platform == 'win32':
246 import msvcrt
247 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 248 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
249 stream = open(encodeFilename(filename), open_mode)
250 return (stream, filename)
251 except (IOError, OSError) as err:
f45c185f
PH
252 if err.errno in (errno.EACCES,):
253 raise
59ae15a5 254
f45c185f
PH
255 # In case of error, try to remove win32 forbidden chars
256 alt_filename = os.path.join(
b74e86f4
PH
257 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
258 for path_part in os.path.split(filename)
259 )
f45c185f
PH
260 if alt_filename == filename:
261 raise
262 else:
263 # An exception here should be caught in the caller
264 stream = open(encodeFilename(filename), open_mode)
265 return (stream, alt_filename)
d77c3dfd
FV
266
267
268def timeconvert(timestr):
59ae15a5
PH
269 """Convert RFC 2822 defined time string into system timestamp"""
270 timestamp = None
271 timetuple = email.utils.parsedate_tz(timestr)
272 if timetuple is not None:
273 timestamp = email.utils.mktime_tz(timetuple)
274 return timestamp
1c469a94 275
5f6a1245 276
796173d0 277def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
278 """Sanitizes a string so it could be used as part of a filename.
279 If restricted is set, use a stricter subset of allowed characters.
796173d0 280 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
281 """
282 def replace_insane(char):
283 if char == '?' or ord(char) < 32 or ord(char) == 127:
284 return ''
285 elif char == '"':
286 return '' if restricted else '\''
287 elif char == ':':
288 return '_-' if restricted else ' -'
289 elif char in '\\/|*<>':
290 return '_'
627dcfff 291 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
292 return '_'
293 if restricted and ord(char) > 127:
294 return '_'
295 return char
296
2aeb06d6
PH
297 # Handle timestamps
298 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 299 result = ''.join(map(replace_insane, s))
796173d0
PH
300 if not is_id:
301 while '__' in result:
302 result = result.replace('__', '_')
303 result = result.strip('_')
304 # Common case of "Foreign band name - English song title"
305 if restricted and result.startswith('-_'):
306 result = result[2:]
307 if not result:
308 result = '_'
59ae15a5 309 return result
d77c3dfd 310
5f6a1245 311
d77c3dfd 312def orderedSet(iterable):
59ae15a5
PH
313 """ Remove all duplicates from the input iterable """
314 res = []
315 for el in iterable:
316 if el not in res:
317 res.append(el)
318 return res
d77c3dfd 319
912b38b4 320
4e408e47
PH
321def _htmlentity_transform(entity):
322 """Transforms an HTML entity to a character."""
323 # Known non-numeric HTML entity
324 if entity in compat_html_entities.name2codepoint:
325 return compat_chr(compat_html_entities.name2codepoint[entity])
326
327 mobj = re.match(r'#(x?[0-9]+)', entity)
328 if mobj is not None:
329 numstr = mobj.group(1)
28e614de 330 if numstr.startswith('x'):
4e408e47 331 base = 16
28e614de 332 numstr = '0%s' % numstr
4e408e47
PH
333 else:
334 base = 10
335 return compat_chr(int(numstr, base))
336
337 # Unknown entity in name, return its literal representation
28e614de 338 return ('&%s;' % entity)
4e408e47
PH
339
340
d77c3dfd 341def unescapeHTML(s):
912b38b4
PH
342 if s is None:
343 return None
344 assert type(s) == compat_str
d77c3dfd 345
4e408e47
PH
346 return re.sub(
347 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 348
8bf48f23
PH
349
350def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
351 """
352 @param s The name of the file
353 """
d77c3dfd 354
8bf48f23 355 assert type(s) == compat_str
d77c3dfd 356
59ae15a5
PH
357 # Python 3 has a Unicode API
358 if sys.version_info >= (3, 0):
359 return s
0f00efed 360
59ae15a5 361 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
28e614de 362 # Pass '' directly to use Unicode APIs on Windows 2000 and up
59ae15a5
PH
363 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
364 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
365 if not for_subprocess:
366 return s
367 else:
368 # For subprocess calls, encode with locale encoding
369 # Refer to http://stackoverflow.com/a/9951851/35070
370 encoding = preferredencoding()
59ae15a5 371 else:
6df40dcb 372 encoding = sys.getfilesystemencoding()
8bf48f23
PH
373 if encoding is None:
374 encoding = 'utf-8'
375 return s.encode(encoding, 'ignore')
376
f07b74fc
PH
377
378def encodeArgument(s):
379 if not isinstance(s, compat_str):
380 # Legacy code that uses byte strings
381 # Uncomment the following line after fixing all post processors
7af808a5 382 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
383 s = s.decode('ascii')
384 return encodeFilename(s, True)
385
386
8271226a
PH
387def decodeOption(optval):
388 if optval is None:
389 return optval
390 if isinstance(optval, bytes):
391 optval = optval.decode(preferredencoding())
392
393 assert isinstance(optval, compat_str)
394 return optval
1c256f70 395
5f6a1245 396
4539dd30
PH
397def formatSeconds(secs):
398 if secs > 3600:
399 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
400 elif secs > 60:
401 return '%d:%02d' % (secs // 60, secs % 60)
402 else:
403 return '%d' % secs
404
a0ddb8a2 405
be4a824d
PH
406def make_HTTPS_handler(params, **kwargs):
407 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 408 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 409 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 410 if opts_no_check_certificate:
be5f2c19 411 context.check_hostname = False
0db261ba 412 context.verify_mode = ssl.CERT_NONE
a2366922 413 try:
be4a824d 414 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
415 except TypeError:
416 # Python 2.7.8
417 # (create_default_context present but HTTPSHandler has no context=)
418 pass
419
420 if sys.version_info < (3, 2):
d7932313 421 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 422 else: # Python < 3.4
d7932313 423 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 424 context.verify_mode = (ssl.CERT_NONE
dca08720 425 if opts_no_check_certificate
ea6d901e 426 else ssl.CERT_REQUIRED)
303b479e 427 context.set_default_verify_paths()
be4a824d 428 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 429
732ea2f0 430
1c256f70
PH
431class ExtractorError(Exception):
432 """Error during info extraction."""
5f6a1245 433
d11271dd 434 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
435 """ tb, if given, is the original traceback (so that it can be printed out).
436 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
437 """
438
439 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
440 expected = True
d11271dd
PH
441 if video_id is not None:
442 msg = video_id + ': ' + msg
410f3e73 443 if cause:
28e614de 444 msg += ' (caused by %r)' % cause
9a82b238 445 if not expected:
732ea2f0
PH
446 if ytdl_is_updateable():
447 update_cmd = 'type youtube-dl -U to update'
448 else:
449 update_cmd = 'see https://yt-dl.org/update on how to update'
450 msg += '; please report this issue on https://yt-dl.org/bug .'
451 msg += ' Make sure you are using the latest version; %s.' % update_cmd
452 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
1c256f70 453 super(ExtractorError, self).__init__(msg)
d5979c5d 454
1c256f70 455 self.traceback = tb
8cc83b8d 456 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 457 self.cause = cause
d11271dd 458 self.video_id = video_id
1c256f70 459
01951dda
PH
460 def format_traceback(self):
461 if self.traceback is None:
462 return None
28e614de 463 return ''.join(traceback.format_tb(self.traceback))
01951dda 464
1c256f70 465
416c7fcb
PH
466class UnsupportedError(ExtractorError):
467 def __init__(self, url):
468 super(UnsupportedError, self).__init__(
469 'Unsupported URL: %s' % url, expected=True)
470 self.url = url
471
472
55b3e45b
JMF
473class RegexNotFoundError(ExtractorError):
474 """Error when a regex didn't match"""
475 pass
476
477
d77c3dfd 478class DownloadError(Exception):
59ae15a5 479 """Download Error exception.
d77c3dfd 480
59ae15a5
PH
481 This exception may be thrown by FileDownloader objects if they are not
482 configured to continue on errors. They will contain the appropriate
483 error message.
484 """
5f6a1245 485
8cc83b8d
FV
486 def __init__(self, msg, exc_info=None):
487 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
488 super(DownloadError, self).__init__(msg)
489 self.exc_info = exc_info
d77c3dfd
FV
490
491
492class SameFileError(Exception):
59ae15a5 493 """Same File exception.
d77c3dfd 494
59ae15a5
PH
495 This exception will be thrown by FileDownloader objects if they detect
496 multiple files would have to be downloaded to the same file on disk.
497 """
498 pass
d77c3dfd
FV
499
500
501class PostProcessingError(Exception):
59ae15a5 502 """Post Processing exception.
d77c3dfd 503
59ae15a5
PH
504 This exception may be raised by PostProcessor's .run() method to
505 indicate an error in the postprocessing task.
506 """
5f6a1245 507
7851b379
PH
508 def __init__(self, msg):
509 self.msg = msg
d77c3dfd 510
5f6a1245 511
d77c3dfd 512class MaxDownloadsReached(Exception):
59ae15a5
PH
513 """ --max-downloads limit has been reached. """
514 pass
d77c3dfd
FV
515
516
517class UnavailableVideoError(Exception):
59ae15a5 518 """Unavailable Format exception.
d77c3dfd 519
59ae15a5
PH
520 This exception will be thrown when a video is requested
521 in a format that is not available for that video.
522 """
523 pass
d77c3dfd
FV
524
525
526class ContentTooShortError(Exception):
59ae15a5 527 """Content Too Short exception.
d77c3dfd 528
59ae15a5
PH
529 This exception may be raised by FileDownloader objects when a file they
530 download is too small for what the server announced first, indicating
531 the connection was probably interrupted.
532 """
533 # Both in bytes
534 downloaded = None
535 expected = None
d77c3dfd 536
59ae15a5
PH
537 def __init__(self, downloaded, expected):
538 self.downloaded = downloaded
539 self.expected = expected
d77c3dfd 540
5f6a1245 541
c5a59d93 542def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
be4a824d
PH
543 hc = http_class(*args, **kwargs)
544 source_address = ydl_handler._params.get('source_address')
545 if source_address is not None:
546 sa = (source_address, 0)
547 if hasattr(hc, 'source_address'): # Python 2.7+
548 hc.source_address = sa
549 else: # Python 2.6
550 def _hc_connect(self, *args, **kwargs):
551 sock = compat_socket_create_connection(
552 (self.host, self.port), self.timeout, sa)
553 if is_https:
d7932313
PH
554 self.sock = ssl.wrap_socket(
555 sock, self.key_file, self.cert_file,
556 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
557 else:
558 self.sock = sock
559 hc.connect = functools.partial(_hc_connect, hc)
560
561 return hc
562
563
acebc9cd 564class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
565 """Handler for HTTP requests and responses.
566
567 This class, when installed with an OpenerDirector, automatically adds
568 the standard headers to every HTTP request and handles gzipped and
569 deflated responses from web servers. If compression is to be avoided in
570 a particular request, the original request in the program code only has
571 to include the HTTP header "Youtubedl-No-Compression", which will be
572 removed before making the real request.
573
574 Part of this code was copied from:
575
576 http://techknack.net/python-urllib2-handlers/
577
578 Andrew Rowls, the author of that code, agreed to release it to the
579 public domain.
580 """
581
be4a824d
PH
582 def __init__(self, params, *args, **kwargs):
583 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
584 self._params = params
585
586 def http_open(self, req):
587 return self.do_open(functools.partial(
c5a59d93 588 _create_http_connection, self, compat_http_client.HTTPConnection, False),
be4a824d
PH
589 req)
590
59ae15a5
PH
591 @staticmethod
592 def deflate(data):
593 try:
594 return zlib.decompress(data, -zlib.MAX_WBITS)
595 except zlib.error:
596 return zlib.decompress(data)
597
598 @staticmethod
599 def addinfourl_wrapper(stream, headers, url, code):
600 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
601 return compat_urllib_request.addinfourl(stream, headers, url, code)
602 ret = compat_urllib_request.addinfourl(stream, headers, url)
603 ret.code = code
604 return ret
605
acebc9cd 606 def http_request(self, req):
33ac271b 607 for h, v in std_headers.items():
3d5f7a39
JK
608 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
609 # The dict keys are capitalized because of this bug by urllib
610 if h.capitalize() not in req.headers:
33ac271b 611 req.add_header(h, v)
59ae15a5
PH
612 if 'Youtubedl-no-compression' in req.headers:
613 if 'Accept-encoding' in req.headers:
614 del req.headers['Accept-encoding']
615 del req.headers['Youtubedl-no-compression']
989b4b2b
PH
616
617 if sys.version_info < (2, 7) and '#' in req.get_full_url():
618 # Python 2.6 is brain-dead when it comes to fragments
619 req._Request__original = req._Request__original.partition('#')[0]
620 req._Request__r_type = req._Request__r_type.partition('#')[0]
621
59ae15a5
PH
622 return req
623
acebc9cd 624 def http_response(self, req, resp):
59ae15a5
PH
625 old_resp = resp
626 # gzip
627 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
628 content = resp.read()
629 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
630 try:
631 uncompressed = io.BytesIO(gz.read())
632 except IOError as original_ioerror:
633 # There may be junk add the end of the file
634 # See http://stackoverflow.com/q/4928560/35070 for details
635 for i in range(1, 1024):
636 try:
637 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
638 uncompressed = io.BytesIO(gz.read())
639 except IOError:
640 continue
641 break
642 else:
643 raise original_ioerror
644 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
645 resp.msg = old_resp.msg
646 # deflate
647 if resp.headers.get('Content-encoding', '') == 'deflate':
648 gz = io.BytesIO(self.deflate(resp.read()))
649 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
650 resp.msg = old_resp.msg
651 return resp
0f8d03f8 652
acebc9cd
PH
653 https_request = http_request
654 https_response = http_response
bf50b038 655
5de90176 656
be4a824d
PH
657class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
658 def __init__(self, params, https_conn_class=None, *args, **kwargs):
659 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
660 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
661 self._params = params
662
663 def https_open(self, req):
4f264c02
JMF
664 kwargs = {}
665 if hasattr(self, '_context'): # python > 2.6
666 kwargs['context'] = self._context
667 if hasattr(self, '_check_hostname'): # python 3.x
668 kwargs['check_hostname'] = self._check_hostname
be4a824d
PH
669 return self.do_open(functools.partial(
670 _create_http_connection, self, self._https_conn_class, True),
4f264c02 671 req, **kwargs)
be4a824d
PH
672
673
08b38d54 674def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
675 """ Return a UNIX timestamp from the given date """
676
677 if date_str is None:
678 return None
679
08b38d54
PH
680 if timezone is None:
681 m = re.search(
682 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
683 date_str)
684 if not m:
912b38b4
PH
685 timezone = datetime.timedelta()
686 else:
08b38d54
PH
687 date_str = date_str[:-len(m.group(0))]
688 if not m.group('sign'):
689 timezone = datetime.timedelta()
690 else:
691 sign = 1 if m.group('sign') == '+' else -1
692 timezone = datetime.timedelta(
693 hours=sign * int(m.group('hours')),
694 minutes=sign * int(m.group('minutes')))
6ad4013d 695 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
305d0683 696 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
697 return calendar.timegm(dt.timetuple())
698
699
42bdd9d0 700def unified_strdate(date_str, day_first=True):
bf50b038 701 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
702
703 if date_str is None:
704 return None
bf50b038 705 upload_date = None
5f6a1245 706 # Replace commas
026fcc04 707 date_str = date_str.replace(',', ' ')
bf50b038 708 # %z (UTC offset) is only supported in python>=3.2
026fcc04 709 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0 710 # Remove AM/PM + timezone
9bb8e0a3 711 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
42bdd9d0 712
19e1d359
JMF
713 format_expressions = [
714 '%d %B %Y',
0f99566c 715 '%d %b %Y',
19e1d359
JMF
716 '%B %d %Y',
717 '%b %d %Y',
78ff59d0
PP
718 '%b %dst %Y %I:%M%p',
719 '%b %dnd %Y %I:%M%p',
720 '%b %dth %Y %I:%M%p',
a69801e2 721 '%Y %m %d',
19e1d359 722 '%Y-%m-%d',
fe556f1b 723 '%Y/%m/%d',
19e1d359 724 '%Y/%m/%d %H:%M:%S',
5d73273f 725 '%Y-%m-%d %H:%M:%S',
e9be9a6a 726 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 727 '%d.%m.%Y %H:%M',
b047de6f 728 '%d.%m.%Y %H.%M',
19e1d359 729 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
730 '%Y-%m-%dT%H:%M:%S.%fZ',
731 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 732 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 733 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 734 '%Y-%m-%dT%H:%M',
19e1d359 735 ]
42bdd9d0
PH
736 if day_first:
737 format_expressions.extend([
776dc399
S
738 '%d.%m.%Y',
739 '%d/%m/%Y',
740 '%d/%m/%y',
42bdd9d0
PH
741 '%d/%m/%Y %H:%M:%S',
742 ])
743 else:
744 format_expressions.extend([
776dc399
S
745 '%m.%d.%Y',
746 '%m/%d/%Y',
747 '%m/%d/%y',
42bdd9d0
PH
748 '%m/%d/%Y %H:%M:%S',
749 ])
bf50b038
JMF
750 for expression in format_expressions:
751 try:
752 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 753 except ValueError:
bf50b038 754 pass
42393ce2
PH
755 if upload_date is None:
756 timetuple = email.utils.parsedate_tz(date_str)
757 if timetuple:
758 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
759 return upload_date
760
5f6a1245 761
28e614de 762def determine_ext(url, default_ext='unknown_video'):
f4776371
S
763 if url is None:
764 return default_ext
28e614de 765 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
766 if re.match(r'^[A-Za-z0-9]+$', guess):
767 return guess
768 else:
cbdbb766 769 return default_ext
73e79f2a 770
5f6a1245 771
d4051a8e 772def subtitles_filename(filename, sub_lang, sub_format):
28e614de 773 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 774
5f6a1245 775
bd558525 776def date_from_str(date_str):
37254abc
JMF
777 """
778 Return a datetime object from a string in the format YYYYMMDD or
779 (now|today)[+-][0-9](day|week|month|year)(s)?"""
780 today = datetime.date.today()
f8795e10 781 if date_str in ('now', 'today'):
37254abc 782 return today
f8795e10
PH
783 if date_str == 'yesterday':
784 return today - datetime.timedelta(days=1)
37254abc
JMF
785 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
786 if match is not None:
787 sign = match.group('sign')
788 time = int(match.group('time'))
789 if sign == '-':
790 time = -time
791 unit = match.group('unit')
5f6a1245 792 # A bad aproximation?
37254abc
JMF
793 if unit == 'month':
794 unit = 'day'
795 time *= 30
796 elif unit == 'year':
797 unit = 'day'
798 time *= 365
799 unit += 's'
800 delta = datetime.timedelta(**{unit: time})
801 return today + delta
bd558525 802 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
5f6a1245
JW
803
804
e63fc1be 805def hyphenate_date(date_str):
806 """
807 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
808 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
809 if match is not None:
810 return '-'.join(match.groups())
811 else:
812 return date_str
813
5f6a1245 814
bd558525
JMF
815class DateRange(object):
816 """Represents a time interval between two dates"""
5f6a1245 817
bd558525
JMF
818 def __init__(self, start=None, end=None):
819 """start and end must be strings in the format accepted by date"""
820 if start is not None:
821 self.start = date_from_str(start)
822 else:
823 self.start = datetime.datetime.min.date()
824 if end is not None:
825 self.end = date_from_str(end)
826 else:
827 self.end = datetime.datetime.max.date()
37254abc 828 if self.start > self.end:
bd558525 829 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 830
bd558525
JMF
831 @classmethod
832 def day(cls, day):
833 """Returns a range that only contains the given day"""
5f6a1245
JW
834 return cls(day, day)
835
bd558525
JMF
836 def __contains__(self, date):
837 """Check if the date is in the range"""
37254abc
JMF
838 if not isinstance(date, datetime.date):
839 date = date_from_str(date)
840 return self.start <= date <= self.end
5f6a1245 841
bd558525 842 def __str__(self):
5f6a1245 843 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
844
845
846def platform_name():
847 """ Returns the platform name as a compat_str """
848 res = platform.platform()
849 if isinstance(res, bytes):
850 res = res.decode(preferredencoding())
851
852 assert isinstance(res, compat_str)
853 return res
c257baff
PH
854
855
b58ddb32
PH
856def _windows_write_string(s, out):
857 """ Returns True if the string was written using special methods,
858 False if it has yet to be written out."""
859 # Adapted from http://stackoverflow.com/a/3259271/35070
860
861 import ctypes
862 import ctypes.wintypes
863
864 WIN_OUTPUT_IDS = {
865 1: -11,
866 2: -12,
867 }
868
a383a98a
PH
869 try:
870 fileno = out.fileno()
871 except AttributeError:
872 # If the output stream doesn't have a fileno, it's virtual
873 return False
aa42e873
PH
874 except io.UnsupportedOperation:
875 # Some strange Windows pseudo files?
876 return False
b58ddb32
PH
877 if fileno not in WIN_OUTPUT_IDS:
878 return False
879
e2f89ec7 880 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 881 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
6ac4e806 882 (b"GetStdHandle", ctypes.windll.kernel32))
b58ddb32
PH
883 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
884
e2f89ec7 885 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
886 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
887 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
6ac4e806 888 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
b58ddb32
PH
889 written = ctypes.wintypes.DWORD(0)
890
6ac4e806 891 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
b58ddb32
PH
892 FILE_TYPE_CHAR = 0x0002
893 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 894 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
895 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
896 ctypes.POINTER(ctypes.wintypes.DWORD))(
6ac4e806 897 (b"GetConsoleMode", ctypes.windll.kernel32))
b58ddb32
PH
898 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
899
900 def not_a_console(handle):
901 if handle == INVALID_HANDLE_VALUE or handle is None:
902 return True
8fb3ac36
PH
903 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
904 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
905
906 if not_a_console(h):
907 return False
908
d1b9c912
PH
909 def next_nonbmp_pos(s):
910 try:
911 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
912 except StopIteration:
913 return len(s)
914
915 while s:
916 count = min(next_nonbmp_pos(s), 1024)
917
b58ddb32 918 ret = WriteConsoleW(
d1b9c912 919 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
920 if ret == 0:
921 raise OSError('Failed to write string')
d1b9c912
PH
922 if not count: # We just wrote a non-BMP character
923 assert written.value == 2
924 s = s[1:]
925 else:
926 assert written.value > 0
927 s = s[written.value:]
b58ddb32
PH
928 return True
929
930
734f90bb 931def write_string(s, out=None, encoding=None):
7459e3a2
PH
932 if out is None:
933 out = sys.stderr
8bf48f23 934 assert type(s) == compat_str
7459e3a2 935
b58ddb32
PH
936 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
937 if _windows_write_string(s, out):
938 return
939
7459e3a2
PH
940 if ('b' in getattr(out, 'mode', '') or
941 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
942 byt = s.encode(encoding or preferredencoding(), 'ignore')
943 out.write(byt)
944 elif hasattr(out, 'buffer'):
945 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
946 byt = s.encode(enc, 'ignore')
947 out.buffer.write(byt)
948 else:
8bf48f23 949 out.write(s)
7459e3a2
PH
950 out.flush()
951
952
48ea9cea
PH
953def bytes_to_intlist(bs):
954 if not bs:
955 return []
956 if isinstance(bs[0], int): # Python 3
957 return list(bs)
958 else:
959 return [ord(c) for c in bs]
960
c257baff 961
cba892fa 962def intlist_to_bytes(xs):
963 if not xs:
964 return b''
eb4157fd 965 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
966
967
c1c9a79c
PH
968# Cross-platform file locking
969if sys.platform == 'win32':
970 import ctypes.wintypes
971 import msvcrt
972
973 class OVERLAPPED(ctypes.Structure):
974 _fields_ = [
975 ('Internal', ctypes.wintypes.LPVOID),
976 ('InternalHigh', ctypes.wintypes.LPVOID),
977 ('Offset', ctypes.wintypes.DWORD),
978 ('OffsetHigh', ctypes.wintypes.DWORD),
979 ('hEvent', ctypes.wintypes.HANDLE),
980 ]
981
982 kernel32 = ctypes.windll.kernel32
983 LockFileEx = kernel32.LockFileEx
984 LockFileEx.argtypes = [
985 ctypes.wintypes.HANDLE, # hFile
986 ctypes.wintypes.DWORD, # dwFlags
987 ctypes.wintypes.DWORD, # dwReserved
988 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
989 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
990 ctypes.POINTER(OVERLAPPED) # Overlapped
991 ]
992 LockFileEx.restype = ctypes.wintypes.BOOL
993 UnlockFileEx = kernel32.UnlockFileEx
994 UnlockFileEx.argtypes = [
995 ctypes.wintypes.HANDLE, # hFile
996 ctypes.wintypes.DWORD, # dwReserved
997 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
998 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
999 ctypes.POINTER(OVERLAPPED) # Overlapped
1000 ]
1001 UnlockFileEx.restype = ctypes.wintypes.BOOL
1002 whole_low = 0xffffffff
1003 whole_high = 0x7fffffff
1004
1005 def _lock_file(f, exclusive):
1006 overlapped = OVERLAPPED()
1007 overlapped.Offset = 0
1008 overlapped.OffsetHigh = 0
1009 overlapped.hEvent = 0
1010 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1011 handle = msvcrt.get_osfhandle(f.fileno())
1012 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1013 whole_low, whole_high, f._lock_file_overlapped_p):
1014 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1015
1016 def _unlock_file(f):
1017 assert f._lock_file_overlapped_p
1018 handle = msvcrt.get_osfhandle(f.fileno())
1019 if not UnlockFileEx(handle, 0,
1020 whole_low, whole_high, f._lock_file_overlapped_p):
1021 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1022
1023else:
1024 import fcntl
1025
1026 def _lock_file(f, exclusive):
2582bebe 1027 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
1028
1029 def _unlock_file(f):
2582bebe 1030 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
1031
1032
1033class locked_file(object):
1034 def __init__(self, filename, mode, encoding=None):
1035 assert mode in ['r', 'a', 'w']
1036 self.f = io.open(filename, mode, encoding=encoding)
1037 self.mode = mode
1038
1039 def __enter__(self):
1040 exclusive = self.mode != 'r'
1041 try:
1042 _lock_file(self.f, exclusive)
1043 except IOError:
1044 self.f.close()
1045 raise
1046 return self
1047
1048 def __exit__(self, etype, value, traceback):
1049 try:
1050 _unlock_file(self.f)
1051 finally:
1052 self.f.close()
1053
1054 def __iter__(self):
1055 return iter(self.f)
1056
1057 def write(self, *args):
1058 return self.f.write(*args)
1059
1060 def read(self, *args):
1061 return self.f.read(*args)
4eb7f1d1
JMF
1062
1063
4644ac55
S
1064def get_filesystem_encoding():
1065 encoding = sys.getfilesystemencoding()
1066 return encoding if encoding is not None else 'utf-8'
1067
1068
4eb7f1d1 1069def shell_quote(args):
a6a173c2 1070 quoted_args = []
4644ac55 1071 encoding = get_filesystem_encoding()
a6a173c2
JMF
1072 for a in args:
1073 if isinstance(a, bytes):
1074 # We may get a filename encoded with 'encodeFilename'
1075 a = a.decode(encoding)
1076 quoted_args.append(pipes.quote(a))
28e614de 1077 return ' '.join(quoted_args)
9d4660ca
PH
1078
1079
f4d96df0
PH
1080def takewhile_inclusive(pred, seq):
1081 """ Like itertools.takewhile, but include the latest evaluated element
1082 (the first element so that Not pred(e)) """
1083 for e in seq:
1084 yield e
1085 if not pred(e):
1086 return
1087
1088
9d4660ca
PH
1089def smuggle_url(url, data):
1090 """ Pass additional data in a URL for internal use. """
1091
1092 sdata = compat_urllib_parse.urlencode(
28e614de
PH
1093 {'__youtubedl_smuggle': json.dumps(data)})
1094 return url + '#' + sdata
9d4660ca
PH
1095
1096
79f82953 1097def unsmuggle_url(smug_url, default=None):
83e865a3 1098 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1099 return smug_url, default
28e614de
PH
1100 url, _, sdata = smug_url.rpartition('#')
1101 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1102 data = json.loads(jsond)
1103 return url, data
02dbf93f
PH
1104
1105
02dbf93f
PH
1106def format_bytes(bytes):
1107 if bytes is None:
28e614de 1108 return 'N/A'
02dbf93f
PH
1109 if type(bytes) is str:
1110 bytes = float(bytes)
1111 if bytes == 0.0:
1112 exponent = 0
1113 else:
1114 exponent = int(math.log(bytes, 1024.0))
28e614de 1115 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1116 converted = float(bytes) / float(1024 ** exponent)
28e614de 1117 return '%.2f%s' % (converted, suffix)
f53c966a 1118
1c088fa8 1119
be64b5b0
PH
1120def parse_filesize(s):
1121 if s is None:
1122 return None
1123
1124 # The lower-case forms are of course incorrect and inofficial,
1125 # but we support those too
1126 _UNIT_TABLE = {
1127 'B': 1,
1128 'b': 1,
1129 'KiB': 1024,
1130 'KB': 1000,
1131 'kB': 1024,
1132 'Kb': 1000,
1133 'MiB': 1024 ** 2,
1134 'MB': 1000 ** 2,
1135 'mB': 1024 ** 2,
1136 'Mb': 1000 ** 2,
1137 'GiB': 1024 ** 3,
1138 'GB': 1000 ** 3,
1139 'gB': 1024 ** 3,
1140 'Gb': 1000 ** 3,
1141 'TiB': 1024 ** 4,
1142 'TB': 1000 ** 4,
1143 'tB': 1024 ** 4,
1144 'Tb': 1000 ** 4,
1145 'PiB': 1024 ** 5,
1146 'PB': 1000 ** 5,
1147 'pB': 1024 ** 5,
1148 'Pb': 1000 ** 5,
1149 'EiB': 1024 ** 6,
1150 'EB': 1000 ** 6,
1151 'eB': 1024 ** 6,
1152 'Eb': 1000 ** 6,
1153 'ZiB': 1024 ** 7,
1154 'ZB': 1000 ** 7,
1155 'zB': 1024 ** 7,
1156 'Zb': 1000 ** 7,
1157 'YiB': 1024 ** 8,
1158 'YB': 1000 ** 8,
1159 'yB': 1024 ** 8,
1160 'Yb': 1000 ** 8,
1161 }
1162
1163 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
4349c07d
PH
1164 m = re.match(
1165 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
be64b5b0
PH
1166 if not m:
1167 return None
1168
4349c07d
PH
1169 num_str = m.group('num').replace(',', '.')
1170 mult = _UNIT_TABLE[m.group('unit')]
1171 return int(float(num_str) * mult)
be64b5b0
PH
1172
1173
1c088fa8 1174def get_term_width():
4644ac55 1175 columns = compat_getenv('COLUMNS', None)
1c088fa8
PH
1176 if columns:
1177 return int(columns)
1178
1179 try:
1180 sp = subprocess.Popen(
1181 ['stty', 'size'],
1182 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1183 out, err = sp.communicate()
1184 return int(out.split()[1])
1185 except:
1186 pass
1187 return None
caefb1de
PH
1188
1189
1190def month_by_name(name):
1191 """ Return the number of a month by (locale-independently) English name """
1192
caefb1de 1193 try:
7105440c
YCH
1194 return ENGLISH_MONTH_NAMES.index(name) + 1
1195 except ValueError:
1196 return None
1197
1198
1199def month_by_abbreviation(abbrev):
1200 """ Return the number of a month by (locale-independently) English
1201 abbreviations """
1202
1203 try:
1204 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1205 except ValueError:
1206 return None
18258362
JMF
1207
1208
5aafe895 1209def fix_xml_ampersands(xml_str):
18258362 1210 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1211 return re.sub(
1212 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1213 '&amp;',
5aafe895 1214 xml_str)
e3946f98
PH
1215
1216
1217def setproctitle(title):
8bf48f23 1218 assert isinstance(title, compat_str)
e3946f98
PH
1219 try:
1220 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1221 except OSError:
1222 return
6eefe533
PH
1223 title_bytes = title.encode('utf-8')
1224 buf = ctypes.create_string_buffer(len(title_bytes))
1225 buf.value = title_bytes
e3946f98 1226 try:
6eefe533 1227 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1228 except AttributeError:
1229 return # Strange libc, just skip this
d7dda168
PH
1230
1231
1232def remove_start(s, start):
1233 if s.startswith(start):
1234 return s[len(start):]
1235 return s
29eb5174
PH
1236
1237
2b9faf55
PH
1238def remove_end(s, end):
1239 if s.endswith(end):
1240 return s[:-len(end)]
1241 return s
1242
1243
29eb5174 1244def url_basename(url):
9b8aaeed 1245 path = compat_urlparse.urlparse(url).path
28e614de 1246 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1247
1248
1249class HEADRequest(compat_urllib_request.Request):
1250 def get_method(self):
1251 return "HEAD"
7217e148
PH
1252
1253
9732d77e 1254def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1255 if get_attr:
1256 if v is not None:
1257 v = getattr(v, get_attr, None)
9572013d
PH
1258 if v == '':
1259 v = None
9732d77e
PH
1260 return default if v is None else (int(v) * invscale // scale)
1261
9572013d 1262
40a90862
JMF
1263def str_or_none(v, default=None):
1264 return default if v is None else compat_str(v)
1265
9732d77e
PH
1266
1267def str_to_int(int_str):
48d4681e 1268 """ A more relaxed version of int_or_none """
9732d77e
PH
1269 if int_str is None:
1270 return None
28e614de 1271 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1272 return int(int_str)
608d11f5
PH
1273
1274
9732d77e
PH
1275def float_or_none(v, scale=1, invscale=1, default=None):
1276 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1277
1278
608d11f5 1279def parse_duration(s):
8f9312c3 1280 if not isinstance(s, compat_basestring):
608d11f5
PH
1281 return None
1282
ca7b3246
S
1283 s = s.strip()
1284
608d11f5 1285 m = re.match(
9d22a7df 1286 r'''(?ix)(?:P?T)?
e8df5cee
PH
1287 (?:
1288 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1289 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1290
6a68bb57 1291 (?:
8f4b58d7
PH
1292 (?:
1293 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1294 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1295 )?
6a68bb57
PH
1296 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1297 )?
e8df5cee
PH
1298 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1299 )$''', s)
608d11f5
PH
1300 if not m:
1301 return None
e8df5cee
PH
1302 res = 0
1303 if m.group('only_mins'):
1304 return float_or_none(m.group('only_mins'), invscale=60)
1305 if m.group('only_hours'):
1306 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1307 if m.group('secs'):
1308 res += int(m.group('secs'))
608d11f5
PH
1309 if m.group('mins'):
1310 res += int(m.group('mins')) * 60
e8df5cee
PH
1311 if m.group('hours'):
1312 res += int(m.group('hours')) * 60 * 60
8f4b58d7
PH
1313 if m.group('days'):
1314 res += int(m.group('days')) * 24 * 60 * 60
7adcbe75
PH
1315 if m.group('ms'):
1316 res += float(m.group('ms'))
608d11f5 1317 return res
91d7d0b3
JMF
1318
1319
1320def prepend_extension(filename, ext):
5f6a1245 1321 name, real_ext = os.path.splitext(filename)
28e614de 1322 return '{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1323
1324
1325def check_executable(exe, args=[]):
1326 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1327 args can be a list of arguments for a short output (like -version) """
1328 try:
1329 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1330 except OSError:
1331 return False
1332 return exe
b7ab0590
PH
1333
1334
95807118 1335def get_exe_version(exe, args=['--version'],
cae97f65 1336 version_re=None, unrecognized='present'):
95807118
PH
1337 """ Returns the version of the specified executable,
1338 or False if the executable is not present """
1339 try:
cae97f65 1340 out, _ = subprocess.Popen(
95807118
PH
1341 [exe] + args,
1342 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1343 except OSError:
1344 return False
cae97f65
PH
1345 if isinstance(out, bytes): # Python 2.x
1346 out = out.decode('ascii', 'ignore')
1347 return detect_exe_version(out, version_re, unrecognized)
1348
1349
1350def detect_exe_version(output, version_re=None, unrecognized='present'):
1351 assert isinstance(output, compat_str)
1352 if version_re is None:
1353 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1354 m = re.search(version_re, output)
95807118
PH
1355 if m:
1356 return m.group(1)
1357 else:
1358 return unrecognized
1359
1360
b7ab0590 1361class PagedList(object):
dd26ced1
PH
1362 def __len__(self):
1363 # This is only useful for tests
1364 return len(self.getslice())
1365
9c44d242
PH
1366
1367class OnDemandPagedList(PagedList):
1368 def __init__(self, pagefunc, pagesize):
1369 self._pagefunc = pagefunc
1370 self._pagesize = pagesize
1371
b7ab0590
PH
1372 def getslice(self, start=0, end=None):
1373 res = []
1374 for pagenum in itertools.count(start // self._pagesize):
1375 firstid = pagenum * self._pagesize
1376 nextfirstid = pagenum * self._pagesize + self._pagesize
1377 if start >= nextfirstid:
1378 continue
1379
1380 page_results = list(self._pagefunc(pagenum))
1381
1382 startv = (
1383 start % self._pagesize
1384 if firstid <= start < nextfirstid
1385 else 0)
1386
1387 endv = (
1388 ((end - 1) % self._pagesize) + 1
1389 if (end is not None and firstid <= end <= nextfirstid)
1390 else None)
1391
1392 if startv != 0 or endv is not None:
1393 page_results = page_results[startv:endv]
1394 res.extend(page_results)
1395
1396 # A little optimization - if current page is not "full", ie. does
1397 # not contain page_size videos then we can assume that this page
1398 # is the last one - there are no more ids on further pages -
1399 # i.e. no need to query again.
1400 if len(page_results) + startv < self._pagesize:
1401 break
1402
1403 # If we got the whole page, but the next page is not interesting,
1404 # break out early as well
1405 if end == nextfirstid:
1406 break
1407 return res
81c2f20b
PH
1408
1409
9c44d242
PH
1410class InAdvancePagedList(PagedList):
1411 def __init__(self, pagefunc, pagecount, pagesize):
1412 self._pagefunc = pagefunc
1413 self._pagecount = pagecount
1414 self._pagesize = pagesize
1415
1416 def getslice(self, start=0, end=None):
1417 res = []
1418 start_page = start // self._pagesize
1419 end_page = (
1420 self._pagecount if end is None else (end // self._pagesize + 1))
1421 skip_elems = start - start_page * self._pagesize
1422 only_more = None if end is None else end - start
1423 for pagenum in range(start_page, end_page):
1424 page = list(self._pagefunc(pagenum))
1425 if skip_elems:
1426 page = page[skip_elems:]
1427 skip_elems = None
1428 if only_more is not None:
1429 if len(page) < only_more:
1430 only_more -= len(page)
1431 else:
1432 page = page[:only_more]
1433 res.extend(page)
1434 break
1435 res.extend(page)
1436 return res
1437
1438
81c2f20b 1439def uppercase_escape(s):
676eb3f2 1440 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1441 return re.sub(
a612753d 1442 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1443 lambda m: unicode_escape(m.group(0))[0],
1444 s)
b53466e1 1445
d05cfe06
S
1446
1447def escape_rfc3986(s):
1448 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1449 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1450 s = s.encode('utf-8')
ecc0c5ee 1451 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1452
1453
1454def escape_url(url):
1455 """Escape URL as suggested by RFC 3986"""
1456 url_parsed = compat_urllib_parse_urlparse(url)
1457 return url_parsed._replace(
1458 path=escape_rfc3986(url_parsed.path),
1459 params=escape_rfc3986(url_parsed.params),
1460 query=escape_rfc3986(url_parsed.query),
1461 fragment=escape_rfc3986(url_parsed.fragment)
1462 ).geturl()
1463
b53466e1 1464try:
28e614de 1465 struct.pack('!I', 0)
b53466e1
PH
1466except TypeError:
1467 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1468 def struct_pack(spec, *args):
1469 if isinstance(spec, compat_str):
1470 spec = spec.encode('ascii')
1471 return struct.pack(spec, *args)
1472
1473 def struct_unpack(spec, *args):
1474 if isinstance(spec, compat_str):
1475 spec = spec.encode('ascii')
1476 return struct.unpack(spec, *args)
1477else:
1478 struct_pack = struct.pack
1479 struct_unpack = struct.unpack
62e609ab
PH
1480
1481
1482def read_batch_urls(batch_fd):
1483 def fixup(url):
1484 if not isinstance(url, compat_str):
1485 url = url.decode('utf-8', 'replace')
28e614de 1486 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1487 if url.startswith(BOM_UTF8):
1488 url = url[len(BOM_UTF8):]
1489 url = url.strip()
1490 if url.startswith(('#', ';', ']')):
1491 return False
1492 return url
1493
1494 with contextlib.closing(batch_fd) as fd:
1495 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1496
1497
1498def urlencode_postdata(*args, **kargs):
1499 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1500
1501
0990305d
PH
1502try:
1503 etree_iter = xml.etree.ElementTree.Element.iter
1504except AttributeError: # Python <=2.6
1505 etree_iter = lambda n: n.findall('.//*')
1506
1507
bcf89ce6
PH
1508def parse_xml(s):
1509 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1510 def doctype(self, name, pubid, system):
1511 pass # Ignore doctypes
1512
1513 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1514 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1515 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1516 # Fix up XML parser in Python 2.x
1517 if sys.version_info < (3, 0):
1518 for n in etree_iter(tree):
1519 if n.text is not None:
1520 if not isinstance(n.text, compat_str):
1521 n.text = n.text.decode('utf-8')
1522 return tree
e68301af
PH
1523
1524
a1a530b0
PH
1525US_RATINGS = {
1526 'G': 0,
1527 'PG': 10,
1528 'PG-13': 13,
1529 'R': 16,
1530 'NC': 18,
1531}
fac55558
PH
1532
1533
146c80e2
S
1534def parse_age_limit(s):
1535 if s is None:
d838b1bd 1536 return None
146c80e2 1537 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d838b1bd 1538 return int(m.group('age')) if m else US_RATINGS.get(s, None)
146c80e2
S
1539
1540
fac55558 1541def strip_jsonp(code):
609a61e3
PH
1542 return re.sub(
1543 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1544
1545
e05f6939
PH
1546def js_to_json(code):
1547 def fix_kv(m):
e7b6d122
PH
1548 v = m.group(0)
1549 if v in ('true', 'false', 'null'):
1550 return v
1551 if v.startswith('"'):
1552 return v
1553 if v.startswith("'"):
1554 v = v[1:-1]
1555 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1556 '\\\\': '\\\\',
1557 "\\'": "'",
1558 '"': '\\"',
1559 }[m.group(0)], v)
1560 return '"%s"' % v
e05f6939
PH
1561
1562 res = re.sub(r'''(?x)
d305dd73
PH
1563 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1564 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
8f4b58d7 1565 [a-zA-Z_][.a-zA-Z_0-9]*
e05f6939
PH
1566 ''', fix_kv, code)
1567 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1568 return res
1569
1570
478c2c61
PH
1571def qualities(quality_ids):
1572 """ Get a numeric quality value out of a list of possible values """
1573 def q(qid):
1574 try:
1575 return quality_ids.index(qid)
1576 except ValueError:
1577 return -1
1578 return q
1579
acd69589
PH
1580
1581DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1582
a020a0dc
PH
1583
1584def limit_length(s, length):
1585 """ Add ellipses to overly long strings """
1586 if s is None:
1587 return None
1588 ELLIPSES = '...'
1589 if len(s) > length:
1590 return s[:length - len(ELLIPSES)] + ELLIPSES
1591 return s
48844745
PH
1592
1593
1594def version_tuple(v):
5f9b8394 1595 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1596
1597
1598def is_outdated_version(version, limit, assume_new=True):
1599 if not version:
1600 return not assume_new
1601 try:
1602 return version_tuple(version) < version_tuple(limit)
1603 except ValueError:
1604 return not assume_new
732ea2f0
PH
1605
1606
1607def ytdl_is_updateable():
1608 """ Returns if youtube-dl can be updated with -U """
1609 from zipimport import zipimporter
1610
1611 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1612
1613
1614def args_to_str(args):
1615 # Get a short string representation for a subprocess command
1616 return ' '.join(shlex_quote(a) for a in args)
2ccd1b10
PH
1617
1618
c460bdd5
PH
1619def mimetype2ext(mt):
1620 _, _, res = mt.rpartition('/')
1621
1622 return {
1623 'x-ms-wmv': 'wmv',
1624 'x-mp4-fragmented': 'mp4',
1625 }.get(res, res)
1626
1627
2ccd1b10
PH
1628def urlhandle_detect_ext(url_handle):
1629 try:
1630 url_handle.headers
1631 getheader = lambda h: url_handle.headers[h]
1632 except AttributeError: # Python < 3
1633 getheader = url_handle.info().getheader
1634
b55ee18f
PH
1635 cd = getheader('Content-Disposition')
1636 if cd:
1637 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1638 if m:
1639 e = determine_ext(m.group('filename'), default_ext=None)
1640 if e:
1641 return e
1642
c460bdd5 1643 return mimetype2ext(getheader('Content-Type'))
05900629
PH
1644
1645
1646def age_restricted(content_limit, age_limit):
1647 """ Returns True iff the content should be blocked """
1648
1649 if age_limit is None: # No limit set
1650 return False
1651 if content_limit is None:
1652 return False # Content available for everyone
1653 return age_limit < content_limit
61ca9a80
PH
1654
1655
1656def is_html(first_bytes):
1657 """ Detect whether a file contains HTML by examining its first bytes. """
1658
1659 BOMS = [
1660 (b'\xef\xbb\xbf', 'utf-8'),
1661 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1662 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1663 (b'\xff\xfe', 'utf-16-le'),
1664 (b'\xfe\xff', 'utf-16-be'),
1665 ]
1666 for bom, enc in BOMS:
1667 if first_bytes.startswith(bom):
1668 s = first_bytes[len(bom):].decode(enc, 'replace')
1669 break
1670 else:
1671 s = first_bytes.decode('utf-8', 'replace')
1672
1673 return re.match(r'^\s*<', s)
a055469f
PH
1674
1675
1676def determine_protocol(info_dict):
1677 protocol = info_dict.get('protocol')
1678 if protocol is not None:
1679 return protocol
1680
1681 url = info_dict['url']
1682 if url.startswith('rtmp'):
1683 return 'rtmp'
1684 elif url.startswith('mms'):
1685 return 'mms'
1686 elif url.startswith('rtsp'):
1687 return 'rtsp'
1688
1689 ext = determine_ext(url)
1690 if ext == 'm3u8':
1691 return 'm3u8'
1692 elif ext == 'f4m':
1693 return 'f4m'
1694
1695 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
1696
1697
1698def render_table(header_row, data):
1699 """ Render a list of rows, each as a list of values """
1700 table = [header_row] + data
1701 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1702 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1703 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
1704
1705
1706def _match_one(filter_part, dct):
1707 COMPARISON_OPERATORS = {
1708 '<': operator.lt,
1709 '<=': operator.le,
1710 '>': operator.gt,
1711 '>=': operator.ge,
1712 '=': operator.eq,
1713 '!=': operator.ne,
1714 }
1715 operator_rex = re.compile(r'''(?x)\s*
1716 (?P<key>[a-z_]+)
1717 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1718 (?:
1719 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1720 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1721 )
1722 \s*$
1723 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1724 m = operator_rex.search(filter_part)
1725 if m:
1726 op = COMPARISON_OPERATORS[m.group('op')]
1727 if m.group('strval') is not None:
1728 if m.group('op') not in ('=', '!='):
1729 raise ValueError(
1730 'Operator %s does not support string values!' % m.group('op'))
1731 comparison_value = m.group('strval')
1732 else:
1733 try:
1734 comparison_value = int(m.group('intval'))
1735 except ValueError:
1736 comparison_value = parse_filesize(m.group('intval'))
1737 if comparison_value is None:
1738 comparison_value = parse_filesize(m.group('intval') + 'B')
1739 if comparison_value is None:
1740 raise ValueError(
1741 'Invalid integer value %r in filter part %r' % (
1742 m.group('intval'), filter_part))
1743 actual_value = dct.get(m.group('key'))
1744 if actual_value is None:
1745 return m.group('none_inclusive')
1746 return op(actual_value, comparison_value)
1747
1748 UNARY_OPERATORS = {
1749 '': lambda v: v is not None,
1750 '!': lambda v: v is None,
1751 }
1752 operator_rex = re.compile(r'''(?x)\s*
1753 (?P<op>%s)\s*(?P<key>[a-z_]+)
1754 \s*$
1755 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1756 m = operator_rex.search(filter_part)
1757 if m:
1758 op = UNARY_OPERATORS[m.group('op')]
1759 actual_value = dct.get(m.group('key'))
1760 return op(actual_value)
1761
1762 raise ValueError('Invalid filter part %r' % filter_part)
1763
1764
1765def match_str(filter_str, dct):
1766 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1767
1768 return all(
1769 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1770
1771
1772def match_filter_func(filter_str):
1773 def _match_func(info_dict):
1774 if match_str(filter_str, info_dict):
1775 return None
1776 else:
1777 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1778 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1779 return _match_func