]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[arte] Clean up format sorting mess
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
912b38b4 6import calendar
676eb3f2 7import codecs
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96
PH
10import datetime
11import email.utils
f45c185f 12import errno
d77c3dfd 13import gzip
b7ab0590 14import itertools
03f9daab 15import io
f4bfd65f 16import json
d77c3dfd 17import locale
02dbf93f 18import math
d77c3dfd 19import os
4eb7f1d1 20import pipes
c496ca96 21import platform
d77c3dfd 22import re
13ebea79 23import ssl
c496ca96 24import socket
b53466e1 25import struct
1c088fa8 26import subprocess
d77c3dfd 27import sys
181c8655 28import tempfile
01951dda 29import traceback
bcf89ce6 30import xml.etree.ElementTree
d77c3dfd 31import zlib
d77c3dfd 32
8c25f81b
PH
33from .compat import (
34 compat_chr,
35 compat_getenv,
36 compat_html_entities,
8c25f81b
PH
37 compat_parse_qs,
38 compat_str,
39 compat_urllib_error,
40 compat_urllib_parse,
41 compat_urllib_parse_urlparse,
42 compat_urllib_request,
43 compat_urlparse,
44)
4644ac55
S
45
46
468e2e92
FV
47# This is not clearly defined otherwise
48compiled_regex_type = type(re.compile(''))
49
3e669f36 50std_headers = {
ae8f7871 51 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
52 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
53 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
54 'Accept-Encoding': 'gzip, deflate',
55 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 56}
f427df17 57
d77c3dfd 58def preferredencoding():
59ae15a5 59 """Get preferred encoding.
d77c3dfd 60
59ae15a5
PH
61 Returns the best encoding scheme for the system, based on
62 locale.getpreferredencoding() and some further tweaks.
63 """
64 try:
65 pref = locale.getpreferredencoding()
28e614de 66 'TEST'.encode(pref)
59ae15a5
PH
67 except:
68 pref = 'UTF-8'
bae611f2 69
59ae15a5 70 return pref
d77c3dfd 71
f4bfd65f 72
181c8655 73def write_json_file(obj, fn):
1394646a 74 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 75
92120217 76 fn = encodeFilename(fn)
61ee5aeb 77 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
78 encoding = get_filesystem_encoding()
79 # os.path.basename returns a bytes object, but NamedTemporaryFile
80 # will fail if the filename contains non ascii characters unless we
81 # use a unicode object
82 path_basename = lambda f: os.path.basename(fn).decode(encoding)
83 # the same for os.path.dirname
84 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
85 else:
86 path_basename = os.path.basename
87 path_dirname = os.path.dirname
88
73159f99
S
89 args = {
90 'suffix': '.tmp',
ec5f6016
JMF
91 'prefix': path_basename(fn) + '.',
92 'dir': path_dirname(fn),
73159f99
S
93 'delete': False,
94 }
95
181c8655
PH
96 # In Python 2.x, json.dump expects a bytestream.
97 # In Python 3.x, it writes to a character stream
98 if sys.version_info < (3, 0):
73159f99 99 args['mode'] = 'wb'
181c8655 100 else:
73159f99
S
101 args.update({
102 'mode': 'w',
103 'encoding': 'utf-8',
104 })
105
106 tf = tempfile.NamedTemporaryFile(**args)
181c8655
PH
107
108 try:
109 with tf:
110 json.dump(obj, tf)
1394646a
IK
111 if sys.platform == 'win32':
112 # Need to remove existing file on Windows, else os.rename raises
113 # WindowsError or FileExistsError.
114 try:
115 os.unlink(fn)
116 except OSError:
117 pass
181c8655
PH
118 os.rename(tf.name, fn)
119 except:
120 try:
121 os.remove(tf.name)
122 except OSError:
123 pass
124 raise
125
126
127if sys.version_info >= (2, 7):
59ae56fa
PH
128 def find_xpath_attr(node, xpath, key, val):
129 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
130 assert re.match(r'^[a-zA-Z-]+$', key)
131 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
59ae56fa
PH
132 expr = xpath + u"[@%s='%s']" % (key, val)
133 return node.find(expr)
134else:
135 def find_xpath_attr(node, xpath, key, val):
4eefbfdb
PH
136 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
137 # .//node does not match if a node is a direct child of . !
138 if isinstance(xpath, unicode):
139 xpath = xpath.encode('ascii')
140
59ae56fa
PH
141 for f in node.findall(xpath):
142 if f.attrib.get(key) == val:
143 return f
144 return None
145
d7e66d39
JMF
146# On python2.6 the xml.etree.ElementTree.Element methods don't support
147# the namespace parameter
148def xpath_with_ns(path, ns_map):
149 components = [c.split(':') for c in path.split('/')]
150 replaced = []
151 for c in components:
152 if len(c) == 1:
153 replaced.append(c[0])
154 else:
155 ns, tag = c
156 replaced.append('{%s}%s' % (ns_map[ns], tag))
157 return '/'.join(replaced)
158
d77c3dfd 159
bf0ff932 160def xpath_text(node, xpath, name=None, fatal=False):
d74bebd5
PH
161 if sys.version_info < (2, 7): # Crazy 2.6
162 xpath = xpath.encode('ascii')
163
bf0ff932
PH
164 n = node.find(xpath)
165 if n is None:
166 if fatal:
167 name = xpath if name is None else name
168 raise ExtractorError('Could not find XML element %s' % name)
169 else:
170 return None
171 return n.text
172
173
9e6dd238 174def get_element_by_id(id, html):
43e8fafd
ND
175 """Return the content of the tag with the specified ID in the passed HTML document"""
176 return get_element_by_attribute("id", id, html)
177
12ea2f30 178
43e8fafd
ND
179def get_element_by_attribute(attribute, value, html):
180 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 181
38285056
PH
182 m = re.search(r'''(?xs)
183 <([a-zA-Z0-9:._-]+)
184 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
185 \s+%s=['"]?%s['"]?
186 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
187 \s*>
188 (?P<content>.*?)
189 </\1>
190 ''' % (re.escape(attribute), re.escape(value)), html)
191
192 if not m:
193 return None
194 res = m.group('content')
195
196 if res.startswith('"') or res.startswith("'"):
197 res = res[1:-1]
a921f407 198
38285056 199 return unescapeHTML(res)
a921f407 200
9e6dd238
FV
201
202def clean_html(html):
59ae15a5
PH
203 """Clean an HTML snippet into a readable string"""
204 # Newline vs <br />
205 html = html.replace('\n', ' ')
6b3aef80
FV
206 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
207 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
208 # Strip html tags
209 html = re.sub('<.*?>', '', html)
210 # Replace html entities
211 html = unescapeHTML(html)
7decf895 212 return html.strip()
9e6dd238
FV
213
214
d77c3dfd 215def sanitize_open(filename, open_mode):
59ae15a5
PH
216 """Try to open the given filename, and slightly tweak it if this fails.
217
218 Attempts to open the given filename. If this fails, it tries to change
219 the filename slightly, step by step, until it's either able to open it
220 or it fails and raises a final exception, like the standard open()
221 function.
222
223 It returns the tuple (stream, definitive_file_name).
224 """
225 try:
28e614de 226 if filename == '-':
59ae15a5
PH
227 if sys.platform == 'win32':
228 import msvcrt
229 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 230 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
231 stream = open(encodeFilename(filename), open_mode)
232 return (stream, filename)
233 except (IOError, OSError) as err:
f45c185f
PH
234 if err.errno in (errno.EACCES,):
235 raise
59ae15a5 236
f45c185f
PH
237 # In case of error, try to remove win32 forbidden chars
238 alt_filename = os.path.join(
28e614de 239 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
f45c185f
PH
240 for path_part in os.path.split(filename)
241 )
242 if alt_filename == filename:
243 raise
244 else:
245 # An exception here should be caught in the caller
246 stream = open(encodeFilename(filename), open_mode)
247 return (stream, alt_filename)
d77c3dfd
FV
248
249
250def timeconvert(timestr):
59ae15a5
PH
251 """Convert RFC 2822 defined time string into system timestamp"""
252 timestamp = None
253 timetuple = email.utils.parsedate_tz(timestr)
254 if timetuple is not None:
255 timestamp = email.utils.mktime_tz(timetuple)
256 return timestamp
1c469a94 257
796173d0 258def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
259 """Sanitizes a string so it could be used as part of a filename.
260 If restricted is set, use a stricter subset of allowed characters.
796173d0 261 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
262 """
263 def replace_insane(char):
264 if char == '?' or ord(char) < 32 or ord(char) == 127:
265 return ''
266 elif char == '"':
267 return '' if restricted else '\''
268 elif char == ':':
269 return '_-' if restricted else ' -'
270 elif char in '\\/|*<>':
271 return '_'
627dcfff 272 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
273 return '_'
274 if restricted and ord(char) > 127:
275 return '_'
276 return char
277
28e614de 278 result = ''.join(map(replace_insane, s))
796173d0
PH
279 if not is_id:
280 while '__' in result:
281 result = result.replace('__', '_')
282 result = result.strip('_')
283 # Common case of "Foreign band name - English song title"
284 if restricted and result.startswith('-_'):
285 result = result[2:]
286 if not result:
287 result = '_'
59ae15a5 288 return result
d77c3dfd
FV
289
290def orderedSet(iterable):
59ae15a5
PH
291 """ Remove all duplicates from the input iterable """
292 res = []
293 for el in iterable:
294 if el not in res:
295 res.append(el)
296 return res
d77c3dfd 297
912b38b4 298
4e408e47
PH
299def _htmlentity_transform(entity):
300 """Transforms an HTML entity to a character."""
301 # Known non-numeric HTML entity
302 if entity in compat_html_entities.name2codepoint:
303 return compat_chr(compat_html_entities.name2codepoint[entity])
304
305 mobj = re.match(r'#(x?[0-9]+)', entity)
306 if mobj is not None:
307 numstr = mobj.group(1)
28e614de 308 if numstr.startswith('x'):
4e408e47 309 base = 16
28e614de 310 numstr = '0%s' % numstr
4e408e47
PH
311 else:
312 base = 10
313 return compat_chr(int(numstr, base))
314
315 # Unknown entity in name, return its literal representation
28e614de 316 return ('&%s;' % entity)
4e408e47
PH
317
318
d77c3dfd 319def unescapeHTML(s):
912b38b4
PH
320 if s is None:
321 return None
322 assert type(s) == compat_str
d77c3dfd 323
4e408e47
PH
324 return re.sub(
325 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 326
8bf48f23
PH
327
328def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
329 """
330 @param s The name of the file
331 """
d77c3dfd 332
8bf48f23 333 assert type(s) == compat_str
d77c3dfd 334
59ae15a5
PH
335 # Python 3 has a Unicode API
336 if sys.version_info >= (3, 0):
337 return s
0f00efed 338
59ae15a5 339 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
28e614de 340 # Pass '' directly to use Unicode APIs on Windows 2000 and up
59ae15a5
PH
341 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
342 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
343 if not for_subprocess:
344 return s
345 else:
346 # For subprocess calls, encode with locale encoding
347 # Refer to http://stackoverflow.com/a/9951851/35070
348 encoding = preferredencoding()
59ae15a5 349 else:
6df40dcb 350 encoding = sys.getfilesystemencoding()
8bf48f23
PH
351 if encoding is None:
352 encoding = 'utf-8'
353 return s.encode(encoding, 'ignore')
354
f07b74fc
PH
355
356def encodeArgument(s):
357 if not isinstance(s, compat_str):
358 # Legacy code that uses byte strings
359 # Uncomment the following line after fixing all post processors
360 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
361 s = s.decode('ascii')
362 return encodeFilename(s, True)
363
364
8271226a
PH
365def decodeOption(optval):
366 if optval is None:
367 return optval
368 if isinstance(optval, bytes):
369 optval = optval.decode(preferredencoding())
370
371 assert isinstance(optval, compat_str)
372 return optval
1c256f70 373
4539dd30
PH
374def formatSeconds(secs):
375 if secs > 3600:
376 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
377 elif secs > 60:
378 return '%d:%02d' % (secs // 60, secs % 60)
379 else:
380 return '%d' % secs
381
a0ddb8a2
PH
382
383def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
13ebea79
PH
384 if sys.version_info < (3, 2):
385 import httplib
386
387 class HTTPSConnectionV3(httplib.HTTPSConnection):
388 def __init__(self, *args, **kwargs):
389 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
390
391 def connect(self):
392 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 393 if getattr(self, '_tunnel_host', False):
13ebea79
PH
394 self.sock = sock
395 self._tunnel()
396 try:
aa37e3d4 397 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
de79c46c 398 except ssl.SSLError:
13ebea79
PH
399 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
400
401 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
402 def https_open(self, req):
403 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 404 return HTTPSHandlerV3(**kwargs)
aa37e3d4
PH
405 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
406 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
407 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
408 if opts_no_check_certificate:
409 context.verify_mode = ssl.CERT_NONE
410 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
411 else: # Python < 3.4
412 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
ea6d901e 413 context.verify_mode = (ssl.CERT_NONE
dca08720 414 if opts_no_check_certificate
ea6d901e 415 else ssl.CERT_REQUIRED)
303b479e
PH
416 context.set_default_verify_paths()
417 try:
418 context.load_default_certs()
419 except AttributeError:
420 pass # Python < 3.4
a0ddb8a2 421 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 422
1c256f70
PH
423class ExtractorError(Exception):
424 """Error during info extraction."""
d11271dd 425 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
426 """ tb, if given, is the original traceback (so that it can be printed out).
427 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
428 """
429
430 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
431 expected = True
d11271dd
PH
432 if video_id is not None:
433 msg = video_id + ': ' + msg
410f3e73 434 if cause:
28e614de 435 msg += ' (caused by %r)' % cause
9a82b238 436 if not expected:
28e614de 437 msg = msg + '; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 438 super(ExtractorError, self).__init__(msg)
d5979c5d 439
1c256f70 440 self.traceback = tb
8cc83b8d 441 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 442 self.cause = cause
d11271dd 443 self.video_id = video_id
1c256f70 444
01951dda
PH
445 def format_traceback(self):
446 if self.traceback is None:
447 return None
28e614de 448 return ''.join(traceback.format_tb(self.traceback))
01951dda 449
1c256f70 450
55b3e45b
JMF
451class RegexNotFoundError(ExtractorError):
452 """Error when a regex didn't match"""
453 pass
454
455
d77c3dfd 456class DownloadError(Exception):
59ae15a5 457 """Download Error exception.
d77c3dfd 458
59ae15a5
PH
459 This exception may be thrown by FileDownloader objects if they are not
460 configured to continue on errors. They will contain the appropriate
461 error message.
462 """
8cc83b8d
FV
463 def __init__(self, msg, exc_info=None):
464 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
465 super(DownloadError, self).__init__(msg)
466 self.exc_info = exc_info
d77c3dfd
FV
467
468
469class SameFileError(Exception):
59ae15a5 470 """Same File exception.
d77c3dfd 471
59ae15a5
PH
472 This exception will be thrown by FileDownloader objects if they detect
473 multiple files would have to be downloaded to the same file on disk.
474 """
475 pass
d77c3dfd
FV
476
477
478class PostProcessingError(Exception):
59ae15a5 479 """Post Processing exception.
d77c3dfd 480
59ae15a5
PH
481 This exception may be raised by PostProcessor's .run() method to
482 indicate an error in the postprocessing task.
483 """
7851b379
PH
484 def __init__(self, msg):
485 self.msg = msg
d77c3dfd
FV
486
487class MaxDownloadsReached(Exception):
59ae15a5
PH
488 """ --max-downloads limit has been reached. """
489 pass
d77c3dfd
FV
490
491
492class UnavailableVideoError(Exception):
59ae15a5 493 """Unavailable Format exception.
d77c3dfd 494
59ae15a5
PH
495 This exception will be thrown when a video is requested
496 in a format that is not available for that video.
497 """
498 pass
d77c3dfd
FV
499
500
501class ContentTooShortError(Exception):
59ae15a5 502 """Content Too Short exception.
d77c3dfd 503
59ae15a5
PH
504 This exception may be raised by FileDownloader objects when a file they
505 download is too small for what the server announced first, indicating
506 the connection was probably interrupted.
507 """
508 # Both in bytes
509 downloaded = None
510 expected = None
d77c3dfd 511
59ae15a5
PH
512 def __init__(self, downloaded, expected):
513 self.downloaded = downloaded
514 self.expected = expected
d77c3dfd 515
acebc9cd 516class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
517 """Handler for HTTP requests and responses.
518
519 This class, when installed with an OpenerDirector, automatically adds
520 the standard headers to every HTTP request and handles gzipped and
521 deflated responses from web servers. If compression is to be avoided in
522 a particular request, the original request in the program code only has
523 to include the HTTP header "Youtubedl-No-Compression", which will be
524 removed before making the real request.
525
526 Part of this code was copied from:
527
528 http://techknack.net/python-urllib2-handlers/
529
530 Andrew Rowls, the author of that code, agreed to release it to the
531 public domain.
532 """
533
534 @staticmethod
535 def deflate(data):
536 try:
537 return zlib.decompress(data, -zlib.MAX_WBITS)
538 except zlib.error:
539 return zlib.decompress(data)
540
541 @staticmethod
542 def addinfourl_wrapper(stream, headers, url, code):
543 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
544 return compat_urllib_request.addinfourl(stream, headers, url, code)
545 ret = compat_urllib_request.addinfourl(stream, headers, url)
546 ret.code = code
547 return ret
548
acebc9cd 549 def http_request(self, req):
33ac271b
PH
550 for h, v in std_headers.items():
551 if h not in req.headers:
552 req.add_header(h, v)
59ae15a5
PH
553 if 'Youtubedl-no-compression' in req.headers:
554 if 'Accept-encoding' in req.headers:
555 del req.headers['Accept-encoding']
556 del req.headers['Youtubedl-no-compression']
3446dfb7 557 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
558 if 'User-agent' in req.headers:
559 del req.headers['User-agent']
560 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 561 del req.headers['Youtubedl-user-agent']
989b4b2b
PH
562
563 if sys.version_info < (2, 7) and '#' in req.get_full_url():
564 # Python 2.6 is brain-dead when it comes to fragments
565 req._Request__original = req._Request__original.partition('#')[0]
566 req._Request__r_type = req._Request__r_type.partition('#')[0]
567
59ae15a5
PH
568 return req
569
acebc9cd 570 def http_response(self, req, resp):
59ae15a5
PH
571 old_resp = resp
572 # gzip
573 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
574 content = resp.read()
575 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
576 try:
577 uncompressed = io.BytesIO(gz.read())
578 except IOError as original_ioerror:
579 # There may be junk add the end of the file
580 # See http://stackoverflow.com/q/4928560/35070 for details
581 for i in range(1, 1024):
582 try:
583 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
584 uncompressed = io.BytesIO(gz.read())
585 except IOError:
586 continue
587 break
588 else:
589 raise original_ioerror
590 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
591 resp.msg = old_resp.msg
592 # deflate
593 if resp.headers.get('Content-encoding', '') == 'deflate':
594 gz = io.BytesIO(self.deflate(resp.read()))
595 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
596 resp.msg = old_resp.msg
597 return resp
0f8d03f8 598
acebc9cd
PH
599 https_request = http_request
600 https_response = http_response
bf50b038 601
5de90176 602
305d0683 603def parse_iso8601(date_str, delimiter='T'):
912b38b4
PH
604 """ Return a UNIX timestamp from the given date """
605
606 if date_str is None:
607 return None
608
609 m = re.search(
6ad4013d 610 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
912b38b4
PH
611 date_str)
612 if not m:
613 timezone = datetime.timedelta()
614 else:
615 date_str = date_str[:-len(m.group(0))]
616 if not m.group('sign'):
617 timezone = datetime.timedelta()
618 else:
619 sign = 1 if m.group('sign') == '+' else -1
620 timezone = datetime.timedelta(
621 hours=sign * int(m.group('hours')),
622 minutes=sign * int(m.group('minutes')))
6ad4013d 623 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
305d0683 624 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
625 return calendar.timegm(dt.timetuple())
626
627
bf50b038
JMF
628def unified_strdate(date_str):
629 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
630
631 if date_str is None:
632 return None
633
bf50b038
JMF
634 upload_date = None
635 #Replace commas
026fcc04 636 date_str = date_str.replace(',', ' ')
bf50b038 637 # %z (UTC offset) is only supported in python>=3.2
026fcc04 638 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
19e1d359
JMF
639 format_expressions = [
640 '%d %B %Y',
0f99566c 641 '%d %b %Y',
19e1d359
JMF
642 '%B %d %Y',
643 '%b %d %Y',
78ff59d0
PP
644 '%b %dst %Y %I:%M%p',
645 '%b %dnd %Y %I:%M%p',
646 '%b %dth %Y %I:%M%p',
19e1d359 647 '%Y-%m-%d',
fe556f1b 648 '%Y/%m/%d',
4cf96546 649 '%d.%m.%Y',
19e1d359 650 '%d/%m/%Y',
423817c4 651 '%d/%m/%y',
19e1d359 652 '%Y/%m/%d %H:%M:%S',
99b67fec 653 '%d/%m/%Y %H:%M:%S',
5d73273f 654 '%Y-%m-%d %H:%M:%S',
e9be9a6a 655 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 656 '%d.%m.%Y %H:%M',
b047de6f 657 '%d.%m.%Y %H.%M',
19e1d359 658 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
659 '%Y-%m-%dT%H:%M:%S.%fZ',
660 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 661 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 662 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 663 '%Y-%m-%dT%H:%M',
19e1d359 664 ]
bf50b038
JMF
665 for expression in format_expressions:
666 try:
667 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 668 except ValueError:
bf50b038 669 pass
42393ce2
PH
670 if upload_date is None:
671 timetuple = email.utils.parsedate_tz(date_str)
672 if timetuple:
673 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
674 return upload_date
675
28e614de 676def determine_ext(url, default_ext='unknown_video'):
f4776371
S
677 if url is None:
678 return default_ext
28e614de 679 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
680 if re.match(r'^[A-Za-z0-9]+$', guess):
681 return guess
682 else:
cbdbb766 683 return default_ext
73e79f2a 684
d4051a8e 685def subtitles_filename(filename, sub_lang, sub_format):
28e614de 686 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 687
bd558525 688def date_from_str(date_str):
37254abc
JMF
689 """
690 Return a datetime object from a string in the format YYYYMMDD or
691 (now|today)[+-][0-9](day|week|month|year)(s)?"""
692 today = datetime.date.today()
693 if date_str == 'now'or date_str == 'today':
694 return today
695 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
696 if match is not None:
697 sign = match.group('sign')
698 time = int(match.group('time'))
699 if sign == '-':
700 time = -time
701 unit = match.group('unit')
702 #A bad aproximation?
703 if unit == 'month':
704 unit = 'day'
705 time *= 30
706 elif unit == 'year':
707 unit = 'day'
708 time *= 365
709 unit += 's'
710 delta = datetime.timedelta(**{unit: time})
711 return today + delta
bd558525
JMF
712 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
713
e63fc1be 714def hyphenate_date(date_str):
715 """
716 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
717 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
718 if match is not None:
719 return '-'.join(match.groups())
720 else:
721 return date_str
722
bd558525
JMF
723class DateRange(object):
724 """Represents a time interval between two dates"""
725 def __init__(self, start=None, end=None):
726 """start and end must be strings in the format accepted by date"""
727 if start is not None:
728 self.start = date_from_str(start)
729 else:
730 self.start = datetime.datetime.min.date()
731 if end is not None:
732 self.end = date_from_str(end)
733 else:
734 self.end = datetime.datetime.max.date()
37254abc 735 if self.start > self.end:
bd558525
JMF
736 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
737 @classmethod
738 def day(cls, day):
739 """Returns a range that only contains the given day"""
740 return cls(day,day)
741 def __contains__(self, date):
742 """Check if the date is in the range"""
37254abc
JMF
743 if not isinstance(date, datetime.date):
744 date = date_from_str(date)
745 return self.start <= date <= self.end
bd558525
JMF
746 def __str__(self):
747 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
748
749
750def platform_name():
751 """ Returns the platform name as a compat_str """
752 res = platform.platform()
753 if isinstance(res, bytes):
754 res = res.decode(preferredencoding())
755
756 assert isinstance(res, compat_str)
757 return res
c257baff
PH
758
759
b58ddb32
PH
760def _windows_write_string(s, out):
761 """ Returns True if the string was written using special methods,
762 False if it has yet to be written out."""
763 # Adapted from http://stackoverflow.com/a/3259271/35070
764
765 import ctypes
766 import ctypes.wintypes
767
768 WIN_OUTPUT_IDS = {
769 1: -11,
770 2: -12,
771 }
772
a383a98a
PH
773 try:
774 fileno = out.fileno()
775 except AttributeError:
776 # If the output stream doesn't have a fileno, it's virtual
777 return False
b58ddb32
PH
778 if fileno not in WIN_OUTPUT_IDS:
779 return False
780
781 GetStdHandle = ctypes.WINFUNCTYPE(
782 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
783 ("GetStdHandle", ctypes.windll.kernel32))
784 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
785
786 WriteConsoleW = ctypes.WINFUNCTYPE(
787 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
788 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
789 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
790 written = ctypes.wintypes.DWORD(0)
791
792 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
793 FILE_TYPE_CHAR = 0x0002
794 FILE_TYPE_REMOTE = 0x8000
795 GetConsoleMode = ctypes.WINFUNCTYPE(
796 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
797 ctypes.POINTER(ctypes.wintypes.DWORD))(
798 ("GetConsoleMode", ctypes.windll.kernel32))
799 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
800
801 def not_a_console(handle):
802 if handle == INVALID_HANDLE_VALUE or handle is None:
803 return True
804 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
805 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
806
807 if not_a_console(h):
808 return False
809
d1b9c912
PH
810 def next_nonbmp_pos(s):
811 try:
812 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
813 except StopIteration:
814 return len(s)
815
816 while s:
817 count = min(next_nonbmp_pos(s), 1024)
818
b58ddb32 819 ret = WriteConsoleW(
d1b9c912 820 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
821 if ret == 0:
822 raise OSError('Failed to write string')
d1b9c912
PH
823 if not count: # We just wrote a non-BMP character
824 assert written.value == 2
825 s = s[1:]
826 else:
827 assert written.value > 0
828 s = s[written.value:]
b58ddb32
PH
829 return True
830
831
734f90bb 832def write_string(s, out=None, encoding=None):
7459e3a2
PH
833 if out is None:
834 out = sys.stderr
8bf48f23 835 assert type(s) == compat_str
7459e3a2 836
b58ddb32
PH
837 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
838 if _windows_write_string(s, out):
839 return
840
7459e3a2
PH
841 if ('b' in getattr(out, 'mode', '') or
842 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
843 byt = s.encode(encoding or preferredencoding(), 'ignore')
844 out.write(byt)
845 elif hasattr(out, 'buffer'):
846 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
847 byt = s.encode(enc, 'ignore')
848 out.buffer.write(byt)
849 else:
8bf48f23 850 out.write(s)
7459e3a2
PH
851 out.flush()
852
853
48ea9cea
PH
854def bytes_to_intlist(bs):
855 if not bs:
856 return []
857 if isinstance(bs[0], int): # Python 3
858 return list(bs)
859 else:
860 return [ord(c) for c in bs]
861
c257baff 862
cba892fa 863def intlist_to_bytes(xs):
864 if not xs:
865 return b''
eb4157fd 866 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
867
868
c1c9a79c
PH
869# Cross-platform file locking
870if sys.platform == 'win32':
871 import ctypes.wintypes
872 import msvcrt
873
874 class OVERLAPPED(ctypes.Structure):
875 _fields_ = [
876 ('Internal', ctypes.wintypes.LPVOID),
877 ('InternalHigh', ctypes.wintypes.LPVOID),
878 ('Offset', ctypes.wintypes.DWORD),
879 ('OffsetHigh', ctypes.wintypes.DWORD),
880 ('hEvent', ctypes.wintypes.HANDLE),
881 ]
882
883 kernel32 = ctypes.windll.kernel32
884 LockFileEx = kernel32.LockFileEx
885 LockFileEx.argtypes = [
886 ctypes.wintypes.HANDLE, # hFile
887 ctypes.wintypes.DWORD, # dwFlags
888 ctypes.wintypes.DWORD, # dwReserved
889 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
890 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
891 ctypes.POINTER(OVERLAPPED) # Overlapped
892 ]
893 LockFileEx.restype = ctypes.wintypes.BOOL
894 UnlockFileEx = kernel32.UnlockFileEx
895 UnlockFileEx.argtypes = [
896 ctypes.wintypes.HANDLE, # hFile
897 ctypes.wintypes.DWORD, # dwReserved
898 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
899 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
900 ctypes.POINTER(OVERLAPPED) # Overlapped
901 ]
902 UnlockFileEx.restype = ctypes.wintypes.BOOL
903 whole_low = 0xffffffff
904 whole_high = 0x7fffffff
905
906 def _lock_file(f, exclusive):
907 overlapped = OVERLAPPED()
908 overlapped.Offset = 0
909 overlapped.OffsetHigh = 0
910 overlapped.hEvent = 0
911 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
912 handle = msvcrt.get_osfhandle(f.fileno())
913 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
914 whole_low, whole_high, f._lock_file_overlapped_p):
915 raise OSError('Locking file failed: %r' % ctypes.FormatError())
916
917 def _unlock_file(f):
918 assert f._lock_file_overlapped_p
919 handle = msvcrt.get_osfhandle(f.fileno())
920 if not UnlockFileEx(handle, 0,
921 whole_low, whole_high, f._lock_file_overlapped_p):
922 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
923
924else:
925 import fcntl
926
927 def _lock_file(f, exclusive):
2582bebe 928 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
929
930 def _unlock_file(f):
2582bebe 931 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
932
933
934class locked_file(object):
935 def __init__(self, filename, mode, encoding=None):
936 assert mode in ['r', 'a', 'w']
937 self.f = io.open(filename, mode, encoding=encoding)
938 self.mode = mode
939
940 def __enter__(self):
941 exclusive = self.mode != 'r'
942 try:
943 _lock_file(self.f, exclusive)
944 except IOError:
945 self.f.close()
946 raise
947 return self
948
949 def __exit__(self, etype, value, traceback):
950 try:
951 _unlock_file(self.f)
952 finally:
953 self.f.close()
954
955 def __iter__(self):
956 return iter(self.f)
957
958 def write(self, *args):
959 return self.f.write(*args)
960
961 def read(self, *args):
962 return self.f.read(*args)
4eb7f1d1
JMF
963
964
4644ac55
S
965def get_filesystem_encoding():
966 encoding = sys.getfilesystemencoding()
967 return encoding if encoding is not None else 'utf-8'
968
969
4eb7f1d1 970def shell_quote(args):
a6a173c2 971 quoted_args = []
4644ac55 972 encoding = get_filesystem_encoding()
a6a173c2
JMF
973 for a in args:
974 if isinstance(a, bytes):
975 # We may get a filename encoded with 'encodeFilename'
976 a = a.decode(encoding)
977 quoted_args.append(pipes.quote(a))
28e614de 978 return ' '.join(quoted_args)
9d4660ca
PH
979
980
f4d96df0
PH
981def takewhile_inclusive(pred, seq):
982 """ Like itertools.takewhile, but include the latest evaluated element
983 (the first element so that Not pred(e)) """
984 for e in seq:
985 yield e
986 if not pred(e):
987 return
988
989
9d4660ca
PH
990def smuggle_url(url, data):
991 """ Pass additional data in a URL for internal use. """
992
993 sdata = compat_urllib_parse.urlencode(
28e614de
PH
994 {'__youtubedl_smuggle': json.dumps(data)})
995 return url + '#' + sdata
9d4660ca
PH
996
997
79f82953 998def unsmuggle_url(smug_url, default=None):
9d4660ca 999 if not '#__youtubedl_smuggle' in smug_url:
79f82953 1000 return smug_url, default
28e614de
PH
1001 url, _, sdata = smug_url.rpartition('#')
1002 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1003 data = json.loads(jsond)
1004 return url, data
02dbf93f
PH
1005
1006
02dbf93f
PH
1007def format_bytes(bytes):
1008 if bytes is None:
28e614de 1009 return 'N/A'
02dbf93f
PH
1010 if type(bytes) is str:
1011 bytes = float(bytes)
1012 if bytes == 0.0:
1013 exponent = 0
1014 else:
1015 exponent = int(math.log(bytes, 1024.0))
28e614de 1016 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1017 converted = float(bytes) / float(1024 ** exponent)
28e614de 1018 return '%.2f%s' % (converted, suffix)
f53c966a 1019
1c088fa8 1020
1c088fa8 1021def get_term_width():
4644ac55 1022 columns = compat_getenv('COLUMNS', None)
1c088fa8
PH
1023 if columns:
1024 return int(columns)
1025
1026 try:
1027 sp = subprocess.Popen(
1028 ['stty', 'size'],
1029 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1030 out, err = sp.communicate()
1031 return int(out.split()[1])
1032 except:
1033 pass
1034 return None
caefb1de
PH
1035
1036
1037def month_by_name(name):
1038 """ Return the number of a month by (locale-independently) English name """
1039
1040 ENGLISH_NAMES = [
28e614de
PH
1041 'January', 'February', 'March', 'April', 'May', 'June',
1042 'July', 'August', 'September', 'October', 'November', 'December']
caefb1de
PH
1043 try:
1044 return ENGLISH_NAMES.index(name) + 1
1045 except ValueError:
1046 return None
18258362
JMF
1047
1048
5aafe895 1049def fix_xml_ampersands(xml_str):
18258362 1050 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1051 return re.sub(
1052 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1053 '&amp;',
5aafe895 1054 xml_str)
e3946f98
PH
1055
1056
1057def setproctitle(title):
8bf48f23 1058 assert isinstance(title, compat_str)
e3946f98
PH
1059 try:
1060 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1061 except OSError:
1062 return
6eefe533
PH
1063 title_bytes = title.encode('utf-8')
1064 buf = ctypes.create_string_buffer(len(title_bytes))
1065 buf.value = title_bytes
e3946f98 1066 try:
6eefe533 1067 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1068 except AttributeError:
1069 return # Strange libc, just skip this
d7dda168
PH
1070
1071
1072def remove_start(s, start):
1073 if s.startswith(start):
1074 return s[len(start):]
1075 return s
29eb5174
PH
1076
1077
2b9faf55
PH
1078def remove_end(s, end):
1079 if s.endswith(end):
1080 return s[:-len(end)]
1081 return s
1082
1083
29eb5174 1084def url_basename(url):
9b8aaeed 1085 path = compat_urlparse.urlparse(url).path
28e614de 1086 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1087
1088
1089class HEADRequest(compat_urllib_request.Request):
1090 def get_method(self):
1091 return "HEAD"
7217e148
PH
1092
1093
9732d77e 1094def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1095 if get_attr:
1096 if v is not None:
1097 v = getattr(v, get_attr, None)
9572013d
PH
1098 if v == '':
1099 v = None
9732d77e
PH
1100 return default if v is None else (int(v) * invscale // scale)
1101
9572013d 1102
40a90862
JMF
1103def str_or_none(v, default=None):
1104 return default if v is None else compat_str(v)
1105
9732d77e
PH
1106
1107def str_to_int(int_str):
48d4681e 1108 """ A more relaxed version of int_or_none """
9732d77e
PH
1109 if int_str is None:
1110 return None
28e614de 1111 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1112 return int(int_str)
608d11f5
PH
1113
1114
9732d77e
PH
1115def float_or_none(v, scale=1, invscale=1, default=None):
1116 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1117
1118
608d11f5
PH
1119def parse_duration(s):
1120 if s is None:
1121 return None
1122
ca7b3246
S
1123 s = s.strip()
1124
608d11f5 1125 m = re.match(
6a68bb57
PH
1126 r'''(?ix)T?
1127 (?:
1128 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1129 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1130 )?
1131 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$''', s)
608d11f5
PH
1132 if not m:
1133 return None
1134 res = int(m.group('secs'))
1135 if m.group('mins'):
1136 res += int(m.group('mins')) * 60
1137 if m.group('hours'):
1138 res += int(m.group('hours')) * 60 * 60
7adcbe75
PH
1139 if m.group('ms'):
1140 res += float(m.group('ms'))
608d11f5 1141 return res
91d7d0b3
JMF
1142
1143
1144def prepend_extension(filename, ext):
1145 name, real_ext = os.path.splitext(filename)
28e614de 1146 return '{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1147
1148
1149def check_executable(exe, args=[]):
1150 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1151 args can be a list of arguments for a short output (like -version) """
1152 try:
1153 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1154 except OSError:
1155 return False
1156 return exe
b7ab0590
PH
1157
1158
95807118
PH
1159def get_exe_version(exe, args=['--version'],
1160 version_re=r'version\s+([0-9._-a-zA-Z]+)',
28e614de 1161 unrecognized='present'):
95807118
PH
1162 """ Returns the version of the specified executable,
1163 or False if the executable is not present """
1164 try:
1165 out, err = subprocess.Popen(
1166 [exe] + args,
1167 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1168 except OSError:
1169 return False
1170 firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1171 m = re.search(version_re, firstline)
1172 if m:
1173 return m.group(1)
1174 else:
1175 return unrecognized
1176
1177
b7ab0590 1178class PagedList(object):
dd26ced1
PH
1179 def __len__(self):
1180 # This is only useful for tests
1181 return len(self.getslice())
1182
9c44d242
PH
1183
1184class OnDemandPagedList(PagedList):
1185 def __init__(self, pagefunc, pagesize):
1186 self._pagefunc = pagefunc
1187 self._pagesize = pagesize
1188
b7ab0590
PH
1189 def getslice(self, start=0, end=None):
1190 res = []
1191 for pagenum in itertools.count(start // self._pagesize):
1192 firstid = pagenum * self._pagesize
1193 nextfirstid = pagenum * self._pagesize + self._pagesize
1194 if start >= nextfirstid:
1195 continue
1196
1197 page_results = list(self._pagefunc(pagenum))
1198
1199 startv = (
1200 start % self._pagesize
1201 if firstid <= start < nextfirstid
1202 else 0)
1203
1204 endv = (
1205 ((end - 1) % self._pagesize) + 1
1206 if (end is not None and firstid <= end <= nextfirstid)
1207 else None)
1208
1209 if startv != 0 or endv is not None:
1210 page_results = page_results[startv:endv]
1211 res.extend(page_results)
1212
1213 # A little optimization - if current page is not "full", ie. does
1214 # not contain page_size videos then we can assume that this page
1215 # is the last one - there are no more ids on further pages -
1216 # i.e. no need to query again.
1217 if len(page_results) + startv < self._pagesize:
1218 break
1219
1220 # If we got the whole page, but the next page is not interesting,
1221 # break out early as well
1222 if end == nextfirstid:
1223 break
1224 return res
81c2f20b
PH
1225
1226
9c44d242
PH
1227class InAdvancePagedList(PagedList):
1228 def __init__(self, pagefunc, pagecount, pagesize):
1229 self._pagefunc = pagefunc
1230 self._pagecount = pagecount
1231 self._pagesize = pagesize
1232
1233 def getslice(self, start=0, end=None):
1234 res = []
1235 start_page = start // self._pagesize
1236 end_page = (
1237 self._pagecount if end is None else (end // self._pagesize + 1))
1238 skip_elems = start - start_page * self._pagesize
1239 only_more = None if end is None else end - start
1240 for pagenum in range(start_page, end_page):
1241 page = list(self._pagefunc(pagenum))
1242 if skip_elems:
1243 page = page[skip_elems:]
1244 skip_elems = None
1245 if only_more is not None:
1246 if len(page) < only_more:
1247 only_more -= len(page)
1248 else:
1249 page = page[:only_more]
1250 res.extend(page)
1251 break
1252 res.extend(page)
1253 return res
1254
1255
81c2f20b 1256def uppercase_escape(s):
676eb3f2 1257 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1258 return re.sub(
a612753d 1259 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1260 lambda m: unicode_escape(m.group(0))[0],
1261 s)
b53466e1 1262
d05cfe06
S
1263
1264def escape_rfc3986(s):
1265 """Escape non-ASCII characters as suggested by RFC 3986"""
1266 if sys.version_info < (3, 0) and isinstance(s, unicode):
1267 s = s.encode('utf-8')
ecc0c5ee 1268 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1269
1270
1271def escape_url(url):
1272 """Escape URL as suggested by RFC 3986"""
1273 url_parsed = compat_urllib_parse_urlparse(url)
1274 return url_parsed._replace(
1275 path=escape_rfc3986(url_parsed.path),
1276 params=escape_rfc3986(url_parsed.params),
1277 query=escape_rfc3986(url_parsed.query),
1278 fragment=escape_rfc3986(url_parsed.fragment)
1279 ).geturl()
1280
b53466e1 1281try:
28e614de 1282 struct.pack('!I', 0)
b53466e1
PH
1283except TypeError:
1284 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1285 def struct_pack(spec, *args):
1286 if isinstance(spec, compat_str):
1287 spec = spec.encode('ascii')
1288 return struct.pack(spec, *args)
1289
1290 def struct_unpack(spec, *args):
1291 if isinstance(spec, compat_str):
1292 spec = spec.encode('ascii')
1293 return struct.unpack(spec, *args)
1294else:
1295 struct_pack = struct.pack
1296 struct_unpack = struct.unpack
62e609ab
PH
1297
1298
1299def read_batch_urls(batch_fd):
1300 def fixup(url):
1301 if not isinstance(url, compat_str):
1302 url = url.decode('utf-8', 'replace')
28e614de 1303 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1304 if url.startswith(BOM_UTF8):
1305 url = url[len(BOM_UTF8):]
1306 url = url.strip()
1307 if url.startswith(('#', ';', ']')):
1308 return False
1309 return url
1310
1311 with contextlib.closing(batch_fd) as fd:
1312 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1313
1314
1315def urlencode_postdata(*args, **kargs):
1316 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1317
1318
0990305d
PH
1319try:
1320 etree_iter = xml.etree.ElementTree.Element.iter
1321except AttributeError: # Python <=2.6
1322 etree_iter = lambda n: n.findall('.//*')
1323
1324
bcf89ce6
PH
1325def parse_xml(s):
1326 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1327 def doctype(self, name, pubid, system):
1328 pass # Ignore doctypes
1329
1330 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1331 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1332 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1333 # Fix up XML parser in Python 2.x
1334 if sys.version_info < (3, 0):
1335 for n in etree_iter(tree):
1336 if n.text is not None:
1337 if not isinstance(n.text, compat_str):
1338 n.text = n.text.decode('utf-8')
1339 return tree
e68301af
PH
1340
1341
a1a530b0
PH
1342US_RATINGS = {
1343 'G': 0,
1344 'PG': 10,
1345 'PG-13': 13,
1346 'R': 16,
1347 'NC': 18,
1348}
fac55558
PH
1349
1350
146c80e2
S
1351def parse_age_limit(s):
1352 if s is None:
d838b1bd 1353 return None
146c80e2 1354 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d838b1bd 1355 return int(m.group('age')) if m else US_RATINGS.get(s, None)
146c80e2
S
1356
1357
fac55558 1358def strip_jsonp(code):
609a61e3
PH
1359 return re.sub(
1360 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1361
1362
e05f6939
PH
1363def js_to_json(code):
1364 def fix_kv(m):
e7b6d122
PH
1365 v = m.group(0)
1366 if v in ('true', 'false', 'null'):
1367 return v
1368 if v.startswith('"'):
1369 return v
1370 if v.startswith("'"):
1371 v = v[1:-1]
1372 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1373 '\\\\': '\\\\',
1374 "\\'": "'",
1375 '"': '\\"',
1376 }[m.group(0)], v)
1377 return '"%s"' % v
e05f6939
PH
1378
1379 res = re.sub(r'''(?x)
e7b6d122
PH
1380 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1381 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1382 [a-zA-Z_][a-zA-Z_0-9]*
e05f6939
PH
1383 ''', fix_kv, code)
1384 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1385 return res
1386
1387
478c2c61
PH
1388def qualities(quality_ids):
1389 """ Get a numeric quality value out of a list of possible values """
1390 def q(qid):
1391 try:
1392 return quality_ids.index(qid)
1393 except ValueError:
1394 return -1
1395 return q
1396
acd69589
PH
1397
1398DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1399
a020a0dc
PH
1400
1401def limit_length(s, length):
1402 """ Add ellipses to overly long strings """
1403 if s is None:
1404 return None
1405 ELLIPSES = '...'
1406 if len(s) > length:
1407 return s[:length - len(ELLIPSES)] + ELLIPSES
1408 return s
48844745
PH
1409
1410
1411def version_tuple(v):
1412 return [int(e) for e in v.split('.')]
1413
1414
1415def is_outdated_version(version, limit, assume_new=True):
1416 if not version:
1417 return not assume_new
1418 try:
1419 return version_tuple(version) < version_tuple(limit)
1420 except ValueError:
1421 return not assume_new