]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[smotri] Adapt to new API and modernize
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
912b38b4 6import calendar
676eb3f2 7import codecs
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96
PH
10import datetime
11import email.utils
f45c185f 12import errno
d77c3dfd 13import gzip
b7ab0590 14import itertools
03f9daab 15import io
f4bfd65f 16import json
d77c3dfd 17import locale
02dbf93f 18import math
d77c3dfd 19import os
4eb7f1d1 20import pipes
c496ca96 21import platform
d77c3dfd 22import re
13ebea79 23import ssl
c496ca96 24import socket
b53466e1 25import struct
1c088fa8 26import subprocess
d77c3dfd 27import sys
181c8655 28import tempfile
01951dda 29import traceback
bcf89ce6 30import xml.etree.ElementTree
d77c3dfd 31import zlib
d77c3dfd 32
8c25f81b
PH
33from .compat import (
34 compat_chr,
35 compat_getenv,
36 compat_html_entities,
8c25f81b
PH
37 compat_parse_qs,
38 compat_str,
39 compat_urllib_error,
40 compat_urllib_parse,
41 compat_urllib_parse_urlparse,
42 compat_urllib_request,
43 compat_urlparse,
7d4111ed 44 shlex_quote,
8c25f81b 45)
4644ac55
S
46
47
468e2e92
FV
48# This is not clearly defined otherwise
49compiled_regex_type = type(re.compile(''))
50
3e669f36 51std_headers = {
ae8f7871 52 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
53 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
54 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
55 'Accept-Encoding': 'gzip, deflate',
56 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 57}
f427df17 58
d77c3dfd 59def preferredencoding():
59ae15a5 60 """Get preferred encoding.
d77c3dfd 61
59ae15a5
PH
62 Returns the best encoding scheme for the system, based on
63 locale.getpreferredencoding() and some further tweaks.
64 """
65 try:
66 pref = locale.getpreferredencoding()
28e614de 67 'TEST'.encode(pref)
59ae15a5
PH
68 except:
69 pref = 'UTF-8'
bae611f2 70
59ae15a5 71 return pref
d77c3dfd 72
f4bfd65f 73
181c8655 74def write_json_file(obj, fn):
1394646a 75 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 76
92120217 77 fn = encodeFilename(fn)
61ee5aeb 78 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
79 encoding = get_filesystem_encoding()
80 # os.path.basename returns a bytes object, but NamedTemporaryFile
81 # will fail if the filename contains non ascii characters unless we
82 # use a unicode object
83 path_basename = lambda f: os.path.basename(fn).decode(encoding)
84 # the same for os.path.dirname
85 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
86 else:
87 path_basename = os.path.basename
88 path_dirname = os.path.dirname
89
73159f99
S
90 args = {
91 'suffix': '.tmp',
ec5f6016
JMF
92 'prefix': path_basename(fn) + '.',
93 'dir': path_dirname(fn),
73159f99
S
94 'delete': False,
95 }
96
181c8655
PH
97 # In Python 2.x, json.dump expects a bytestream.
98 # In Python 3.x, it writes to a character stream
99 if sys.version_info < (3, 0):
73159f99 100 args['mode'] = 'wb'
181c8655 101 else:
73159f99
S
102 args.update({
103 'mode': 'w',
104 'encoding': 'utf-8',
105 })
106
107 tf = tempfile.NamedTemporaryFile(**args)
181c8655
PH
108
109 try:
110 with tf:
111 json.dump(obj, tf)
1394646a
IK
112 if sys.platform == 'win32':
113 # Need to remove existing file on Windows, else os.rename raises
114 # WindowsError or FileExistsError.
115 try:
116 os.unlink(fn)
117 except OSError:
118 pass
181c8655
PH
119 os.rename(tf.name, fn)
120 except:
121 try:
122 os.remove(tf.name)
123 except OSError:
124 pass
125 raise
126
127
128if sys.version_info >= (2, 7):
59ae56fa
PH
129 def find_xpath_attr(node, xpath, key, val):
130 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
131 assert re.match(r'^[a-zA-Z-]+$', key)
132 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
59ae56fa
PH
133 expr = xpath + u"[@%s='%s']" % (key, val)
134 return node.find(expr)
135else:
136 def find_xpath_attr(node, xpath, key, val):
4eefbfdb
PH
137 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
138 # .//node does not match if a node is a direct child of . !
139 if isinstance(xpath, unicode):
140 xpath = xpath.encode('ascii')
141
59ae56fa
PH
142 for f in node.findall(xpath):
143 if f.attrib.get(key) == val:
144 return f
145 return None
146
d7e66d39
JMF
147# On python2.6 the xml.etree.ElementTree.Element methods don't support
148# the namespace parameter
149def xpath_with_ns(path, ns_map):
150 components = [c.split(':') for c in path.split('/')]
151 replaced = []
152 for c in components:
153 if len(c) == 1:
154 replaced.append(c[0])
155 else:
156 ns, tag = c
157 replaced.append('{%s}%s' % (ns_map[ns], tag))
158 return '/'.join(replaced)
159
d77c3dfd 160
bf0ff932 161def xpath_text(node, xpath, name=None, fatal=False):
d74bebd5
PH
162 if sys.version_info < (2, 7): # Crazy 2.6
163 xpath = xpath.encode('ascii')
164
bf0ff932
PH
165 n = node.find(xpath)
166 if n is None:
167 if fatal:
168 name = xpath if name is None else name
169 raise ExtractorError('Could not find XML element %s' % name)
170 else:
171 return None
172 return n.text
173
174
9e6dd238 175def get_element_by_id(id, html):
43e8fafd
ND
176 """Return the content of the tag with the specified ID in the passed HTML document"""
177 return get_element_by_attribute("id", id, html)
178
12ea2f30 179
43e8fafd
ND
180def get_element_by_attribute(attribute, value, html):
181 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 182
38285056
PH
183 m = re.search(r'''(?xs)
184 <([a-zA-Z0-9:._-]+)
185 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
186 \s+%s=['"]?%s['"]?
187 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
188 \s*>
189 (?P<content>.*?)
190 </\1>
191 ''' % (re.escape(attribute), re.escape(value)), html)
192
193 if not m:
194 return None
195 res = m.group('content')
196
197 if res.startswith('"') or res.startswith("'"):
198 res = res[1:-1]
a921f407 199
38285056 200 return unescapeHTML(res)
a921f407 201
9e6dd238
FV
202
203def clean_html(html):
59ae15a5
PH
204 """Clean an HTML snippet into a readable string"""
205 # Newline vs <br />
206 html = html.replace('\n', ' ')
6b3aef80
FV
207 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
208 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
209 # Strip html tags
210 html = re.sub('<.*?>', '', html)
211 # Replace html entities
212 html = unescapeHTML(html)
7decf895 213 return html.strip()
9e6dd238
FV
214
215
d77c3dfd 216def sanitize_open(filename, open_mode):
59ae15a5
PH
217 """Try to open the given filename, and slightly tweak it if this fails.
218
219 Attempts to open the given filename. If this fails, it tries to change
220 the filename slightly, step by step, until it's either able to open it
221 or it fails and raises a final exception, like the standard open()
222 function.
223
224 It returns the tuple (stream, definitive_file_name).
225 """
226 try:
28e614de 227 if filename == '-':
59ae15a5
PH
228 if sys.platform == 'win32':
229 import msvcrt
230 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 231 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
232 stream = open(encodeFilename(filename), open_mode)
233 return (stream, filename)
234 except (IOError, OSError) as err:
f45c185f
PH
235 if err.errno in (errno.EACCES,):
236 raise
59ae15a5 237
f45c185f
PH
238 # In case of error, try to remove win32 forbidden chars
239 alt_filename = os.path.join(
28e614de 240 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
f45c185f
PH
241 for path_part in os.path.split(filename)
242 )
243 if alt_filename == filename:
244 raise
245 else:
246 # An exception here should be caught in the caller
247 stream = open(encodeFilename(filename), open_mode)
248 return (stream, alt_filename)
d77c3dfd
FV
249
250
251def timeconvert(timestr):
59ae15a5
PH
252 """Convert RFC 2822 defined time string into system timestamp"""
253 timestamp = None
254 timetuple = email.utils.parsedate_tz(timestr)
255 if timetuple is not None:
256 timestamp = email.utils.mktime_tz(timetuple)
257 return timestamp
1c469a94 258
796173d0 259def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
260 """Sanitizes a string so it could be used as part of a filename.
261 If restricted is set, use a stricter subset of allowed characters.
796173d0 262 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
263 """
264 def replace_insane(char):
265 if char == '?' or ord(char) < 32 or ord(char) == 127:
266 return ''
267 elif char == '"':
268 return '' if restricted else '\''
269 elif char == ':':
270 return '_-' if restricted else ' -'
271 elif char in '\\/|*<>':
272 return '_'
627dcfff 273 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
274 return '_'
275 if restricted and ord(char) > 127:
276 return '_'
277 return char
278
28e614de 279 result = ''.join(map(replace_insane, s))
796173d0
PH
280 if not is_id:
281 while '__' in result:
282 result = result.replace('__', '_')
283 result = result.strip('_')
284 # Common case of "Foreign band name - English song title"
285 if restricted and result.startswith('-_'):
286 result = result[2:]
287 if not result:
288 result = '_'
59ae15a5 289 return result
d77c3dfd
FV
290
291def orderedSet(iterable):
59ae15a5
PH
292 """ Remove all duplicates from the input iterable """
293 res = []
294 for el in iterable:
295 if el not in res:
296 res.append(el)
297 return res
d77c3dfd 298
912b38b4 299
4e408e47
PH
300def _htmlentity_transform(entity):
301 """Transforms an HTML entity to a character."""
302 # Known non-numeric HTML entity
303 if entity in compat_html_entities.name2codepoint:
304 return compat_chr(compat_html_entities.name2codepoint[entity])
305
306 mobj = re.match(r'#(x?[0-9]+)', entity)
307 if mobj is not None:
308 numstr = mobj.group(1)
28e614de 309 if numstr.startswith('x'):
4e408e47 310 base = 16
28e614de 311 numstr = '0%s' % numstr
4e408e47
PH
312 else:
313 base = 10
314 return compat_chr(int(numstr, base))
315
316 # Unknown entity in name, return its literal representation
28e614de 317 return ('&%s;' % entity)
4e408e47
PH
318
319
d77c3dfd 320def unescapeHTML(s):
912b38b4
PH
321 if s is None:
322 return None
323 assert type(s) == compat_str
d77c3dfd 324
4e408e47
PH
325 return re.sub(
326 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 327
8bf48f23
PH
328
329def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
330 """
331 @param s The name of the file
332 """
d77c3dfd 333
8bf48f23 334 assert type(s) == compat_str
d77c3dfd 335
59ae15a5
PH
336 # Python 3 has a Unicode API
337 if sys.version_info >= (3, 0):
338 return s
0f00efed 339
59ae15a5 340 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
28e614de 341 # Pass '' directly to use Unicode APIs on Windows 2000 and up
59ae15a5
PH
342 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
343 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
344 if not for_subprocess:
345 return s
346 else:
347 # For subprocess calls, encode with locale encoding
348 # Refer to http://stackoverflow.com/a/9951851/35070
349 encoding = preferredencoding()
59ae15a5 350 else:
6df40dcb 351 encoding = sys.getfilesystemencoding()
8bf48f23
PH
352 if encoding is None:
353 encoding = 'utf-8'
354 return s.encode(encoding, 'ignore')
355
f07b74fc
PH
356
357def encodeArgument(s):
358 if not isinstance(s, compat_str):
359 # Legacy code that uses byte strings
360 # Uncomment the following line after fixing all post processors
361 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
362 s = s.decode('ascii')
363 return encodeFilename(s, True)
364
365
8271226a
PH
366def decodeOption(optval):
367 if optval is None:
368 return optval
369 if isinstance(optval, bytes):
370 optval = optval.decode(preferredencoding())
371
372 assert isinstance(optval, compat_str)
373 return optval
1c256f70 374
4539dd30
PH
375def formatSeconds(secs):
376 if secs > 3600:
377 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
378 elif secs > 60:
379 return '%d:%02d' % (secs // 60, secs % 60)
380 else:
381 return '%d' % secs
382
a0ddb8a2
PH
383
384def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
13ebea79
PH
385 if sys.version_info < (3, 2):
386 import httplib
387
388 class HTTPSConnectionV3(httplib.HTTPSConnection):
389 def __init__(self, *args, **kwargs):
390 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
391
392 def connect(self):
393 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 394 if getattr(self, '_tunnel_host', False):
13ebea79
PH
395 self.sock = sock
396 self._tunnel()
397 try:
aa37e3d4 398 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
de79c46c 399 except ssl.SSLError:
13ebea79
PH
400 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
401
402 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
403 def https_open(self, req):
404 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 405 return HTTPSHandlerV3(**kwargs)
aa37e3d4
PH
406 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
407 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
408 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
409 if opts_no_check_certificate:
410 context.verify_mode = ssl.CERT_NONE
411 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
412 else: # Python < 3.4
413 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
ea6d901e 414 context.verify_mode = (ssl.CERT_NONE
dca08720 415 if opts_no_check_certificate
ea6d901e 416 else ssl.CERT_REQUIRED)
303b479e
PH
417 context.set_default_verify_paths()
418 try:
419 context.load_default_certs()
420 except AttributeError:
421 pass # Python < 3.4
a0ddb8a2 422 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 423
732ea2f0 424
1c256f70
PH
425class ExtractorError(Exception):
426 """Error during info extraction."""
d11271dd 427 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
428 """ tb, if given, is the original traceback (so that it can be printed out).
429 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
430 """
431
432 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
433 expected = True
d11271dd
PH
434 if video_id is not None:
435 msg = video_id + ': ' + msg
410f3e73 436 if cause:
28e614de 437 msg += ' (caused by %r)' % cause
9a82b238 438 if not expected:
732ea2f0
PH
439 if ytdl_is_updateable():
440 update_cmd = 'type youtube-dl -U to update'
441 else:
442 update_cmd = 'see https://yt-dl.org/update on how to update'
443 msg += '; please report this issue on https://yt-dl.org/bug .'
444 msg += ' Make sure you are using the latest version; %s.' % update_cmd
445 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
1c256f70 446 super(ExtractorError, self).__init__(msg)
d5979c5d 447
1c256f70 448 self.traceback = tb
8cc83b8d 449 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 450 self.cause = cause
d11271dd 451 self.video_id = video_id
1c256f70 452
01951dda
PH
453 def format_traceback(self):
454 if self.traceback is None:
455 return None
28e614de 456 return ''.join(traceback.format_tb(self.traceback))
01951dda 457
1c256f70 458
55b3e45b
JMF
459class RegexNotFoundError(ExtractorError):
460 """Error when a regex didn't match"""
461 pass
462
463
d77c3dfd 464class DownloadError(Exception):
59ae15a5 465 """Download Error exception.
d77c3dfd 466
59ae15a5
PH
467 This exception may be thrown by FileDownloader objects if they are not
468 configured to continue on errors. They will contain the appropriate
469 error message.
470 """
8cc83b8d
FV
471 def __init__(self, msg, exc_info=None):
472 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
473 super(DownloadError, self).__init__(msg)
474 self.exc_info = exc_info
d77c3dfd
FV
475
476
477class SameFileError(Exception):
59ae15a5 478 """Same File exception.
d77c3dfd 479
59ae15a5
PH
480 This exception will be thrown by FileDownloader objects if they detect
481 multiple files would have to be downloaded to the same file on disk.
482 """
483 pass
d77c3dfd
FV
484
485
486class PostProcessingError(Exception):
59ae15a5 487 """Post Processing exception.
d77c3dfd 488
59ae15a5
PH
489 This exception may be raised by PostProcessor's .run() method to
490 indicate an error in the postprocessing task.
491 """
7851b379
PH
492 def __init__(self, msg):
493 self.msg = msg
d77c3dfd
FV
494
495class MaxDownloadsReached(Exception):
59ae15a5
PH
496 """ --max-downloads limit has been reached. """
497 pass
d77c3dfd
FV
498
499
500class UnavailableVideoError(Exception):
59ae15a5 501 """Unavailable Format exception.
d77c3dfd 502
59ae15a5
PH
503 This exception will be thrown when a video is requested
504 in a format that is not available for that video.
505 """
506 pass
d77c3dfd
FV
507
508
509class ContentTooShortError(Exception):
59ae15a5 510 """Content Too Short exception.
d77c3dfd 511
59ae15a5
PH
512 This exception may be raised by FileDownloader objects when a file they
513 download is too small for what the server announced first, indicating
514 the connection was probably interrupted.
515 """
516 # Both in bytes
517 downloaded = None
518 expected = None
d77c3dfd 519
59ae15a5
PH
520 def __init__(self, downloaded, expected):
521 self.downloaded = downloaded
522 self.expected = expected
d77c3dfd 523
acebc9cd 524class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
525 """Handler for HTTP requests and responses.
526
527 This class, when installed with an OpenerDirector, automatically adds
528 the standard headers to every HTTP request and handles gzipped and
529 deflated responses from web servers. If compression is to be avoided in
530 a particular request, the original request in the program code only has
531 to include the HTTP header "Youtubedl-No-Compression", which will be
532 removed before making the real request.
533
534 Part of this code was copied from:
535
536 http://techknack.net/python-urllib2-handlers/
537
538 Andrew Rowls, the author of that code, agreed to release it to the
539 public domain.
540 """
541
542 @staticmethod
543 def deflate(data):
544 try:
545 return zlib.decompress(data, -zlib.MAX_WBITS)
546 except zlib.error:
547 return zlib.decompress(data)
548
549 @staticmethod
550 def addinfourl_wrapper(stream, headers, url, code):
551 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
552 return compat_urllib_request.addinfourl(stream, headers, url, code)
553 ret = compat_urllib_request.addinfourl(stream, headers, url)
554 ret.code = code
555 return ret
556
acebc9cd 557 def http_request(self, req):
33ac271b
PH
558 for h, v in std_headers.items():
559 if h not in req.headers:
560 req.add_header(h, v)
59ae15a5
PH
561 if 'Youtubedl-no-compression' in req.headers:
562 if 'Accept-encoding' in req.headers:
563 del req.headers['Accept-encoding']
564 del req.headers['Youtubedl-no-compression']
3446dfb7 565 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
566 if 'User-agent' in req.headers:
567 del req.headers['User-agent']
568 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 569 del req.headers['Youtubedl-user-agent']
989b4b2b
PH
570
571 if sys.version_info < (2, 7) and '#' in req.get_full_url():
572 # Python 2.6 is brain-dead when it comes to fragments
573 req._Request__original = req._Request__original.partition('#')[0]
574 req._Request__r_type = req._Request__r_type.partition('#')[0]
575
59ae15a5
PH
576 return req
577
acebc9cd 578 def http_response(self, req, resp):
59ae15a5
PH
579 old_resp = resp
580 # gzip
581 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
582 content = resp.read()
583 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
584 try:
585 uncompressed = io.BytesIO(gz.read())
586 except IOError as original_ioerror:
587 # There may be junk add the end of the file
588 # See http://stackoverflow.com/q/4928560/35070 for details
589 for i in range(1, 1024):
590 try:
591 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
592 uncompressed = io.BytesIO(gz.read())
593 except IOError:
594 continue
595 break
596 else:
597 raise original_ioerror
598 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
599 resp.msg = old_resp.msg
600 # deflate
601 if resp.headers.get('Content-encoding', '') == 'deflate':
602 gz = io.BytesIO(self.deflate(resp.read()))
603 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
604 resp.msg = old_resp.msg
605 return resp
0f8d03f8 606
acebc9cd
PH
607 https_request = http_request
608 https_response = http_response
bf50b038 609
5de90176 610
305d0683 611def parse_iso8601(date_str, delimiter='T'):
912b38b4
PH
612 """ Return a UNIX timestamp from the given date """
613
614 if date_str is None:
615 return None
616
617 m = re.search(
6ad4013d 618 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
912b38b4
PH
619 date_str)
620 if not m:
621 timezone = datetime.timedelta()
622 else:
623 date_str = date_str[:-len(m.group(0))]
624 if not m.group('sign'):
625 timezone = datetime.timedelta()
626 else:
627 sign = 1 if m.group('sign') == '+' else -1
628 timezone = datetime.timedelta(
629 hours=sign * int(m.group('hours')),
630 minutes=sign * int(m.group('minutes')))
6ad4013d 631 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
305d0683 632 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
633 return calendar.timegm(dt.timetuple())
634
635
bf50b038
JMF
636def unified_strdate(date_str):
637 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
638
639 if date_str is None:
640 return None
641
bf50b038
JMF
642 upload_date = None
643 #Replace commas
026fcc04 644 date_str = date_str.replace(',', ' ')
bf50b038 645 # %z (UTC offset) is only supported in python>=3.2
026fcc04 646 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
19e1d359
JMF
647 format_expressions = [
648 '%d %B %Y',
0f99566c 649 '%d %b %Y',
19e1d359
JMF
650 '%B %d %Y',
651 '%b %d %Y',
78ff59d0
PP
652 '%b %dst %Y %I:%M%p',
653 '%b %dnd %Y %I:%M%p',
654 '%b %dth %Y %I:%M%p',
19e1d359 655 '%Y-%m-%d',
fe556f1b 656 '%Y/%m/%d',
4cf96546 657 '%d.%m.%Y',
19e1d359 658 '%d/%m/%Y',
423817c4 659 '%d/%m/%y',
19e1d359 660 '%Y/%m/%d %H:%M:%S',
99b67fec 661 '%d/%m/%Y %H:%M:%S',
5d73273f 662 '%Y-%m-%d %H:%M:%S',
e9be9a6a 663 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 664 '%d.%m.%Y %H:%M',
b047de6f 665 '%d.%m.%Y %H.%M',
19e1d359 666 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
667 '%Y-%m-%dT%H:%M:%S.%fZ',
668 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 669 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 670 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 671 '%Y-%m-%dT%H:%M',
19e1d359 672 ]
bf50b038
JMF
673 for expression in format_expressions:
674 try:
675 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 676 except ValueError:
bf50b038 677 pass
42393ce2
PH
678 if upload_date is None:
679 timetuple = email.utils.parsedate_tz(date_str)
680 if timetuple:
681 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
682 return upload_date
683
28e614de 684def determine_ext(url, default_ext='unknown_video'):
f4776371
S
685 if url is None:
686 return default_ext
28e614de 687 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
688 if re.match(r'^[A-Za-z0-9]+$', guess):
689 return guess
690 else:
cbdbb766 691 return default_ext
73e79f2a 692
d4051a8e 693def subtitles_filename(filename, sub_lang, sub_format):
28e614de 694 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 695
bd558525 696def date_from_str(date_str):
37254abc
JMF
697 """
698 Return a datetime object from a string in the format YYYYMMDD or
699 (now|today)[+-][0-9](day|week|month|year)(s)?"""
700 today = datetime.date.today()
701 if date_str == 'now'or date_str == 'today':
702 return today
703 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
704 if match is not None:
705 sign = match.group('sign')
706 time = int(match.group('time'))
707 if sign == '-':
708 time = -time
709 unit = match.group('unit')
710 #A bad aproximation?
711 if unit == 'month':
712 unit = 'day'
713 time *= 30
714 elif unit == 'year':
715 unit = 'day'
716 time *= 365
717 unit += 's'
718 delta = datetime.timedelta(**{unit: time})
719 return today + delta
bd558525
JMF
720 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
721
e63fc1be 722def hyphenate_date(date_str):
723 """
724 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
725 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
726 if match is not None:
727 return '-'.join(match.groups())
728 else:
729 return date_str
730
bd558525
JMF
731class DateRange(object):
732 """Represents a time interval between two dates"""
733 def __init__(self, start=None, end=None):
734 """start and end must be strings in the format accepted by date"""
735 if start is not None:
736 self.start = date_from_str(start)
737 else:
738 self.start = datetime.datetime.min.date()
739 if end is not None:
740 self.end = date_from_str(end)
741 else:
742 self.end = datetime.datetime.max.date()
37254abc 743 if self.start > self.end:
bd558525
JMF
744 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
745 @classmethod
746 def day(cls, day):
747 """Returns a range that only contains the given day"""
748 return cls(day,day)
749 def __contains__(self, date):
750 """Check if the date is in the range"""
37254abc
JMF
751 if not isinstance(date, datetime.date):
752 date = date_from_str(date)
753 return self.start <= date <= self.end
bd558525
JMF
754 def __str__(self):
755 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
756
757
758def platform_name():
759 """ Returns the platform name as a compat_str """
760 res = platform.platform()
761 if isinstance(res, bytes):
762 res = res.decode(preferredencoding())
763
764 assert isinstance(res, compat_str)
765 return res
c257baff
PH
766
767
b58ddb32
PH
768def _windows_write_string(s, out):
769 """ Returns True if the string was written using special methods,
770 False if it has yet to be written out."""
771 # Adapted from http://stackoverflow.com/a/3259271/35070
772
773 import ctypes
774 import ctypes.wintypes
775
776 WIN_OUTPUT_IDS = {
777 1: -11,
778 2: -12,
779 }
780
a383a98a
PH
781 try:
782 fileno = out.fileno()
783 except AttributeError:
784 # If the output stream doesn't have a fileno, it's virtual
785 return False
b58ddb32
PH
786 if fileno not in WIN_OUTPUT_IDS:
787 return False
788
789 GetStdHandle = ctypes.WINFUNCTYPE(
790 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
791 ("GetStdHandle", ctypes.windll.kernel32))
792 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
793
794 WriteConsoleW = ctypes.WINFUNCTYPE(
795 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
796 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
797 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
798 written = ctypes.wintypes.DWORD(0)
799
800 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
801 FILE_TYPE_CHAR = 0x0002
802 FILE_TYPE_REMOTE = 0x8000
803 GetConsoleMode = ctypes.WINFUNCTYPE(
804 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
805 ctypes.POINTER(ctypes.wintypes.DWORD))(
806 ("GetConsoleMode", ctypes.windll.kernel32))
807 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
808
809 def not_a_console(handle):
810 if handle == INVALID_HANDLE_VALUE or handle is None:
811 return True
812 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
813 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
814
815 if not_a_console(h):
816 return False
817
d1b9c912
PH
818 def next_nonbmp_pos(s):
819 try:
820 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
821 except StopIteration:
822 return len(s)
823
824 while s:
825 count = min(next_nonbmp_pos(s), 1024)
826
b58ddb32 827 ret = WriteConsoleW(
d1b9c912 828 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
829 if ret == 0:
830 raise OSError('Failed to write string')
d1b9c912
PH
831 if not count: # We just wrote a non-BMP character
832 assert written.value == 2
833 s = s[1:]
834 else:
835 assert written.value > 0
836 s = s[written.value:]
b58ddb32
PH
837 return True
838
839
734f90bb 840def write_string(s, out=None, encoding=None):
7459e3a2
PH
841 if out is None:
842 out = sys.stderr
8bf48f23 843 assert type(s) == compat_str
7459e3a2 844
b58ddb32
PH
845 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
846 if _windows_write_string(s, out):
847 return
848
7459e3a2
PH
849 if ('b' in getattr(out, 'mode', '') or
850 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
851 byt = s.encode(encoding or preferredencoding(), 'ignore')
852 out.write(byt)
853 elif hasattr(out, 'buffer'):
854 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
855 byt = s.encode(enc, 'ignore')
856 out.buffer.write(byt)
857 else:
8bf48f23 858 out.write(s)
7459e3a2
PH
859 out.flush()
860
861
48ea9cea
PH
862def bytes_to_intlist(bs):
863 if not bs:
864 return []
865 if isinstance(bs[0], int): # Python 3
866 return list(bs)
867 else:
868 return [ord(c) for c in bs]
869
c257baff 870
cba892fa 871def intlist_to_bytes(xs):
872 if not xs:
873 return b''
eb4157fd 874 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
875
876
c1c9a79c
PH
877# Cross-platform file locking
878if sys.platform == 'win32':
879 import ctypes.wintypes
880 import msvcrt
881
882 class OVERLAPPED(ctypes.Structure):
883 _fields_ = [
884 ('Internal', ctypes.wintypes.LPVOID),
885 ('InternalHigh', ctypes.wintypes.LPVOID),
886 ('Offset', ctypes.wintypes.DWORD),
887 ('OffsetHigh', ctypes.wintypes.DWORD),
888 ('hEvent', ctypes.wintypes.HANDLE),
889 ]
890
891 kernel32 = ctypes.windll.kernel32
892 LockFileEx = kernel32.LockFileEx
893 LockFileEx.argtypes = [
894 ctypes.wintypes.HANDLE, # hFile
895 ctypes.wintypes.DWORD, # dwFlags
896 ctypes.wintypes.DWORD, # dwReserved
897 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
898 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
899 ctypes.POINTER(OVERLAPPED) # Overlapped
900 ]
901 LockFileEx.restype = ctypes.wintypes.BOOL
902 UnlockFileEx = kernel32.UnlockFileEx
903 UnlockFileEx.argtypes = [
904 ctypes.wintypes.HANDLE, # hFile
905 ctypes.wintypes.DWORD, # dwReserved
906 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
907 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
908 ctypes.POINTER(OVERLAPPED) # Overlapped
909 ]
910 UnlockFileEx.restype = ctypes.wintypes.BOOL
911 whole_low = 0xffffffff
912 whole_high = 0x7fffffff
913
914 def _lock_file(f, exclusive):
915 overlapped = OVERLAPPED()
916 overlapped.Offset = 0
917 overlapped.OffsetHigh = 0
918 overlapped.hEvent = 0
919 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
920 handle = msvcrt.get_osfhandle(f.fileno())
921 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
922 whole_low, whole_high, f._lock_file_overlapped_p):
923 raise OSError('Locking file failed: %r' % ctypes.FormatError())
924
925 def _unlock_file(f):
926 assert f._lock_file_overlapped_p
927 handle = msvcrt.get_osfhandle(f.fileno())
928 if not UnlockFileEx(handle, 0,
929 whole_low, whole_high, f._lock_file_overlapped_p):
930 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
931
932else:
933 import fcntl
934
935 def _lock_file(f, exclusive):
2582bebe 936 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
937
938 def _unlock_file(f):
2582bebe 939 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
940
941
942class locked_file(object):
943 def __init__(self, filename, mode, encoding=None):
944 assert mode in ['r', 'a', 'w']
945 self.f = io.open(filename, mode, encoding=encoding)
946 self.mode = mode
947
948 def __enter__(self):
949 exclusive = self.mode != 'r'
950 try:
951 _lock_file(self.f, exclusive)
952 except IOError:
953 self.f.close()
954 raise
955 return self
956
957 def __exit__(self, etype, value, traceback):
958 try:
959 _unlock_file(self.f)
960 finally:
961 self.f.close()
962
963 def __iter__(self):
964 return iter(self.f)
965
966 def write(self, *args):
967 return self.f.write(*args)
968
969 def read(self, *args):
970 return self.f.read(*args)
4eb7f1d1
JMF
971
972
4644ac55
S
973def get_filesystem_encoding():
974 encoding = sys.getfilesystemencoding()
975 return encoding if encoding is not None else 'utf-8'
976
977
4eb7f1d1 978def shell_quote(args):
a6a173c2 979 quoted_args = []
4644ac55 980 encoding = get_filesystem_encoding()
a6a173c2
JMF
981 for a in args:
982 if isinstance(a, bytes):
983 # We may get a filename encoded with 'encodeFilename'
984 a = a.decode(encoding)
985 quoted_args.append(pipes.quote(a))
28e614de 986 return ' '.join(quoted_args)
9d4660ca
PH
987
988
f4d96df0
PH
989def takewhile_inclusive(pred, seq):
990 """ Like itertools.takewhile, but include the latest evaluated element
991 (the first element so that Not pred(e)) """
992 for e in seq:
993 yield e
994 if not pred(e):
995 return
996
997
9d4660ca
PH
998def smuggle_url(url, data):
999 """ Pass additional data in a URL for internal use. """
1000
1001 sdata = compat_urllib_parse.urlencode(
28e614de
PH
1002 {'__youtubedl_smuggle': json.dumps(data)})
1003 return url + '#' + sdata
9d4660ca
PH
1004
1005
79f82953 1006def unsmuggle_url(smug_url, default=None):
9d4660ca 1007 if not '#__youtubedl_smuggle' in smug_url:
79f82953 1008 return smug_url, default
28e614de
PH
1009 url, _, sdata = smug_url.rpartition('#')
1010 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1011 data = json.loads(jsond)
1012 return url, data
02dbf93f
PH
1013
1014
02dbf93f
PH
1015def format_bytes(bytes):
1016 if bytes is None:
28e614de 1017 return 'N/A'
02dbf93f
PH
1018 if type(bytes) is str:
1019 bytes = float(bytes)
1020 if bytes == 0.0:
1021 exponent = 0
1022 else:
1023 exponent = int(math.log(bytes, 1024.0))
28e614de 1024 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1025 converted = float(bytes) / float(1024 ** exponent)
28e614de 1026 return '%.2f%s' % (converted, suffix)
f53c966a 1027
1c088fa8 1028
1c088fa8 1029def get_term_width():
4644ac55 1030 columns = compat_getenv('COLUMNS', None)
1c088fa8
PH
1031 if columns:
1032 return int(columns)
1033
1034 try:
1035 sp = subprocess.Popen(
1036 ['stty', 'size'],
1037 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1038 out, err = sp.communicate()
1039 return int(out.split()[1])
1040 except:
1041 pass
1042 return None
caefb1de
PH
1043
1044
1045def month_by_name(name):
1046 """ Return the number of a month by (locale-independently) English name """
1047
1048 ENGLISH_NAMES = [
28e614de
PH
1049 'January', 'February', 'March', 'April', 'May', 'June',
1050 'July', 'August', 'September', 'October', 'November', 'December']
caefb1de
PH
1051 try:
1052 return ENGLISH_NAMES.index(name) + 1
1053 except ValueError:
1054 return None
18258362
JMF
1055
1056
5aafe895 1057def fix_xml_ampersands(xml_str):
18258362 1058 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1059 return re.sub(
1060 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1061 '&amp;',
5aafe895 1062 xml_str)
e3946f98
PH
1063
1064
1065def setproctitle(title):
8bf48f23 1066 assert isinstance(title, compat_str)
e3946f98
PH
1067 try:
1068 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1069 except OSError:
1070 return
6eefe533
PH
1071 title_bytes = title.encode('utf-8')
1072 buf = ctypes.create_string_buffer(len(title_bytes))
1073 buf.value = title_bytes
e3946f98 1074 try:
6eefe533 1075 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1076 except AttributeError:
1077 return # Strange libc, just skip this
d7dda168
PH
1078
1079
1080def remove_start(s, start):
1081 if s.startswith(start):
1082 return s[len(start):]
1083 return s
29eb5174
PH
1084
1085
2b9faf55
PH
1086def remove_end(s, end):
1087 if s.endswith(end):
1088 return s[:-len(end)]
1089 return s
1090
1091
29eb5174 1092def url_basename(url):
9b8aaeed 1093 path = compat_urlparse.urlparse(url).path
28e614de 1094 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1095
1096
1097class HEADRequest(compat_urllib_request.Request):
1098 def get_method(self):
1099 return "HEAD"
7217e148
PH
1100
1101
9732d77e 1102def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1103 if get_attr:
1104 if v is not None:
1105 v = getattr(v, get_attr, None)
9572013d
PH
1106 if v == '':
1107 v = None
9732d77e
PH
1108 return default if v is None else (int(v) * invscale // scale)
1109
9572013d 1110
40a90862
JMF
1111def str_or_none(v, default=None):
1112 return default if v is None else compat_str(v)
1113
9732d77e
PH
1114
1115def str_to_int(int_str):
48d4681e 1116 """ A more relaxed version of int_or_none """
9732d77e
PH
1117 if int_str is None:
1118 return None
28e614de 1119 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1120 return int(int_str)
608d11f5
PH
1121
1122
9732d77e
PH
1123def float_or_none(v, scale=1, invscale=1, default=None):
1124 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1125
1126
608d11f5
PH
1127def parse_duration(s):
1128 if s is None:
1129 return None
1130
ca7b3246
S
1131 s = s.strip()
1132
608d11f5 1133 m = re.match(
6a68bb57
PH
1134 r'''(?ix)T?
1135 (?:
1136 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1137 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1138 )?
1139 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$''', s)
608d11f5
PH
1140 if not m:
1141 return None
1142 res = int(m.group('secs'))
1143 if m.group('mins'):
1144 res += int(m.group('mins')) * 60
1145 if m.group('hours'):
1146 res += int(m.group('hours')) * 60 * 60
7adcbe75
PH
1147 if m.group('ms'):
1148 res += float(m.group('ms'))
608d11f5 1149 return res
91d7d0b3
JMF
1150
1151
1152def prepend_extension(filename, ext):
1153 name, real_ext = os.path.splitext(filename)
28e614de 1154 return '{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1155
1156
1157def check_executable(exe, args=[]):
1158 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1159 args can be a list of arguments for a short output (like -version) """
1160 try:
1161 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1162 except OSError:
1163 return False
1164 return exe
b7ab0590
PH
1165
1166
95807118
PH
1167def get_exe_version(exe, args=['--version'],
1168 version_re=r'version\s+([0-9._-a-zA-Z]+)',
28e614de 1169 unrecognized='present'):
95807118
PH
1170 """ Returns the version of the specified executable,
1171 or False if the executable is not present """
1172 try:
1173 out, err = subprocess.Popen(
1174 [exe] + args,
1175 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1176 except OSError:
1177 return False
1178 firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1179 m = re.search(version_re, firstline)
1180 if m:
1181 return m.group(1)
1182 else:
1183 return unrecognized
1184
1185
b7ab0590 1186class PagedList(object):
dd26ced1
PH
1187 def __len__(self):
1188 # This is only useful for tests
1189 return len(self.getslice())
1190
9c44d242
PH
1191
1192class OnDemandPagedList(PagedList):
1193 def __init__(self, pagefunc, pagesize):
1194 self._pagefunc = pagefunc
1195 self._pagesize = pagesize
1196
b7ab0590
PH
1197 def getslice(self, start=0, end=None):
1198 res = []
1199 for pagenum in itertools.count(start // self._pagesize):
1200 firstid = pagenum * self._pagesize
1201 nextfirstid = pagenum * self._pagesize + self._pagesize
1202 if start >= nextfirstid:
1203 continue
1204
1205 page_results = list(self._pagefunc(pagenum))
1206
1207 startv = (
1208 start % self._pagesize
1209 if firstid <= start < nextfirstid
1210 else 0)
1211
1212 endv = (
1213 ((end - 1) % self._pagesize) + 1
1214 if (end is not None and firstid <= end <= nextfirstid)
1215 else None)
1216
1217 if startv != 0 or endv is not None:
1218 page_results = page_results[startv:endv]
1219 res.extend(page_results)
1220
1221 # A little optimization - if current page is not "full", ie. does
1222 # not contain page_size videos then we can assume that this page
1223 # is the last one - there are no more ids on further pages -
1224 # i.e. no need to query again.
1225 if len(page_results) + startv < self._pagesize:
1226 break
1227
1228 # If we got the whole page, but the next page is not interesting,
1229 # break out early as well
1230 if end == nextfirstid:
1231 break
1232 return res
81c2f20b
PH
1233
1234
9c44d242
PH
1235class InAdvancePagedList(PagedList):
1236 def __init__(self, pagefunc, pagecount, pagesize):
1237 self._pagefunc = pagefunc
1238 self._pagecount = pagecount
1239 self._pagesize = pagesize
1240
1241 def getslice(self, start=0, end=None):
1242 res = []
1243 start_page = start // self._pagesize
1244 end_page = (
1245 self._pagecount if end is None else (end // self._pagesize + 1))
1246 skip_elems = start - start_page * self._pagesize
1247 only_more = None if end is None else end - start
1248 for pagenum in range(start_page, end_page):
1249 page = list(self._pagefunc(pagenum))
1250 if skip_elems:
1251 page = page[skip_elems:]
1252 skip_elems = None
1253 if only_more is not None:
1254 if len(page) < only_more:
1255 only_more -= len(page)
1256 else:
1257 page = page[:only_more]
1258 res.extend(page)
1259 break
1260 res.extend(page)
1261 return res
1262
1263
81c2f20b 1264def uppercase_escape(s):
676eb3f2 1265 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1266 return re.sub(
a612753d 1267 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1268 lambda m: unicode_escape(m.group(0))[0],
1269 s)
b53466e1 1270
d05cfe06
S
1271
1272def escape_rfc3986(s):
1273 """Escape non-ASCII characters as suggested by RFC 3986"""
1274 if sys.version_info < (3, 0) and isinstance(s, unicode):
1275 s = s.encode('utf-8')
ecc0c5ee 1276 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1277
1278
1279def escape_url(url):
1280 """Escape URL as suggested by RFC 3986"""
1281 url_parsed = compat_urllib_parse_urlparse(url)
1282 return url_parsed._replace(
1283 path=escape_rfc3986(url_parsed.path),
1284 params=escape_rfc3986(url_parsed.params),
1285 query=escape_rfc3986(url_parsed.query),
1286 fragment=escape_rfc3986(url_parsed.fragment)
1287 ).geturl()
1288
b53466e1 1289try:
28e614de 1290 struct.pack('!I', 0)
b53466e1
PH
1291except TypeError:
1292 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1293 def struct_pack(spec, *args):
1294 if isinstance(spec, compat_str):
1295 spec = spec.encode('ascii')
1296 return struct.pack(spec, *args)
1297
1298 def struct_unpack(spec, *args):
1299 if isinstance(spec, compat_str):
1300 spec = spec.encode('ascii')
1301 return struct.unpack(spec, *args)
1302else:
1303 struct_pack = struct.pack
1304 struct_unpack = struct.unpack
62e609ab
PH
1305
1306
1307def read_batch_urls(batch_fd):
1308 def fixup(url):
1309 if not isinstance(url, compat_str):
1310 url = url.decode('utf-8', 'replace')
28e614de 1311 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1312 if url.startswith(BOM_UTF8):
1313 url = url[len(BOM_UTF8):]
1314 url = url.strip()
1315 if url.startswith(('#', ';', ']')):
1316 return False
1317 return url
1318
1319 with contextlib.closing(batch_fd) as fd:
1320 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1321
1322
1323def urlencode_postdata(*args, **kargs):
1324 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1325
1326
0990305d
PH
1327try:
1328 etree_iter = xml.etree.ElementTree.Element.iter
1329except AttributeError: # Python <=2.6
1330 etree_iter = lambda n: n.findall('.//*')
1331
1332
bcf89ce6
PH
1333def parse_xml(s):
1334 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1335 def doctype(self, name, pubid, system):
1336 pass # Ignore doctypes
1337
1338 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1339 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1340 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1341 # Fix up XML parser in Python 2.x
1342 if sys.version_info < (3, 0):
1343 for n in etree_iter(tree):
1344 if n.text is not None:
1345 if not isinstance(n.text, compat_str):
1346 n.text = n.text.decode('utf-8')
1347 return tree
e68301af
PH
1348
1349
a1a530b0
PH
1350US_RATINGS = {
1351 'G': 0,
1352 'PG': 10,
1353 'PG-13': 13,
1354 'R': 16,
1355 'NC': 18,
1356}
fac55558
PH
1357
1358
146c80e2
S
1359def parse_age_limit(s):
1360 if s is None:
d838b1bd 1361 return None
146c80e2 1362 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d838b1bd 1363 return int(m.group('age')) if m else US_RATINGS.get(s, None)
146c80e2
S
1364
1365
fac55558 1366def strip_jsonp(code):
609a61e3
PH
1367 return re.sub(
1368 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1369
1370
e05f6939
PH
1371def js_to_json(code):
1372 def fix_kv(m):
e7b6d122
PH
1373 v = m.group(0)
1374 if v in ('true', 'false', 'null'):
1375 return v
1376 if v.startswith('"'):
1377 return v
1378 if v.startswith("'"):
1379 v = v[1:-1]
1380 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1381 '\\\\': '\\\\',
1382 "\\'": "'",
1383 '"': '\\"',
1384 }[m.group(0)], v)
1385 return '"%s"' % v
e05f6939
PH
1386
1387 res = re.sub(r'''(?x)
e7b6d122
PH
1388 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1389 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1390 [a-zA-Z_][a-zA-Z_0-9]*
e05f6939
PH
1391 ''', fix_kv, code)
1392 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1393 return res
1394
1395
478c2c61
PH
1396def qualities(quality_ids):
1397 """ Get a numeric quality value out of a list of possible values """
1398 def q(qid):
1399 try:
1400 return quality_ids.index(qid)
1401 except ValueError:
1402 return -1
1403 return q
1404
acd69589
PH
1405
1406DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1407
a020a0dc
PH
1408
1409def limit_length(s, length):
1410 """ Add ellipses to overly long strings """
1411 if s is None:
1412 return None
1413 ELLIPSES = '...'
1414 if len(s) > length:
1415 return s[:length - len(ELLIPSES)] + ELLIPSES
1416 return s
48844745
PH
1417
1418
1419def version_tuple(v):
1420 return [int(e) for e in v.split('.')]
1421
1422
1423def is_outdated_version(version, limit, assume_new=True):
1424 if not version:
1425 return not assume_new
1426 try:
1427 return version_tuple(version) < version_tuple(limit)
1428 except ValueError:
1429 return not assume_new
732ea2f0
PH
1430
1431
1432def ytdl_is_updateable():
1433 """ Returns if youtube-dl can be updated with -U """
1434 from zipimport import zipimporter
1435
1436 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1437
1438
1439def args_to_str(args):
1440 # Get a short string representation for a subprocess command
1441 return ' '.join(shlex_quote(a) for a in args)