]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[rtlxl] Use m3u8 streams instead of f4m (#4115, #4118)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
912b38b4 6import calendar
676eb3f2 7import codecs
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96
PH
10import datetime
11import email.utils
f45c185f 12import errno
d77c3dfd 13import gzip
b7ab0590 14import itertools
03f9daab 15import io
f4bfd65f 16import json
d77c3dfd 17import locale
02dbf93f 18import math
d77c3dfd 19import os
4eb7f1d1 20import pipes
c496ca96 21import platform
d77c3dfd 22import re
13ebea79 23import ssl
c496ca96 24import socket
b53466e1 25import struct
1c088fa8 26import subprocess
d77c3dfd 27import sys
181c8655 28import tempfile
01951dda 29import traceback
bcf89ce6 30import xml.etree.ElementTree
d77c3dfd 31import zlib
d77c3dfd 32
8c25f81b
PH
33from .compat import (
34 compat_chr,
35 compat_getenv,
36 compat_html_entities,
8c25f81b
PH
37 compat_parse_qs,
38 compat_str,
39 compat_urllib_error,
40 compat_urllib_parse,
41 compat_urllib_parse_urlparse,
42 compat_urllib_request,
43 compat_urlparse,
44)
4644ac55
S
45
46
468e2e92
FV
47# This is not clearly defined otherwise
48compiled_regex_type = type(re.compile(''))
49
3e669f36 50std_headers = {
ae8f7871 51 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
52 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
53 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
54 'Accept-Encoding': 'gzip, deflate',
55 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 56}
f427df17 57
d77c3dfd 58def preferredencoding():
59ae15a5 59 """Get preferred encoding.
d77c3dfd 60
59ae15a5
PH
61 Returns the best encoding scheme for the system, based on
62 locale.getpreferredencoding() and some further tweaks.
63 """
64 try:
65 pref = locale.getpreferredencoding()
28e614de 66 'TEST'.encode(pref)
59ae15a5
PH
67 except:
68 pref = 'UTF-8'
bae611f2 69
59ae15a5 70 return pref
d77c3dfd 71
f4bfd65f 72
181c8655
PH
73def write_json_file(obj, fn):
74 """ Encode obj as JSON and write it to fn, atomically """
75
ec5f6016
JMF
76 if sys.version_info < (3, 0):
77 encoding = get_filesystem_encoding()
78 # os.path.basename returns a bytes object, but NamedTemporaryFile
79 # will fail if the filename contains non ascii characters unless we
80 # use a unicode object
81 path_basename = lambda f: os.path.basename(fn).decode(encoding)
82 # the same for os.path.dirname
83 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
84 else:
85 path_basename = os.path.basename
86 path_dirname = os.path.dirname
87
73159f99
S
88 args = {
89 'suffix': '.tmp',
ec5f6016
JMF
90 'prefix': path_basename(fn) + '.',
91 'dir': path_dirname(fn),
73159f99
S
92 'delete': False,
93 }
94
181c8655
PH
95 # In Python 2.x, json.dump expects a bytestream.
96 # In Python 3.x, it writes to a character stream
97 if sys.version_info < (3, 0):
73159f99 98 args['mode'] = 'wb'
181c8655 99 else:
73159f99
S
100 args.update({
101 'mode': 'w',
102 'encoding': 'utf-8',
103 })
104
105 tf = tempfile.NamedTemporaryFile(**args)
181c8655
PH
106
107 try:
108 with tf:
109 json.dump(obj, tf)
110 os.rename(tf.name, fn)
111 except:
112 try:
113 os.remove(tf.name)
114 except OSError:
115 pass
116 raise
117
118
119if sys.version_info >= (2, 7):
59ae56fa
PH
120 def find_xpath_attr(node, xpath, key, val):
121 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
122 assert re.match(r'^[a-zA-Z-]+$', key)
123 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
59ae56fa
PH
124 expr = xpath + u"[@%s='%s']" % (key, val)
125 return node.find(expr)
126else:
127 def find_xpath_attr(node, xpath, key, val):
4eefbfdb
PH
128 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
129 # .//node does not match if a node is a direct child of . !
130 if isinstance(xpath, unicode):
131 xpath = xpath.encode('ascii')
132
59ae56fa
PH
133 for f in node.findall(xpath):
134 if f.attrib.get(key) == val:
135 return f
136 return None
137
d7e66d39
JMF
138# On python2.6 the xml.etree.ElementTree.Element methods don't support
139# the namespace parameter
140def xpath_with_ns(path, ns_map):
141 components = [c.split(':') for c in path.split('/')]
142 replaced = []
143 for c in components:
144 if len(c) == 1:
145 replaced.append(c[0])
146 else:
147 ns, tag = c
148 replaced.append('{%s}%s' % (ns_map[ns], tag))
149 return '/'.join(replaced)
150
d77c3dfd 151
bf0ff932 152def xpath_text(node, xpath, name=None, fatal=False):
d74bebd5
PH
153 if sys.version_info < (2, 7): # Crazy 2.6
154 xpath = xpath.encode('ascii')
155
bf0ff932
PH
156 n = node.find(xpath)
157 if n is None:
158 if fatal:
159 name = xpath if name is None else name
160 raise ExtractorError('Could not find XML element %s' % name)
161 else:
162 return None
163 return n.text
164
165
9e6dd238 166def get_element_by_id(id, html):
43e8fafd
ND
167 """Return the content of the tag with the specified ID in the passed HTML document"""
168 return get_element_by_attribute("id", id, html)
169
12ea2f30 170
43e8fafd
ND
171def get_element_by_attribute(attribute, value, html):
172 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 173
38285056
PH
174 m = re.search(r'''(?xs)
175 <([a-zA-Z0-9:._-]+)
176 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
177 \s+%s=['"]?%s['"]?
178 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
179 \s*>
180 (?P<content>.*?)
181 </\1>
182 ''' % (re.escape(attribute), re.escape(value)), html)
183
184 if not m:
185 return None
186 res = m.group('content')
187
188 if res.startswith('"') or res.startswith("'"):
189 res = res[1:-1]
a921f407 190
38285056 191 return unescapeHTML(res)
a921f407 192
9e6dd238
FV
193
194def clean_html(html):
59ae15a5
PH
195 """Clean an HTML snippet into a readable string"""
196 # Newline vs <br />
197 html = html.replace('\n', ' ')
6b3aef80
FV
198 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
199 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
200 # Strip html tags
201 html = re.sub('<.*?>', '', html)
202 # Replace html entities
203 html = unescapeHTML(html)
7decf895 204 return html.strip()
9e6dd238
FV
205
206
d77c3dfd 207def sanitize_open(filename, open_mode):
59ae15a5
PH
208 """Try to open the given filename, and slightly tweak it if this fails.
209
210 Attempts to open the given filename. If this fails, it tries to change
211 the filename slightly, step by step, until it's either able to open it
212 or it fails and raises a final exception, like the standard open()
213 function.
214
215 It returns the tuple (stream, definitive_file_name).
216 """
217 try:
28e614de 218 if filename == '-':
59ae15a5
PH
219 if sys.platform == 'win32':
220 import msvcrt
221 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 222 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
223 stream = open(encodeFilename(filename), open_mode)
224 return (stream, filename)
225 except (IOError, OSError) as err:
f45c185f
PH
226 if err.errno in (errno.EACCES,):
227 raise
59ae15a5 228
f45c185f
PH
229 # In case of error, try to remove win32 forbidden chars
230 alt_filename = os.path.join(
28e614de 231 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
f45c185f
PH
232 for path_part in os.path.split(filename)
233 )
234 if alt_filename == filename:
235 raise
236 else:
237 # An exception here should be caught in the caller
238 stream = open(encodeFilename(filename), open_mode)
239 return (stream, alt_filename)
d77c3dfd
FV
240
241
242def timeconvert(timestr):
59ae15a5
PH
243 """Convert RFC 2822 defined time string into system timestamp"""
244 timestamp = None
245 timetuple = email.utils.parsedate_tz(timestr)
246 if timetuple is not None:
247 timestamp = email.utils.mktime_tz(timetuple)
248 return timestamp
1c469a94 249
796173d0 250def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
251 """Sanitizes a string so it could be used as part of a filename.
252 If restricted is set, use a stricter subset of allowed characters.
796173d0 253 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
254 """
255 def replace_insane(char):
256 if char == '?' or ord(char) < 32 or ord(char) == 127:
257 return ''
258 elif char == '"':
259 return '' if restricted else '\''
260 elif char == ':':
261 return '_-' if restricted else ' -'
262 elif char in '\\/|*<>':
263 return '_'
627dcfff 264 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
265 return '_'
266 if restricted and ord(char) > 127:
267 return '_'
268 return char
269
28e614de 270 result = ''.join(map(replace_insane, s))
796173d0
PH
271 if not is_id:
272 while '__' in result:
273 result = result.replace('__', '_')
274 result = result.strip('_')
275 # Common case of "Foreign band name - English song title"
276 if restricted and result.startswith('-_'):
277 result = result[2:]
278 if not result:
279 result = '_'
59ae15a5 280 return result
d77c3dfd
FV
281
282def orderedSet(iterable):
59ae15a5
PH
283 """ Remove all duplicates from the input iterable """
284 res = []
285 for el in iterable:
286 if el not in res:
287 res.append(el)
288 return res
d77c3dfd 289
912b38b4 290
4e408e47
PH
291def _htmlentity_transform(entity):
292 """Transforms an HTML entity to a character."""
293 # Known non-numeric HTML entity
294 if entity in compat_html_entities.name2codepoint:
295 return compat_chr(compat_html_entities.name2codepoint[entity])
296
297 mobj = re.match(r'#(x?[0-9]+)', entity)
298 if mobj is not None:
299 numstr = mobj.group(1)
28e614de 300 if numstr.startswith('x'):
4e408e47 301 base = 16
28e614de 302 numstr = '0%s' % numstr
4e408e47
PH
303 else:
304 base = 10
305 return compat_chr(int(numstr, base))
306
307 # Unknown entity in name, return its literal representation
28e614de 308 return ('&%s;' % entity)
4e408e47
PH
309
310
d77c3dfd 311def unescapeHTML(s):
912b38b4
PH
312 if s is None:
313 return None
314 assert type(s) == compat_str
d77c3dfd 315
4e408e47
PH
316 return re.sub(
317 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 318
8bf48f23
PH
319
320def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
321 """
322 @param s The name of the file
323 """
d77c3dfd 324
8bf48f23 325 assert type(s) == compat_str
d77c3dfd 326
59ae15a5
PH
327 # Python 3 has a Unicode API
328 if sys.version_info >= (3, 0):
329 return s
0f00efed 330
59ae15a5 331 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
28e614de 332 # Pass '' directly to use Unicode APIs on Windows 2000 and up
59ae15a5
PH
333 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
334 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
335 if not for_subprocess:
336 return s
337 else:
338 # For subprocess calls, encode with locale encoding
339 # Refer to http://stackoverflow.com/a/9951851/35070
340 encoding = preferredencoding()
59ae15a5 341 else:
6df40dcb 342 encoding = sys.getfilesystemencoding()
8bf48f23
PH
343 if encoding is None:
344 encoding = 'utf-8'
345 return s.encode(encoding, 'ignore')
346
f07b74fc
PH
347
348def encodeArgument(s):
349 if not isinstance(s, compat_str):
350 # Legacy code that uses byte strings
351 # Uncomment the following line after fixing all post processors
352 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
353 s = s.decode('ascii')
354 return encodeFilename(s, True)
355
356
8271226a
PH
357def decodeOption(optval):
358 if optval is None:
359 return optval
360 if isinstance(optval, bytes):
361 optval = optval.decode(preferredencoding())
362
363 assert isinstance(optval, compat_str)
364 return optval
1c256f70 365
4539dd30
PH
366def formatSeconds(secs):
367 if secs > 3600:
368 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
369 elif secs > 60:
370 return '%d:%02d' % (secs // 60, secs % 60)
371 else:
372 return '%d' % secs
373
a0ddb8a2
PH
374
375def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
13ebea79
PH
376 if sys.version_info < (3, 2):
377 import httplib
378
379 class HTTPSConnectionV3(httplib.HTTPSConnection):
380 def __init__(self, *args, **kwargs):
381 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
382
383 def connect(self):
384 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 385 if getattr(self, '_tunnel_host', False):
13ebea79
PH
386 self.sock = sock
387 self._tunnel()
388 try:
aa37e3d4 389 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
de79c46c 390 except ssl.SSLError:
13ebea79
PH
391 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
392
393 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
394 def https_open(self, req):
395 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 396 return HTTPSHandlerV3(**kwargs)
aa37e3d4
PH
397 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
398 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
399 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
400 if opts_no_check_certificate:
401 context.verify_mode = ssl.CERT_NONE
402 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
403 else: # Python < 3.4
404 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
ea6d901e 405 context.verify_mode = (ssl.CERT_NONE
dca08720 406 if opts_no_check_certificate
ea6d901e 407 else ssl.CERT_REQUIRED)
303b479e
PH
408 context.set_default_verify_paths()
409 try:
410 context.load_default_certs()
411 except AttributeError:
412 pass # Python < 3.4
a0ddb8a2 413 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 414
1c256f70
PH
415class ExtractorError(Exception):
416 """Error during info extraction."""
d11271dd 417 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
418 """ tb, if given, is the original traceback (so that it can be printed out).
419 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
420 """
421
422 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
423 expected = True
d11271dd
PH
424 if video_id is not None:
425 msg = video_id + ': ' + msg
410f3e73 426 if cause:
28e614de 427 msg += ' (caused by %r)' % cause
9a82b238 428 if not expected:
28e614de 429 msg = msg + '; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 430 super(ExtractorError, self).__init__(msg)
d5979c5d 431
1c256f70 432 self.traceback = tb
8cc83b8d 433 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 434 self.cause = cause
d11271dd 435 self.video_id = video_id
1c256f70 436
01951dda
PH
437 def format_traceback(self):
438 if self.traceback is None:
439 return None
28e614de 440 return ''.join(traceback.format_tb(self.traceback))
01951dda 441
1c256f70 442
55b3e45b
JMF
443class RegexNotFoundError(ExtractorError):
444 """Error when a regex didn't match"""
445 pass
446
447
d77c3dfd 448class DownloadError(Exception):
59ae15a5 449 """Download Error exception.
d77c3dfd 450
59ae15a5
PH
451 This exception may be thrown by FileDownloader objects if they are not
452 configured to continue on errors. They will contain the appropriate
453 error message.
454 """
8cc83b8d
FV
455 def __init__(self, msg, exc_info=None):
456 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
457 super(DownloadError, self).__init__(msg)
458 self.exc_info = exc_info
d77c3dfd
FV
459
460
461class SameFileError(Exception):
59ae15a5 462 """Same File exception.
d77c3dfd 463
59ae15a5
PH
464 This exception will be thrown by FileDownloader objects if they detect
465 multiple files would have to be downloaded to the same file on disk.
466 """
467 pass
d77c3dfd
FV
468
469
470class PostProcessingError(Exception):
59ae15a5 471 """Post Processing exception.
d77c3dfd 472
59ae15a5
PH
473 This exception may be raised by PostProcessor's .run() method to
474 indicate an error in the postprocessing task.
475 """
7851b379
PH
476 def __init__(self, msg):
477 self.msg = msg
d77c3dfd
FV
478
479class MaxDownloadsReached(Exception):
59ae15a5
PH
480 """ --max-downloads limit has been reached. """
481 pass
d77c3dfd
FV
482
483
484class UnavailableVideoError(Exception):
59ae15a5 485 """Unavailable Format exception.
d77c3dfd 486
59ae15a5
PH
487 This exception will be thrown when a video is requested
488 in a format that is not available for that video.
489 """
490 pass
d77c3dfd
FV
491
492
493class ContentTooShortError(Exception):
59ae15a5 494 """Content Too Short exception.
d77c3dfd 495
59ae15a5
PH
496 This exception may be raised by FileDownloader objects when a file they
497 download is too small for what the server announced first, indicating
498 the connection was probably interrupted.
499 """
500 # Both in bytes
501 downloaded = None
502 expected = None
d77c3dfd 503
59ae15a5
PH
504 def __init__(self, downloaded, expected):
505 self.downloaded = downloaded
506 self.expected = expected
d77c3dfd 507
acebc9cd 508class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
509 """Handler for HTTP requests and responses.
510
511 This class, when installed with an OpenerDirector, automatically adds
512 the standard headers to every HTTP request and handles gzipped and
513 deflated responses from web servers. If compression is to be avoided in
514 a particular request, the original request in the program code only has
515 to include the HTTP header "Youtubedl-No-Compression", which will be
516 removed before making the real request.
517
518 Part of this code was copied from:
519
520 http://techknack.net/python-urllib2-handlers/
521
522 Andrew Rowls, the author of that code, agreed to release it to the
523 public domain.
524 """
525
526 @staticmethod
527 def deflate(data):
528 try:
529 return zlib.decompress(data, -zlib.MAX_WBITS)
530 except zlib.error:
531 return zlib.decompress(data)
532
533 @staticmethod
534 def addinfourl_wrapper(stream, headers, url, code):
535 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
536 return compat_urllib_request.addinfourl(stream, headers, url, code)
537 ret = compat_urllib_request.addinfourl(stream, headers, url)
538 ret.code = code
539 return ret
540
acebc9cd 541 def http_request(self, req):
33ac271b
PH
542 for h, v in std_headers.items():
543 if h not in req.headers:
544 req.add_header(h, v)
59ae15a5
PH
545 if 'Youtubedl-no-compression' in req.headers:
546 if 'Accept-encoding' in req.headers:
547 del req.headers['Accept-encoding']
548 del req.headers['Youtubedl-no-compression']
3446dfb7 549 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
550 if 'User-agent' in req.headers:
551 del req.headers['User-agent']
552 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 553 del req.headers['Youtubedl-user-agent']
989b4b2b
PH
554
555 if sys.version_info < (2, 7) and '#' in req.get_full_url():
556 # Python 2.6 is brain-dead when it comes to fragments
557 req._Request__original = req._Request__original.partition('#')[0]
558 req._Request__r_type = req._Request__r_type.partition('#')[0]
559
59ae15a5
PH
560 return req
561
acebc9cd 562 def http_response(self, req, resp):
59ae15a5
PH
563 old_resp = resp
564 # gzip
565 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
566 content = resp.read()
567 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
568 try:
569 uncompressed = io.BytesIO(gz.read())
570 except IOError as original_ioerror:
571 # There may be junk add the end of the file
572 # See http://stackoverflow.com/q/4928560/35070 for details
573 for i in range(1, 1024):
574 try:
575 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
576 uncompressed = io.BytesIO(gz.read())
577 except IOError:
578 continue
579 break
580 else:
581 raise original_ioerror
582 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
583 resp.msg = old_resp.msg
584 # deflate
585 if resp.headers.get('Content-encoding', '') == 'deflate':
586 gz = io.BytesIO(self.deflate(resp.read()))
587 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
588 resp.msg = old_resp.msg
589 return resp
0f8d03f8 590
acebc9cd
PH
591 https_request = http_request
592 https_response = http_response
bf50b038 593
5de90176 594
305d0683 595def parse_iso8601(date_str, delimiter='T'):
912b38b4
PH
596 """ Return a UNIX timestamp from the given date """
597
598 if date_str is None:
599 return None
600
601 m = re.search(
6ad4013d 602 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
912b38b4
PH
603 date_str)
604 if not m:
605 timezone = datetime.timedelta()
606 else:
607 date_str = date_str[:-len(m.group(0))]
608 if not m.group('sign'):
609 timezone = datetime.timedelta()
610 else:
611 sign = 1 if m.group('sign') == '+' else -1
612 timezone = datetime.timedelta(
613 hours=sign * int(m.group('hours')),
614 minutes=sign * int(m.group('minutes')))
6ad4013d 615 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
305d0683 616 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
617 return calendar.timegm(dt.timetuple())
618
619
bf50b038
JMF
620def unified_strdate(date_str):
621 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
622
623 if date_str is None:
624 return None
625
bf50b038
JMF
626 upload_date = None
627 #Replace commas
026fcc04 628 date_str = date_str.replace(',', ' ')
bf50b038 629 # %z (UTC offset) is only supported in python>=3.2
026fcc04 630 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
19e1d359
JMF
631 format_expressions = [
632 '%d %B %Y',
0f99566c 633 '%d %b %Y',
19e1d359
JMF
634 '%B %d %Y',
635 '%b %d %Y',
78ff59d0
PP
636 '%b %dst %Y %I:%M%p',
637 '%b %dnd %Y %I:%M%p',
638 '%b %dth %Y %I:%M%p',
19e1d359 639 '%Y-%m-%d',
fe556f1b 640 '%Y/%m/%d',
4cf96546 641 '%d.%m.%Y',
19e1d359 642 '%d/%m/%Y',
423817c4 643 '%d/%m/%y',
19e1d359 644 '%Y/%m/%d %H:%M:%S',
99b67fec 645 '%d/%m/%Y %H:%M:%S',
5d73273f 646 '%Y-%m-%d %H:%M:%S',
e9be9a6a 647 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 648 '%d.%m.%Y %H:%M',
b047de6f 649 '%d.%m.%Y %H.%M',
19e1d359 650 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
651 '%Y-%m-%dT%H:%M:%S.%fZ',
652 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 653 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 654 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 655 '%Y-%m-%dT%H:%M',
19e1d359 656 ]
bf50b038
JMF
657 for expression in format_expressions:
658 try:
659 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 660 except ValueError:
bf50b038 661 pass
42393ce2
PH
662 if upload_date is None:
663 timetuple = email.utils.parsedate_tz(date_str)
664 if timetuple:
665 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
666 return upload_date
667
28e614de 668def determine_ext(url, default_ext='unknown_video'):
f4776371
S
669 if url is None:
670 return default_ext
28e614de 671 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
672 if re.match(r'^[A-Za-z0-9]+$', guess):
673 return guess
674 else:
cbdbb766 675 return default_ext
73e79f2a 676
d4051a8e 677def subtitles_filename(filename, sub_lang, sub_format):
28e614de 678 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 679
bd558525 680def date_from_str(date_str):
37254abc
JMF
681 """
682 Return a datetime object from a string in the format YYYYMMDD or
683 (now|today)[+-][0-9](day|week|month|year)(s)?"""
684 today = datetime.date.today()
685 if date_str == 'now'or date_str == 'today':
686 return today
687 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
688 if match is not None:
689 sign = match.group('sign')
690 time = int(match.group('time'))
691 if sign == '-':
692 time = -time
693 unit = match.group('unit')
694 #A bad aproximation?
695 if unit == 'month':
696 unit = 'day'
697 time *= 30
698 elif unit == 'year':
699 unit = 'day'
700 time *= 365
701 unit += 's'
702 delta = datetime.timedelta(**{unit: time})
703 return today + delta
bd558525
JMF
704 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
705
e63fc1be 706def hyphenate_date(date_str):
707 """
708 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
709 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
710 if match is not None:
711 return '-'.join(match.groups())
712 else:
713 return date_str
714
bd558525
JMF
715class DateRange(object):
716 """Represents a time interval between two dates"""
717 def __init__(self, start=None, end=None):
718 """start and end must be strings in the format accepted by date"""
719 if start is not None:
720 self.start = date_from_str(start)
721 else:
722 self.start = datetime.datetime.min.date()
723 if end is not None:
724 self.end = date_from_str(end)
725 else:
726 self.end = datetime.datetime.max.date()
37254abc 727 if self.start > self.end:
bd558525
JMF
728 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
729 @classmethod
730 def day(cls, day):
731 """Returns a range that only contains the given day"""
732 return cls(day,day)
733 def __contains__(self, date):
734 """Check if the date is in the range"""
37254abc
JMF
735 if not isinstance(date, datetime.date):
736 date = date_from_str(date)
737 return self.start <= date <= self.end
bd558525
JMF
738 def __str__(self):
739 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
740
741
742def platform_name():
743 """ Returns the platform name as a compat_str """
744 res = platform.platform()
745 if isinstance(res, bytes):
746 res = res.decode(preferredencoding())
747
748 assert isinstance(res, compat_str)
749 return res
c257baff
PH
750
751
b58ddb32
PH
752def _windows_write_string(s, out):
753 """ Returns True if the string was written using special methods,
754 False if it has yet to be written out."""
755 # Adapted from http://stackoverflow.com/a/3259271/35070
756
757 import ctypes
758 import ctypes.wintypes
759
760 WIN_OUTPUT_IDS = {
761 1: -11,
762 2: -12,
763 }
764
a383a98a
PH
765 try:
766 fileno = out.fileno()
767 except AttributeError:
768 # If the output stream doesn't have a fileno, it's virtual
769 return False
b58ddb32
PH
770 if fileno not in WIN_OUTPUT_IDS:
771 return False
772
773 GetStdHandle = ctypes.WINFUNCTYPE(
774 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
775 ("GetStdHandle", ctypes.windll.kernel32))
776 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
777
778 WriteConsoleW = ctypes.WINFUNCTYPE(
779 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
780 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
781 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
782 written = ctypes.wintypes.DWORD(0)
783
784 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
785 FILE_TYPE_CHAR = 0x0002
786 FILE_TYPE_REMOTE = 0x8000
787 GetConsoleMode = ctypes.WINFUNCTYPE(
788 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
789 ctypes.POINTER(ctypes.wintypes.DWORD))(
790 ("GetConsoleMode", ctypes.windll.kernel32))
791 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
792
793 def not_a_console(handle):
794 if handle == INVALID_HANDLE_VALUE or handle is None:
795 return True
796 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
797 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
798
799 if not_a_console(h):
800 return False
801
d1b9c912
PH
802 def next_nonbmp_pos(s):
803 try:
804 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
805 except StopIteration:
806 return len(s)
807
808 while s:
809 count = min(next_nonbmp_pos(s), 1024)
810
b58ddb32 811 ret = WriteConsoleW(
d1b9c912 812 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
813 if ret == 0:
814 raise OSError('Failed to write string')
d1b9c912
PH
815 if not count: # We just wrote a non-BMP character
816 assert written.value == 2
817 s = s[1:]
818 else:
819 assert written.value > 0
820 s = s[written.value:]
b58ddb32
PH
821 return True
822
823
734f90bb 824def write_string(s, out=None, encoding=None):
7459e3a2
PH
825 if out is None:
826 out = sys.stderr
8bf48f23 827 assert type(s) == compat_str
7459e3a2 828
b58ddb32
PH
829 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
830 if _windows_write_string(s, out):
831 return
832
7459e3a2
PH
833 if ('b' in getattr(out, 'mode', '') or
834 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
835 byt = s.encode(encoding or preferredencoding(), 'ignore')
836 out.write(byt)
837 elif hasattr(out, 'buffer'):
838 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
839 byt = s.encode(enc, 'ignore')
840 out.buffer.write(byt)
841 else:
8bf48f23 842 out.write(s)
7459e3a2
PH
843 out.flush()
844
845
48ea9cea
PH
846def bytes_to_intlist(bs):
847 if not bs:
848 return []
849 if isinstance(bs[0], int): # Python 3
850 return list(bs)
851 else:
852 return [ord(c) for c in bs]
853
c257baff 854
cba892fa 855def intlist_to_bytes(xs):
856 if not xs:
857 return b''
eb4157fd 858 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
859
860
c1c9a79c
PH
861# Cross-platform file locking
862if sys.platform == 'win32':
863 import ctypes.wintypes
864 import msvcrt
865
866 class OVERLAPPED(ctypes.Structure):
867 _fields_ = [
868 ('Internal', ctypes.wintypes.LPVOID),
869 ('InternalHigh', ctypes.wintypes.LPVOID),
870 ('Offset', ctypes.wintypes.DWORD),
871 ('OffsetHigh', ctypes.wintypes.DWORD),
872 ('hEvent', ctypes.wintypes.HANDLE),
873 ]
874
875 kernel32 = ctypes.windll.kernel32
876 LockFileEx = kernel32.LockFileEx
877 LockFileEx.argtypes = [
878 ctypes.wintypes.HANDLE, # hFile
879 ctypes.wintypes.DWORD, # dwFlags
880 ctypes.wintypes.DWORD, # dwReserved
881 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
882 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
883 ctypes.POINTER(OVERLAPPED) # Overlapped
884 ]
885 LockFileEx.restype = ctypes.wintypes.BOOL
886 UnlockFileEx = kernel32.UnlockFileEx
887 UnlockFileEx.argtypes = [
888 ctypes.wintypes.HANDLE, # hFile
889 ctypes.wintypes.DWORD, # dwReserved
890 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
891 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
892 ctypes.POINTER(OVERLAPPED) # Overlapped
893 ]
894 UnlockFileEx.restype = ctypes.wintypes.BOOL
895 whole_low = 0xffffffff
896 whole_high = 0x7fffffff
897
898 def _lock_file(f, exclusive):
899 overlapped = OVERLAPPED()
900 overlapped.Offset = 0
901 overlapped.OffsetHigh = 0
902 overlapped.hEvent = 0
903 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
904 handle = msvcrt.get_osfhandle(f.fileno())
905 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
906 whole_low, whole_high, f._lock_file_overlapped_p):
907 raise OSError('Locking file failed: %r' % ctypes.FormatError())
908
909 def _unlock_file(f):
910 assert f._lock_file_overlapped_p
911 handle = msvcrt.get_osfhandle(f.fileno())
912 if not UnlockFileEx(handle, 0,
913 whole_low, whole_high, f._lock_file_overlapped_p):
914 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
915
916else:
917 import fcntl
918
919 def _lock_file(f, exclusive):
2582bebe 920 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
921
922 def _unlock_file(f):
2582bebe 923 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
924
925
926class locked_file(object):
927 def __init__(self, filename, mode, encoding=None):
928 assert mode in ['r', 'a', 'w']
929 self.f = io.open(filename, mode, encoding=encoding)
930 self.mode = mode
931
932 def __enter__(self):
933 exclusive = self.mode != 'r'
934 try:
935 _lock_file(self.f, exclusive)
936 except IOError:
937 self.f.close()
938 raise
939 return self
940
941 def __exit__(self, etype, value, traceback):
942 try:
943 _unlock_file(self.f)
944 finally:
945 self.f.close()
946
947 def __iter__(self):
948 return iter(self.f)
949
950 def write(self, *args):
951 return self.f.write(*args)
952
953 def read(self, *args):
954 return self.f.read(*args)
4eb7f1d1
JMF
955
956
4644ac55
S
957def get_filesystem_encoding():
958 encoding = sys.getfilesystemencoding()
959 return encoding if encoding is not None else 'utf-8'
960
961
4eb7f1d1 962def shell_quote(args):
a6a173c2 963 quoted_args = []
4644ac55 964 encoding = get_filesystem_encoding()
a6a173c2
JMF
965 for a in args:
966 if isinstance(a, bytes):
967 # We may get a filename encoded with 'encodeFilename'
968 a = a.decode(encoding)
969 quoted_args.append(pipes.quote(a))
28e614de 970 return ' '.join(quoted_args)
9d4660ca
PH
971
972
f4d96df0
PH
973def takewhile_inclusive(pred, seq):
974 """ Like itertools.takewhile, but include the latest evaluated element
975 (the first element so that Not pred(e)) """
976 for e in seq:
977 yield e
978 if not pred(e):
979 return
980
981
9d4660ca
PH
982def smuggle_url(url, data):
983 """ Pass additional data in a URL for internal use. """
984
985 sdata = compat_urllib_parse.urlencode(
28e614de
PH
986 {'__youtubedl_smuggle': json.dumps(data)})
987 return url + '#' + sdata
9d4660ca
PH
988
989
79f82953 990def unsmuggle_url(smug_url, default=None):
9d4660ca 991 if not '#__youtubedl_smuggle' in smug_url:
79f82953 992 return smug_url, default
28e614de
PH
993 url, _, sdata = smug_url.rpartition('#')
994 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
995 data = json.loads(jsond)
996 return url, data
02dbf93f
PH
997
998
02dbf93f
PH
999def format_bytes(bytes):
1000 if bytes is None:
28e614de 1001 return 'N/A'
02dbf93f
PH
1002 if type(bytes) is str:
1003 bytes = float(bytes)
1004 if bytes == 0.0:
1005 exponent = 0
1006 else:
1007 exponent = int(math.log(bytes, 1024.0))
28e614de 1008 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1009 converted = float(bytes) / float(1024 ** exponent)
28e614de 1010 return '%.2f%s' % (converted, suffix)
f53c966a 1011
1c088fa8 1012
1c088fa8 1013def get_term_width():
4644ac55 1014 columns = compat_getenv('COLUMNS', None)
1c088fa8
PH
1015 if columns:
1016 return int(columns)
1017
1018 try:
1019 sp = subprocess.Popen(
1020 ['stty', 'size'],
1021 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1022 out, err = sp.communicate()
1023 return int(out.split()[1])
1024 except:
1025 pass
1026 return None
caefb1de
PH
1027
1028
1029def month_by_name(name):
1030 """ Return the number of a month by (locale-independently) English name """
1031
1032 ENGLISH_NAMES = [
28e614de
PH
1033 'January', 'February', 'March', 'April', 'May', 'June',
1034 'July', 'August', 'September', 'October', 'November', 'December']
caefb1de
PH
1035 try:
1036 return ENGLISH_NAMES.index(name) + 1
1037 except ValueError:
1038 return None
18258362
JMF
1039
1040
5aafe895 1041def fix_xml_ampersands(xml_str):
18258362 1042 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1043 return re.sub(
1044 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1045 '&amp;',
5aafe895 1046 xml_str)
e3946f98
PH
1047
1048
1049def setproctitle(title):
8bf48f23 1050 assert isinstance(title, compat_str)
e3946f98
PH
1051 try:
1052 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1053 except OSError:
1054 return
6eefe533
PH
1055 title_bytes = title.encode('utf-8')
1056 buf = ctypes.create_string_buffer(len(title_bytes))
1057 buf.value = title_bytes
e3946f98 1058 try:
6eefe533 1059 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1060 except AttributeError:
1061 return # Strange libc, just skip this
d7dda168
PH
1062
1063
1064def remove_start(s, start):
1065 if s.startswith(start):
1066 return s[len(start):]
1067 return s
29eb5174
PH
1068
1069
2b9faf55
PH
1070def remove_end(s, end):
1071 if s.endswith(end):
1072 return s[:-len(end)]
1073 return s
1074
1075
29eb5174 1076def url_basename(url):
9b8aaeed 1077 path = compat_urlparse.urlparse(url).path
28e614de 1078 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1079
1080
1081class HEADRequest(compat_urllib_request.Request):
1082 def get_method(self):
1083 return "HEAD"
7217e148
PH
1084
1085
9732d77e 1086def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1087 if get_attr:
1088 if v is not None:
1089 v = getattr(v, get_attr, None)
9572013d
PH
1090 if v == '':
1091 v = None
9732d77e
PH
1092 return default if v is None else (int(v) * invscale // scale)
1093
9572013d 1094
40a90862
JMF
1095def str_or_none(v, default=None):
1096 return default if v is None else compat_str(v)
1097
9732d77e
PH
1098
1099def str_to_int(int_str):
48d4681e 1100 """ A more relaxed version of int_or_none """
9732d77e
PH
1101 if int_str is None:
1102 return None
28e614de 1103 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1104 return int(int_str)
608d11f5
PH
1105
1106
9732d77e
PH
1107def float_or_none(v, scale=1, invscale=1, default=None):
1108 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1109
1110
608d11f5
PH
1111def parse_duration(s):
1112 if s is None:
1113 return None
1114
ca7b3246
S
1115 s = s.strip()
1116
608d11f5 1117 m = re.match(
6a68bb57
PH
1118 r'''(?ix)T?
1119 (?:
1120 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1121 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1122 )?
1123 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$''', s)
608d11f5
PH
1124 if not m:
1125 return None
1126 res = int(m.group('secs'))
1127 if m.group('mins'):
1128 res += int(m.group('mins')) * 60
1129 if m.group('hours'):
1130 res += int(m.group('hours')) * 60 * 60
7adcbe75
PH
1131 if m.group('ms'):
1132 res += float(m.group('ms'))
608d11f5 1133 return res
91d7d0b3
JMF
1134
1135
1136def prepend_extension(filename, ext):
1137 name, real_ext = os.path.splitext(filename)
28e614de 1138 return '{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1139
1140
1141def check_executable(exe, args=[]):
1142 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1143 args can be a list of arguments for a short output (like -version) """
1144 try:
1145 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1146 except OSError:
1147 return False
1148 return exe
b7ab0590
PH
1149
1150
95807118
PH
1151def get_exe_version(exe, args=['--version'],
1152 version_re=r'version\s+([0-9._-a-zA-Z]+)',
28e614de 1153 unrecognized='present'):
95807118
PH
1154 """ Returns the version of the specified executable,
1155 or False if the executable is not present """
1156 try:
1157 out, err = subprocess.Popen(
1158 [exe] + args,
1159 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1160 except OSError:
1161 return False
1162 firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1163 m = re.search(version_re, firstline)
1164 if m:
1165 return m.group(1)
1166 else:
1167 return unrecognized
1168
1169
b7ab0590 1170class PagedList(object):
dd26ced1
PH
1171 def __len__(self):
1172 # This is only useful for tests
1173 return len(self.getslice())
1174
9c44d242
PH
1175
1176class OnDemandPagedList(PagedList):
1177 def __init__(self, pagefunc, pagesize):
1178 self._pagefunc = pagefunc
1179 self._pagesize = pagesize
1180
b7ab0590
PH
1181 def getslice(self, start=0, end=None):
1182 res = []
1183 for pagenum in itertools.count(start // self._pagesize):
1184 firstid = pagenum * self._pagesize
1185 nextfirstid = pagenum * self._pagesize + self._pagesize
1186 if start >= nextfirstid:
1187 continue
1188
1189 page_results = list(self._pagefunc(pagenum))
1190
1191 startv = (
1192 start % self._pagesize
1193 if firstid <= start < nextfirstid
1194 else 0)
1195
1196 endv = (
1197 ((end - 1) % self._pagesize) + 1
1198 if (end is not None and firstid <= end <= nextfirstid)
1199 else None)
1200
1201 if startv != 0 or endv is not None:
1202 page_results = page_results[startv:endv]
1203 res.extend(page_results)
1204
1205 # A little optimization - if current page is not "full", ie. does
1206 # not contain page_size videos then we can assume that this page
1207 # is the last one - there are no more ids on further pages -
1208 # i.e. no need to query again.
1209 if len(page_results) + startv < self._pagesize:
1210 break
1211
1212 # If we got the whole page, but the next page is not interesting,
1213 # break out early as well
1214 if end == nextfirstid:
1215 break
1216 return res
81c2f20b
PH
1217
1218
9c44d242
PH
1219class InAdvancePagedList(PagedList):
1220 def __init__(self, pagefunc, pagecount, pagesize):
1221 self._pagefunc = pagefunc
1222 self._pagecount = pagecount
1223 self._pagesize = pagesize
1224
1225 def getslice(self, start=0, end=None):
1226 res = []
1227 start_page = start // self._pagesize
1228 end_page = (
1229 self._pagecount if end is None else (end // self._pagesize + 1))
1230 skip_elems = start - start_page * self._pagesize
1231 only_more = None if end is None else end - start
1232 for pagenum in range(start_page, end_page):
1233 page = list(self._pagefunc(pagenum))
1234 if skip_elems:
1235 page = page[skip_elems:]
1236 skip_elems = None
1237 if only_more is not None:
1238 if len(page) < only_more:
1239 only_more -= len(page)
1240 else:
1241 page = page[:only_more]
1242 res.extend(page)
1243 break
1244 res.extend(page)
1245 return res
1246
1247
81c2f20b 1248def uppercase_escape(s):
676eb3f2 1249 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1250 return re.sub(
a612753d 1251 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1252 lambda m: unicode_escape(m.group(0))[0],
1253 s)
b53466e1 1254
d05cfe06
S
1255
1256def escape_rfc3986(s):
1257 """Escape non-ASCII characters as suggested by RFC 3986"""
1258 if sys.version_info < (3, 0) and isinstance(s, unicode):
1259 s = s.encode('utf-8')
ecc0c5ee 1260 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1261
1262
1263def escape_url(url):
1264 """Escape URL as suggested by RFC 3986"""
1265 url_parsed = compat_urllib_parse_urlparse(url)
1266 return url_parsed._replace(
1267 path=escape_rfc3986(url_parsed.path),
1268 params=escape_rfc3986(url_parsed.params),
1269 query=escape_rfc3986(url_parsed.query),
1270 fragment=escape_rfc3986(url_parsed.fragment)
1271 ).geturl()
1272
b53466e1 1273try:
28e614de 1274 struct.pack('!I', 0)
b53466e1
PH
1275except TypeError:
1276 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1277 def struct_pack(spec, *args):
1278 if isinstance(spec, compat_str):
1279 spec = spec.encode('ascii')
1280 return struct.pack(spec, *args)
1281
1282 def struct_unpack(spec, *args):
1283 if isinstance(spec, compat_str):
1284 spec = spec.encode('ascii')
1285 return struct.unpack(spec, *args)
1286else:
1287 struct_pack = struct.pack
1288 struct_unpack = struct.unpack
62e609ab
PH
1289
1290
1291def read_batch_urls(batch_fd):
1292 def fixup(url):
1293 if not isinstance(url, compat_str):
1294 url = url.decode('utf-8', 'replace')
28e614de 1295 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1296 if url.startswith(BOM_UTF8):
1297 url = url[len(BOM_UTF8):]
1298 url = url.strip()
1299 if url.startswith(('#', ';', ']')):
1300 return False
1301 return url
1302
1303 with contextlib.closing(batch_fd) as fd:
1304 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1305
1306
1307def urlencode_postdata(*args, **kargs):
1308 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1309
1310
0990305d
PH
1311try:
1312 etree_iter = xml.etree.ElementTree.Element.iter
1313except AttributeError: # Python <=2.6
1314 etree_iter = lambda n: n.findall('.//*')
1315
1316
bcf89ce6
PH
1317def parse_xml(s):
1318 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1319 def doctype(self, name, pubid, system):
1320 pass # Ignore doctypes
1321
1322 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1323 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1324 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1325 # Fix up XML parser in Python 2.x
1326 if sys.version_info < (3, 0):
1327 for n in etree_iter(tree):
1328 if n.text is not None:
1329 if not isinstance(n.text, compat_str):
1330 n.text = n.text.decode('utf-8')
1331 return tree
e68301af
PH
1332
1333
a1a530b0
PH
1334US_RATINGS = {
1335 'G': 0,
1336 'PG': 10,
1337 'PG-13': 13,
1338 'R': 16,
1339 'NC': 18,
1340}
fac55558
PH
1341
1342
146c80e2
S
1343def parse_age_limit(s):
1344 if s is None:
d838b1bd 1345 return None
146c80e2 1346 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d838b1bd 1347 return int(m.group('age')) if m else US_RATINGS.get(s, None)
146c80e2
S
1348
1349
fac55558 1350def strip_jsonp(code):
609a61e3
PH
1351 return re.sub(
1352 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1353
1354
e05f6939
PH
1355def js_to_json(code):
1356 def fix_kv(m):
e7b6d122
PH
1357 v = m.group(0)
1358 if v in ('true', 'false', 'null'):
1359 return v
1360 if v.startswith('"'):
1361 return v
1362 if v.startswith("'"):
1363 v = v[1:-1]
1364 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1365 '\\\\': '\\\\',
1366 "\\'": "'",
1367 '"': '\\"',
1368 }[m.group(0)], v)
1369 return '"%s"' % v
e05f6939
PH
1370
1371 res = re.sub(r'''(?x)
e7b6d122
PH
1372 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1373 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1374 [a-zA-Z_][a-zA-Z_0-9]*
e05f6939
PH
1375 ''', fix_kv, code)
1376 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1377 return res
1378
1379
478c2c61
PH
1380def qualities(quality_ids):
1381 """ Get a numeric quality value out of a list of possible values """
1382 def q(qid):
1383 try:
1384 return quality_ids.index(qid)
1385 except ValueError:
1386 return -1
1387 return q
1388
acd69589
PH
1389
1390DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1391
a020a0dc
PH
1392
1393def limit_length(s, length):
1394 """ Add ellipses to overly long strings """
1395 if s is None:
1396 return None
1397 ELLIPSES = '...'
1398 if len(s) > length:
1399 return s[:length - len(ELLIPSES)] + ELLIPSES
1400 return s
48844745
PH
1401
1402
1403def version_tuple(v):
1404 return [int(e) for e in v.split('.')]
1405
1406
1407def is_outdated_version(version, limit, assume_new=True):
1408 if not version:
1409 return not assume_new
1410 try:
1411 return version_tuple(version) < version_tuple(limit)
1412 except ValueError:
1413 return not assume_new