]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[airmozilla] Be more tolerant when nonessential items are missing (#5030)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
912b38b4 6import calendar
676eb3f2 7import codecs
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96
PH
10import datetime
11import email.utils
f45c185f 12import errno
be4a824d 13import functools
d77c3dfd 14import gzip
b7ab0590 15import itertools
03f9daab 16import io
f4bfd65f 17import json
d77c3dfd 18import locale
02dbf93f 19import math
347de493 20import operator
d77c3dfd 21import os
4eb7f1d1 22import pipes
c496ca96 23import platform
d77c3dfd 24import re
13ebea79 25import ssl
c496ca96 26import socket
b53466e1 27import struct
1c088fa8 28import subprocess
d77c3dfd 29import sys
181c8655 30import tempfile
01951dda 31import traceback
bcf89ce6 32import xml.etree.ElementTree
d77c3dfd 33import zlib
d77c3dfd 34
8c25f81b 35from .compat import (
8f9312c3 36 compat_basestring,
8c25f81b
PH
37 compat_chr,
38 compat_getenv,
39 compat_html_entities,
be4a824d 40 compat_http_client,
8c25f81b 41 compat_parse_qs,
be4a824d 42 compat_socket_create_connection,
8c25f81b
PH
43 compat_str,
44 compat_urllib_error,
45 compat_urllib_parse,
46 compat_urllib_parse_urlparse,
47 compat_urllib_request,
48 compat_urlparse,
7d4111ed 49 shlex_quote,
8c25f81b 50)
4644ac55
S
51
52
468e2e92
FV
53# This is not clearly defined otherwise
54compiled_regex_type = type(re.compile(''))
55
3e669f36 56std_headers = {
18313934 57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
59ae15a5
PH
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 62}
f427df17 63
5f6a1245 64
7105440c
YCH
65ENGLISH_MONTH_NAMES = [
66 'January', 'February', 'March', 'April', 'May', 'June',
67 'July', 'August', 'September', 'October', 'November', 'December']
68
69
d77c3dfd 70def preferredencoding():
59ae15a5 71 """Get preferred encoding.
d77c3dfd 72
59ae15a5
PH
73 Returns the best encoding scheme for the system, based on
74 locale.getpreferredencoding() and some further tweaks.
75 """
76 try:
77 pref = locale.getpreferredencoding()
28e614de 78 'TEST'.encode(pref)
59ae15a5
PH
79 except:
80 pref = 'UTF-8'
bae611f2 81
59ae15a5 82 return pref
d77c3dfd 83
f4bfd65f 84
181c8655 85def write_json_file(obj, fn):
1394646a 86 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 87
92120217 88 fn = encodeFilename(fn)
61ee5aeb 89 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
90 encoding = get_filesystem_encoding()
91 # os.path.basename returns a bytes object, but NamedTemporaryFile
92 # will fail if the filename contains non ascii characters unless we
93 # use a unicode object
94 path_basename = lambda f: os.path.basename(fn).decode(encoding)
95 # the same for os.path.dirname
96 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
97 else:
98 path_basename = os.path.basename
99 path_dirname = os.path.dirname
100
73159f99
S
101 args = {
102 'suffix': '.tmp',
ec5f6016
JMF
103 'prefix': path_basename(fn) + '.',
104 'dir': path_dirname(fn),
73159f99
S
105 'delete': False,
106 }
107
181c8655
PH
108 # In Python 2.x, json.dump expects a bytestream.
109 # In Python 3.x, it writes to a character stream
110 if sys.version_info < (3, 0):
73159f99 111 args['mode'] = 'wb'
181c8655 112 else:
73159f99
S
113 args.update({
114 'mode': 'w',
115 'encoding': 'utf-8',
116 })
117
118 tf = tempfile.NamedTemporaryFile(**args)
181c8655
PH
119
120 try:
121 with tf:
122 json.dump(obj, tf)
1394646a
IK
123 if sys.platform == 'win32':
124 # Need to remove existing file on Windows, else os.rename raises
125 # WindowsError or FileExistsError.
126 try:
127 os.unlink(fn)
128 except OSError:
129 pass
181c8655
PH
130 os.rename(tf.name, fn)
131 except:
132 try:
133 os.remove(tf.name)
134 except OSError:
135 pass
136 raise
137
138
139if sys.version_info >= (2, 7):
59ae56fa
PH
140 def find_xpath_attr(node, xpath, key, val):
141 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
142 assert re.match(r'^[a-zA-Z-]+$', key)
143 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
ab4ee31e 144 expr = xpath + "[@%s='%s']" % (key, val)
59ae56fa
PH
145 return node.find(expr)
146else:
147 def find_xpath_attr(node, xpath, key, val):
4eefbfdb
PH
148 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
149 # .//node does not match if a node is a direct child of . !
8f9312c3 150 if isinstance(xpath, compat_str):
4eefbfdb
PH
151 xpath = xpath.encode('ascii')
152
59ae56fa
PH
153 for f in node.findall(xpath):
154 if f.attrib.get(key) == val:
155 return f
156 return None
157
d7e66d39
JMF
158# On python2.6 the xml.etree.ElementTree.Element methods don't support
159# the namespace parameter
5f6a1245
JW
160
161
d7e66d39
JMF
162def xpath_with_ns(path, ns_map):
163 components = [c.split(':') for c in path.split('/')]
164 replaced = []
165 for c in components:
166 if len(c) == 1:
167 replaced.append(c[0])
168 else:
169 ns, tag = c
170 replaced.append('{%s}%s' % (ns_map[ns], tag))
171 return '/'.join(replaced)
172
d77c3dfd 173
bf0ff932 174def xpath_text(node, xpath, name=None, fatal=False):
d74bebd5
PH
175 if sys.version_info < (2, 7): # Crazy 2.6
176 xpath = xpath.encode('ascii')
177
bf0ff932 178 n = node.find(xpath)
42bdd9d0 179 if n is None or n.text is None:
bf0ff932
PH
180 if fatal:
181 name = xpath if name is None else name
182 raise ExtractorError('Could not find XML element %s' % name)
183 else:
184 return None
185 return n.text
186
187
9e6dd238 188def get_element_by_id(id, html):
43e8fafd
ND
189 """Return the content of the tag with the specified ID in the passed HTML document"""
190 return get_element_by_attribute("id", id, html)
191
12ea2f30 192
43e8fafd
ND
193def get_element_by_attribute(attribute, value, html):
194 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 195
38285056
PH
196 m = re.search(r'''(?xs)
197 <([a-zA-Z0-9:._-]+)
198 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
199 \s+%s=['"]?%s['"]?
200 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
201 \s*>
202 (?P<content>.*?)
203 </\1>
204 ''' % (re.escape(attribute), re.escape(value)), html)
205
206 if not m:
207 return None
208 res = m.group('content')
209
210 if res.startswith('"') or res.startswith("'"):
211 res = res[1:-1]
a921f407 212
38285056 213 return unescapeHTML(res)
a921f407 214
9e6dd238
FV
215
216def clean_html(html):
59ae15a5 217 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
218
219 if html is None: # Convenience for sanitizing descriptions etc.
220 return html
221
59ae15a5
PH
222 # Newline vs <br />
223 html = html.replace('\n', ' ')
6b3aef80
FV
224 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
225 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
226 # Strip html tags
227 html = re.sub('<.*?>', '', html)
228 # Replace html entities
229 html = unescapeHTML(html)
7decf895 230 return html.strip()
9e6dd238
FV
231
232
d77c3dfd 233def sanitize_open(filename, open_mode):
59ae15a5
PH
234 """Try to open the given filename, and slightly tweak it if this fails.
235
236 Attempts to open the given filename. If this fails, it tries to change
237 the filename slightly, step by step, until it's either able to open it
238 or it fails and raises a final exception, like the standard open()
239 function.
240
241 It returns the tuple (stream, definitive_file_name).
242 """
243 try:
28e614de 244 if filename == '-':
59ae15a5
PH
245 if sys.platform == 'win32':
246 import msvcrt
247 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 248 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
249 stream = open(encodeFilename(filename), open_mode)
250 return (stream, filename)
251 except (IOError, OSError) as err:
f45c185f
PH
252 if err.errno in (errno.EACCES,):
253 raise
59ae15a5 254
f45c185f
PH
255 # In case of error, try to remove win32 forbidden chars
256 alt_filename = os.path.join(
b74e86f4
PH
257 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
258 for path_part in os.path.split(filename)
259 )
f45c185f
PH
260 if alt_filename == filename:
261 raise
262 else:
263 # An exception here should be caught in the caller
264 stream = open(encodeFilename(filename), open_mode)
265 return (stream, alt_filename)
d77c3dfd
FV
266
267
268def timeconvert(timestr):
59ae15a5
PH
269 """Convert RFC 2822 defined time string into system timestamp"""
270 timestamp = None
271 timetuple = email.utils.parsedate_tz(timestr)
272 if timetuple is not None:
273 timestamp = email.utils.mktime_tz(timetuple)
274 return timestamp
1c469a94 275
5f6a1245 276
796173d0 277def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
278 """Sanitizes a string so it could be used as part of a filename.
279 If restricted is set, use a stricter subset of allowed characters.
796173d0 280 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
281 """
282 def replace_insane(char):
283 if char == '?' or ord(char) < 32 or ord(char) == 127:
284 return ''
285 elif char == '"':
286 return '' if restricted else '\''
287 elif char == ':':
288 return '_-' if restricted else ' -'
289 elif char in '\\/|*<>':
290 return '_'
627dcfff 291 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
292 return '_'
293 if restricted and ord(char) > 127:
294 return '_'
295 return char
296
2aeb06d6
PH
297 # Handle timestamps
298 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 299 result = ''.join(map(replace_insane, s))
796173d0
PH
300 if not is_id:
301 while '__' in result:
302 result = result.replace('__', '_')
303 result = result.strip('_')
304 # Common case of "Foreign band name - English song title"
305 if restricted and result.startswith('-_'):
306 result = result[2:]
5a42414b
PH
307 if result.startswith('-'):
308 result = '_' + result[len('-'):]
796173d0
PH
309 if not result:
310 result = '_'
59ae15a5 311 return result
d77c3dfd 312
5f6a1245 313
d77c3dfd 314def orderedSet(iterable):
59ae15a5
PH
315 """ Remove all duplicates from the input iterable """
316 res = []
317 for el in iterable:
318 if el not in res:
319 res.append(el)
320 return res
d77c3dfd 321
912b38b4 322
4e408e47
PH
323def _htmlentity_transform(entity):
324 """Transforms an HTML entity to a character."""
325 # Known non-numeric HTML entity
326 if entity in compat_html_entities.name2codepoint:
327 return compat_chr(compat_html_entities.name2codepoint[entity])
328
329 mobj = re.match(r'#(x?[0-9]+)', entity)
330 if mobj is not None:
331 numstr = mobj.group(1)
28e614de 332 if numstr.startswith('x'):
4e408e47 333 base = 16
28e614de 334 numstr = '0%s' % numstr
4e408e47
PH
335 else:
336 base = 10
337 return compat_chr(int(numstr, base))
338
339 # Unknown entity in name, return its literal representation
28e614de 340 return ('&%s;' % entity)
4e408e47
PH
341
342
d77c3dfd 343def unescapeHTML(s):
912b38b4
PH
344 if s is None:
345 return None
346 assert type(s) == compat_str
d77c3dfd 347
4e408e47
PH
348 return re.sub(
349 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 350
8bf48f23
PH
351
352def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
353 """
354 @param s The name of the file
355 """
d77c3dfd 356
8bf48f23 357 assert type(s) == compat_str
d77c3dfd 358
59ae15a5
PH
359 # Python 3 has a Unicode API
360 if sys.version_info >= (3, 0):
361 return s
0f00efed 362
59ae15a5 363 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
28e614de 364 # Pass '' directly to use Unicode APIs on Windows 2000 and up
59ae15a5
PH
365 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
366 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
367 if not for_subprocess:
368 return s
369 else:
370 # For subprocess calls, encode with locale encoding
371 # Refer to http://stackoverflow.com/a/9951851/35070
372 encoding = preferredencoding()
59ae15a5 373 else:
6df40dcb 374 encoding = sys.getfilesystemencoding()
8bf48f23
PH
375 if encoding is None:
376 encoding = 'utf-8'
377 return s.encode(encoding, 'ignore')
378
f07b74fc
PH
379
380def encodeArgument(s):
381 if not isinstance(s, compat_str):
382 # Legacy code that uses byte strings
383 # Uncomment the following line after fixing all post processors
7af808a5 384 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
385 s = s.decode('ascii')
386 return encodeFilename(s, True)
387
388
8271226a
PH
389def decodeOption(optval):
390 if optval is None:
391 return optval
392 if isinstance(optval, bytes):
393 optval = optval.decode(preferredencoding())
394
395 assert isinstance(optval, compat_str)
396 return optval
1c256f70 397
5f6a1245 398
4539dd30
PH
399def formatSeconds(secs):
400 if secs > 3600:
401 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
402 elif secs > 60:
403 return '%d:%02d' % (secs // 60, secs % 60)
404 else:
405 return '%d' % secs
406
a0ddb8a2 407
be4a824d
PH
408def make_HTTPS_handler(params, **kwargs):
409 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 410 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 411 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 412 if opts_no_check_certificate:
be5f2c19 413 context.check_hostname = False
0db261ba 414 context.verify_mode = ssl.CERT_NONE
a2366922 415 try:
be4a824d 416 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
417 except TypeError:
418 # Python 2.7.8
419 # (create_default_context present but HTTPSHandler has no context=)
420 pass
421
422 if sys.version_info < (3, 2):
d7932313 423 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 424 else: # Python < 3.4
d7932313 425 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 426 context.verify_mode = (ssl.CERT_NONE
dca08720 427 if opts_no_check_certificate
ea6d901e 428 else ssl.CERT_REQUIRED)
303b479e 429 context.set_default_verify_paths()
be4a824d 430 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 431
732ea2f0 432
1c256f70
PH
433class ExtractorError(Exception):
434 """Error during info extraction."""
5f6a1245 435
d11271dd 436 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
437 """ tb, if given, is the original traceback (so that it can be printed out).
438 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
439 """
440
441 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
442 expected = True
d11271dd
PH
443 if video_id is not None:
444 msg = video_id + ': ' + msg
410f3e73 445 if cause:
28e614de 446 msg += ' (caused by %r)' % cause
9a82b238 447 if not expected:
732ea2f0
PH
448 if ytdl_is_updateable():
449 update_cmd = 'type youtube-dl -U to update'
450 else:
451 update_cmd = 'see https://yt-dl.org/update on how to update'
452 msg += '; please report this issue on https://yt-dl.org/bug .'
453 msg += ' Make sure you are using the latest version; %s.' % update_cmd
454 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
1c256f70 455 super(ExtractorError, self).__init__(msg)
d5979c5d 456
1c256f70 457 self.traceback = tb
8cc83b8d 458 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 459 self.cause = cause
d11271dd 460 self.video_id = video_id
1c256f70 461
01951dda
PH
462 def format_traceback(self):
463 if self.traceback is None:
464 return None
28e614de 465 return ''.join(traceback.format_tb(self.traceback))
01951dda 466
1c256f70 467
416c7fcb
PH
468class UnsupportedError(ExtractorError):
469 def __init__(self, url):
470 super(UnsupportedError, self).__init__(
471 'Unsupported URL: %s' % url, expected=True)
472 self.url = url
473
474
55b3e45b
JMF
475class RegexNotFoundError(ExtractorError):
476 """Error when a regex didn't match"""
477 pass
478
479
d77c3dfd 480class DownloadError(Exception):
59ae15a5 481 """Download Error exception.
d77c3dfd 482
59ae15a5
PH
483 This exception may be thrown by FileDownloader objects if they are not
484 configured to continue on errors. They will contain the appropriate
485 error message.
486 """
5f6a1245 487
8cc83b8d
FV
488 def __init__(self, msg, exc_info=None):
489 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
490 super(DownloadError, self).__init__(msg)
491 self.exc_info = exc_info
d77c3dfd
FV
492
493
494class SameFileError(Exception):
59ae15a5 495 """Same File exception.
d77c3dfd 496
59ae15a5
PH
497 This exception will be thrown by FileDownloader objects if they detect
498 multiple files would have to be downloaded to the same file on disk.
499 """
500 pass
d77c3dfd
FV
501
502
503class PostProcessingError(Exception):
59ae15a5 504 """Post Processing exception.
d77c3dfd 505
59ae15a5
PH
506 This exception may be raised by PostProcessor's .run() method to
507 indicate an error in the postprocessing task.
508 """
5f6a1245 509
7851b379
PH
510 def __init__(self, msg):
511 self.msg = msg
d77c3dfd 512
5f6a1245 513
d77c3dfd 514class MaxDownloadsReached(Exception):
59ae15a5
PH
515 """ --max-downloads limit has been reached. """
516 pass
d77c3dfd
FV
517
518
519class UnavailableVideoError(Exception):
59ae15a5 520 """Unavailable Format exception.
d77c3dfd 521
59ae15a5
PH
522 This exception will be thrown when a video is requested
523 in a format that is not available for that video.
524 """
525 pass
d77c3dfd
FV
526
527
528class ContentTooShortError(Exception):
59ae15a5 529 """Content Too Short exception.
d77c3dfd 530
59ae15a5
PH
531 This exception may be raised by FileDownloader objects when a file they
532 download is too small for what the server announced first, indicating
533 the connection was probably interrupted.
534 """
535 # Both in bytes
536 downloaded = None
537 expected = None
d77c3dfd 538
59ae15a5
PH
539 def __init__(self, downloaded, expected):
540 self.downloaded = downloaded
541 self.expected = expected
d77c3dfd 542
5f6a1245 543
c5a59d93 544def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
be4a824d
PH
545 hc = http_class(*args, **kwargs)
546 source_address = ydl_handler._params.get('source_address')
547 if source_address is not None:
548 sa = (source_address, 0)
549 if hasattr(hc, 'source_address'): # Python 2.7+
550 hc.source_address = sa
551 else: # Python 2.6
552 def _hc_connect(self, *args, **kwargs):
553 sock = compat_socket_create_connection(
554 (self.host, self.port), self.timeout, sa)
555 if is_https:
d7932313
PH
556 self.sock = ssl.wrap_socket(
557 sock, self.key_file, self.cert_file,
558 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
559 else:
560 self.sock = sock
561 hc.connect = functools.partial(_hc_connect, hc)
562
563 return hc
564
565
acebc9cd 566class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
567 """Handler for HTTP requests and responses.
568
569 This class, when installed with an OpenerDirector, automatically adds
570 the standard headers to every HTTP request and handles gzipped and
571 deflated responses from web servers. If compression is to be avoided in
572 a particular request, the original request in the program code only has
573 to include the HTTP header "Youtubedl-No-Compression", which will be
574 removed before making the real request.
575
576 Part of this code was copied from:
577
578 http://techknack.net/python-urllib2-handlers/
579
580 Andrew Rowls, the author of that code, agreed to release it to the
581 public domain.
582 """
583
be4a824d
PH
584 def __init__(self, params, *args, **kwargs):
585 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
586 self._params = params
587
588 def http_open(self, req):
589 return self.do_open(functools.partial(
c5a59d93 590 _create_http_connection, self, compat_http_client.HTTPConnection, False),
be4a824d
PH
591 req)
592
59ae15a5
PH
593 @staticmethod
594 def deflate(data):
595 try:
596 return zlib.decompress(data, -zlib.MAX_WBITS)
597 except zlib.error:
598 return zlib.decompress(data)
599
600 @staticmethod
601 def addinfourl_wrapper(stream, headers, url, code):
602 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
603 return compat_urllib_request.addinfourl(stream, headers, url, code)
604 ret = compat_urllib_request.addinfourl(stream, headers, url)
605 ret.code = code
606 return ret
607
acebc9cd 608 def http_request(self, req):
33ac271b 609 for h, v in std_headers.items():
3d5f7a39
JK
610 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
611 # The dict keys are capitalized because of this bug by urllib
612 if h.capitalize() not in req.headers:
33ac271b 613 req.add_header(h, v)
59ae15a5
PH
614 if 'Youtubedl-no-compression' in req.headers:
615 if 'Accept-encoding' in req.headers:
616 del req.headers['Accept-encoding']
617 del req.headers['Youtubedl-no-compression']
989b4b2b
PH
618
619 if sys.version_info < (2, 7) and '#' in req.get_full_url():
620 # Python 2.6 is brain-dead when it comes to fragments
621 req._Request__original = req._Request__original.partition('#')[0]
622 req._Request__r_type = req._Request__r_type.partition('#')[0]
623
59ae15a5
PH
624 return req
625
acebc9cd 626 def http_response(self, req, resp):
59ae15a5
PH
627 old_resp = resp
628 # gzip
629 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
630 content = resp.read()
631 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
632 try:
633 uncompressed = io.BytesIO(gz.read())
634 except IOError as original_ioerror:
635 # There may be junk add the end of the file
636 # See http://stackoverflow.com/q/4928560/35070 for details
637 for i in range(1, 1024):
638 try:
639 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
640 uncompressed = io.BytesIO(gz.read())
641 except IOError:
642 continue
643 break
644 else:
645 raise original_ioerror
646 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
647 resp.msg = old_resp.msg
648 # deflate
649 if resp.headers.get('Content-encoding', '') == 'deflate':
650 gz = io.BytesIO(self.deflate(resp.read()))
651 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
652 resp.msg = old_resp.msg
653 return resp
0f8d03f8 654
acebc9cd
PH
655 https_request = http_request
656 https_response = http_response
bf50b038 657
5de90176 658
be4a824d
PH
659class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
660 def __init__(self, params, https_conn_class=None, *args, **kwargs):
661 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
662 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
663 self._params = params
664
665 def https_open(self, req):
4f264c02
JMF
666 kwargs = {}
667 if hasattr(self, '_context'): # python > 2.6
668 kwargs['context'] = self._context
669 if hasattr(self, '_check_hostname'): # python 3.x
670 kwargs['check_hostname'] = self._check_hostname
be4a824d
PH
671 return self.do_open(functools.partial(
672 _create_http_connection, self, self._https_conn_class, True),
4f264c02 673 req, **kwargs)
be4a824d
PH
674
675
08b38d54 676def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
677 """ Return a UNIX timestamp from the given date """
678
679 if date_str is None:
680 return None
681
08b38d54
PH
682 if timezone is None:
683 m = re.search(
684 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
685 date_str)
686 if not m:
912b38b4
PH
687 timezone = datetime.timedelta()
688 else:
08b38d54
PH
689 date_str = date_str[:-len(m.group(0))]
690 if not m.group('sign'):
691 timezone = datetime.timedelta()
692 else:
693 sign = 1 if m.group('sign') == '+' else -1
694 timezone = datetime.timedelta(
695 hours=sign * int(m.group('hours')),
696 minutes=sign * int(m.group('minutes')))
6ad4013d 697 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
305d0683 698 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
699 return calendar.timegm(dt.timetuple())
700
701
42bdd9d0 702def unified_strdate(date_str, day_first=True):
bf50b038 703 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
704
705 if date_str is None:
706 return None
bf50b038 707 upload_date = None
5f6a1245 708 # Replace commas
026fcc04 709 date_str = date_str.replace(',', ' ')
bf50b038 710 # %z (UTC offset) is only supported in python>=3.2
026fcc04 711 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0 712 # Remove AM/PM + timezone
9bb8e0a3 713 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
42bdd9d0 714
19e1d359
JMF
715 format_expressions = [
716 '%d %B %Y',
0f99566c 717 '%d %b %Y',
19e1d359
JMF
718 '%B %d %Y',
719 '%b %d %Y',
78ff59d0
PP
720 '%b %dst %Y %I:%M%p',
721 '%b %dnd %Y %I:%M%p',
722 '%b %dth %Y %I:%M%p',
a69801e2 723 '%Y %m %d',
19e1d359 724 '%Y-%m-%d',
fe556f1b 725 '%Y/%m/%d',
19e1d359 726 '%Y/%m/%d %H:%M:%S',
5d73273f 727 '%Y-%m-%d %H:%M:%S',
e9be9a6a 728 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 729 '%d.%m.%Y %H:%M',
b047de6f 730 '%d.%m.%Y %H.%M',
19e1d359 731 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
732 '%Y-%m-%dT%H:%M:%S.%fZ',
733 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 734 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 735 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 736 '%Y-%m-%dT%H:%M',
19e1d359 737 ]
42bdd9d0
PH
738 if day_first:
739 format_expressions.extend([
776dc399
S
740 '%d.%m.%Y',
741 '%d/%m/%Y',
742 '%d/%m/%y',
42bdd9d0
PH
743 '%d/%m/%Y %H:%M:%S',
744 ])
745 else:
746 format_expressions.extend([
776dc399
S
747 '%m.%d.%Y',
748 '%m/%d/%Y',
749 '%m/%d/%y',
42bdd9d0
PH
750 '%m/%d/%Y %H:%M:%S',
751 ])
bf50b038
JMF
752 for expression in format_expressions:
753 try:
754 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 755 except ValueError:
bf50b038 756 pass
42393ce2
PH
757 if upload_date is None:
758 timetuple = email.utils.parsedate_tz(date_str)
759 if timetuple:
760 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
761 return upload_date
762
5f6a1245 763
28e614de 764def determine_ext(url, default_ext='unknown_video'):
f4776371
S
765 if url is None:
766 return default_ext
28e614de 767 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
768 if re.match(r'^[A-Za-z0-9]+$', guess):
769 return guess
770 else:
cbdbb766 771 return default_ext
73e79f2a 772
5f6a1245 773
d4051a8e 774def subtitles_filename(filename, sub_lang, sub_format):
28e614de 775 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 776
5f6a1245 777
bd558525 778def date_from_str(date_str):
37254abc
JMF
779 """
780 Return a datetime object from a string in the format YYYYMMDD or
781 (now|today)[+-][0-9](day|week|month|year)(s)?"""
782 today = datetime.date.today()
f8795e10 783 if date_str in ('now', 'today'):
37254abc 784 return today
f8795e10
PH
785 if date_str == 'yesterday':
786 return today - datetime.timedelta(days=1)
37254abc
JMF
787 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
788 if match is not None:
789 sign = match.group('sign')
790 time = int(match.group('time'))
791 if sign == '-':
792 time = -time
793 unit = match.group('unit')
5f6a1245 794 # A bad aproximation?
37254abc
JMF
795 if unit == 'month':
796 unit = 'day'
797 time *= 30
798 elif unit == 'year':
799 unit = 'day'
800 time *= 365
801 unit += 's'
802 delta = datetime.timedelta(**{unit: time})
803 return today + delta
bd558525 804 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
5f6a1245
JW
805
806
e63fc1be 807def hyphenate_date(date_str):
808 """
809 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
810 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
811 if match is not None:
812 return '-'.join(match.groups())
813 else:
814 return date_str
815
5f6a1245 816
bd558525
JMF
817class DateRange(object):
818 """Represents a time interval between two dates"""
5f6a1245 819
bd558525
JMF
820 def __init__(self, start=None, end=None):
821 """start and end must be strings in the format accepted by date"""
822 if start is not None:
823 self.start = date_from_str(start)
824 else:
825 self.start = datetime.datetime.min.date()
826 if end is not None:
827 self.end = date_from_str(end)
828 else:
829 self.end = datetime.datetime.max.date()
37254abc 830 if self.start > self.end:
bd558525 831 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 832
bd558525
JMF
833 @classmethod
834 def day(cls, day):
835 """Returns a range that only contains the given day"""
5f6a1245
JW
836 return cls(day, day)
837
bd558525
JMF
838 def __contains__(self, date):
839 """Check if the date is in the range"""
37254abc
JMF
840 if not isinstance(date, datetime.date):
841 date = date_from_str(date)
842 return self.start <= date <= self.end
5f6a1245 843
bd558525 844 def __str__(self):
5f6a1245 845 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
846
847
848def platform_name():
849 """ Returns the platform name as a compat_str """
850 res = platform.platform()
851 if isinstance(res, bytes):
852 res = res.decode(preferredencoding())
853
854 assert isinstance(res, compat_str)
855 return res
c257baff
PH
856
857
b58ddb32
PH
858def _windows_write_string(s, out):
859 """ Returns True if the string was written using special methods,
860 False if it has yet to be written out."""
861 # Adapted from http://stackoverflow.com/a/3259271/35070
862
863 import ctypes
864 import ctypes.wintypes
865
866 WIN_OUTPUT_IDS = {
867 1: -11,
868 2: -12,
869 }
870
a383a98a
PH
871 try:
872 fileno = out.fileno()
873 except AttributeError:
874 # If the output stream doesn't have a fileno, it's virtual
875 return False
aa42e873
PH
876 except io.UnsupportedOperation:
877 # Some strange Windows pseudo files?
878 return False
b58ddb32
PH
879 if fileno not in WIN_OUTPUT_IDS:
880 return False
881
e2f89ec7 882 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 883 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
6ac4e806 884 (b"GetStdHandle", ctypes.windll.kernel32))
b58ddb32
PH
885 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
886
e2f89ec7 887 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
888 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
889 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
6ac4e806 890 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
b58ddb32
PH
891 written = ctypes.wintypes.DWORD(0)
892
6ac4e806 893 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
b58ddb32
PH
894 FILE_TYPE_CHAR = 0x0002
895 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 896 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
897 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
898 ctypes.POINTER(ctypes.wintypes.DWORD))(
6ac4e806 899 (b"GetConsoleMode", ctypes.windll.kernel32))
b58ddb32
PH
900 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
901
902 def not_a_console(handle):
903 if handle == INVALID_HANDLE_VALUE or handle is None:
904 return True
8fb3ac36
PH
905 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
906 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
907
908 if not_a_console(h):
909 return False
910
d1b9c912
PH
911 def next_nonbmp_pos(s):
912 try:
913 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
914 except StopIteration:
915 return len(s)
916
917 while s:
918 count = min(next_nonbmp_pos(s), 1024)
919
b58ddb32 920 ret = WriteConsoleW(
d1b9c912 921 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
922 if ret == 0:
923 raise OSError('Failed to write string')
d1b9c912
PH
924 if not count: # We just wrote a non-BMP character
925 assert written.value == 2
926 s = s[1:]
927 else:
928 assert written.value > 0
929 s = s[written.value:]
b58ddb32
PH
930 return True
931
932
734f90bb 933def write_string(s, out=None, encoding=None):
7459e3a2
PH
934 if out is None:
935 out = sys.stderr
8bf48f23 936 assert type(s) == compat_str
7459e3a2 937
b58ddb32
PH
938 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
939 if _windows_write_string(s, out):
940 return
941
7459e3a2
PH
942 if ('b' in getattr(out, 'mode', '') or
943 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
944 byt = s.encode(encoding or preferredencoding(), 'ignore')
945 out.write(byt)
946 elif hasattr(out, 'buffer'):
947 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
948 byt = s.encode(enc, 'ignore')
949 out.buffer.write(byt)
950 else:
8bf48f23 951 out.write(s)
7459e3a2
PH
952 out.flush()
953
954
48ea9cea
PH
955def bytes_to_intlist(bs):
956 if not bs:
957 return []
958 if isinstance(bs[0], int): # Python 3
959 return list(bs)
960 else:
961 return [ord(c) for c in bs]
962
c257baff 963
cba892fa 964def intlist_to_bytes(xs):
965 if not xs:
966 return b''
eb4157fd 967 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
968
969
c1c9a79c
PH
970# Cross-platform file locking
971if sys.platform == 'win32':
972 import ctypes.wintypes
973 import msvcrt
974
975 class OVERLAPPED(ctypes.Structure):
976 _fields_ = [
977 ('Internal', ctypes.wintypes.LPVOID),
978 ('InternalHigh', ctypes.wintypes.LPVOID),
979 ('Offset', ctypes.wintypes.DWORD),
980 ('OffsetHigh', ctypes.wintypes.DWORD),
981 ('hEvent', ctypes.wintypes.HANDLE),
982 ]
983
984 kernel32 = ctypes.windll.kernel32
985 LockFileEx = kernel32.LockFileEx
986 LockFileEx.argtypes = [
987 ctypes.wintypes.HANDLE, # hFile
988 ctypes.wintypes.DWORD, # dwFlags
989 ctypes.wintypes.DWORD, # dwReserved
990 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
991 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
992 ctypes.POINTER(OVERLAPPED) # Overlapped
993 ]
994 LockFileEx.restype = ctypes.wintypes.BOOL
995 UnlockFileEx = kernel32.UnlockFileEx
996 UnlockFileEx.argtypes = [
997 ctypes.wintypes.HANDLE, # hFile
998 ctypes.wintypes.DWORD, # dwReserved
999 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1000 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1001 ctypes.POINTER(OVERLAPPED) # Overlapped
1002 ]
1003 UnlockFileEx.restype = ctypes.wintypes.BOOL
1004 whole_low = 0xffffffff
1005 whole_high = 0x7fffffff
1006
1007 def _lock_file(f, exclusive):
1008 overlapped = OVERLAPPED()
1009 overlapped.Offset = 0
1010 overlapped.OffsetHigh = 0
1011 overlapped.hEvent = 0
1012 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1013 handle = msvcrt.get_osfhandle(f.fileno())
1014 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1015 whole_low, whole_high, f._lock_file_overlapped_p):
1016 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1017
1018 def _unlock_file(f):
1019 assert f._lock_file_overlapped_p
1020 handle = msvcrt.get_osfhandle(f.fileno())
1021 if not UnlockFileEx(handle, 0,
1022 whole_low, whole_high, f._lock_file_overlapped_p):
1023 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1024
1025else:
1026 import fcntl
1027
1028 def _lock_file(f, exclusive):
2582bebe 1029 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
1030
1031 def _unlock_file(f):
2582bebe 1032 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
1033
1034
1035class locked_file(object):
1036 def __init__(self, filename, mode, encoding=None):
1037 assert mode in ['r', 'a', 'w']
1038 self.f = io.open(filename, mode, encoding=encoding)
1039 self.mode = mode
1040
1041 def __enter__(self):
1042 exclusive = self.mode != 'r'
1043 try:
1044 _lock_file(self.f, exclusive)
1045 except IOError:
1046 self.f.close()
1047 raise
1048 return self
1049
1050 def __exit__(self, etype, value, traceback):
1051 try:
1052 _unlock_file(self.f)
1053 finally:
1054 self.f.close()
1055
1056 def __iter__(self):
1057 return iter(self.f)
1058
1059 def write(self, *args):
1060 return self.f.write(*args)
1061
1062 def read(self, *args):
1063 return self.f.read(*args)
4eb7f1d1
JMF
1064
1065
4644ac55
S
1066def get_filesystem_encoding():
1067 encoding = sys.getfilesystemencoding()
1068 return encoding if encoding is not None else 'utf-8'
1069
1070
4eb7f1d1 1071def shell_quote(args):
a6a173c2 1072 quoted_args = []
4644ac55 1073 encoding = get_filesystem_encoding()
a6a173c2
JMF
1074 for a in args:
1075 if isinstance(a, bytes):
1076 # We may get a filename encoded with 'encodeFilename'
1077 a = a.decode(encoding)
1078 quoted_args.append(pipes.quote(a))
28e614de 1079 return ' '.join(quoted_args)
9d4660ca
PH
1080
1081
f4d96df0
PH
1082def takewhile_inclusive(pred, seq):
1083 """ Like itertools.takewhile, but include the latest evaluated element
1084 (the first element so that Not pred(e)) """
1085 for e in seq:
1086 yield e
1087 if not pred(e):
1088 return
1089
1090
9d4660ca
PH
1091def smuggle_url(url, data):
1092 """ Pass additional data in a URL for internal use. """
1093
1094 sdata = compat_urllib_parse.urlencode(
28e614de
PH
1095 {'__youtubedl_smuggle': json.dumps(data)})
1096 return url + '#' + sdata
9d4660ca
PH
1097
1098
79f82953 1099def unsmuggle_url(smug_url, default=None):
83e865a3 1100 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1101 return smug_url, default
28e614de
PH
1102 url, _, sdata = smug_url.rpartition('#')
1103 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1104 data = json.loads(jsond)
1105 return url, data
02dbf93f
PH
1106
1107
02dbf93f
PH
1108def format_bytes(bytes):
1109 if bytes is None:
28e614de 1110 return 'N/A'
02dbf93f
PH
1111 if type(bytes) is str:
1112 bytes = float(bytes)
1113 if bytes == 0.0:
1114 exponent = 0
1115 else:
1116 exponent = int(math.log(bytes, 1024.0))
28e614de 1117 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1118 converted = float(bytes) / float(1024 ** exponent)
28e614de 1119 return '%.2f%s' % (converted, suffix)
f53c966a 1120
1c088fa8 1121
be64b5b0
PH
1122def parse_filesize(s):
1123 if s is None:
1124 return None
1125
1126 # The lower-case forms are of course incorrect and inofficial,
1127 # but we support those too
1128 _UNIT_TABLE = {
1129 'B': 1,
1130 'b': 1,
1131 'KiB': 1024,
1132 'KB': 1000,
1133 'kB': 1024,
1134 'Kb': 1000,
1135 'MiB': 1024 ** 2,
1136 'MB': 1000 ** 2,
1137 'mB': 1024 ** 2,
1138 'Mb': 1000 ** 2,
1139 'GiB': 1024 ** 3,
1140 'GB': 1000 ** 3,
1141 'gB': 1024 ** 3,
1142 'Gb': 1000 ** 3,
1143 'TiB': 1024 ** 4,
1144 'TB': 1000 ** 4,
1145 'tB': 1024 ** 4,
1146 'Tb': 1000 ** 4,
1147 'PiB': 1024 ** 5,
1148 'PB': 1000 ** 5,
1149 'pB': 1024 ** 5,
1150 'Pb': 1000 ** 5,
1151 'EiB': 1024 ** 6,
1152 'EB': 1000 ** 6,
1153 'eB': 1024 ** 6,
1154 'Eb': 1000 ** 6,
1155 'ZiB': 1024 ** 7,
1156 'ZB': 1000 ** 7,
1157 'zB': 1024 ** 7,
1158 'Zb': 1000 ** 7,
1159 'YiB': 1024 ** 8,
1160 'YB': 1000 ** 8,
1161 'yB': 1024 ** 8,
1162 'Yb': 1000 ** 8,
1163 }
1164
1165 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
4349c07d
PH
1166 m = re.match(
1167 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
be64b5b0
PH
1168 if not m:
1169 return None
1170
4349c07d
PH
1171 num_str = m.group('num').replace(',', '.')
1172 mult = _UNIT_TABLE[m.group('unit')]
1173 return int(float(num_str) * mult)
be64b5b0
PH
1174
1175
1c088fa8 1176def get_term_width():
4644ac55 1177 columns = compat_getenv('COLUMNS', None)
1c088fa8
PH
1178 if columns:
1179 return int(columns)
1180
1181 try:
1182 sp = subprocess.Popen(
1183 ['stty', 'size'],
1184 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1185 out, err = sp.communicate()
1186 return int(out.split()[1])
1187 except:
1188 pass
1189 return None
caefb1de
PH
1190
1191
1192def month_by_name(name):
1193 """ Return the number of a month by (locale-independently) English name """
1194
caefb1de 1195 try:
7105440c
YCH
1196 return ENGLISH_MONTH_NAMES.index(name) + 1
1197 except ValueError:
1198 return None
1199
1200
1201def month_by_abbreviation(abbrev):
1202 """ Return the number of a month by (locale-independently) English
1203 abbreviations """
1204
1205 try:
1206 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1207 except ValueError:
1208 return None
18258362
JMF
1209
1210
5aafe895 1211def fix_xml_ampersands(xml_str):
18258362 1212 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1213 return re.sub(
1214 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1215 '&amp;',
5aafe895 1216 xml_str)
e3946f98
PH
1217
1218
1219def setproctitle(title):
8bf48f23 1220 assert isinstance(title, compat_str)
e3946f98
PH
1221 try:
1222 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1223 except OSError:
1224 return
6eefe533
PH
1225 title_bytes = title.encode('utf-8')
1226 buf = ctypes.create_string_buffer(len(title_bytes))
1227 buf.value = title_bytes
e3946f98 1228 try:
6eefe533 1229 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1230 except AttributeError:
1231 return # Strange libc, just skip this
d7dda168
PH
1232
1233
1234def remove_start(s, start):
1235 if s.startswith(start):
1236 return s[len(start):]
1237 return s
29eb5174
PH
1238
1239
2b9faf55
PH
1240def remove_end(s, end):
1241 if s.endswith(end):
1242 return s[:-len(end)]
1243 return s
1244
1245
29eb5174 1246def url_basename(url):
9b8aaeed 1247 path = compat_urlparse.urlparse(url).path
28e614de 1248 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1249
1250
1251class HEADRequest(compat_urllib_request.Request):
1252 def get_method(self):
1253 return "HEAD"
7217e148
PH
1254
1255
9732d77e 1256def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1257 if get_attr:
1258 if v is not None:
1259 v = getattr(v, get_attr, None)
9572013d
PH
1260 if v == '':
1261 v = None
9732d77e
PH
1262 return default if v is None else (int(v) * invscale // scale)
1263
9572013d 1264
40a90862
JMF
1265def str_or_none(v, default=None):
1266 return default if v is None else compat_str(v)
1267
9732d77e
PH
1268
1269def str_to_int(int_str):
48d4681e 1270 """ A more relaxed version of int_or_none """
9732d77e
PH
1271 if int_str is None:
1272 return None
28e614de 1273 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1274 return int(int_str)
608d11f5
PH
1275
1276
9732d77e
PH
1277def float_or_none(v, scale=1, invscale=1, default=None):
1278 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1279
1280
608d11f5 1281def parse_duration(s):
8f9312c3 1282 if not isinstance(s, compat_basestring):
608d11f5
PH
1283 return None
1284
ca7b3246
S
1285 s = s.strip()
1286
608d11f5 1287 m = re.match(
9d22a7df 1288 r'''(?ix)(?:P?T)?
e8df5cee
PH
1289 (?:
1290 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1291 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1292
3e675fab 1293 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
6a68bb57 1294 (?:
8f4b58d7
PH
1295 (?:
1296 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1297 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1298 )?
6a68bb57
PH
1299 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1300 )?
e8df5cee
PH
1301 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1302 )$''', s)
608d11f5
PH
1303 if not m:
1304 return None
e8df5cee
PH
1305 res = 0
1306 if m.group('only_mins'):
1307 return float_or_none(m.group('only_mins'), invscale=60)
1308 if m.group('only_hours'):
1309 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1310 if m.group('secs'):
1311 res += int(m.group('secs'))
3e675fab
PH
1312 if m.group('mins_reversed'):
1313 res += int(m.group('mins_reversed')) * 60
608d11f5
PH
1314 if m.group('mins'):
1315 res += int(m.group('mins')) * 60
e8df5cee
PH
1316 if m.group('hours'):
1317 res += int(m.group('hours')) * 60 * 60
3e675fab
PH
1318 if m.group('hours_reversed'):
1319 res += int(m.group('hours_reversed')) * 60 * 60
8f4b58d7
PH
1320 if m.group('days'):
1321 res += int(m.group('days')) * 24 * 60 * 60
7adcbe75
PH
1322 if m.group('ms'):
1323 res += float(m.group('ms'))
608d11f5 1324 return res
91d7d0b3
JMF
1325
1326
1327def prepend_extension(filename, ext):
5f6a1245 1328 name, real_ext = os.path.splitext(filename)
28e614de 1329 return '{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1330
1331
1332def check_executable(exe, args=[]):
1333 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1334 args can be a list of arguments for a short output (like -version) """
1335 try:
1336 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1337 except OSError:
1338 return False
1339 return exe
b7ab0590
PH
1340
1341
95807118 1342def get_exe_version(exe, args=['--version'],
cae97f65 1343 version_re=None, unrecognized='present'):
95807118
PH
1344 """ Returns the version of the specified executable,
1345 or False if the executable is not present """
1346 try:
cae97f65 1347 out, _ = subprocess.Popen(
95807118
PH
1348 [exe] + args,
1349 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1350 except OSError:
1351 return False
cae97f65
PH
1352 if isinstance(out, bytes): # Python 2.x
1353 out = out.decode('ascii', 'ignore')
1354 return detect_exe_version(out, version_re, unrecognized)
1355
1356
1357def detect_exe_version(output, version_re=None, unrecognized='present'):
1358 assert isinstance(output, compat_str)
1359 if version_re is None:
1360 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1361 m = re.search(version_re, output)
95807118
PH
1362 if m:
1363 return m.group(1)
1364 else:
1365 return unrecognized
1366
1367
b7ab0590 1368class PagedList(object):
dd26ced1
PH
1369 def __len__(self):
1370 # This is only useful for tests
1371 return len(self.getslice())
1372
9c44d242
PH
1373
1374class OnDemandPagedList(PagedList):
1375 def __init__(self, pagefunc, pagesize):
1376 self._pagefunc = pagefunc
1377 self._pagesize = pagesize
1378
b7ab0590
PH
1379 def getslice(self, start=0, end=None):
1380 res = []
1381 for pagenum in itertools.count(start // self._pagesize):
1382 firstid = pagenum * self._pagesize
1383 nextfirstid = pagenum * self._pagesize + self._pagesize
1384 if start >= nextfirstid:
1385 continue
1386
1387 page_results = list(self._pagefunc(pagenum))
1388
1389 startv = (
1390 start % self._pagesize
1391 if firstid <= start < nextfirstid
1392 else 0)
1393
1394 endv = (
1395 ((end - 1) % self._pagesize) + 1
1396 if (end is not None and firstid <= end <= nextfirstid)
1397 else None)
1398
1399 if startv != 0 or endv is not None:
1400 page_results = page_results[startv:endv]
1401 res.extend(page_results)
1402
1403 # A little optimization - if current page is not "full", ie. does
1404 # not contain page_size videos then we can assume that this page
1405 # is the last one - there are no more ids on further pages -
1406 # i.e. no need to query again.
1407 if len(page_results) + startv < self._pagesize:
1408 break
1409
1410 # If we got the whole page, but the next page is not interesting,
1411 # break out early as well
1412 if end == nextfirstid:
1413 break
1414 return res
81c2f20b
PH
1415
1416
9c44d242
PH
1417class InAdvancePagedList(PagedList):
1418 def __init__(self, pagefunc, pagecount, pagesize):
1419 self._pagefunc = pagefunc
1420 self._pagecount = pagecount
1421 self._pagesize = pagesize
1422
1423 def getslice(self, start=0, end=None):
1424 res = []
1425 start_page = start // self._pagesize
1426 end_page = (
1427 self._pagecount if end is None else (end // self._pagesize + 1))
1428 skip_elems = start - start_page * self._pagesize
1429 only_more = None if end is None else end - start
1430 for pagenum in range(start_page, end_page):
1431 page = list(self._pagefunc(pagenum))
1432 if skip_elems:
1433 page = page[skip_elems:]
1434 skip_elems = None
1435 if only_more is not None:
1436 if len(page) < only_more:
1437 only_more -= len(page)
1438 else:
1439 page = page[:only_more]
1440 res.extend(page)
1441 break
1442 res.extend(page)
1443 return res
1444
1445
81c2f20b 1446def uppercase_escape(s):
676eb3f2 1447 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1448 return re.sub(
a612753d 1449 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1450 lambda m: unicode_escape(m.group(0))[0],
1451 s)
b53466e1 1452
d05cfe06
S
1453
1454def escape_rfc3986(s):
1455 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1456 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1457 s = s.encode('utf-8')
ecc0c5ee 1458 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1459
1460
1461def escape_url(url):
1462 """Escape URL as suggested by RFC 3986"""
1463 url_parsed = compat_urllib_parse_urlparse(url)
1464 return url_parsed._replace(
1465 path=escape_rfc3986(url_parsed.path),
1466 params=escape_rfc3986(url_parsed.params),
1467 query=escape_rfc3986(url_parsed.query),
1468 fragment=escape_rfc3986(url_parsed.fragment)
1469 ).geturl()
1470
b53466e1 1471try:
28e614de 1472 struct.pack('!I', 0)
b53466e1
PH
1473except TypeError:
1474 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1475 def struct_pack(spec, *args):
1476 if isinstance(spec, compat_str):
1477 spec = spec.encode('ascii')
1478 return struct.pack(spec, *args)
1479
1480 def struct_unpack(spec, *args):
1481 if isinstance(spec, compat_str):
1482 spec = spec.encode('ascii')
1483 return struct.unpack(spec, *args)
1484else:
1485 struct_pack = struct.pack
1486 struct_unpack = struct.unpack
62e609ab
PH
1487
1488
1489def read_batch_urls(batch_fd):
1490 def fixup(url):
1491 if not isinstance(url, compat_str):
1492 url = url.decode('utf-8', 'replace')
28e614de 1493 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1494 if url.startswith(BOM_UTF8):
1495 url = url[len(BOM_UTF8):]
1496 url = url.strip()
1497 if url.startswith(('#', ';', ']')):
1498 return False
1499 return url
1500
1501 with contextlib.closing(batch_fd) as fd:
1502 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1503
1504
1505def urlencode_postdata(*args, **kargs):
1506 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1507
1508
0990305d
PH
1509try:
1510 etree_iter = xml.etree.ElementTree.Element.iter
1511except AttributeError: # Python <=2.6
1512 etree_iter = lambda n: n.findall('.//*')
1513
1514
bcf89ce6
PH
1515def parse_xml(s):
1516 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1517 def doctype(self, name, pubid, system):
1518 pass # Ignore doctypes
1519
1520 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1521 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1522 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1523 # Fix up XML parser in Python 2.x
1524 if sys.version_info < (3, 0):
1525 for n in etree_iter(tree):
1526 if n.text is not None:
1527 if not isinstance(n.text, compat_str):
1528 n.text = n.text.decode('utf-8')
1529 return tree
e68301af
PH
1530
1531
a1a530b0
PH
1532US_RATINGS = {
1533 'G': 0,
1534 'PG': 10,
1535 'PG-13': 13,
1536 'R': 16,
1537 'NC': 18,
1538}
fac55558
PH
1539
1540
146c80e2
S
1541def parse_age_limit(s):
1542 if s is None:
d838b1bd 1543 return None
146c80e2 1544 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d838b1bd 1545 return int(m.group('age')) if m else US_RATINGS.get(s, None)
146c80e2
S
1546
1547
fac55558 1548def strip_jsonp(code):
609a61e3
PH
1549 return re.sub(
1550 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1551
1552
e05f6939
PH
1553def js_to_json(code):
1554 def fix_kv(m):
e7b6d122
PH
1555 v = m.group(0)
1556 if v in ('true', 'false', 'null'):
1557 return v
1558 if v.startswith('"'):
1559 return v
1560 if v.startswith("'"):
1561 v = v[1:-1]
1562 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1563 '\\\\': '\\\\',
1564 "\\'": "'",
1565 '"': '\\"',
1566 }[m.group(0)], v)
1567 return '"%s"' % v
e05f6939
PH
1568
1569 res = re.sub(r'''(?x)
d305dd73
PH
1570 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1571 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
8f4b58d7 1572 [a-zA-Z_][.a-zA-Z_0-9]*
e05f6939
PH
1573 ''', fix_kv, code)
1574 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1575 return res
1576
1577
478c2c61
PH
1578def qualities(quality_ids):
1579 """ Get a numeric quality value out of a list of possible values """
1580 def q(qid):
1581 try:
1582 return quality_ids.index(qid)
1583 except ValueError:
1584 return -1
1585 return q
1586
acd69589
PH
1587
1588DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1589
a020a0dc
PH
1590
1591def limit_length(s, length):
1592 """ Add ellipses to overly long strings """
1593 if s is None:
1594 return None
1595 ELLIPSES = '...'
1596 if len(s) > length:
1597 return s[:length - len(ELLIPSES)] + ELLIPSES
1598 return s
48844745
PH
1599
1600
1601def version_tuple(v):
5f9b8394 1602 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1603
1604
1605def is_outdated_version(version, limit, assume_new=True):
1606 if not version:
1607 return not assume_new
1608 try:
1609 return version_tuple(version) < version_tuple(limit)
1610 except ValueError:
1611 return not assume_new
732ea2f0
PH
1612
1613
1614def ytdl_is_updateable():
1615 """ Returns if youtube-dl can be updated with -U """
1616 from zipimport import zipimporter
1617
1618 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1619
1620
1621def args_to_str(args):
1622 # Get a short string representation for a subprocess command
1623 return ' '.join(shlex_quote(a) for a in args)
2ccd1b10
PH
1624
1625
c460bdd5
PH
1626def mimetype2ext(mt):
1627 _, _, res = mt.rpartition('/')
1628
1629 return {
1630 'x-ms-wmv': 'wmv',
1631 'x-mp4-fragmented': 'mp4',
1632 }.get(res, res)
1633
1634
2ccd1b10
PH
1635def urlhandle_detect_ext(url_handle):
1636 try:
1637 url_handle.headers
1638 getheader = lambda h: url_handle.headers[h]
1639 except AttributeError: # Python < 3
1640 getheader = url_handle.info().getheader
1641
b55ee18f
PH
1642 cd = getheader('Content-Disposition')
1643 if cd:
1644 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1645 if m:
1646 e = determine_ext(m.group('filename'), default_ext=None)
1647 if e:
1648 return e
1649
c460bdd5 1650 return mimetype2ext(getheader('Content-Type'))
05900629
PH
1651
1652
1653def age_restricted(content_limit, age_limit):
1654 """ Returns True iff the content should be blocked """
1655
1656 if age_limit is None: # No limit set
1657 return False
1658 if content_limit is None:
1659 return False # Content available for everyone
1660 return age_limit < content_limit
61ca9a80
PH
1661
1662
1663def is_html(first_bytes):
1664 """ Detect whether a file contains HTML by examining its first bytes. """
1665
1666 BOMS = [
1667 (b'\xef\xbb\xbf', 'utf-8'),
1668 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1669 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1670 (b'\xff\xfe', 'utf-16-le'),
1671 (b'\xfe\xff', 'utf-16-be'),
1672 ]
1673 for bom, enc in BOMS:
1674 if first_bytes.startswith(bom):
1675 s = first_bytes[len(bom):].decode(enc, 'replace')
1676 break
1677 else:
1678 s = first_bytes.decode('utf-8', 'replace')
1679
1680 return re.match(r'^\s*<', s)
a055469f
PH
1681
1682
1683def determine_protocol(info_dict):
1684 protocol = info_dict.get('protocol')
1685 if protocol is not None:
1686 return protocol
1687
1688 url = info_dict['url']
1689 if url.startswith('rtmp'):
1690 return 'rtmp'
1691 elif url.startswith('mms'):
1692 return 'mms'
1693 elif url.startswith('rtsp'):
1694 return 'rtsp'
1695
1696 ext = determine_ext(url)
1697 if ext == 'm3u8':
1698 return 'm3u8'
1699 elif ext == 'f4m':
1700 return 'f4m'
1701
1702 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
1703
1704
1705def render_table(header_row, data):
1706 """ Render a list of rows, each as a list of values """
1707 table = [header_row] + data
1708 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1709 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1710 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
1711
1712
1713def _match_one(filter_part, dct):
1714 COMPARISON_OPERATORS = {
1715 '<': operator.lt,
1716 '<=': operator.le,
1717 '>': operator.gt,
1718 '>=': operator.ge,
1719 '=': operator.eq,
1720 '!=': operator.ne,
1721 }
1722 operator_rex = re.compile(r'''(?x)\s*
1723 (?P<key>[a-z_]+)
1724 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1725 (?:
1726 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1727 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1728 )
1729 \s*$
1730 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1731 m = operator_rex.search(filter_part)
1732 if m:
1733 op = COMPARISON_OPERATORS[m.group('op')]
1734 if m.group('strval') is not None:
1735 if m.group('op') not in ('=', '!='):
1736 raise ValueError(
1737 'Operator %s does not support string values!' % m.group('op'))
1738 comparison_value = m.group('strval')
1739 else:
1740 try:
1741 comparison_value = int(m.group('intval'))
1742 except ValueError:
1743 comparison_value = parse_filesize(m.group('intval'))
1744 if comparison_value is None:
1745 comparison_value = parse_filesize(m.group('intval') + 'B')
1746 if comparison_value is None:
1747 raise ValueError(
1748 'Invalid integer value %r in filter part %r' % (
1749 m.group('intval'), filter_part))
1750 actual_value = dct.get(m.group('key'))
1751 if actual_value is None:
1752 return m.group('none_inclusive')
1753 return op(actual_value, comparison_value)
1754
1755 UNARY_OPERATORS = {
1756 '': lambda v: v is not None,
1757 '!': lambda v: v is None,
1758 }
1759 operator_rex = re.compile(r'''(?x)\s*
1760 (?P<op>%s)\s*(?P<key>[a-z_]+)
1761 \s*$
1762 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1763 m = operator_rex.search(filter_part)
1764 if m:
1765 op = UNARY_OPERATORS[m.group('op')]
1766 actual_value = dct.get(m.group('key'))
1767 return op(actual_value)
1768
1769 raise ValueError('Invalid filter part %r' % filter_part)
1770
1771
1772def match_str(filter_str, dct):
1773 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1774
1775 return all(
1776 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1777
1778
1779def match_filter_func(filter_str):
1780 def _match_func(info_dict):
1781 if match_str(filter_str, info_dict):
1782 return None
1783 else:
1784 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1785 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1786 return _match_func