]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[smotri] Improve extraction (Closes #4698)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
912b38b4 6import calendar
676eb3f2 7import codecs
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96
PH
10import datetime
11import email.utils
f45c185f 12import errno
be4a824d 13import functools
d77c3dfd 14import gzip
b7ab0590 15import itertools
03f9daab 16import io
f4bfd65f 17import json
d77c3dfd 18import locale
02dbf93f 19import math
d77c3dfd 20import os
4eb7f1d1 21import pipes
c496ca96 22import platform
d77c3dfd 23import re
13ebea79 24import ssl
c496ca96 25import socket
b53466e1 26import struct
1c088fa8 27import subprocess
d77c3dfd 28import sys
181c8655 29import tempfile
01951dda 30import traceback
bcf89ce6 31import xml.etree.ElementTree
d77c3dfd 32import zlib
d77c3dfd 33
8c25f81b
PH
34from .compat import (
35 compat_chr,
36 compat_getenv,
37 compat_html_entities,
be4a824d 38 compat_http_client,
8c25f81b 39 compat_parse_qs,
be4a824d 40 compat_socket_create_connection,
8c25f81b
PH
41 compat_str,
42 compat_urllib_error,
43 compat_urllib_parse,
44 compat_urllib_parse_urlparse,
45 compat_urllib_request,
46 compat_urlparse,
7d4111ed 47 shlex_quote,
8c25f81b 48)
4644ac55
S
49
50
468e2e92
FV
51# This is not clearly defined otherwise
52compiled_regex_type = type(re.compile(''))
53
3e669f36 54std_headers = {
ae8f7871 55 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
56 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
57 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
58 'Accept-Encoding': 'gzip, deflate',
59 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 60}
f427df17 61
5f6a1245 62
d77c3dfd 63def preferredencoding():
59ae15a5 64 """Get preferred encoding.
d77c3dfd 65
59ae15a5
PH
66 Returns the best encoding scheme for the system, based on
67 locale.getpreferredencoding() and some further tweaks.
68 """
69 try:
70 pref = locale.getpreferredencoding()
28e614de 71 'TEST'.encode(pref)
59ae15a5
PH
72 except:
73 pref = 'UTF-8'
bae611f2 74
59ae15a5 75 return pref
d77c3dfd 76
f4bfd65f 77
181c8655 78def write_json_file(obj, fn):
1394646a 79 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 80
92120217 81 fn = encodeFilename(fn)
61ee5aeb 82 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
83 encoding = get_filesystem_encoding()
84 # os.path.basename returns a bytes object, but NamedTemporaryFile
85 # will fail if the filename contains non ascii characters unless we
86 # use a unicode object
87 path_basename = lambda f: os.path.basename(fn).decode(encoding)
88 # the same for os.path.dirname
89 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
90 else:
91 path_basename = os.path.basename
92 path_dirname = os.path.dirname
93
73159f99
S
94 args = {
95 'suffix': '.tmp',
ec5f6016
JMF
96 'prefix': path_basename(fn) + '.',
97 'dir': path_dirname(fn),
73159f99
S
98 'delete': False,
99 }
100
181c8655
PH
101 # In Python 2.x, json.dump expects a bytestream.
102 # In Python 3.x, it writes to a character stream
103 if sys.version_info < (3, 0):
73159f99 104 args['mode'] = 'wb'
181c8655 105 else:
73159f99
S
106 args.update({
107 'mode': 'w',
108 'encoding': 'utf-8',
109 })
110
111 tf = tempfile.NamedTemporaryFile(**args)
181c8655
PH
112
113 try:
114 with tf:
115 json.dump(obj, tf)
1394646a
IK
116 if sys.platform == 'win32':
117 # Need to remove existing file on Windows, else os.rename raises
118 # WindowsError or FileExistsError.
119 try:
120 os.unlink(fn)
121 except OSError:
122 pass
181c8655
PH
123 os.rename(tf.name, fn)
124 except:
125 try:
126 os.remove(tf.name)
127 except OSError:
128 pass
129 raise
130
131
132if sys.version_info >= (2, 7):
59ae56fa
PH
133 def find_xpath_attr(node, xpath, key, val):
134 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
135 assert re.match(r'^[a-zA-Z-]+$', key)
136 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
ab4ee31e 137 expr = xpath + "[@%s='%s']" % (key, val)
59ae56fa
PH
138 return node.find(expr)
139else:
140 def find_xpath_attr(node, xpath, key, val):
4eefbfdb
PH
141 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
142 # .//node does not match if a node is a direct child of . !
143 if isinstance(xpath, unicode):
144 xpath = xpath.encode('ascii')
145
59ae56fa
PH
146 for f in node.findall(xpath):
147 if f.attrib.get(key) == val:
148 return f
149 return None
150
d7e66d39
JMF
151# On python2.6 the xml.etree.ElementTree.Element methods don't support
152# the namespace parameter
5f6a1245
JW
153
154
d7e66d39
JMF
155def xpath_with_ns(path, ns_map):
156 components = [c.split(':') for c in path.split('/')]
157 replaced = []
158 for c in components:
159 if len(c) == 1:
160 replaced.append(c[0])
161 else:
162 ns, tag = c
163 replaced.append('{%s}%s' % (ns_map[ns], tag))
164 return '/'.join(replaced)
165
d77c3dfd 166
bf0ff932 167def xpath_text(node, xpath, name=None, fatal=False):
d74bebd5
PH
168 if sys.version_info < (2, 7): # Crazy 2.6
169 xpath = xpath.encode('ascii')
170
bf0ff932 171 n = node.find(xpath)
42bdd9d0 172 if n is None or n.text is None:
bf0ff932
PH
173 if fatal:
174 name = xpath if name is None else name
175 raise ExtractorError('Could not find XML element %s' % name)
176 else:
177 return None
178 return n.text
179
180
9e6dd238 181def get_element_by_id(id, html):
43e8fafd
ND
182 """Return the content of the tag with the specified ID in the passed HTML document"""
183 return get_element_by_attribute("id", id, html)
184
12ea2f30 185
43e8fafd
ND
186def get_element_by_attribute(attribute, value, html):
187 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 188
38285056
PH
189 m = re.search(r'''(?xs)
190 <([a-zA-Z0-9:._-]+)
191 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
192 \s+%s=['"]?%s['"]?
193 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
194 \s*>
195 (?P<content>.*?)
196 </\1>
197 ''' % (re.escape(attribute), re.escape(value)), html)
198
199 if not m:
200 return None
201 res = m.group('content')
202
203 if res.startswith('"') or res.startswith("'"):
204 res = res[1:-1]
a921f407 205
38285056 206 return unescapeHTML(res)
a921f407 207
9e6dd238
FV
208
209def clean_html(html):
59ae15a5 210 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
211
212 if html is None: # Convenience for sanitizing descriptions etc.
213 return html
214
59ae15a5
PH
215 # Newline vs <br />
216 html = html.replace('\n', ' ')
6b3aef80
FV
217 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
218 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
219 # Strip html tags
220 html = re.sub('<.*?>', '', html)
221 # Replace html entities
222 html = unescapeHTML(html)
7decf895 223 return html.strip()
9e6dd238
FV
224
225
d77c3dfd 226def sanitize_open(filename, open_mode):
59ae15a5
PH
227 """Try to open the given filename, and slightly tweak it if this fails.
228
229 Attempts to open the given filename. If this fails, it tries to change
230 the filename slightly, step by step, until it's either able to open it
231 or it fails and raises a final exception, like the standard open()
232 function.
233
234 It returns the tuple (stream, definitive_file_name).
235 """
236 try:
28e614de 237 if filename == '-':
59ae15a5
PH
238 if sys.platform == 'win32':
239 import msvcrt
240 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 241 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
242 stream = open(encodeFilename(filename), open_mode)
243 return (stream, filename)
244 except (IOError, OSError) as err:
f45c185f
PH
245 if err.errno in (errno.EACCES,):
246 raise
59ae15a5 247
f45c185f
PH
248 # In case of error, try to remove win32 forbidden chars
249 alt_filename = os.path.join(
b74e86f4
PH
250 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
251 for path_part in os.path.split(filename)
252 )
f45c185f
PH
253 if alt_filename == filename:
254 raise
255 else:
256 # An exception here should be caught in the caller
257 stream = open(encodeFilename(filename), open_mode)
258 return (stream, alt_filename)
d77c3dfd
FV
259
260
261def timeconvert(timestr):
59ae15a5
PH
262 """Convert RFC 2822 defined time string into system timestamp"""
263 timestamp = None
264 timetuple = email.utils.parsedate_tz(timestr)
265 if timetuple is not None:
266 timestamp = email.utils.mktime_tz(timetuple)
267 return timestamp
1c469a94 268
5f6a1245 269
796173d0 270def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
271 """Sanitizes a string so it could be used as part of a filename.
272 If restricted is set, use a stricter subset of allowed characters.
796173d0 273 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
274 """
275 def replace_insane(char):
276 if char == '?' or ord(char) < 32 or ord(char) == 127:
277 return ''
278 elif char == '"':
279 return '' if restricted else '\''
280 elif char == ':':
281 return '_-' if restricted else ' -'
282 elif char in '\\/|*<>':
283 return '_'
627dcfff 284 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
285 return '_'
286 if restricted and ord(char) > 127:
287 return '_'
288 return char
289
2aeb06d6
PH
290 # Handle timestamps
291 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 292 result = ''.join(map(replace_insane, s))
796173d0
PH
293 if not is_id:
294 while '__' in result:
295 result = result.replace('__', '_')
296 result = result.strip('_')
297 # Common case of "Foreign band name - English song title"
298 if restricted and result.startswith('-_'):
299 result = result[2:]
300 if not result:
301 result = '_'
59ae15a5 302 return result
d77c3dfd 303
5f6a1245 304
d77c3dfd 305def orderedSet(iterable):
59ae15a5
PH
306 """ Remove all duplicates from the input iterable """
307 res = []
308 for el in iterable:
309 if el not in res:
310 res.append(el)
311 return res
d77c3dfd 312
912b38b4 313
4e408e47
PH
314def _htmlentity_transform(entity):
315 """Transforms an HTML entity to a character."""
316 # Known non-numeric HTML entity
317 if entity in compat_html_entities.name2codepoint:
318 return compat_chr(compat_html_entities.name2codepoint[entity])
319
320 mobj = re.match(r'#(x?[0-9]+)', entity)
321 if mobj is not None:
322 numstr = mobj.group(1)
28e614de 323 if numstr.startswith('x'):
4e408e47 324 base = 16
28e614de 325 numstr = '0%s' % numstr
4e408e47
PH
326 else:
327 base = 10
328 return compat_chr(int(numstr, base))
329
330 # Unknown entity in name, return its literal representation
28e614de 331 return ('&%s;' % entity)
4e408e47
PH
332
333
d77c3dfd 334def unescapeHTML(s):
912b38b4
PH
335 if s is None:
336 return None
337 assert type(s) == compat_str
d77c3dfd 338
4e408e47
PH
339 return re.sub(
340 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 341
8bf48f23
PH
342
343def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
344 """
345 @param s The name of the file
346 """
d77c3dfd 347
8bf48f23 348 assert type(s) == compat_str
d77c3dfd 349
59ae15a5
PH
350 # Python 3 has a Unicode API
351 if sys.version_info >= (3, 0):
352 return s
0f00efed 353
59ae15a5 354 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
28e614de 355 # Pass '' directly to use Unicode APIs on Windows 2000 and up
59ae15a5
PH
356 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
357 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
358 if not for_subprocess:
359 return s
360 else:
361 # For subprocess calls, encode with locale encoding
362 # Refer to http://stackoverflow.com/a/9951851/35070
363 encoding = preferredencoding()
59ae15a5 364 else:
6df40dcb 365 encoding = sys.getfilesystemencoding()
8bf48f23
PH
366 if encoding is None:
367 encoding = 'utf-8'
368 return s.encode(encoding, 'ignore')
369
f07b74fc
PH
370
371def encodeArgument(s):
372 if not isinstance(s, compat_str):
373 # Legacy code that uses byte strings
374 # Uncomment the following line after fixing all post processors
7af808a5 375 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
376 s = s.decode('ascii')
377 return encodeFilename(s, True)
378
379
8271226a
PH
380def decodeOption(optval):
381 if optval is None:
382 return optval
383 if isinstance(optval, bytes):
384 optval = optval.decode(preferredencoding())
385
386 assert isinstance(optval, compat_str)
387 return optval
1c256f70 388
5f6a1245 389
4539dd30
PH
390def formatSeconds(secs):
391 if secs > 3600:
392 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
393 elif secs > 60:
394 return '%d:%02d' % (secs // 60, secs % 60)
395 else:
396 return '%d' % secs
397
a0ddb8a2 398
be4a824d
PH
399def make_HTTPS_handler(params, **kwargs):
400 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba
JMF
401 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
402 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
0db261ba
JMF
403 if opts_no_check_certificate:
404 context.verify_mode = ssl.CERT_NONE
a2366922 405 try:
be4a824d 406 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
407 except TypeError:
408 # Python 2.7.8
409 # (create_default_context present but HTTPSHandler has no context=)
410 pass
411
412 if sys.version_info < (3, 2):
13ebea79
PH
413 import httplib
414
415 class HTTPSConnectionV3(httplib.HTTPSConnection):
416 def __init__(self, *args, **kwargs):
417 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
418
419 def connect(self):
420 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 421 if getattr(self, '_tunnel_host', False):
13ebea79
PH
422 self.sock = sock
423 self._tunnel()
424 try:
aa37e3d4 425 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
de79c46c 426 except ssl.SSLError:
13ebea79
PH
427 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
428
be4a824d 429 return YoutubeDLHTTPSHandler(params, https_conn_class=HTTPSConnectionV3, **kwargs)
aa37e3d4
PH
430 else: # Python < 3.4
431 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
ea6d901e 432 context.verify_mode = (ssl.CERT_NONE
dca08720 433 if opts_no_check_certificate
ea6d901e 434 else ssl.CERT_REQUIRED)
303b479e 435 context.set_default_verify_paths()
be4a824d 436 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 437
732ea2f0 438
1c256f70
PH
439class ExtractorError(Exception):
440 """Error during info extraction."""
5f6a1245 441
d11271dd 442 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
443 """ tb, if given, is the original traceback (so that it can be printed out).
444 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
445 """
446
447 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
448 expected = True
d11271dd
PH
449 if video_id is not None:
450 msg = video_id + ': ' + msg
410f3e73 451 if cause:
28e614de 452 msg += ' (caused by %r)' % cause
9a82b238 453 if not expected:
732ea2f0
PH
454 if ytdl_is_updateable():
455 update_cmd = 'type youtube-dl -U to update'
456 else:
457 update_cmd = 'see https://yt-dl.org/update on how to update'
458 msg += '; please report this issue on https://yt-dl.org/bug .'
459 msg += ' Make sure you are using the latest version; %s.' % update_cmd
460 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
1c256f70 461 super(ExtractorError, self).__init__(msg)
d5979c5d 462
1c256f70 463 self.traceback = tb
8cc83b8d 464 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 465 self.cause = cause
d11271dd 466 self.video_id = video_id
1c256f70 467
01951dda
PH
468 def format_traceback(self):
469 if self.traceback is None:
470 return None
28e614de 471 return ''.join(traceback.format_tb(self.traceback))
01951dda 472
1c256f70 473
416c7fcb
PH
474class UnsupportedError(ExtractorError):
475 def __init__(self, url):
476 super(UnsupportedError, self).__init__(
477 'Unsupported URL: %s' % url, expected=True)
478 self.url = url
479
480
55b3e45b
JMF
481class RegexNotFoundError(ExtractorError):
482 """Error when a regex didn't match"""
483 pass
484
485
d77c3dfd 486class DownloadError(Exception):
59ae15a5 487 """Download Error exception.
d77c3dfd 488
59ae15a5
PH
489 This exception may be thrown by FileDownloader objects if they are not
490 configured to continue on errors. They will contain the appropriate
491 error message.
492 """
5f6a1245 493
8cc83b8d
FV
494 def __init__(self, msg, exc_info=None):
495 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
496 super(DownloadError, self).__init__(msg)
497 self.exc_info = exc_info
d77c3dfd
FV
498
499
500class SameFileError(Exception):
59ae15a5 501 """Same File exception.
d77c3dfd 502
59ae15a5
PH
503 This exception will be thrown by FileDownloader objects if they detect
504 multiple files would have to be downloaded to the same file on disk.
505 """
506 pass
d77c3dfd
FV
507
508
509class PostProcessingError(Exception):
59ae15a5 510 """Post Processing exception.
d77c3dfd 511
59ae15a5
PH
512 This exception may be raised by PostProcessor's .run() method to
513 indicate an error in the postprocessing task.
514 """
5f6a1245 515
7851b379
PH
516 def __init__(self, msg):
517 self.msg = msg
d77c3dfd 518
5f6a1245 519
d77c3dfd 520class MaxDownloadsReached(Exception):
59ae15a5
PH
521 """ --max-downloads limit has been reached. """
522 pass
d77c3dfd
FV
523
524
525class UnavailableVideoError(Exception):
59ae15a5 526 """Unavailable Format exception.
d77c3dfd 527
59ae15a5
PH
528 This exception will be thrown when a video is requested
529 in a format that is not available for that video.
530 """
531 pass
d77c3dfd
FV
532
533
534class ContentTooShortError(Exception):
59ae15a5 535 """Content Too Short exception.
d77c3dfd 536
59ae15a5
PH
537 This exception may be raised by FileDownloader objects when a file they
538 download is too small for what the server announced first, indicating
539 the connection was probably interrupted.
540 """
541 # Both in bytes
542 downloaded = None
543 expected = None
d77c3dfd 544
59ae15a5
PH
545 def __init__(self, downloaded, expected):
546 self.downloaded = downloaded
547 self.expected = expected
d77c3dfd 548
5f6a1245 549
c5a59d93 550def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
be4a824d
PH
551 hc = http_class(*args, **kwargs)
552 source_address = ydl_handler._params.get('source_address')
553 if source_address is not None:
554 sa = (source_address, 0)
555 if hasattr(hc, 'source_address'): # Python 2.7+
556 hc.source_address = sa
557 else: # Python 2.6
558 def _hc_connect(self, *args, **kwargs):
559 sock = compat_socket_create_connection(
560 (self.host, self.port), self.timeout, sa)
561 if is_https:
562 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file)
563 else:
564 self.sock = sock
565 hc.connect = functools.partial(_hc_connect, hc)
566
567 return hc
568
569
acebc9cd 570class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
571 """Handler for HTTP requests and responses.
572
573 This class, when installed with an OpenerDirector, automatically adds
574 the standard headers to every HTTP request and handles gzipped and
575 deflated responses from web servers. If compression is to be avoided in
576 a particular request, the original request in the program code only has
577 to include the HTTP header "Youtubedl-No-Compression", which will be
578 removed before making the real request.
579
580 Part of this code was copied from:
581
582 http://techknack.net/python-urllib2-handlers/
583
584 Andrew Rowls, the author of that code, agreed to release it to the
585 public domain.
586 """
587
be4a824d
PH
588 def __init__(self, params, *args, **kwargs):
589 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
590 self._params = params
591
592 def http_open(self, req):
593 return self.do_open(functools.partial(
c5a59d93 594 _create_http_connection, self, compat_http_client.HTTPConnection, False),
be4a824d
PH
595 req)
596
59ae15a5
PH
597 @staticmethod
598 def deflate(data):
599 try:
600 return zlib.decompress(data, -zlib.MAX_WBITS)
601 except zlib.error:
602 return zlib.decompress(data)
603
604 @staticmethod
605 def addinfourl_wrapper(stream, headers, url, code):
606 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
607 return compat_urllib_request.addinfourl(stream, headers, url, code)
608 ret = compat_urllib_request.addinfourl(stream, headers, url)
609 ret.code = code
610 return ret
611
acebc9cd 612 def http_request(self, req):
33ac271b
PH
613 for h, v in std_headers.items():
614 if h not in req.headers:
615 req.add_header(h, v)
59ae15a5
PH
616 if 'Youtubedl-no-compression' in req.headers:
617 if 'Accept-encoding' in req.headers:
618 del req.headers['Accept-encoding']
619 del req.headers['Youtubedl-no-compression']
3446dfb7 620 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
621 if 'User-agent' in req.headers:
622 del req.headers['User-agent']
623 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 624 del req.headers['Youtubedl-user-agent']
989b4b2b
PH
625
626 if sys.version_info < (2, 7) and '#' in req.get_full_url():
627 # Python 2.6 is brain-dead when it comes to fragments
628 req._Request__original = req._Request__original.partition('#')[0]
629 req._Request__r_type = req._Request__r_type.partition('#')[0]
630
59ae15a5
PH
631 return req
632
acebc9cd 633 def http_response(self, req, resp):
59ae15a5
PH
634 old_resp = resp
635 # gzip
636 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
637 content = resp.read()
638 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
639 try:
640 uncompressed = io.BytesIO(gz.read())
641 except IOError as original_ioerror:
642 # There may be junk add the end of the file
643 # See http://stackoverflow.com/q/4928560/35070 for details
644 for i in range(1, 1024):
645 try:
646 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
647 uncompressed = io.BytesIO(gz.read())
648 except IOError:
649 continue
650 break
651 else:
652 raise original_ioerror
653 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
654 resp.msg = old_resp.msg
655 # deflate
656 if resp.headers.get('Content-encoding', '') == 'deflate':
657 gz = io.BytesIO(self.deflate(resp.read()))
658 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
659 resp.msg = old_resp.msg
660 return resp
0f8d03f8 661
acebc9cd
PH
662 https_request = http_request
663 https_response = http_response
bf50b038 664
5de90176 665
be4a824d
PH
666class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
667 def __init__(self, params, https_conn_class=None, *args, **kwargs):
668 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
669 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
670 self._params = params
671
672 def https_open(self, req):
673 return self.do_open(functools.partial(
674 _create_http_connection, self, self._https_conn_class, True),
675 req)
676
677
305d0683 678def parse_iso8601(date_str, delimiter='T'):
912b38b4
PH
679 """ Return a UNIX timestamp from the given date """
680
681 if date_str is None:
682 return None
683
684 m = re.search(
6ad4013d 685 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
912b38b4
PH
686 date_str)
687 if not m:
688 timezone = datetime.timedelta()
689 else:
690 date_str = date_str[:-len(m.group(0))]
691 if not m.group('sign'):
692 timezone = datetime.timedelta()
693 else:
694 sign = 1 if m.group('sign') == '+' else -1
695 timezone = datetime.timedelta(
696 hours=sign * int(m.group('hours')),
697 minutes=sign * int(m.group('minutes')))
6ad4013d 698 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
305d0683 699 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
700 return calendar.timegm(dt.timetuple())
701
702
42bdd9d0 703def unified_strdate(date_str, day_first=True):
bf50b038 704 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
705
706 if date_str is None:
707 return None
bf50b038 708 upload_date = None
5f6a1245 709 # Replace commas
026fcc04 710 date_str = date_str.replace(',', ' ')
bf50b038 711 # %z (UTC offset) is only supported in python>=3.2
026fcc04 712 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0
PH
713 # Remove AM/PM + timezone
714 date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
715
19e1d359
JMF
716 format_expressions = [
717 '%d %B %Y',
0f99566c 718 '%d %b %Y',
19e1d359
JMF
719 '%B %d %Y',
720 '%b %d %Y',
78ff59d0
PP
721 '%b %dst %Y %I:%M%p',
722 '%b %dnd %Y %I:%M%p',
723 '%b %dth %Y %I:%M%p',
a69801e2 724 '%Y %m %d',
19e1d359 725 '%Y-%m-%d',
fe556f1b 726 '%Y/%m/%d',
19e1d359 727 '%Y/%m/%d %H:%M:%S',
5d73273f 728 '%Y-%m-%d %H:%M:%S',
e9be9a6a 729 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 730 '%d.%m.%Y %H:%M',
b047de6f 731 '%d.%m.%Y %H.%M',
19e1d359 732 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
733 '%Y-%m-%dT%H:%M:%S.%fZ',
734 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 735 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 736 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 737 '%Y-%m-%dT%H:%M',
19e1d359 738 ]
42bdd9d0
PH
739 if day_first:
740 format_expressions.extend([
776dc399
S
741 '%d.%m.%Y',
742 '%d/%m/%Y',
743 '%d/%m/%y',
42bdd9d0
PH
744 '%d/%m/%Y %H:%M:%S',
745 ])
746 else:
747 format_expressions.extend([
776dc399
S
748 '%m.%d.%Y',
749 '%m/%d/%Y',
750 '%m/%d/%y',
42bdd9d0
PH
751 '%m/%d/%Y %H:%M:%S',
752 ])
bf50b038
JMF
753 for expression in format_expressions:
754 try:
755 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 756 except ValueError:
bf50b038 757 pass
42393ce2
PH
758 if upload_date is None:
759 timetuple = email.utils.parsedate_tz(date_str)
760 if timetuple:
761 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
762 return upload_date
763
5f6a1245 764
28e614de 765def determine_ext(url, default_ext='unknown_video'):
f4776371
S
766 if url is None:
767 return default_ext
28e614de 768 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
769 if re.match(r'^[A-Za-z0-9]+$', guess):
770 return guess
771 else:
cbdbb766 772 return default_ext
73e79f2a 773
5f6a1245 774
d4051a8e 775def subtitles_filename(filename, sub_lang, sub_format):
28e614de 776 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 777
5f6a1245 778
bd558525 779def date_from_str(date_str):
37254abc
JMF
780 """
781 Return a datetime object from a string in the format YYYYMMDD or
782 (now|today)[+-][0-9](day|week|month|year)(s)?"""
783 today = datetime.date.today()
f8795e10 784 if date_str in ('now', 'today'):
37254abc 785 return today
f8795e10
PH
786 if date_str == 'yesterday':
787 return today - datetime.timedelta(days=1)
37254abc
JMF
788 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
789 if match is not None:
790 sign = match.group('sign')
791 time = int(match.group('time'))
792 if sign == '-':
793 time = -time
794 unit = match.group('unit')
5f6a1245 795 # A bad aproximation?
37254abc
JMF
796 if unit == 'month':
797 unit = 'day'
798 time *= 30
799 elif unit == 'year':
800 unit = 'day'
801 time *= 365
802 unit += 's'
803 delta = datetime.timedelta(**{unit: time})
804 return today + delta
bd558525 805 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
5f6a1245
JW
806
807
e63fc1be 808def hyphenate_date(date_str):
809 """
810 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
811 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
812 if match is not None:
813 return '-'.join(match.groups())
814 else:
815 return date_str
816
5f6a1245 817
bd558525
JMF
818class DateRange(object):
819 """Represents a time interval between two dates"""
5f6a1245 820
bd558525
JMF
821 def __init__(self, start=None, end=None):
822 """start and end must be strings in the format accepted by date"""
823 if start is not None:
824 self.start = date_from_str(start)
825 else:
826 self.start = datetime.datetime.min.date()
827 if end is not None:
828 self.end = date_from_str(end)
829 else:
830 self.end = datetime.datetime.max.date()
37254abc 831 if self.start > self.end:
bd558525 832 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 833
bd558525
JMF
834 @classmethod
835 def day(cls, day):
836 """Returns a range that only contains the given day"""
5f6a1245
JW
837 return cls(day, day)
838
bd558525
JMF
839 def __contains__(self, date):
840 """Check if the date is in the range"""
37254abc
JMF
841 if not isinstance(date, datetime.date):
842 date = date_from_str(date)
843 return self.start <= date <= self.end
5f6a1245 844
bd558525 845 def __str__(self):
5f6a1245 846 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
847
848
849def platform_name():
850 """ Returns the platform name as a compat_str """
851 res = platform.platform()
852 if isinstance(res, bytes):
853 res = res.decode(preferredencoding())
854
855 assert isinstance(res, compat_str)
856 return res
c257baff
PH
857
858
b58ddb32
PH
859def _windows_write_string(s, out):
860 """ Returns True if the string was written using special methods,
861 False if it has yet to be written out."""
862 # Adapted from http://stackoverflow.com/a/3259271/35070
863
864 import ctypes
865 import ctypes.wintypes
866
867 WIN_OUTPUT_IDS = {
868 1: -11,
869 2: -12,
870 }
871
a383a98a
PH
872 try:
873 fileno = out.fileno()
874 except AttributeError:
875 # If the output stream doesn't have a fileno, it's virtual
876 return False
b58ddb32
PH
877 if fileno not in WIN_OUTPUT_IDS:
878 return False
879
e2f89ec7 880 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 881 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
6ac4e806 882 (b"GetStdHandle", ctypes.windll.kernel32))
b58ddb32
PH
883 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
884
e2f89ec7 885 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
886 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
887 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
6ac4e806 888 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
b58ddb32
PH
889 written = ctypes.wintypes.DWORD(0)
890
6ac4e806 891 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
b58ddb32
PH
892 FILE_TYPE_CHAR = 0x0002
893 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 894 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
895 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
896 ctypes.POINTER(ctypes.wintypes.DWORD))(
6ac4e806 897 (b"GetConsoleMode", ctypes.windll.kernel32))
b58ddb32
PH
898 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
899
900 def not_a_console(handle):
901 if handle == INVALID_HANDLE_VALUE or handle is None:
902 return True
903 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
904 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
905
906 if not_a_console(h):
907 return False
908
d1b9c912
PH
909 def next_nonbmp_pos(s):
910 try:
911 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
912 except StopIteration:
913 return len(s)
914
915 while s:
916 count = min(next_nonbmp_pos(s), 1024)
917
b58ddb32 918 ret = WriteConsoleW(
d1b9c912 919 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
920 if ret == 0:
921 raise OSError('Failed to write string')
d1b9c912
PH
922 if not count: # We just wrote a non-BMP character
923 assert written.value == 2
924 s = s[1:]
925 else:
926 assert written.value > 0
927 s = s[written.value:]
b58ddb32
PH
928 return True
929
930
734f90bb 931def write_string(s, out=None, encoding=None):
7459e3a2
PH
932 if out is None:
933 out = sys.stderr
8bf48f23 934 assert type(s) == compat_str
7459e3a2 935
b58ddb32
PH
936 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
937 if _windows_write_string(s, out):
938 return
939
7459e3a2
PH
940 if ('b' in getattr(out, 'mode', '') or
941 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
942 byt = s.encode(encoding or preferredencoding(), 'ignore')
943 out.write(byt)
944 elif hasattr(out, 'buffer'):
945 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
946 byt = s.encode(enc, 'ignore')
947 out.buffer.write(byt)
948 else:
8bf48f23 949 out.write(s)
7459e3a2
PH
950 out.flush()
951
952
48ea9cea
PH
953def bytes_to_intlist(bs):
954 if not bs:
955 return []
956 if isinstance(bs[0], int): # Python 3
957 return list(bs)
958 else:
959 return [ord(c) for c in bs]
960
c257baff 961
cba892fa 962def intlist_to_bytes(xs):
963 if not xs:
964 return b''
eb4157fd 965 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
966
967
c1c9a79c
PH
968# Cross-platform file locking
969if sys.platform == 'win32':
970 import ctypes.wintypes
971 import msvcrt
972
973 class OVERLAPPED(ctypes.Structure):
974 _fields_ = [
975 ('Internal', ctypes.wintypes.LPVOID),
976 ('InternalHigh', ctypes.wintypes.LPVOID),
977 ('Offset', ctypes.wintypes.DWORD),
978 ('OffsetHigh', ctypes.wintypes.DWORD),
979 ('hEvent', ctypes.wintypes.HANDLE),
980 ]
981
982 kernel32 = ctypes.windll.kernel32
983 LockFileEx = kernel32.LockFileEx
984 LockFileEx.argtypes = [
985 ctypes.wintypes.HANDLE, # hFile
986 ctypes.wintypes.DWORD, # dwFlags
987 ctypes.wintypes.DWORD, # dwReserved
988 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
989 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
990 ctypes.POINTER(OVERLAPPED) # Overlapped
991 ]
992 LockFileEx.restype = ctypes.wintypes.BOOL
993 UnlockFileEx = kernel32.UnlockFileEx
994 UnlockFileEx.argtypes = [
995 ctypes.wintypes.HANDLE, # hFile
996 ctypes.wintypes.DWORD, # dwReserved
997 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
998 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
999 ctypes.POINTER(OVERLAPPED) # Overlapped
1000 ]
1001 UnlockFileEx.restype = ctypes.wintypes.BOOL
1002 whole_low = 0xffffffff
1003 whole_high = 0x7fffffff
1004
1005 def _lock_file(f, exclusive):
1006 overlapped = OVERLAPPED()
1007 overlapped.Offset = 0
1008 overlapped.OffsetHigh = 0
1009 overlapped.hEvent = 0
1010 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1011 handle = msvcrt.get_osfhandle(f.fileno())
1012 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1013 whole_low, whole_high, f._lock_file_overlapped_p):
1014 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1015
1016 def _unlock_file(f):
1017 assert f._lock_file_overlapped_p
1018 handle = msvcrt.get_osfhandle(f.fileno())
1019 if not UnlockFileEx(handle, 0,
1020 whole_low, whole_high, f._lock_file_overlapped_p):
1021 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1022
1023else:
1024 import fcntl
1025
1026 def _lock_file(f, exclusive):
2582bebe 1027 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
1028
1029 def _unlock_file(f):
2582bebe 1030 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
1031
1032
1033class locked_file(object):
1034 def __init__(self, filename, mode, encoding=None):
1035 assert mode in ['r', 'a', 'w']
1036 self.f = io.open(filename, mode, encoding=encoding)
1037 self.mode = mode
1038
1039 def __enter__(self):
1040 exclusive = self.mode != 'r'
1041 try:
1042 _lock_file(self.f, exclusive)
1043 except IOError:
1044 self.f.close()
1045 raise
1046 return self
1047
1048 def __exit__(self, etype, value, traceback):
1049 try:
1050 _unlock_file(self.f)
1051 finally:
1052 self.f.close()
1053
1054 def __iter__(self):
1055 return iter(self.f)
1056
1057 def write(self, *args):
1058 return self.f.write(*args)
1059
1060 def read(self, *args):
1061 return self.f.read(*args)
4eb7f1d1
JMF
1062
1063
4644ac55
S
1064def get_filesystem_encoding():
1065 encoding = sys.getfilesystemencoding()
1066 return encoding if encoding is not None else 'utf-8'
1067
1068
4eb7f1d1 1069def shell_quote(args):
a6a173c2 1070 quoted_args = []
4644ac55 1071 encoding = get_filesystem_encoding()
a6a173c2
JMF
1072 for a in args:
1073 if isinstance(a, bytes):
1074 # We may get a filename encoded with 'encodeFilename'
1075 a = a.decode(encoding)
1076 quoted_args.append(pipes.quote(a))
28e614de 1077 return ' '.join(quoted_args)
9d4660ca
PH
1078
1079
f4d96df0
PH
1080def takewhile_inclusive(pred, seq):
1081 """ Like itertools.takewhile, but include the latest evaluated element
1082 (the first element so that Not pred(e)) """
1083 for e in seq:
1084 yield e
1085 if not pred(e):
1086 return
1087
1088
9d4660ca
PH
1089def smuggle_url(url, data):
1090 """ Pass additional data in a URL for internal use. """
1091
1092 sdata = compat_urllib_parse.urlencode(
28e614de
PH
1093 {'__youtubedl_smuggle': json.dumps(data)})
1094 return url + '#' + sdata
9d4660ca
PH
1095
1096
79f82953 1097def unsmuggle_url(smug_url, default=None):
83e865a3 1098 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1099 return smug_url, default
28e614de
PH
1100 url, _, sdata = smug_url.rpartition('#')
1101 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1102 data = json.loads(jsond)
1103 return url, data
02dbf93f
PH
1104
1105
02dbf93f
PH
1106def format_bytes(bytes):
1107 if bytes is None:
28e614de 1108 return 'N/A'
02dbf93f
PH
1109 if type(bytes) is str:
1110 bytes = float(bytes)
1111 if bytes == 0.0:
1112 exponent = 0
1113 else:
1114 exponent = int(math.log(bytes, 1024.0))
28e614de 1115 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1116 converted = float(bytes) / float(1024 ** exponent)
28e614de 1117 return '%.2f%s' % (converted, suffix)
f53c966a 1118
1c088fa8 1119
be64b5b0
PH
1120def parse_filesize(s):
1121 if s is None:
1122 return None
1123
1124 # The lower-case forms are of course incorrect and inofficial,
1125 # but we support those too
1126 _UNIT_TABLE = {
1127 'B': 1,
1128 'b': 1,
1129 'KiB': 1024,
1130 'KB': 1000,
1131 'kB': 1024,
1132 'Kb': 1000,
1133 'MiB': 1024 ** 2,
1134 'MB': 1000 ** 2,
1135 'mB': 1024 ** 2,
1136 'Mb': 1000 ** 2,
1137 'GiB': 1024 ** 3,
1138 'GB': 1000 ** 3,
1139 'gB': 1024 ** 3,
1140 'Gb': 1000 ** 3,
1141 'TiB': 1024 ** 4,
1142 'TB': 1000 ** 4,
1143 'tB': 1024 ** 4,
1144 'Tb': 1000 ** 4,
1145 'PiB': 1024 ** 5,
1146 'PB': 1000 ** 5,
1147 'pB': 1024 ** 5,
1148 'Pb': 1000 ** 5,
1149 'EiB': 1024 ** 6,
1150 'EB': 1000 ** 6,
1151 'eB': 1024 ** 6,
1152 'Eb': 1000 ** 6,
1153 'ZiB': 1024 ** 7,
1154 'ZB': 1000 ** 7,
1155 'zB': 1024 ** 7,
1156 'Zb': 1000 ** 7,
1157 'YiB': 1024 ** 8,
1158 'YB': 1000 ** 8,
1159 'yB': 1024 ** 8,
1160 'Yb': 1000 ** 8,
1161 }
1162
1163 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
4349c07d
PH
1164 m = re.match(
1165 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
be64b5b0
PH
1166 if not m:
1167 return None
1168
4349c07d
PH
1169 num_str = m.group('num').replace(',', '.')
1170 mult = _UNIT_TABLE[m.group('unit')]
1171 return int(float(num_str) * mult)
be64b5b0
PH
1172
1173
1c088fa8 1174def get_term_width():
4644ac55 1175 columns = compat_getenv('COLUMNS', None)
1c088fa8
PH
1176 if columns:
1177 return int(columns)
1178
1179 try:
1180 sp = subprocess.Popen(
1181 ['stty', 'size'],
1182 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1183 out, err = sp.communicate()
1184 return int(out.split()[1])
1185 except:
1186 pass
1187 return None
caefb1de
PH
1188
1189
1190def month_by_name(name):
1191 """ Return the number of a month by (locale-independently) English name """
1192
1193 ENGLISH_NAMES = [
28e614de
PH
1194 'January', 'February', 'March', 'April', 'May', 'June',
1195 'July', 'August', 'September', 'October', 'November', 'December']
caefb1de
PH
1196 try:
1197 return ENGLISH_NAMES.index(name) + 1
1198 except ValueError:
1199 return None
18258362
JMF
1200
1201
5aafe895 1202def fix_xml_ampersands(xml_str):
18258362 1203 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1204 return re.sub(
1205 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1206 '&amp;',
5aafe895 1207 xml_str)
e3946f98
PH
1208
1209
1210def setproctitle(title):
8bf48f23 1211 assert isinstance(title, compat_str)
e3946f98
PH
1212 try:
1213 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1214 except OSError:
1215 return
6eefe533
PH
1216 title_bytes = title.encode('utf-8')
1217 buf = ctypes.create_string_buffer(len(title_bytes))
1218 buf.value = title_bytes
e3946f98 1219 try:
6eefe533 1220 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1221 except AttributeError:
1222 return # Strange libc, just skip this
d7dda168
PH
1223
1224
1225def remove_start(s, start):
1226 if s.startswith(start):
1227 return s[len(start):]
1228 return s
29eb5174
PH
1229
1230
2b9faf55
PH
1231def remove_end(s, end):
1232 if s.endswith(end):
1233 return s[:-len(end)]
1234 return s
1235
1236
29eb5174 1237def url_basename(url):
9b8aaeed 1238 path = compat_urlparse.urlparse(url).path
28e614de 1239 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1240
1241
1242class HEADRequest(compat_urllib_request.Request):
1243 def get_method(self):
1244 return "HEAD"
7217e148
PH
1245
1246
9732d77e 1247def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1248 if get_attr:
1249 if v is not None:
1250 v = getattr(v, get_attr, None)
9572013d
PH
1251 if v == '':
1252 v = None
9732d77e
PH
1253 return default if v is None else (int(v) * invscale // scale)
1254
9572013d 1255
40a90862
JMF
1256def str_or_none(v, default=None):
1257 return default if v is None else compat_str(v)
1258
9732d77e
PH
1259
1260def str_to_int(int_str):
48d4681e 1261 """ A more relaxed version of int_or_none """
9732d77e
PH
1262 if int_str is None:
1263 return None
28e614de 1264 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1265 return int(int_str)
608d11f5
PH
1266
1267
9732d77e
PH
1268def float_or_none(v, scale=1, invscale=1, default=None):
1269 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1270
1271
608d11f5 1272def parse_duration(s):
227d4822 1273 if not isinstance(s, basestring if sys.version_info < (3, 0) else compat_str):
608d11f5
PH
1274 return None
1275
ca7b3246
S
1276 s = s.strip()
1277
608d11f5 1278 m = re.match(
6a68bb57 1279 r'''(?ix)T?
e8df5cee
PH
1280 (?:
1281 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1282 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1283
6a68bb57
PH
1284 (?:
1285 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1286 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1287 )?
e8df5cee
PH
1288 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1289 )$''', s)
608d11f5
PH
1290 if not m:
1291 return None
e8df5cee
PH
1292 res = 0
1293 if m.group('only_mins'):
1294 return float_or_none(m.group('only_mins'), invscale=60)
1295 if m.group('only_hours'):
1296 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1297 if m.group('secs'):
1298 res += int(m.group('secs'))
608d11f5
PH
1299 if m.group('mins'):
1300 res += int(m.group('mins')) * 60
e8df5cee
PH
1301 if m.group('hours'):
1302 res += int(m.group('hours')) * 60 * 60
7adcbe75
PH
1303 if m.group('ms'):
1304 res += float(m.group('ms'))
608d11f5 1305 return res
91d7d0b3
JMF
1306
1307
1308def prepend_extension(filename, ext):
5f6a1245 1309 name, real_ext = os.path.splitext(filename)
28e614de 1310 return '{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1311
1312
1313def check_executable(exe, args=[]):
1314 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1315 args can be a list of arguments for a short output (like -version) """
1316 try:
1317 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1318 except OSError:
1319 return False
1320 return exe
b7ab0590
PH
1321
1322
95807118 1323def get_exe_version(exe, args=['--version'],
cae97f65 1324 version_re=None, unrecognized='present'):
95807118
PH
1325 """ Returns the version of the specified executable,
1326 or False if the executable is not present """
1327 try:
cae97f65 1328 out, _ = subprocess.Popen(
95807118
PH
1329 [exe] + args,
1330 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1331 except OSError:
1332 return False
cae97f65
PH
1333 if isinstance(out, bytes): # Python 2.x
1334 out = out.decode('ascii', 'ignore')
1335 return detect_exe_version(out, version_re, unrecognized)
1336
1337
1338def detect_exe_version(output, version_re=None, unrecognized='present'):
1339 assert isinstance(output, compat_str)
1340 if version_re is None:
1341 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1342 m = re.search(version_re, output)
95807118
PH
1343 if m:
1344 return m.group(1)
1345 else:
1346 return unrecognized
1347
1348
b7ab0590 1349class PagedList(object):
dd26ced1
PH
1350 def __len__(self):
1351 # This is only useful for tests
1352 return len(self.getslice())
1353
9c44d242
PH
1354
1355class OnDemandPagedList(PagedList):
1356 def __init__(self, pagefunc, pagesize):
1357 self._pagefunc = pagefunc
1358 self._pagesize = pagesize
1359
b7ab0590
PH
1360 def getslice(self, start=0, end=None):
1361 res = []
1362 for pagenum in itertools.count(start // self._pagesize):
1363 firstid = pagenum * self._pagesize
1364 nextfirstid = pagenum * self._pagesize + self._pagesize
1365 if start >= nextfirstid:
1366 continue
1367
1368 page_results = list(self._pagefunc(pagenum))
1369
1370 startv = (
1371 start % self._pagesize
1372 if firstid <= start < nextfirstid
1373 else 0)
1374
1375 endv = (
1376 ((end - 1) % self._pagesize) + 1
1377 if (end is not None and firstid <= end <= nextfirstid)
1378 else None)
1379
1380 if startv != 0 or endv is not None:
1381 page_results = page_results[startv:endv]
1382 res.extend(page_results)
1383
1384 # A little optimization - if current page is not "full", ie. does
1385 # not contain page_size videos then we can assume that this page
1386 # is the last one - there are no more ids on further pages -
1387 # i.e. no need to query again.
1388 if len(page_results) + startv < self._pagesize:
1389 break
1390
1391 # If we got the whole page, but the next page is not interesting,
1392 # break out early as well
1393 if end == nextfirstid:
1394 break
1395 return res
81c2f20b
PH
1396
1397
9c44d242
PH
1398class InAdvancePagedList(PagedList):
1399 def __init__(self, pagefunc, pagecount, pagesize):
1400 self._pagefunc = pagefunc
1401 self._pagecount = pagecount
1402 self._pagesize = pagesize
1403
1404 def getslice(self, start=0, end=None):
1405 res = []
1406 start_page = start // self._pagesize
1407 end_page = (
1408 self._pagecount if end is None else (end // self._pagesize + 1))
1409 skip_elems = start - start_page * self._pagesize
1410 only_more = None if end is None else end - start
1411 for pagenum in range(start_page, end_page):
1412 page = list(self._pagefunc(pagenum))
1413 if skip_elems:
1414 page = page[skip_elems:]
1415 skip_elems = None
1416 if only_more is not None:
1417 if len(page) < only_more:
1418 only_more -= len(page)
1419 else:
1420 page = page[:only_more]
1421 res.extend(page)
1422 break
1423 res.extend(page)
1424 return res
1425
1426
81c2f20b 1427def uppercase_escape(s):
676eb3f2 1428 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1429 return re.sub(
a612753d 1430 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1431 lambda m: unicode_escape(m.group(0))[0],
1432 s)
b53466e1 1433
d05cfe06
S
1434
1435def escape_rfc3986(s):
1436 """Escape non-ASCII characters as suggested by RFC 3986"""
1437 if sys.version_info < (3, 0) and isinstance(s, unicode):
1438 s = s.encode('utf-8')
ecc0c5ee 1439 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1440
1441
1442def escape_url(url):
1443 """Escape URL as suggested by RFC 3986"""
1444 url_parsed = compat_urllib_parse_urlparse(url)
1445 return url_parsed._replace(
1446 path=escape_rfc3986(url_parsed.path),
1447 params=escape_rfc3986(url_parsed.params),
1448 query=escape_rfc3986(url_parsed.query),
1449 fragment=escape_rfc3986(url_parsed.fragment)
1450 ).geturl()
1451
b53466e1 1452try:
28e614de 1453 struct.pack('!I', 0)
b53466e1
PH
1454except TypeError:
1455 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1456 def struct_pack(spec, *args):
1457 if isinstance(spec, compat_str):
1458 spec = spec.encode('ascii')
1459 return struct.pack(spec, *args)
1460
1461 def struct_unpack(spec, *args):
1462 if isinstance(spec, compat_str):
1463 spec = spec.encode('ascii')
1464 return struct.unpack(spec, *args)
1465else:
1466 struct_pack = struct.pack
1467 struct_unpack = struct.unpack
62e609ab
PH
1468
1469
1470def read_batch_urls(batch_fd):
1471 def fixup(url):
1472 if not isinstance(url, compat_str):
1473 url = url.decode('utf-8', 'replace')
28e614de 1474 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1475 if url.startswith(BOM_UTF8):
1476 url = url[len(BOM_UTF8):]
1477 url = url.strip()
1478 if url.startswith(('#', ';', ']')):
1479 return False
1480 return url
1481
1482 with contextlib.closing(batch_fd) as fd:
1483 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1484
1485
1486def urlencode_postdata(*args, **kargs):
1487 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1488
1489
0990305d
PH
1490try:
1491 etree_iter = xml.etree.ElementTree.Element.iter
1492except AttributeError: # Python <=2.6
1493 etree_iter = lambda n: n.findall('.//*')
1494
1495
bcf89ce6
PH
1496def parse_xml(s):
1497 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1498 def doctype(self, name, pubid, system):
1499 pass # Ignore doctypes
1500
1501 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1502 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1503 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1504 # Fix up XML parser in Python 2.x
1505 if sys.version_info < (3, 0):
1506 for n in etree_iter(tree):
1507 if n.text is not None:
1508 if not isinstance(n.text, compat_str):
1509 n.text = n.text.decode('utf-8')
1510 return tree
e68301af
PH
1511
1512
a1a530b0
PH
1513US_RATINGS = {
1514 'G': 0,
1515 'PG': 10,
1516 'PG-13': 13,
1517 'R': 16,
1518 'NC': 18,
1519}
fac55558
PH
1520
1521
146c80e2
S
1522def parse_age_limit(s):
1523 if s is None:
d838b1bd 1524 return None
146c80e2 1525 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d838b1bd 1526 return int(m.group('age')) if m else US_RATINGS.get(s, None)
146c80e2
S
1527
1528
fac55558 1529def strip_jsonp(code):
609a61e3
PH
1530 return re.sub(
1531 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1532
1533
e05f6939
PH
1534def js_to_json(code):
1535 def fix_kv(m):
e7b6d122
PH
1536 v = m.group(0)
1537 if v in ('true', 'false', 'null'):
1538 return v
1539 if v.startswith('"'):
1540 return v
1541 if v.startswith("'"):
1542 v = v[1:-1]
1543 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1544 '\\\\': '\\\\',
1545 "\\'": "'",
1546 '"': '\\"',
1547 }[m.group(0)], v)
1548 return '"%s"' % v
e05f6939
PH
1549
1550 res = re.sub(r'''(?x)
e7b6d122
PH
1551 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1552 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1553 [a-zA-Z_][a-zA-Z_0-9]*
e05f6939
PH
1554 ''', fix_kv, code)
1555 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1556 return res
1557
1558
478c2c61
PH
1559def qualities(quality_ids):
1560 """ Get a numeric quality value out of a list of possible values """
1561 def q(qid):
1562 try:
1563 return quality_ids.index(qid)
1564 except ValueError:
1565 return -1
1566 return q
1567
acd69589
PH
1568
1569DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1570
a020a0dc
PH
1571
1572def limit_length(s, length):
1573 """ Add ellipses to overly long strings """
1574 if s is None:
1575 return None
1576 ELLIPSES = '...'
1577 if len(s) > length:
1578 return s[:length - len(ELLIPSES)] + ELLIPSES
1579 return s
48844745
PH
1580
1581
1582def version_tuple(v):
5f9b8394 1583 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1584
1585
1586def is_outdated_version(version, limit, assume_new=True):
1587 if not version:
1588 return not assume_new
1589 try:
1590 return version_tuple(version) < version_tuple(limit)
1591 except ValueError:
1592 return not assume_new
732ea2f0
PH
1593
1594
1595def ytdl_is_updateable():
1596 """ Returns if youtube-dl can be updated with -U """
1597 from zipimport import zipimporter
1598
1599 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1600
1601
1602def args_to_str(args):
1603 # Get a short string representation for a subprocess command
1604 return ' '.join(shlex_quote(a) for a in args)
2ccd1b10
PH
1605
1606
1607def urlhandle_detect_ext(url_handle):
1608 try:
1609 url_handle.headers
1610 getheader = lambda h: url_handle.headers[h]
1611 except AttributeError: # Python < 3
1612 getheader = url_handle.info().getheader
1613
1614 return getheader('Content-Type').split("/")[1]
05900629
PH
1615
1616
1617def age_restricted(content_limit, age_limit):
1618 """ Returns True iff the content should be blocked """
1619
1620 if age_limit is None: # No limit set
1621 return False
1622 if content_limit is None:
1623 return False # Content available for everyone
1624 return age_limit < content_limit