]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[YoutubeDL/utils] Clarify rationale for URL escaping in comment, move escape routines...
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
912b38b4 4import calendar
676eb3f2 5import codecs
62e609ab 6import contextlib
e3946f98 7import ctypes
c496ca96
PH
8import datetime
9import email.utils
f45c185f 10import errno
e68301af 11import getpass
d77c3dfd 12import gzip
b7ab0590 13import itertools
03f9daab 14import io
f4bfd65f 15import json
d77c3dfd 16import locale
02dbf93f 17import math
d77c3dfd 18import os
4eb7f1d1 19import pipes
c496ca96 20import platform
d77c3dfd 21import re
13ebea79 22import ssl
c496ca96 23import socket
b53466e1 24import struct
1c088fa8 25import subprocess
d77c3dfd 26import sys
181c8655 27import tempfile
01951dda 28import traceback
bcf89ce6 29import xml.etree.ElementTree
d77c3dfd 30import zlib
d77c3dfd 31
01ba00ca 32try:
59ae15a5 33 import urllib.request as compat_urllib_request
01ba00ca 34except ImportError: # Python 2
59ae15a5 35 import urllib2 as compat_urllib_request
01ba00ca
PH
36
37try:
59ae15a5 38 import urllib.error as compat_urllib_error
01ba00ca 39except ImportError: # Python 2
59ae15a5 40 import urllib2 as compat_urllib_error
01ba00ca
PH
41
42try:
59ae15a5 43 import urllib.parse as compat_urllib_parse
01ba00ca 44except ImportError: # Python 2
59ae15a5 45 import urllib as compat_urllib_parse
01ba00ca 46
799c0763
PH
47try:
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
51
6543f0dc
JMF
52try:
53 import urllib.parse as compat_urlparse
54except ImportError: # Python 2
55 import urlparse as compat_urlparse
56
01ba00ca 57try:
59ae15a5 58 import http.cookiejar as compat_cookiejar
01ba00ca 59except ImportError: # Python 2
59ae15a5 60 import cookielib as compat_cookiejar
01ba00ca 61
3e669f36 62try:
59ae15a5 63 import html.entities as compat_html_entities
9f37a959 64except ImportError: # Python 2
59ae15a5 65 import htmlentitydefs as compat_html_entities
3e669f36 66
a8156c1d 67try:
59ae15a5 68 import html.parser as compat_html_parser
9f37a959 69except ImportError: # Python 2
59ae15a5 70 import HTMLParser as compat_html_parser
a8156c1d 71
348d0a7a 72try:
59ae15a5 73 import http.client as compat_http_client
9f37a959 74except ImportError: # Python 2
59ae15a5 75 import httplib as compat_http_client
348d0a7a 76
2eabb802 77try:
0e283428 78 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
79except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
81
e0df6211
PH
82try:
83 from urllib.request import urlretrieve as compat_urlretrieve
84except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
86
87
5910e210
PH
88try:
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
91except ImportError:
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
93
9f37a959 94try:
f1f725c6
PH
95 from urllib.parse import unquote as compat_urllib_parse_unquote
96except ImportError:
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
59ae15a5
PH
98 if string == '':
99 return string
100 res = string.split('%')
101 if len(res) == 1:
102 return string
103 if encoding is None:
104 encoding = 'utf-8'
105 if errors is None:
106 errors = 'replace'
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
108 pct_sequence = b''
109 string = res[0]
110 for item in res[1:]:
111 try:
112 if not item:
113 raise ValueError
114 pct_sequence += item[:2].decode('hex')
115 rest = item[2:]
116 if not rest:
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
120 continue
121 except ValueError:
122 rest = '%' + item
123 # Encountered non-percent-encoded characters. Flush the current
124 # pct_sequence.
125 string += pct_sequence.decode(encoding, errors) + rest
126 pct_sequence = b''
127 if pct_sequence:
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
130 return string
131
f1f725c6
PH
132
133try:
134 from urllib.parse import parse_qs as compat_parse_qs
135except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
138
59ae15a5
PH
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
143 r = []
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
146 continue
147 nv = name_value.split('=', 1)
148 if len(nv) != 2:
149 if strict_parsing:
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
153 nv.append('')
154 else:
155 continue
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
f1f725c6
PH
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
59ae15a5
PH
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
f1f725c6
PH
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
59ae15a5
PH
164 value = _coerce_result(value)
165 r.append((name, value))
166 return r
167
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
170 parsed_result = {}
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
176 else:
177 parsed_result[name] = [value]
178 return parsed_result
348d0a7a 179
3e669f36 180try:
59ae15a5 181 compat_str = unicode # Python 2
3e669f36 182except NameError:
59ae15a5 183 compat_str = str
3e669f36
PH
184
185try:
59ae15a5 186 compat_chr = unichr # Python 2
3e669f36 187except NameError:
59ae15a5 188 compat_chr = chr
3e669f36 189
f7300c5c
JMF
190try:
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
194
8d31fa3c
PH
195try:
196 from shlex import quote as shlex_quote
197except ImportError: # Python < 3.3
198 def shlex_quote(s):
199 return "'" + s.replace("'", "'\"'\"'") + "'"
200
201
b31756c1
FV
202def compat_ord(c):
203 if type(c) is int: return c
204 else: return ord(c)
205
468e2e92
FV
206# This is not clearly defined otherwise
207compiled_regex_type = type(re.compile(''))
208
3e669f36 209std_headers = {
ae8f7871 210 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
211 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
212 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
213 'Accept-Encoding': 'gzip, deflate',
214 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 215}
f427df17 216
d77c3dfd 217def preferredencoding():
59ae15a5 218 """Get preferred encoding.
d77c3dfd 219
59ae15a5
PH
220 Returns the best encoding scheme for the system, based on
221 locale.getpreferredencoding() and some further tweaks.
222 """
223 try:
224 pref = locale.getpreferredencoding()
225 u'TEST'.encode(pref)
226 except:
227 pref = 'UTF-8'
bae611f2 228
59ae15a5 229 return pref
d77c3dfd 230
8cd10ac4 231if sys.version_info < (3,0):
59ae15a5
PH
232 def compat_print(s):
233 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 234else:
59ae15a5
PH
235 def compat_print(s):
236 assert type(s) == type(u'')
237 print(s)
d77c3dfd 238
f4bfd65f 239
181c8655
PH
240def write_json_file(obj, fn):
241 """ Encode obj as JSON and write it to fn, atomically """
242
73159f99
S
243 args = {
244 'suffix': '.tmp',
245 'prefix': os.path.basename(fn) + '.',
246 'dir': os.path.dirname(fn),
247 'delete': False,
248 }
249
181c8655
PH
250 # In Python 2.x, json.dump expects a bytestream.
251 # In Python 3.x, it writes to a character stream
252 if sys.version_info < (3, 0):
73159f99 253 args['mode'] = 'wb'
181c8655 254 else:
73159f99
S
255 args.update({
256 'mode': 'w',
257 'encoding': 'utf-8',
258 })
259
260 tf = tempfile.NamedTemporaryFile(**args)
181c8655
PH
261
262 try:
263 with tf:
264 json.dump(obj, tf)
265 os.rename(tf.name, fn)
266 except:
267 try:
268 os.remove(tf.name)
269 except OSError:
270 pass
271 raise
272
273
274if sys.version_info >= (2, 7):
59ae56fa
PH
275 def find_xpath_attr(node, xpath, key, val):
276 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
277 assert re.match(r'^[a-zA-Z-]+$', key)
278 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
59ae56fa
PH
279 expr = xpath + u"[@%s='%s']" % (key, val)
280 return node.find(expr)
281else:
282 def find_xpath_attr(node, xpath, key, val):
283 for f in node.findall(xpath):
284 if f.attrib.get(key) == val:
285 return f
286 return None
287
d7e66d39
JMF
288# On python2.6 the xml.etree.ElementTree.Element methods don't support
289# the namespace parameter
290def xpath_with_ns(path, ns_map):
291 components = [c.split(':') for c in path.split('/')]
292 replaced = []
293 for c in components:
294 if len(c) == 1:
295 replaced.append(c[0])
296 else:
297 ns, tag = c
298 replaced.append('{%s}%s' % (ns_map[ns], tag))
299 return '/'.join(replaced)
300
d77c3dfd 301
a8156c1d 302compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
303class BaseHTMLParser(compat_html_parser.HTMLParser):
304 def __init(self):
305 compat_html_parser.HTMLParser.__init__(self)
306 self.html = None
307
308 def loads(self, html):
309 self.html = html
310 self.feed(html)
311 self.close()
312
313class AttrParser(BaseHTMLParser):
43e8fafd
ND
314 """Modified HTMLParser that isolates a tag with the specified attribute"""
315 def __init__(self, attribute, value):
316 self.attribute = attribute
317 self.value = value
59ae15a5
PH
318 self.result = None
319 self.started = False
320 self.depth = {}
59ae15a5
PH
321 self.watch_startpos = False
322 self.error_count = 0
a921f407 323 BaseHTMLParser.__init__(self)
59ae15a5
PH
324
325 def error(self, message):
326 if self.error_count > 10 or self.started:
327 raise compat_html_parser.HTMLParseError(message, self.getpos())
328 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
329 self.error_count += 1
330 self.goahead(1)
331
59ae15a5
PH
332 def handle_starttag(self, tag, attrs):
333 attrs = dict(attrs)
334 if self.started:
335 self.find_startpos(None)
43e8fafd 336 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
337 self.result = [tag]
338 self.started = True
339 self.watch_startpos = True
340 if self.started:
341 if not tag in self.depth: self.depth[tag] = 0
342 self.depth[tag] += 1
343
344 def handle_endtag(self, tag):
345 if self.started:
346 if tag in self.depth: self.depth[tag] -= 1
347 if self.depth[self.result[0]] == 0:
348 self.started = False
349 self.result.append(self.getpos())
350
351 def find_startpos(self, x):
352 """Needed to put the start position of the result (self.result[1])
353 after the opening tag with the requested id"""
354 if self.watch_startpos:
355 self.watch_startpos = False
356 self.result.append(self.getpos())
357 handle_entityref = handle_charref = handle_data = handle_comment = \
358 handle_decl = handle_pi = unknown_decl = find_startpos
359
360 def get_result(self):
361 if self.result is None:
362 return None
363 if len(self.result) != 3:
364 return None
365 lines = self.html.split('\n')
366 lines = lines[self.result[1][0]-1:self.result[2][0]]
367 lines[0] = lines[0][self.result[1][1]:]
368 if len(lines) == 1:
369 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
370 lines[-1] = lines[-1][:self.result[2][1]]
371 return '\n'.join(lines).strip()
3b024e17
PH
372# Hack for https://github.com/rg3/youtube-dl/issues/662
373if sys.version_info < (2, 7, 3):
374 AttrParser.parse_endtag = (lambda self, i:
375 i + len("</scr'+'ipt>")
376 if self.rawdata[i:].startswith("</scr'+'ipt>")
377 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
378
379def get_element_by_id(id, html):
43e8fafd
ND
380 """Return the content of the tag with the specified ID in the passed HTML document"""
381 return get_element_by_attribute("id", id, html)
382
383def get_element_by_attribute(attribute, value, html):
384 """Return the content of the tag with the specified attribute in the passed HTML document"""
385 parser = AttrParser(attribute, value)
59ae15a5
PH
386 try:
387 parser.loads(html)
388 except compat_html_parser.HTMLParseError:
389 pass
390 return parser.get_result()
9e6dd238 391
a921f407
JMF
392class MetaParser(BaseHTMLParser):
393 """
394 Modified HTMLParser that isolates a meta tag with the specified name
395 attribute.
396 """
397 def __init__(self, name):
398 BaseHTMLParser.__init__(self)
399 self.name = name
400 self.content = None
401 self.result = None
402
403 def handle_starttag(self, tag, attrs):
404 if tag != 'meta':
405 return
406 attrs = dict(attrs)
407 if attrs.get('name') == self.name:
408 self.result = attrs.get('content')
409
410 def get_result(self):
411 return self.result
412
413def get_meta_content(name, html):
414 """
415 Return the content attribute from the meta tag with the given name attribute.
416 """
417 parser = MetaParser(name)
418 try:
419 parser.loads(html)
420 except compat_html_parser.HTMLParseError:
421 pass
422 return parser.get_result()
423
9e6dd238
FV
424
425def clean_html(html):
59ae15a5
PH
426 """Clean an HTML snippet into a readable string"""
427 # Newline vs <br />
428 html = html.replace('\n', ' ')
6b3aef80
FV
429 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
430 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
431 # Strip html tags
432 html = re.sub('<.*?>', '', html)
433 # Replace html entities
434 html = unescapeHTML(html)
7decf895 435 return html.strip()
9e6dd238
FV
436
437
d77c3dfd 438def sanitize_open(filename, open_mode):
59ae15a5
PH
439 """Try to open the given filename, and slightly tweak it if this fails.
440
441 Attempts to open the given filename. If this fails, it tries to change
442 the filename slightly, step by step, until it's either able to open it
443 or it fails and raises a final exception, like the standard open()
444 function.
445
446 It returns the tuple (stream, definitive_file_name).
447 """
448 try:
449 if filename == u'-':
450 if sys.platform == 'win32':
451 import msvcrt
452 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 453 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
454 stream = open(encodeFilename(filename), open_mode)
455 return (stream, filename)
456 except (IOError, OSError) as err:
f45c185f
PH
457 if err.errno in (errno.EACCES,):
458 raise
59ae15a5 459
f45c185f
PH
460 # In case of error, try to remove win32 forbidden chars
461 alt_filename = os.path.join(
462 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
463 for path_part in os.path.split(filename)
464 )
465 if alt_filename == filename:
466 raise
467 else:
468 # An exception here should be caught in the caller
469 stream = open(encodeFilename(filename), open_mode)
470 return (stream, alt_filename)
d77c3dfd
FV
471
472
473def timeconvert(timestr):
59ae15a5
PH
474 """Convert RFC 2822 defined time string into system timestamp"""
475 timestamp = None
476 timetuple = email.utils.parsedate_tz(timestr)
477 if timetuple is not None:
478 timestamp = email.utils.mktime_tz(timetuple)
479 return timestamp
1c469a94 480
796173d0 481def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
482 """Sanitizes a string so it could be used as part of a filename.
483 If restricted is set, use a stricter subset of allowed characters.
796173d0 484 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
485 """
486 def replace_insane(char):
487 if char == '?' or ord(char) < 32 or ord(char) == 127:
488 return ''
489 elif char == '"':
490 return '' if restricted else '\''
491 elif char == ':':
492 return '_-' if restricted else ' -'
493 elif char in '\\/|*<>':
494 return '_'
627dcfff 495 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
496 return '_'
497 if restricted and ord(char) > 127:
498 return '_'
499 return char
500
501 result = u''.join(map(replace_insane, s))
796173d0
PH
502 if not is_id:
503 while '__' in result:
504 result = result.replace('__', '_')
505 result = result.strip('_')
506 # Common case of "Foreign band name - English song title"
507 if restricted and result.startswith('-_'):
508 result = result[2:]
509 if not result:
510 result = '_'
59ae15a5 511 return result
d77c3dfd
FV
512
513def orderedSet(iterable):
59ae15a5
PH
514 """ Remove all duplicates from the input iterable """
515 res = []
516 for el in iterable:
517 if el not in res:
518 res.append(el)
519 return res
d77c3dfd 520
912b38b4 521
4e408e47
PH
522def _htmlentity_transform(entity):
523 """Transforms an HTML entity to a character."""
524 # Known non-numeric HTML entity
525 if entity in compat_html_entities.name2codepoint:
526 return compat_chr(compat_html_entities.name2codepoint[entity])
527
528 mobj = re.match(r'#(x?[0-9]+)', entity)
529 if mobj is not None:
530 numstr = mobj.group(1)
531 if numstr.startswith(u'x'):
532 base = 16
533 numstr = u'0%s' % numstr
534 else:
535 base = 10
536 return compat_chr(int(numstr, base))
537
538 # Unknown entity in name, return its literal representation
539 return (u'&%s;' % entity)
540
541
d77c3dfd 542def unescapeHTML(s):
912b38b4
PH
543 if s is None:
544 return None
545 assert type(s) == compat_str
d77c3dfd 546
4e408e47
PH
547 return re.sub(
548 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 549
8bf48f23
PH
550
551def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
552 """
553 @param s The name of the file
554 """
d77c3dfd 555
8bf48f23 556 assert type(s) == compat_str
d77c3dfd 557
59ae15a5
PH
558 # Python 3 has a Unicode API
559 if sys.version_info >= (3, 0):
560 return s
0f00efed 561
59ae15a5
PH
562 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
563 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
564 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
565 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
566 if not for_subprocess:
567 return s
568 else:
569 # For subprocess calls, encode with locale encoding
570 # Refer to http://stackoverflow.com/a/9951851/35070
571 encoding = preferredencoding()
59ae15a5 572 else:
6df40dcb 573 encoding = sys.getfilesystemencoding()
8bf48f23
PH
574 if encoding is None:
575 encoding = 'utf-8'
576 return s.encode(encoding, 'ignore')
577
f07b74fc
PH
578
579def encodeArgument(s):
580 if not isinstance(s, compat_str):
581 # Legacy code that uses byte strings
582 # Uncomment the following line after fixing all post processors
583 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
584 s = s.decode('ascii')
585 return encodeFilename(s, True)
586
587
8271226a
PH
588def decodeOption(optval):
589 if optval is None:
590 return optval
591 if isinstance(optval, bytes):
592 optval = optval.decode(preferredencoding())
593
594 assert isinstance(optval, compat_str)
595 return optval
1c256f70 596
4539dd30
PH
597def formatSeconds(secs):
598 if secs > 3600:
599 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
600 elif secs > 60:
601 return '%d:%02d' % (secs // 60, secs % 60)
602 else:
603 return '%d' % secs
604
a0ddb8a2
PH
605
606def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
13ebea79
PH
607 if sys.version_info < (3, 2):
608 import httplib
609
610 class HTTPSConnectionV3(httplib.HTTPSConnection):
611 def __init__(self, *args, **kwargs):
612 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
613
614 def connect(self):
615 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 616 if getattr(self, '_tunnel_host', False):
13ebea79
PH
617 self.sock = sock
618 self._tunnel()
619 try:
aa37e3d4 620 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
de79c46c 621 except ssl.SSLError:
13ebea79
PH
622 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
623
624 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
625 def https_open(self, req):
626 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 627 return HTTPSHandlerV3(**kwargs)
aa37e3d4
PH
628 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
629 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
630 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
631 if opts_no_check_certificate:
632 context.verify_mode = ssl.CERT_NONE
633 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
634 else: # Python < 3.4
635 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
ea6d901e 636 context.verify_mode = (ssl.CERT_NONE
dca08720 637 if opts_no_check_certificate
ea6d901e 638 else ssl.CERT_REQUIRED)
303b479e
PH
639 context.set_default_verify_paths()
640 try:
641 context.load_default_certs()
642 except AttributeError:
643 pass # Python < 3.4
a0ddb8a2 644 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 645
1c256f70
PH
646class ExtractorError(Exception):
647 """Error during info extraction."""
d11271dd 648 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
649 """ tb, if given, is the original traceback (so that it can be printed out).
650 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
651 """
652
653 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
654 expected = True
d11271dd
PH
655 if video_id is not None:
656 msg = video_id + ': ' + msg
9a82b238 657 if not expected:
298f833b 658 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 659 super(ExtractorError, self).__init__(msg)
d5979c5d 660
1c256f70 661 self.traceback = tb
8cc83b8d 662 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 663 self.cause = cause
d11271dd 664 self.video_id = video_id
1c256f70 665
01951dda
PH
666 def format_traceback(self):
667 if self.traceback is None:
668 return None
669 return u''.join(traceback.format_tb(self.traceback))
670
1c256f70 671
55b3e45b
JMF
672class RegexNotFoundError(ExtractorError):
673 """Error when a regex didn't match"""
674 pass
675
676
d77c3dfd 677class DownloadError(Exception):
59ae15a5 678 """Download Error exception.
d77c3dfd 679
59ae15a5
PH
680 This exception may be thrown by FileDownloader objects if they are not
681 configured to continue on errors. They will contain the appropriate
682 error message.
683 """
8cc83b8d
FV
684 def __init__(self, msg, exc_info=None):
685 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
686 super(DownloadError, self).__init__(msg)
687 self.exc_info = exc_info
d77c3dfd
FV
688
689
690class SameFileError(Exception):
59ae15a5 691 """Same File exception.
d77c3dfd 692
59ae15a5
PH
693 This exception will be thrown by FileDownloader objects if they detect
694 multiple files would have to be downloaded to the same file on disk.
695 """
696 pass
d77c3dfd
FV
697
698
699class PostProcessingError(Exception):
59ae15a5 700 """Post Processing exception.
d77c3dfd 701
59ae15a5
PH
702 This exception may be raised by PostProcessor's .run() method to
703 indicate an error in the postprocessing task.
704 """
7851b379
PH
705 def __init__(self, msg):
706 self.msg = msg
d77c3dfd
FV
707
708class MaxDownloadsReached(Exception):
59ae15a5
PH
709 """ --max-downloads limit has been reached. """
710 pass
d77c3dfd
FV
711
712
713class UnavailableVideoError(Exception):
59ae15a5 714 """Unavailable Format exception.
d77c3dfd 715
59ae15a5
PH
716 This exception will be thrown when a video is requested
717 in a format that is not available for that video.
718 """
719 pass
d77c3dfd
FV
720
721
722class ContentTooShortError(Exception):
59ae15a5 723 """Content Too Short exception.
d77c3dfd 724
59ae15a5
PH
725 This exception may be raised by FileDownloader objects when a file they
726 download is too small for what the server announced first, indicating
727 the connection was probably interrupted.
728 """
729 # Both in bytes
730 downloaded = None
731 expected = None
d77c3dfd 732
59ae15a5
PH
733 def __init__(self, downloaded, expected):
734 self.downloaded = downloaded
735 self.expected = expected
d77c3dfd 736
acebc9cd 737class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
738 """Handler for HTTP requests and responses.
739
740 This class, when installed with an OpenerDirector, automatically adds
741 the standard headers to every HTTP request and handles gzipped and
742 deflated responses from web servers. If compression is to be avoided in
743 a particular request, the original request in the program code only has
744 to include the HTTP header "Youtubedl-No-Compression", which will be
745 removed before making the real request.
746
747 Part of this code was copied from:
748
749 http://techknack.net/python-urllib2-handlers/
750
751 Andrew Rowls, the author of that code, agreed to release it to the
752 public domain.
753 """
754
755 @staticmethod
756 def deflate(data):
757 try:
758 return zlib.decompress(data, -zlib.MAX_WBITS)
759 except zlib.error:
760 return zlib.decompress(data)
761
762 @staticmethod
763 def addinfourl_wrapper(stream, headers, url, code):
764 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
765 return compat_urllib_request.addinfourl(stream, headers, url, code)
766 ret = compat_urllib_request.addinfourl(stream, headers, url)
767 ret.code = code
768 return ret
769
acebc9cd 770 def http_request(self, req):
33ac271b
PH
771 for h, v in std_headers.items():
772 if h not in req.headers:
773 req.add_header(h, v)
59ae15a5
PH
774 if 'Youtubedl-no-compression' in req.headers:
775 if 'Accept-encoding' in req.headers:
776 del req.headers['Accept-encoding']
777 del req.headers['Youtubedl-no-compression']
3446dfb7 778 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
779 if 'User-agent' in req.headers:
780 del req.headers['User-agent']
781 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 782 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
783 return req
784
acebc9cd 785 def http_response(self, req, resp):
59ae15a5
PH
786 old_resp = resp
787 # gzip
788 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
789 content = resp.read()
790 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
791 try:
792 uncompressed = io.BytesIO(gz.read())
793 except IOError as original_ioerror:
794 # There may be junk add the end of the file
795 # See http://stackoverflow.com/q/4928560/35070 for details
796 for i in range(1, 1024):
797 try:
798 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
799 uncompressed = io.BytesIO(gz.read())
800 except IOError:
801 continue
802 break
803 else:
804 raise original_ioerror
805 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
806 resp.msg = old_resp.msg
807 # deflate
808 if resp.headers.get('Content-encoding', '') == 'deflate':
809 gz = io.BytesIO(self.deflate(resp.read()))
810 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
811 resp.msg = old_resp.msg
812 return resp
0f8d03f8 813
acebc9cd
PH
814 https_request = http_request
815 https_response = http_response
bf50b038 816
5de90176 817
305d0683 818def parse_iso8601(date_str, delimiter='T'):
912b38b4
PH
819 """ Return a UNIX timestamp from the given date """
820
821 if date_str is None:
822 return None
823
824 m = re.search(
825 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
826 date_str)
827 if not m:
828 timezone = datetime.timedelta()
829 else:
830 date_str = date_str[:-len(m.group(0))]
831 if not m.group('sign'):
832 timezone = datetime.timedelta()
833 else:
834 sign = 1 if m.group('sign') == '+' else -1
835 timezone = datetime.timedelta(
836 hours=sign * int(m.group('hours')),
837 minutes=sign * int(m.group('minutes')))
305d0683
TB
838 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
839 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
840 return calendar.timegm(dt.timetuple())
841
842
bf50b038
JMF
843def unified_strdate(date_str):
844 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
845
846 if date_str is None:
847 return None
848
bf50b038
JMF
849 upload_date = None
850 #Replace commas
026fcc04 851 date_str = date_str.replace(',', ' ')
bf50b038 852 # %z (UTC offset) is only supported in python>=3.2
026fcc04 853 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
19e1d359
JMF
854 format_expressions = [
855 '%d %B %Y',
0f99566c 856 '%d %b %Y',
19e1d359
JMF
857 '%B %d %Y',
858 '%b %d %Y',
78ff59d0
PP
859 '%b %dst %Y %I:%M%p',
860 '%b %dnd %Y %I:%M%p',
861 '%b %dth %Y %I:%M%p',
19e1d359 862 '%Y-%m-%d',
fe556f1b 863 '%Y/%m/%d',
4cf96546 864 '%d.%m.%Y',
19e1d359 865 '%d/%m/%Y',
423817c4 866 '%d/%m/%y',
19e1d359 867 '%Y/%m/%d %H:%M:%S',
5d73273f 868 '%Y-%m-%d %H:%M:%S',
19e1d359 869 '%d.%m.%Y %H:%M',
b047de6f 870 '%d.%m.%Y %H.%M',
19e1d359 871 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
872 '%Y-%m-%dT%H:%M:%S.%fZ',
873 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 874 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 875 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 876 '%Y-%m-%dT%H:%M',
19e1d359 877 ]
bf50b038
JMF
878 for expression in format_expressions:
879 try:
880 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 881 except ValueError:
bf50b038 882 pass
42393ce2
PH
883 if upload_date is None:
884 timetuple = email.utils.parsedate_tz(date_str)
885 if timetuple:
886 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
887 return upload_date
888
cbdbb766 889def determine_ext(url, default_ext=u'unknown_video'):
f4776371
S
890 if url is None:
891 return default_ext
73e79f2a
PH
892 guess = url.partition(u'?')[0].rpartition(u'.')[2]
893 if re.match(r'^[A-Za-z0-9]+$', guess):
894 return guess
895 else:
cbdbb766 896 return default_ext
73e79f2a 897
d4051a8e
JMF
898def subtitles_filename(filename, sub_lang, sub_format):
899 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
900
bd558525 901def date_from_str(date_str):
37254abc
JMF
902 """
903 Return a datetime object from a string in the format YYYYMMDD or
904 (now|today)[+-][0-9](day|week|month|year)(s)?"""
905 today = datetime.date.today()
906 if date_str == 'now'or date_str == 'today':
907 return today
908 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
909 if match is not None:
910 sign = match.group('sign')
911 time = int(match.group('time'))
912 if sign == '-':
913 time = -time
914 unit = match.group('unit')
915 #A bad aproximation?
916 if unit == 'month':
917 unit = 'day'
918 time *= 30
919 elif unit == 'year':
920 unit = 'day'
921 time *= 365
922 unit += 's'
923 delta = datetime.timedelta(**{unit: time})
924 return today + delta
bd558525
JMF
925 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
926
e63fc1be 927def hyphenate_date(date_str):
928 """
929 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
930 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
931 if match is not None:
932 return '-'.join(match.groups())
933 else:
934 return date_str
935
bd558525
JMF
936class DateRange(object):
937 """Represents a time interval between two dates"""
938 def __init__(self, start=None, end=None):
939 """start and end must be strings in the format accepted by date"""
940 if start is not None:
941 self.start = date_from_str(start)
942 else:
943 self.start = datetime.datetime.min.date()
944 if end is not None:
945 self.end = date_from_str(end)
946 else:
947 self.end = datetime.datetime.max.date()
37254abc 948 if self.start > self.end:
bd558525
JMF
949 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
950 @classmethod
951 def day(cls, day):
952 """Returns a range that only contains the given day"""
953 return cls(day,day)
954 def __contains__(self, date):
955 """Check if the date is in the range"""
37254abc
JMF
956 if not isinstance(date, datetime.date):
957 date = date_from_str(date)
958 return self.start <= date <= self.end
bd558525
JMF
959 def __str__(self):
960 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
961
962
963def platform_name():
964 """ Returns the platform name as a compat_str """
965 res = platform.platform()
966 if isinstance(res, bytes):
967 res = res.decode(preferredencoding())
968
969 assert isinstance(res, compat_str)
970 return res
c257baff
PH
971
972
b58ddb32
PH
973def _windows_write_string(s, out):
974 """ Returns True if the string was written using special methods,
975 False if it has yet to be written out."""
976 # Adapted from http://stackoverflow.com/a/3259271/35070
977
978 import ctypes
979 import ctypes.wintypes
980
981 WIN_OUTPUT_IDS = {
982 1: -11,
983 2: -12,
984 }
985
a383a98a
PH
986 try:
987 fileno = out.fileno()
988 except AttributeError:
989 # If the output stream doesn't have a fileno, it's virtual
990 return False
b58ddb32
PH
991 if fileno not in WIN_OUTPUT_IDS:
992 return False
993
994 GetStdHandle = ctypes.WINFUNCTYPE(
995 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
996 ("GetStdHandle", ctypes.windll.kernel32))
997 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
998
999 WriteConsoleW = ctypes.WINFUNCTYPE(
1000 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1001 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1002 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1003 written = ctypes.wintypes.DWORD(0)
1004
1005 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1006 FILE_TYPE_CHAR = 0x0002
1007 FILE_TYPE_REMOTE = 0x8000
1008 GetConsoleMode = ctypes.WINFUNCTYPE(
1009 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1010 ctypes.POINTER(ctypes.wintypes.DWORD))(
1011 ("GetConsoleMode", ctypes.windll.kernel32))
1012 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1013
1014 def not_a_console(handle):
1015 if handle == INVALID_HANDLE_VALUE or handle is None:
1016 return True
1017 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1018 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1019
1020 if not_a_console(h):
1021 return False
1022
d1b9c912
PH
1023 def next_nonbmp_pos(s):
1024 try:
1025 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1026 except StopIteration:
1027 return len(s)
1028
1029 while s:
1030 count = min(next_nonbmp_pos(s), 1024)
1031
b58ddb32 1032 ret = WriteConsoleW(
d1b9c912 1033 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1034 if ret == 0:
1035 raise OSError('Failed to write string')
d1b9c912
PH
1036 if not count: # We just wrote a non-BMP character
1037 assert written.value == 2
1038 s = s[1:]
1039 else:
1040 assert written.value > 0
1041 s = s[written.value:]
b58ddb32
PH
1042 return True
1043
1044
734f90bb 1045def write_string(s, out=None, encoding=None):
7459e3a2
PH
1046 if out is None:
1047 out = sys.stderr
8bf48f23 1048 assert type(s) == compat_str
7459e3a2 1049
b58ddb32
PH
1050 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1051 if _windows_write_string(s, out):
1052 return
1053
7459e3a2
PH
1054 if ('b' in getattr(out, 'mode', '') or
1055 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1056 byt = s.encode(encoding or preferredencoding(), 'ignore')
1057 out.write(byt)
1058 elif hasattr(out, 'buffer'):
1059 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1060 byt = s.encode(enc, 'ignore')
1061 out.buffer.write(byt)
1062 else:
8bf48f23 1063 out.write(s)
7459e3a2
PH
1064 out.flush()
1065
1066
48ea9cea
PH
1067def bytes_to_intlist(bs):
1068 if not bs:
1069 return []
1070 if isinstance(bs[0], int): # Python 3
1071 return list(bs)
1072 else:
1073 return [ord(c) for c in bs]
1074
c257baff 1075
cba892fa 1076def intlist_to_bytes(xs):
1077 if not xs:
1078 return b''
1079 if isinstance(chr(0), bytes): # Python 2
1080 return ''.join([chr(x) for x in xs])
1081 else:
1082 return bytes(xs)
c38b1e77
PH
1083
1084
c1c9a79c
PH
1085# Cross-platform file locking
1086if sys.platform == 'win32':
1087 import ctypes.wintypes
1088 import msvcrt
1089
1090 class OVERLAPPED(ctypes.Structure):
1091 _fields_ = [
1092 ('Internal', ctypes.wintypes.LPVOID),
1093 ('InternalHigh', ctypes.wintypes.LPVOID),
1094 ('Offset', ctypes.wintypes.DWORD),
1095 ('OffsetHigh', ctypes.wintypes.DWORD),
1096 ('hEvent', ctypes.wintypes.HANDLE),
1097 ]
1098
1099 kernel32 = ctypes.windll.kernel32
1100 LockFileEx = kernel32.LockFileEx
1101 LockFileEx.argtypes = [
1102 ctypes.wintypes.HANDLE, # hFile
1103 ctypes.wintypes.DWORD, # dwFlags
1104 ctypes.wintypes.DWORD, # dwReserved
1105 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1106 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1107 ctypes.POINTER(OVERLAPPED) # Overlapped
1108 ]
1109 LockFileEx.restype = ctypes.wintypes.BOOL
1110 UnlockFileEx = kernel32.UnlockFileEx
1111 UnlockFileEx.argtypes = [
1112 ctypes.wintypes.HANDLE, # hFile
1113 ctypes.wintypes.DWORD, # dwReserved
1114 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1115 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1116 ctypes.POINTER(OVERLAPPED) # Overlapped
1117 ]
1118 UnlockFileEx.restype = ctypes.wintypes.BOOL
1119 whole_low = 0xffffffff
1120 whole_high = 0x7fffffff
1121
1122 def _lock_file(f, exclusive):
1123 overlapped = OVERLAPPED()
1124 overlapped.Offset = 0
1125 overlapped.OffsetHigh = 0
1126 overlapped.hEvent = 0
1127 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1128 handle = msvcrt.get_osfhandle(f.fileno())
1129 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1130 whole_low, whole_high, f._lock_file_overlapped_p):
1131 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1132
1133 def _unlock_file(f):
1134 assert f._lock_file_overlapped_p
1135 handle = msvcrt.get_osfhandle(f.fileno())
1136 if not UnlockFileEx(handle, 0,
1137 whole_low, whole_high, f._lock_file_overlapped_p):
1138 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1139
1140else:
1141 import fcntl
1142
1143 def _lock_file(f, exclusive):
2582bebe 1144 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
1145
1146 def _unlock_file(f):
2582bebe 1147 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
1148
1149
1150class locked_file(object):
1151 def __init__(self, filename, mode, encoding=None):
1152 assert mode in ['r', 'a', 'w']
1153 self.f = io.open(filename, mode, encoding=encoding)
1154 self.mode = mode
1155
1156 def __enter__(self):
1157 exclusive = self.mode != 'r'
1158 try:
1159 _lock_file(self.f, exclusive)
1160 except IOError:
1161 self.f.close()
1162 raise
1163 return self
1164
1165 def __exit__(self, etype, value, traceback):
1166 try:
1167 _unlock_file(self.f)
1168 finally:
1169 self.f.close()
1170
1171 def __iter__(self):
1172 return iter(self.f)
1173
1174 def write(self, *args):
1175 return self.f.write(*args)
1176
1177 def read(self, *args):
1178 return self.f.read(*args)
4eb7f1d1
JMF
1179
1180
1181def shell_quote(args):
a6a173c2
JMF
1182 quoted_args = []
1183 encoding = sys.getfilesystemencoding()
1184 if encoding is None:
1185 encoding = 'utf-8'
1186 for a in args:
1187 if isinstance(a, bytes):
1188 # We may get a filename encoded with 'encodeFilename'
1189 a = a.decode(encoding)
1190 quoted_args.append(pipes.quote(a))
1191 return u' '.join(quoted_args)
9d4660ca
PH
1192
1193
f4d96df0
PH
1194def takewhile_inclusive(pred, seq):
1195 """ Like itertools.takewhile, but include the latest evaluated element
1196 (the first element so that Not pred(e)) """
1197 for e in seq:
1198 yield e
1199 if not pred(e):
1200 return
1201
1202
9d4660ca
PH
1203def smuggle_url(url, data):
1204 """ Pass additional data in a URL for internal use. """
1205
1206 sdata = compat_urllib_parse.urlencode(
1207 {u'__youtubedl_smuggle': json.dumps(data)})
1208 return url + u'#' + sdata
1209
1210
79f82953 1211def unsmuggle_url(smug_url, default=None):
9d4660ca 1212 if not '#__youtubedl_smuggle' in smug_url:
79f82953 1213 return smug_url, default
9d4660ca
PH
1214 url, _, sdata = smug_url.rpartition(u'#')
1215 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1216 data = json.loads(jsond)
1217 return url, data
02dbf93f
PH
1218
1219
02dbf93f
PH
1220def format_bytes(bytes):
1221 if bytes is None:
1222 return u'N/A'
1223 if type(bytes) is str:
1224 bytes = float(bytes)
1225 if bytes == 0.0:
1226 exponent = 0
1227 else:
1228 exponent = int(math.log(bytes, 1024.0))
1229 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1230 converted = float(bytes) / float(1024 ** exponent)
1231 return u'%.2f%s' % (converted, suffix)
f53c966a 1232
1c088fa8 1233
1c088fa8
PH
1234def get_term_width():
1235 columns = os.environ.get('COLUMNS', None)
1236 if columns:
1237 return int(columns)
1238
1239 try:
1240 sp = subprocess.Popen(
1241 ['stty', 'size'],
1242 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1243 out, err = sp.communicate()
1244 return int(out.split()[1])
1245 except:
1246 pass
1247 return None
caefb1de
PH
1248
1249
1250def month_by_name(name):
1251 """ Return the number of a month by (locale-independently) English name """
1252
1253 ENGLISH_NAMES = [
dadb8184 1254 u'January', u'February', u'March', u'April', u'May', u'June',
caefb1de
PH
1255 u'July', u'August', u'September', u'October', u'November', u'December']
1256 try:
1257 return ENGLISH_NAMES.index(name) + 1
1258 except ValueError:
1259 return None
18258362
JMF
1260
1261
5aafe895 1262def fix_xml_ampersands(xml_str):
18258362 1263 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1264 return re.sub(
1265 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1266 u'&amp;',
1267 xml_str)
e3946f98
PH
1268
1269
1270def setproctitle(title):
8bf48f23 1271 assert isinstance(title, compat_str)
e3946f98
PH
1272 try:
1273 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1274 except OSError:
1275 return
6eefe533
PH
1276 title_bytes = title.encode('utf-8')
1277 buf = ctypes.create_string_buffer(len(title_bytes))
1278 buf.value = title_bytes
e3946f98 1279 try:
6eefe533 1280 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1281 except AttributeError:
1282 return # Strange libc, just skip this
d7dda168
PH
1283
1284
1285def remove_start(s, start):
1286 if s.startswith(start):
1287 return s[len(start):]
1288 return s
29eb5174
PH
1289
1290
2b9faf55
PH
1291def remove_end(s, end):
1292 if s.endswith(end):
1293 return s[:-len(end)]
1294 return s
1295
1296
29eb5174 1297def url_basename(url):
9b8aaeed
JMF
1298 path = compat_urlparse.urlparse(url).path
1299 return path.strip(u'/').split(u'/')[-1]
aa94a6d3
PH
1300
1301
1302class HEADRequest(compat_urllib_request.Request):
1303 def get_method(self):
1304 return "HEAD"
7217e148
PH
1305
1306
9732d77e 1307def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1308 if get_attr:
1309 if v is not None:
1310 v = getattr(v, get_attr, None)
9572013d
PH
1311 if v == '':
1312 v = None
9732d77e
PH
1313 return default if v is None else (int(v) * invscale // scale)
1314
9572013d 1315
40a90862
JMF
1316def str_or_none(v, default=None):
1317 return default if v is None else compat_str(v)
1318
9732d77e
PH
1319
1320def str_to_int(int_str):
48d4681e 1321 """ A more relaxed version of int_or_none """
9732d77e
PH
1322 if int_str is None:
1323 return None
884ae747 1324 int_str = re.sub(r'[,\.\+]', u'', int_str)
9732d77e 1325 return int(int_str)
608d11f5
PH
1326
1327
9732d77e
PH
1328def float_or_none(v, scale=1, invscale=1, default=None):
1329 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1330
1331
608d11f5
PH
1332def parse_duration(s):
1333 if s is None:
1334 return None
1335
ca7b3246
S
1336 s = s.strip()
1337
608d11f5 1338 m = re.match(
f164038b 1339 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
608d11f5
PH
1340 if not m:
1341 return None
1342 res = int(m.group('secs'))
1343 if m.group('mins'):
1344 res += int(m.group('mins')) * 60
1345 if m.group('hours'):
1346 res += int(m.group('hours')) * 60 * 60
7adcbe75
PH
1347 if m.group('ms'):
1348 res += float(m.group('ms'))
608d11f5 1349 return res
91d7d0b3
JMF
1350
1351
1352def prepend_extension(filename, ext):
1353 name, real_ext = os.path.splitext(filename)
1354 return u'{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1355
1356
1357def check_executable(exe, args=[]):
1358 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1359 args can be a list of arguments for a short output (like -version) """
1360 try:
1361 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1362 except OSError:
1363 return False
1364 return exe
b7ab0590
PH
1365
1366
1367class PagedList(object):
1368 def __init__(self, pagefunc, pagesize):
1369 self._pagefunc = pagefunc
1370 self._pagesize = pagesize
1371
dd26ced1
PH
1372 def __len__(self):
1373 # This is only useful for tests
1374 return len(self.getslice())
1375
b7ab0590
PH
1376 def getslice(self, start=0, end=None):
1377 res = []
1378 for pagenum in itertools.count(start // self._pagesize):
1379 firstid = pagenum * self._pagesize
1380 nextfirstid = pagenum * self._pagesize + self._pagesize
1381 if start >= nextfirstid:
1382 continue
1383
1384 page_results = list(self._pagefunc(pagenum))
1385
1386 startv = (
1387 start % self._pagesize
1388 if firstid <= start < nextfirstid
1389 else 0)
1390
1391 endv = (
1392 ((end - 1) % self._pagesize) + 1
1393 if (end is not None and firstid <= end <= nextfirstid)
1394 else None)
1395
1396 if startv != 0 or endv is not None:
1397 page_results = page_results[startv:endv]
1398 res.extend(page_results)
1399
1400 # A little optimization - if current page is not "full", ie. does
1401 # not contain page_size videos then we can assume that this page
1402 # is the last one - there are no more ids on further pages -
1403 # i.e. no need to query again.
1404 if len(page_results) + startv < self._pagesize:
1405 break
1406
1407 # If we got the whole page, but the next page is not interesting,
1408 # break out early as well
1409 if end == nextfirstid:
1410 break
1411 return res
81c2f20b
PH
1412
1413
1414def uppercase_escape(s):
676eb3f2 1415 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1416 return re.sub(
a612753d 1417 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1418 lambda m: unicode_escape(m.group(0))[0],
1419 s)
b53466e1 1420
d05cfe06
S
1421
1422def escape_rfc3986(s):
1423 """Escape non-ASCII characters as suggested by RFC 3986"""
1424 if sys.version_info < (3, 0) and isinstance(s, unicode):
1425 s = s.encode('utf-8')
1426 return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]") #"%/;:@&=+$,!~*'()?#[]+" #?#[]+
1427
1428
1429def escape_url(url):
1430 """Escape URL as suggested by RFC 3986"""
1431 url_parsed = compat_urllib_parse_urlparse(url)
1432 return url_parsed._replace(
1433 path=escape_rfc3986(url_parsed.path),
1434 params=escape_rfc3986(url_parsed.params),
1435 query=escape_rfc3986(url_parsed.query),
1436 fragment=escape_rfc3986(url_parsed.fragment)
1437 ).geturl()
1438
b53466e1
PH
1439try:
1440 struct.pack(u'!I', 0)
1441except TypeError:
1442 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1443 def struct_pack(spec, *args):
1444 if isinstance(spec, compat_str):
1445 spec = spec.encode('ascii')
1446 return struct.pack(spec, *args)
1447
1448 def struct_unpack(spec, *args):
1449 if isinstance(spec, compat_str):
1450 spec = spec.encode('ascii')
1451 return struct.unpack(spec, *args)
1452else:
1453 struct_pack = struct.pack
1454 struct_unpack = struct.unpack
62e609ab
PH
1455
1456
1457def read_batch_urls(batch_fd):
1458 def fixup(url):
1459 if not isinstance(url, compat_str):
1460 url = url.decode('utf-8', 'replace')
1461 BOM_UTF8 = u'\xef\xbb\xbf'
1462 if url.startswith(BOM_UTF8):
1463 url = url[len(BOM_UTF8):]
1464 url = url.strip()
1465 if url.startswith(('#', ';', ']')):
1466 return False
1467 return url
1468
1469 with contextlib.closing(batch_fd) as fd:
1470 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1471
1472
1473def urlencode_postdata(*args, **kargs):
1474 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1475
1476
0990305d
PH
1477try:
1478 etree_iter = xml.etree.ElementTree.Element.iter
1479except AttributeError: # Python <=2.6
1480 etree_iter = lambda n: n.findall('.//*')
1481
1482
bcf89ce6
PH
1483def parse_xml(s):
1484 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1485 def doctype(self, name, pubid, system):
1486 pass # Ignore doctypes
1487
1488 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1489 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1490 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1491 # Fix up XML parser in Python 2.x
1492 if sys.version_info < (3, 0):
1493 for n in etree_iter(tree):
1494 if n.text is not None:
1495 if not isinstance(n.text, compat_str):
1496 n.text = n.text.decode('utf-8')
1497 return tree
e68301af
PH
1498
1499
1500if sys.version_info < (3, 0) and sys.platform == 'win32':
1501 def compat_getpass(prompt, *args, **kwargs):
1502 if isinstance(prompt, compat_str):
4e6f9aec 1503 prompt = prompt.encode(preferredencoding())
e68301af
PH
1504 return getpass.getpass(prompt, *args, **kwargs)
1505else:
1506 compat_getpass = getpass.getpass
a1a530b0
PH
1507
1508
1509US_RATINGS = {
1510 'G': 0,
1511 'PG': 10,
1512 'PG-13': 13,
1513 'R': 16,
1514 'NC': 18,
1515}
fac55558
PH
1516
1517
1518def strip_jsonp(code):
816930c4 1519 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
478c2c61
PH
1520
1521
e05f6939
PH
1522def js_to_json(code):
1523 def fix_kv(m):
1524 key = m.group(2)
1525 if key.startswith("'"):
1526 assert key.endswith("'")
1527 assert '"' not in key
1528 key = '"%s"' % key[1:-1]
1529 elif not key.startswith('"'):
1530 key = '"%s"' % key
1531
1532 value = m.group(4)
1533 if value.startswith("'"):
1534 assert value.endswith("'")
1535 assert '"' not in value
1536 value = '"%s"' % value[1:-1]
1537
1538 return m.group(1) + key + m.group(3) + value
1539
1540 res = re.sub(r'''(?x)
1541 ([{,]\s*)
1542 ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1543 (:\s*)
1544 ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1545 ''', fix_kv, code)
1546 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1547 return res
1548
1549
478c2c61
PH
1550def qualities(quality_ids):
1551 """ Get a numeric quality value out of a list of possible values """
1552 def q(qid):
1553 try:
1554 return quality_ids.index(qid)
1555 except ValueError:
1556 return -1
1557 return q
1558
acd69589
PH
1559
1560DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68
PH
1561
1562try:
1563 subprocess_check_output = subprocess.check_output
1564except AttributeError:
1565 def subprocess_check_output(*args, **kwargs):
1566 assert 'input' not in kwargs
1567 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1568 output, _ = p.communicate()
1569 ret = p.poll()
1570 if ret:
1571 raise subprocess.CalledProcessError(ret, p.args, output=output)
1572 return output