]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[mixcloud] Use a HEAD request when checking if the url is valid
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
912b38b4 4import calendar
676eb3f2 5import codecs
62e609ab 6import contextlib
e3946f98 7import ctypes
c496ca96
PH
8import datetime
9import email.utils
f45c185f 10import errno
e68301af 11import getpass
d77c3dfd 12import gzip
b7ab0590 13import itertools
03f9daab 14import io
f4bfd65f 15import json
d77c3dfd 16import locale
02dbf93f 17import math
d77c3dfd 18import os
4eb7f1d1 19import pipes
c496ca96 20import platform
d77c3dfd 21import re
13ebea79 22import ssl
c496ca96 23import socket
b53466e1 24import struct
1c088fa8 25import subprocess
d77c3dfd 26import sys
181c8655 27import tempfile
01951dda 28import traceback
bcf89ce6 29import xml.etree.ElementTree
d77c3dfd 30import zlib
d77c3dfd 31
01ba00ca 32try:
59ae15a5 33 import urllib.request as compat_urllib_request
01ba00ca 34except ImportError: # Python 2
59ae15a5 35 import urllib2 as compat_urllib_request
01ba00ca
PH
36
37try:
59ae15a5 38 import urllib.error as compat_urllib_error
01ba00ca 39except ImportError: # Python 2
59ae15a5 40 import urllib2 as compat_urllib_error
01ba00ca
PH
41
42try:
59ae15a5 43 import urllib.parse as compat_urllib_parse
01ba00ca 44except ImportError: # Python 2
59ae15a5 45 import urllib as compat_urllib_parse
01ba00ca 46
799c0763
PH
47try:
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
51
6543f0dc
JMF
52try:
53 import urllib.parse as compat_urlparse
54except ImportError: # Python 2
55 import urlparse as compat_urlparse
56
01ba00ca 57try:
59ae15a5 58 import http.cookiejar as compat_cookiejar
01ba00ca 59except ImportError: # Python 2
59ae15a5 60 import cookielib as compat_cookiejar
01ba00ca 61
3e669f36 62try:
59ae15a5 63 import html.entities as compat_html_entities
9f37a959 64except ImportError: # Python 2
59ae15a5 65 import htmlentitydefs as compat_html_entities
3e669f36 66
a8156c1d 67try:
59ae15a5 68 import html.parser as compat_html_parser
9f37a959 69except ImportError: # Python 2
59ae15a5 70 import HTMLParser as compat_html_parser
a8156c1d 71
348d0a7a 72try:
59ae15a5 73 import http.client as compat_http_client
9f37a959 74except ImportError: # Python 2
59ae15a5 75 import httplib as compat_http_client
348d0a7a 76
2eabb802 77try:
0e283428 78 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
79except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
81
e0df6211
PH
82try:
83 from urllib.request import urlretrieve as compat_urlretrieve
84except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
86
87
5910e210
PH
88try:
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
91except ImportError:
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
93
9f37a959 94try:
f1f725c6
PH
95 from urllib.parse import unquote as compat_urllib_parse_unquote
96except ImportError:
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
59ae15a5
PH
98 if string == '':
99 return string
100 res = string.split('%')
101 if len(res) == 1:
102 return string
103 if encoding is None:
104 encoding = 'utf-8'
105 if errors is None:
106 errors = 'replace'
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
108 pct_sequence = b''
109 string = res[0]
110 for item in res[1:]:
111 try:
112 if not item:
113 raise ValueError
114 pct_sequence += item[:2].decode('hex')
115 rest = item[2:]
116 if not rest:
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
120 continue
121 except ValueError:
122 rest = '%' + item
123 # Encountered non-percent-encoded characters. Flush the current
124 # pct_sequence.
125 string += pct_sequence.decode(encoding, errors) + rest
126 pct_sequence = b''
127 if pct_sequence:
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
130 return string
131
f1f725c6
PH
132
133try:
134 from urllib.parse import parse_qs as compat_parse_qs
135except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
138
59ae15a5
PH
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
143 r = []
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
146 continue
147 nv = name_value.split('=', 1)
148 if len(nv) != 2:
149 if strict_parsing:
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
153 nv.append('')
154 else:
155 continue
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
f1f725c6
PH
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
59ae15a5
PH
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
f1f725c6
PH
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
59ae15a5
PH
164 value = _coerce_result(value)
165 r.append((name, value))
166 return r
167
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
170 parsed_result = {}
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
176 else:
177 parsed_result[name] = [value]
178 return parsed_result
348d0a7a 179
3e669f36 180try:
59ae15a5 181 compat_str = unicode # Python 2
3e669f36 182except NameError:
59ae15a5 183 compat_str = str
3e669f36
PH
184
185try:
59ae15a5 186 compat_chr = unichr # Python 2
3e669f36 187except NameError:
59ae15a5 188 compat_chr = chr
3e669f36 189
f7300c5c
JMF
190try:
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
194
8d31fa3c
PH
195try:
196 from shlex import quote as shlex_quote
197except ImportError: # Python < 3.3
198 def shlex_quote(s):
199 return "'" + s.replace("'", "'\"'\"'") + "'"
200
201
b31756c1
FV
202def compat_ord(c):
203 if type(c) is int: return c
204 else: return ord(c)
205
468e2e92
FV
206# This is not clearly defined otherwise
207compiled_regex_type = type(re.compile(''))
208
3e669f36 209std_headers = {
ae8f7871 210 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
211 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
212 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
213 'Accept-Encoding': 'gzip, deflate',
214 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 215}
f427df17 216
d77c3dfd 217def preferredencoding():
59ae15a5 218 """Get preferred encoding.
d77c3dfd 219
59ae15a5
PH
220 Returns the best encoding scheme for the system, based on
221 locale.getpreferredencoding() and some further tweaks.
222 """
223 try:
224 pref = locale.getpreferredencoding()
225 u'TEST'.encode(pref)
226 except:
227 pref = 'UTF-8'
bae611f2 228
59ae15a5 229 return pref
d77c3dfd 230
8cd10ac4 231if sys.version_info < (3,0):
59ae15a5
PH
232 def compat_print(s):
233 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 234else:
59ae15a5
PH
235 def compat_print(s):
236 assert type(s) == type(u'')
237 print(s)
d77c3dfd 238
f4bfd65f 239
181c8655
PH
240def write_json_file(obj, fn):
241 """ Encode obj as JSON and write it to fn, atomically """
242
73159f99
S
243 args = {
244 'suffix': '.tmp',
245 'prefix': os.path.basename(fn) + '.',
246 'dir': os.path.dirname(fn),
247 'delete': False,
248 }
249
181c8655
PH
250 # In Python 2.x, json.dump expects a bytestream.
251 # In Python 3.x, it writes to a character stream
252 if sys.version_info < (3, 0):
73159f99 253 args['mode'] = 'wb'
181c8655 254 else:
73159f99
S
255 args.update({
256 'mode': 'w',
257 'encoding': 'utf-8',
258 })
259
260 tf = tempfile.NamedTemporaryFile(**args)
181c8655
PH
261
262 try:
263 with tf:
264 json.dump(obj, tf)
265 os.rename(tf.name, fn)
266 except:
267 try:
268 os.remove(tf.name)
269 except OSError:
270 pass
271 raise
272
273
274if sys.version_info >= (2, 7):
59ae56fa
PH
275 def find_xpath_attr(node, xpath, key, val):
276 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
277 assert re.match(r'^[a-zA-Z-]+$', key)
278 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
59ae56fa
PH
279 expr = xpath + u"[@%s='%s']" % (key, val)
280 return node.find(expr)
281else:
282 def find_xpath_attr(node, xpath, key, val):
283 for f in node.findall(xpath):
284 if f.attrib.get(key) == val:
285 return f
286 return None
287
d7e66d39
JMF
288# On python2.6 the xml.etree.ElementTree.Element methods don't support
289# the namespace parameter
290def xpath_with_ns(path, ns_map):
291 components = [c.split(':') for c in path.split('/')]
292 replaced = []
293 for c in components:
294 if len(c) == 1:
295 replaced.append(c[0])
296 else:
297 ns, tag = c
298 replaced.append('{%s}%s' % (ns_map[ns], tag))
299 return '/'.join(replaced)
300
d77c3dfd 301def htmlentity_transform(matchobj):
59ae15a5
PH
302 """Transforms an HTML entity to a character.
303
304 This function receives a match object and is intended to be used with
305 the re.sub() function.
306 """
307 entity = matchobj.group(1)
308
309 # Known non-numeric HTML entity
310 if entity in compat_html_entities.name2codepoint:
311 return compat_chr(compat_html_entities.name2codepoint[entity])
312
313 mobj = re.match(u'(?u)#(x?\\d+)', entity)
314 if mobj is not None:
315 numstr = mobj.group(1)
316 if numstr.startswith(u'x'):
317 base = 16
318 numstr = u'0%s' % numstr
319 else:
320 base = 10
321 return compat_chr(int(numstr, base))
322
323 # Unknown entity in name, return its literal representation
324 return (u'&%s;' % entity)
d77c3dfd 325
a8156c1d 326compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
327class BaseHTMLParser(compat_html_parser.HTMLParser):
328 def __init(self):
329 compat_html_parser.HTMLParser.__init__(self)
330 self.html = None
331
332 def loads(self, html):
333 self.html = html
334 self.feed(html)
335 self.close()
336
337class AttrParser(BaseHTMLParser):
43e8fafd
ND
338 """Modified HTMLParser that isolates a tag with the specified attribute"""
339 def __init__(self, attribute, value):
340 self.attribute = attribute
341 self.value = value
59ae15a5
PH
342 self.result = None
343 self.started = False
344 self.depth = {}
59ae15a5
PH
345 self.watch_startpos = False
346 self.error_count = 0
a921f407 347 BaseHTMLParser.__init__(self)
59ae15a5
PH
348
349 def error(self, message):
350 if self.error_count > 10 or self.started:
351 raise compat_html_parser.HTMLParseError(message, self.getpos())
352 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
353 self.error_count += 1
354 self.goahead(1)
355
59ae15a5
PH
356 def handle_starttag(self, tag, attrs):
357 attrs = dict(attrs)
358 if self.started:
359 self.find_startpos(None)
43e8fafd 360 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
361 self.result = [tag]
362 self.started = True
363 self.watch_startpos = True
364 if self.started:
365 if not tag in self.depth: self.depth[tag] = 0
366 self.depth[tag] += 1
367
368 def handle_endtag(self, tag):
369 if self.started:
370 if tag in self.depth: self.depth[tag] -= 1
371 if self.depth[self.result[0]] == 0:
372 self.started = False
373 self.result.append(self.getpos())
374
375 def find_startpos(self, x):
376 """Needed to put the start position of the result (self.result[1])
377 after the opening tag with the requested id"""
378 if self.watch_startpos:
379 self.watch_startpos = False
380 self.result.append(self.getpos())
381 handle_entityref = handle_charref = handle_data = handle_comment = \
382 handle_decl = handle_pi = unknown_decl = find_startpos
383
384 def get_result(self):
385 if self.result is None:
386 return None
387 if len(self.result) != 3:
388 return None
389 lines = self.html.split('\n')
390 lines = lines[self.result[1][0]-1:self.result[2][0]]
391 lines[0] = lines[0][self.result[1][1]:]
392 if len(lines) == 1:
393 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
394 lines[-1] = lines[-1][:self.result[2][1]]
395 return '\n'.join(lines).strip()
3b024e17
PH
396# Hack for https://github.com/rg3/youtube-dl/issues/662
397if sys.version_info < (2, 7, 3):
398 AttrParser.parse_endtag = (lambda self, i:
399 i + len("</scr'+'ipt>")
400 if self.rawdata[i:].startswith("</scr'+'ipt>")
401 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
402
403def get_element_by_id(id, html):
43e8fafd
ND
404 """Return the content of the tag with the specified ID in the passed HTML document"""
405 return get_element_by_attribute("id", id, html)
406
407def get_element_by_attribute(attribute, value, html):
408 """Return the content of the tag with the specified attribute in the passed HTML document"""
409 parser = AttrParser(attribute, value)
59ae15a5
PH
410 try:
411 parser.loads(html)
412 except compat_html_parser.HTMLParseError:
413 pass
414 return parser.get_result()
9e6dd238 415
a921f407
JMF
416class MetaParser(BaseHTMLParser):
417 """
418 Modified HTMLParser that isolates a meta tag with the specified name
419 attribute.
420 """
421 def __init__(self, name):
422 BaseHTMLParser.__init__(self)
423 self.name = name
424 self.content = None
425 self.result = None
426
427 def handle_starttag(self, tag, attrs):
428 if tag != 'meta':
429 return
430 attrs = dict(attrs)
431 if attrs.get('name') == self.name:
432 self.result = attrs.get('content')
433
434 def get_result(self):
435 return self.result
436
437def get_meta_content(name, html):
438 """
439 Return the content attribute from the meta tag with the given name attribute.
440 """
441 parser = MetaParser(name)
442 try:
443 parser.loads(html)
444 except compat_html_parser.HTMLParseError:
445 pass
446 return parser.get_result()
447
9e6dd238
FV
448
449def clean_html(html):
59ae15a5
PH
450 """Clean an HTML snippet into a readable string"""
451 # Newline vs <br />
452 html = html.replace('\n', ' ')
6b3aef80
FV
453 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
454 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
455 # Strip html tags
456 html = re.sub('<.*?>', '', html)
457 # Replace html entities
458 html = unescapeHTML(html)
7decf895 459 return html.strip()
9e6dd238
FV
460
461
d77c3dfd 462def sanitize_open(filename, open_mode):
59ae15a5
PH
463 """Try to open the given filename, and slightly tweak it if this fails.
464
465 Attempts to open the given filename. If this fails, it tries to change
466 the filename slightly, step by step, until it's either able to open it
467 or it fails and raises a final exception, like the standard open()
468 function.
469
470 It returns the tuple (stream, definitive_file_name).
471 """
472 try:
473 if filename == u'-':
474 if sys.platform == 'win32':
475 import msvcrt
476 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 477 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
478 stream = open(encodeFilename(filename), open_mode)
479 return (stream, filename)
480 except (IOError, OSError) as err:
f45c185f
PH
481 if err.errno in (errno.EACCES,):
482 raise
59ae15a5 483
f45c185f
PH
484 # In case of error, try to remove win32 forbidden chars
485 alt_filename = os.path.join(
486 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
487 for path_part in os.path.split(filename)
488 )
489 if alt_filename == filename:
490 raise
491 else:
492 # An exception here should be caught in the caller
493 stream = open(encodeFilename(filename), open_mode)
494 return (stream, alt_filename)
d77c3dfd
FV
495
496
497def timeconvert(timestr):
59ae15a5
PH
498 """Convert RFC 2822 defined time string into system timestamp"""
499 timestamp = None
500 timetuple = email.utils.parsedate_tz(timestr)
501 if timetuple is not None:
502 timestamp = email.utils.mktime_tz(timetuple)
503 return timestamp
1c469a94 504
796173d0 505def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
506 """Sanitizes a string so it could be used as part of a filename.
507 If restricted is set, use a stricter subset of allowed characters.
796173d0 508 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
509 """
510 def replace_insane(char):
511 if char == '?' or ord(char) < 32 or ord(char) == 127:
512 return ''
513 elif char == '"':
514 return '' if restricted else '\''
515 elif char == ':':
516 return '_-' if restricted else ' -'
517 elif char in '\\/|*<>':
518 return '_'
627dcfff 519 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
520 return '_'
521 if restricted and ord(char) > 127:
522 return '_'
523 return char
524
525 result = u''.join(map(replace_insane, s))
796173d0
PH
526 if not is_id:
527 while '__' in result:
528 result = result.replace('__', '_')
529 result = result.strip('_')
530 # Common case of "Foreign band name - English song title"
531 if restricted and result.startswith('-_'):
532 result = result[2:]
533 if not result:
534 result = '_'
59ae15a5 535 return result
d77c3dfd
FV
536
537def orderedSet(iterable):
59ae15a5
PH
538 """ Remove all duplicates from the input iterable """
539 res = []
540 for el in iterable:
541 if el not in res:
542 res.append(el)
543 return res
d77c3dfd 544
912b38b4 545
d77c3dfd 546def unescapeHTML(s):
912b38b4
PH
547 if s is None:
548 return None
549 assert type(s) == compat_str
d77c3dfd 550
912b38b4 551 result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
59ae15a5 552 return result
d77c3dfd 553
8bf48f23
PH
554
555def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
556 """
557 @param s The name of the file
558 """
d77c3dfd 559
8bf48f23 560 assert type(s) == compat_str
d77c3dfd 561
59ae15a5
PH
562 # Python 3 has a Unicode API
563 if sys.version_info >= (3, 0):
564 return s
0f00efed 565
59ae15a5
PH
566 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
567 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
568 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
569 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
570 if not for_subprocess:
571 return s
572 else:
573 # For subprocess calls, encode with locale encoding
574 # Refer to http://stackoverflow.com/a/9951851/35070
575 encoding = preferredencoding()
59ae15a5 576 else:
6df40dcb 577 encoding = sys.getfilesystemencoding()
8bf48f23
PH
578 if encoding is None:
579 encoding = 'utf-8'
580 return s.encode(encoding, 'ignore')
581
f07b74fc
PH
582
583def encodeArgument(s):
584 if not isinstance(s, compat_str):
585 # Legacy code that uses byte strings
586 # Uncomment the following line after fixing all post processors
587 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
588 s = s.decode('ascii')
589 return encodeFilename(s, True)
590
591
8271226a
PH
592def decodeOption(optval):
593 if optval is None:
594 return optval
595 if isinstance(optval, bytes):
596 optval = optval.decode(preferredencoding())
597
598 assert isinstance(optval, compat_str)
599 return optval
1c256f70 600
4539dd30
PH
601def formatSeconds(secs):
602 if secs > 3600:
603 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
604 elif secs > 60:
605 return '%d:%02d' % (secs // 60, secs % 60)
606 else:
607 return '%d' % secs
608
a0ddb8a2
PH
609
610def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
13ebea79
PH
611 if sys.version_info < (3, 2):
612 import httplib
613
614 class HTTPSConnectionV3(httplib.HTTPSConnection):
615 def __init__(self, *args, **kwargs):
616 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
617
618 def connect(self):
619 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 620 if getattr(self, '_tunnel_host', False):
13ebea79
PH
621 self.sock = sock
622 self._tunnel()
623 try:
624 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
de79c46c 625 except ssl.SSLError:
13ebea79
PH
626 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
627
628 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
629 def https_open(self, req):
630 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 631 return HTTPSHandlerV3(**kwargs)
ea6d901e 632 else:
13ebea79 633 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
ea6d901e 634 context.verify_mode = (ssl.CERT_NONE
dca08720 635 if opts_no_check_certificate
ea6d901e 636 else ssl.CERT_REQUIRED)
303b479e
PH
637 context.set_default_verify_paths()
638 try:
639 context.load_default_certs()
640 except AttributeError:
641 pass # Python < 3.4
a0ddb8a2 642 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 643
1c256f70
PH
644class ExtractorError(Exception):
645 """Error during info extraction."""
d11271dd 646 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
647 """ tb, if given, is the original traceback (so that it can be printed out).
648 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
649 """
650
651 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
652 expected = True
d11271dd
PH
653 if video_id is not None:
654 msg = video_id + ': ' + msg
9a82b238 655 if not expected:
298f833b 656 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 657 super(ExtractorError, self).__init__(msg)
d5979c5d 658
1c256f70 659 self.traceback = tb
8cc83b8d 660 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 661 self.cause = cause
d11271dd 662 self.video_id = video_id
1c256f70 663
01951dda
PH
664 def format_traceback(self):
665 if self.traceback is None:
666 return None
667 return u''.join(traceback.format_tb(self.traceback))
668
1c256f70 669
55b3e45b
JMF
670class RegexNotFoundError(ExtractorError):
671 """Error when a regex didn't match"""
672 pass
673
674
d77c3dfd 675class DownloadError(Exception):
59ae15a5 676 """Download Error exception.
d77c3dfd 677
59ae15a5
PH
678 This exception may be thrown by FileDownloader objects if they are not
679 configured to continue on errors. They will contain the appropriate
680 error message.
681 """
8cc83b8d
FV
682 def __init__(self, msg, exc_info=None):
683 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
684 super(DownloadError, self).__init__(msg)
685 self.exc_info = exc_info
d77c3dfd
FV
686
687
688class SameFileError(Exception):
59ae15a5 689 """Same File exception.
d77c3dfd 690
59ae15a5
PH
691 This exception will be thrown by FileDownloader objects if they detect
692 multiple files would have to be downloaded to the same file on disk.
693 """
694 pass
d77c3dfd
FV
695
696
697class PostProcessingError(Exception):
59ae15a5 698 """Post Processing exception.
d77c3dfd 699
59ae15a5
PH
700 This exception may be raised by PostProcessor's .run() method to
701 indicate an error in the postprocessing task.
702 """
7851b379
PH
703 def __init__(self, msg):
704 self.msg = msg
d77c3dfd
FV
705
706class MaxDownloadsReached(Exception):
59ae15a5
PH
707 """ --max-downloads limit has been reached. """
708 pass
d77c3dfd
FV
709
710
711class UnavailableVideoError(Exception):
59ae15a5 712 """Unavailable Format exception.
d77c3dfd 713
59ae15a5
PH
714 This exception will be thrown when a video is requested
715 in a format that is not available for that video.
716 """
717 pass
d77c3dfd
FV
718
719
720class ContentTooShortError(Exception):
59ae15a5 721 """Content Too Short exception.
d77c3dfd 722
59ae15a5
PH
723 This exception may be raised by FileDownloader objects when a file they
724 download is too small for what the server announced first, indicating
725 the connection was probably interrupted.
726 """
727 # Both in bytes
728 downloaded = None
729 expected = None
d77c3dfd 730
59ae15a5
PH
731 def __init__(self, downloaded, expected):
732 self.downloaded = downloaded
733 self.expected = expected
d77c3dfd 734
acebc9cd 735class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
736 """Handler for HTTP requests and responses.
737
738 This class, when installed with an OpenerDirector, automatically adds
739 the standard headers to every HTTP request and handles gzipped and
740 deflated responses from web servers. If compression is to be avoided in
741 a particular request, the original request in the program code only has
742 to include the HTTP header "Youtubedl-No-Compression", which will be
743 removed before making the real request.
744
745 Part of this code was copied from:
746
747 http://techknack.net/python-urllib2-handlers/
748
749 Andrew Rowls, the author of that code, agreed to release it to the
750 public domain.
751 """
752
753 @staticmethod
754 def deflate(data):
755 try:
756 return zlib.decompress(data, -zlib.MAX_WBITS)
757 except zlib.error:
758 return zlib.decompress(data)
759
760 @staticmethod
761 def addinfourl_wrapper(stream, headers, url, code):
762 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
763 return compat_urllib_request.addinfourl(stream, headers, url, code)
764 ret = compat_urllib_request.addinfourl(stream, headers, url)
765 ret.code = code
766 return ret
767
acebc9cd 768 def http_request(self, req):
33ac271b
PH
769 for h, v in std_headers.items():
770 if h not in req.headers:
771 req.add_header(h, v)
59ae15a5
PH
772 if 'Youtubedl-no-compression' in req.headers:
773 if 'Accept-encoding' in req.headers:
774 del req.headers['Accept-encoding']
775 del req.headers['Youtubedl-no-compression']
3446dfb7 776 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
777 if 'User-agent' in req.headers:
778 del req.headers['User-agent']
779 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 780 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
781 return req
782
acebc9cd 783 def http_response(self, req, resp):
59ae15a5
PH
784 old_resp = resp
785 # gzip
786 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
787 content = resp.read()
788 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
789 try:
790 uncompressed = io.BytesIO(gz.read())
791 except IOError as original_ioerror:
792 # There may be junk add the end of the file
793 # See http://stackoverflow.com/q/4928560/35070 for details
794 for i in range(1, 1024):
795 try:
796 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
797 uncompressed = io.BytesIO(gz.read())
798 except IOError:
799 continue
800 break
801 else:
802 raise original_ioerror
803 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
804 resp.msg = old_resp.msg
805 # deflate
806 if resp.headers.get('Content-encoding', '') == 'deflate':
807 gz = io.BytesIO(self.deflate(resp.read()))
808 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
809 resp.msg = old_resp.msg
810 return resp
0f8d03f8 811
acebc9cd
PH
812 https_request = http_request
813 https_response = http_response
bf50b038 814
5de90176 815
305d0683 816def parse_iso8601(date_str, delimiter='T'):
912b38b4
PH
817 """ Return a UNIX timestamp from the given date """
818
819 if date_str is None:
820 return None
821
822 m = re.search(
823 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
824 date_str)
825 if not m:
826 timezone = datetime.timedelta()
827 else:
828 date_str = date_str[:-len(m.group(0))]
829 if not m.group('sign'):
830 timezone = datetime.timedelta()
831 else:
832 sign = 1 if m.group('sign') == '+' else -1
833 timezone = datetime.timedelta(
834 hours=sign * int(m.group('hours')),
835 minutes=sign * int(m.group('minutes')))
305d0683
TB
836 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
837 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
838 return calendar.timegm(dt.timetuple())
839
840
bf50b038
JMF
841def unified_strdate(date_str):
842 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
843
844 if date_str is None:
845 return None
846
bf50b038
JMF
847 upload_date = None
848 #Replace commas
026fcc04 849 date_str = date_str.replace(',', ' ')
bf50b038 850 # %z (UTC offset) is only supported in python>=3.2
026fcc04 851 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
19e1d359
JMF
852 format_expressions = [
853 '%d %B %Y',
0f99566c 854 '%d %b %Y',
19e1d359
JMF
855 '%B %d %Y',
856 '%b %d %Y',
78ff59d0
PP
857 '%b %dst %Y %I:%M%p',
858 '%b %dnd %Y %I:%M%p',
859 '%b %dth %Y %I:%M%p',
19e1d359 860 '%Y-%m-%d',
fe556f1b 861 '%Y/%m/%d',
4cf96546 862 '%d.%m.%Y',
19e1d359 863 '%d/%m/%Y',
423817c4 864 '%d/%m/%y',
19e1d359 865 '%Y/%m/%d %H:%M:%S',
5d73273f 866 '%Y-%m-%d %H:%M:%S',
19e1d359 867 '%d.%m.%Y %H:%M',
b047de6f 868 '%d.%m.%Y %H.%M',
19e1d359 869 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
870 '%Y-%m-%dT%H:%M:%S.%fZ',
871 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 872 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 873 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 874 '%Y-%m-%dT%H:%M',
19e1d359 875 ]
bf50b038
JMF
876 for expression in format_expressions:
877 try:
878 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 879 except ValueError:
bf50b038 880 pass
42393ce2
PH
881 if upload_date is None:
882 timetuple = email.utils.parsedate_tz(date_str)
883 if timetuple:
884 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
885 return upload_date
886
cbdbb766 887def determine_ext(url, default_ext=u'unknown_video'):
f4776371
S
888 if url is None:
889 return default_ext
73e79f2a
PH
890 guess = url.partition(u'?')[0].rpartition(u'.')[2]
891 if re.match(r'^[A-Za-z0-9]+$', guess):
892 return guess
893 else:
cbdbb766 894 return default_ext
73e79f2a 895
d4051a8e
JMF
896def subtitles_filename(filename, sub_lang, sub_format):
897 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
898
bd558525 899def date_from_str(date_str):
37254abc
JMF
900 """
901 Return a datetime object from a string in the format YYYYMMDD or
902 (now|today)[+-][0-9](day|week|month|year)(s)?"""
903 today = datetime.date.today()
904 if date_str == 'now'or date_str == 'today':
905 return today
906 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
907 if match is not None:
908 sign = match.group('sign')
909 time = int(match.group('time'))
910 if sign == '-':
911 time = -time
912 unit = match.group('unit')
913 #A bad aproximation?
914 if unit == 'month':
915 unit = 'day'
916 time *= 30
917 elif unit == 'year':
918 unit = 'day'
919 time *= 365
920 unit += 's'
921 delta = datetime.timedelta(**{unit: time})
922 return today + delta
bd558525
JMF
923 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
924
e63fc1be 925def hyphenate_date(date_str):
926 """
927 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
928 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
929 if match is not None:
930 return '-'.join(match.groups())
931 else:
932 return date_str
933
bd558525
JMF
934class DateRange(object):
935 """Represents a time interval between two dates"""
936 def __init__(self, start=None, end=None):
937 """start and end must be strings in the format accepted by date"""
938 if start is not None:
939 self.start = date_from_str(start)
940 else:
941 self.start = datetime.datetime.min.date()
942 if end is not None:
943 self.end = date_from_str(end)
944 else:
945 self.end = datetime.datetime.max.date()
37254abc 946 if self.start > self.end:
bd558525
JMF
947 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
948 @classmethod
949 def day(cls, day):
950 """Returns a range that only contains the given day"""
951 return cls(day,day)
952 def __contains__(self, date):
953 """Check if the date is in the range"""
37254abc
JMF
954 if not isinstance(date, datetime.date):
955 date = date_from_str(date)
956 return self.start <= date <= self.end
bd558525
JMF
957 def __str__(self):
958 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
959
960
961def platform_name():
962 """ Returns the platform name as a compat_str """
963 res = platform.platform()
964 if isinstance(res, bytes):
965 res = res.decode(preferredencoding())
966
967 assert isinstance(res, compat_str)
968 return res
c257baff
PH
969
970
b58ddb32
PH
971def _windows_write_string(s, out):
972 """ Returns True if the string was written using special methods,
973 False if it has yet to be written out."""
974 # Adapted from http://stackoverflow.com/a/3259271/35070
975
976 import ctypes
977 import ctypes.wintypes
978
979 WIN_OUTPUT_IDS = {
980 1: -11,
981 2: -12,
982 }
983
a383a98a
PH
984 try:
985 fileno = out.fileno()
986 except AttributeError:
987 # If the output stream doesn't have a fileno, it's virtual
988 return False
b58ddb32
PH
989 if fileno not in WIN_OUTPUT_IDS:
990 return False
991
992 GetStdHandle = ctypes.WINFUNCTYPE(
993 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
994 ("GetStdHandle", ctypes.windll.kernel32))
995 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
996
997 WriteConsoleW = ctypes.WINFUNCTYPE(
998 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
999 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1000 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1001 written = ctypes.wintypes.DWORD(0)
1002
1003 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1004 FILE_TYPE_CHAR = 0x0002
1005 FILE_TYPE_REMOTE = 0x8000
1006 GetConsoleMode = ctypes.WINFUNCTYPE(
1007 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1008 ctypes.POINTER(ctypes.wintypes.DWORD))(
1009 ("GetConsoleMode", ctypes.windll.kernel32))
1010 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1011
1012 def not_a_console(handle):
1013 if handle == INVALID_HANDLE_VALUE or handle is None:
1014 return True
1015 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1016 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1017
1018 if not_a_console(h):
1019 return False
1020
d1b9c912
PH
1021 def next_nonbmp_pos(s):
1022 try:
1023 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1024 except StopIteration:
1025 return len(s)
1026
1027 while s:
1028 count = min(next_nonbmp_pos(s), 1024)
1029
b58ddb32 1030 ret = WriteConsoleW(
d1b9c912 1031 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1032 if ret == 0:
1033 raise OSError('Failed to write string')
d1b9c912
PH
1034 if not count: # We just wrote a non-BMP character
1035 assert written.value == 2
1036 s = s[1:]
1037 else:
1038 assert written.value > 0
1039 s = s[written.value:]
b58ddb32
PH
1040 return True
1041
1042
734f90bb 1043def write_string(s, out=None, encoding=None):
7459e3a2
PH
1044 if out is None:
1045 out = sys.stderr
8bf48f23 1046 assert type(s) == compat_str
7459e3a2 1047
b58ddb32
PH
1048 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1049 if _windows_write_string(s, out):
1050 return
1051
7459e3a2
PH
1052 if ('b' in getattr(out, 'mode', '') or
1053 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1054 byt = s.encode(encoding or preferredencoding(), 'ignore')
1055 out.write(byt)
1056 elif hasattr(out, 'buffer'):
1057 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1058 byt = s.encode(enc, 'ignore')
1059 out.buffer.write(byt)
1060 else:
8bf48f23 1061 out.write(s)
7459e3a2
PH
1062 out.flush()
1063
1064
48ea9cea
PH
1065def bytes_to_intlist(bs):
1066 if not bs:
1067 return []
1068 if isinstance(bs[0], int): # Python 3
1069 return list(bs)
1070 else:
1071 return [ord(c) for c in bs]
1072
c257baff 1073
cba892fa 1074def intlist_to_bytes(xs):
1075 if not xs:
1076 return b''
1077 if isinstance(chr(0), bytes): # Python 2
1078 return ''.join([chr(x) for x in xs])
1079 else:
1080 return bytes(xs)
c38b1e77
PH
1081
1082
1083def get_cachedir(params={}):
1084 cache_root = os.environ.get('XDG_CACHE_HOME',
1085 os.path.expanduser('~/.cache'))
1086 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
c1c9a79c
PH
1087
1088
1089# Cross-platform file locking
1090if sys.platform == 'win32':
1091 import ctypes.wintypes
1092 import msvcrt
1093
1094 class OVERLAPPED(ctypes.Structure):
1095 _fields_ = [
1096 ('Internal', ctypes.wintypes.LPVOID),
1097 ('InternalHigh', ctypes.wintypes.LPVOID),
1098 ('Offset', ctypes.wintypes.DWORD),
1099 ('OffsetHigh', ctypes.wintypes.DWORD),
1100 ('hEvent', ctypes.wintypes.HANDLE),
1101 ]
1102
1103 kernel32 = ctypes.windll.kernel32
1104 LockFileEx = kernel32.LockFileEx
1105 LockFileEx.argtypes = [
1106 ctypes.wintypes.HANDLE, # hFile
1107 ctypes.wintypes.DWORD, # dwFlags
1108 ctypes.wintypes.DWORD, # dwReserved
1109 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1110 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1111 ctypes.POINTER(OVERLAPPED) # Overlapped
1112 ]
1113 LockFileEx.restype = ctypes.wintypes.BOOL
1114 UnlockFileEx = kernel32.UnlockFileEx
1115 UnlockFileEx.argtypes = [
1116 ctypes.wintypes.HANDLE, # hFile
1117 ctypes.wintypes.DWORD, # dwReserved
1118 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1119 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1120 ctypes.POINTER(OVERLAPPED) # Overlapped
1121 ]
1122 UnlockFileEx.restype = ctypes.wintypes.BOOL
1123 whole_low = 0xffffffff
1124 whole_high = 0x7fffffff
1125
1126 def _lock_file(f, exclusive):
1127 overlapped = OVERLAPPED()
1128 overlapped.Offset = 0
1129 overlapped.OffsetHigh = 0
1130 overlapped.hEvent = 0
1131 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1132 handle = msvcrt.get_osfhandle(f.fileno())
1133 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1134 whole_low, whole_high, f._lock_file_overlapped_p):
1135 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1136
1137 def _unlock_file(f):
1138 assert f._lock_file_overlapped_p
1139 handle = msvcrt.get_osfhandle(f.fileno())
1140 if not UnlockFileEx(handle, 0,
1141 whole_low, whole_high, f._lock_file_overlapped_p):
1142 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1143
1144else:
1145 import fcntl
1146
1147 def _lock_file(f, exclusive):
1148 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1149
1150 def _unlock_file(f):
1151 fcntl.lockf(f, fcntl.LOCK_UN)
1152
1153
1154class locked_file(object):
1155 def __init__(self, filename, mode, encoding=None):
1156 assert mode in ['r', 'a', 'w']
1157 self.f = io.open(filename, mode, encoding=encoding)
1158 self.mode = mode
1159
1160 def __enter__(self):
1161 exclusive = self.mode != 'r'
1162 try:
1163 _lock_file(self.f, exclusive)
1164 except IOError:
1165 self.f.close()
1166 raise
1167 return self
1168
1169 def __exit__(self, etype, value, traceback):
1170 try:
1171 _unlock_file(self.f)
1172 finally:
1173 self.f.close()
1174
1175 def __iter__(self):
1176 return iter(self.f)
1177
1178 def write(self, *args):
1179 return self.f.write(*args)
1180
1181 def read(self, *args):
1182 return self.f.read(*args)
4eb7f1d1
JMF
1183
1184
1185def shell_quote(args):
a6a173c2
JMF
1186 quoted_args = []
1187 encoding = sys.getfilesystemencoding()
1188 if encoding is None:
1189 encoding = 'utf-8'
1190 for a in args:
1191 if isinstance(a, bytes):
1192 # We may get a filename encoded with 'encodeFilename'
1193 a = a.decode(encoding)
1194 quoted_args.append(pipes.quote(a))
1195 return u' '.join(quoted_args)
9d4660ca
PH
1196
1197
f4d96df0
PH
1198def takewhile_inclusive(pred, seq):
1199 """ Like itertools.takewhile, but include the latest evaluated element
1200 (the first element so that Not pred(e)) """
1201 for e in seq:
1202 yield e
1203 if not pred(e):
1204 return
1205
1206
9d4660ca
PH
1207def smuggle_url(url, data):
1208 """ Pass additional data in a URL for internal use. """
1209
1210 sdata = compat_urllib_parse.urlencode(
1211 {u'__youtubedl_smuggle': json.dumps(data)})
1212 return url + u'#' + sdata
1213
1214
79f82953 1215def unsmuggle_url(smug_url, default=None):
9d4660ca 1216 if not '#__youtubedl_smuggle' in smug_url:
79f82953 1217 return smug_url, default
9d4660ca
PH
1218 url, _, sdata = smug_url.rpartition(u'#')
1219 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1220 data = json.loads(jsond)
1221 return url, data
02dbf93f
PH
1222
1223
02dbf93f
PH
1224def format_bytes(bytes):
1225 if bytes is None:
1226 return u'N/A'
1227 if type(bytes) is str:
1228 bytes = float(bytes)
1229 if bytes == 0.0:
1230 exponent = 0
1231 else:
1232 exponent = int(math.log(bytes, 1024.0))
1233 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1234 converted = float(bytes) / float(1024 ** exponent)
1235 return u'%.2f%s' % (converted, suffix)
f53c966a 1236
1c088fa8 1237
1c088fa8
PH
1238def get_term_width():
1239 columns = os.environ.get('COLUMNS', None)
1240 if columns:
1241 return int(columns)
1242
1243 try:
1244 sp = subprocess.Popen(
1245 ['stty', 'size'],
1246 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1247 out, err = sp.communicate()
1248 return int(out.split()[1])
1249 except:
1250 pass
1251 return None
caefb1de
PH
1252
1253
1254def month_by_name(name):
1255 """ Return the number of a month by (locale-independently) English name """
1256
1257 ENGLISH_NAMES = [
dadb8184 1258 u'January', u'February', u'March', u'April', u'May', u'June',
caefb1de
PH
1259 u'July', u'August', u'September', u'October', u'November', u'December']
1260 try:
1261 return ENGLISH_NAMES.index(name) + 1
1262 except ValueError:
1263 return None
18258362
JMF
1264
1265
5aafe895 1266def fix_xml_ampersands(xml_str):
18258362 1267 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1268 return re.sub(
1269 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1270 u'&amp;',
1271 xml_str)
e3946f98
PH
1272
1273
1274def setproctitle(title):
8bf48f23 1275 assert isinstance(title, compat_str)
e3946f98
PH
1276 try:
1277 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1278 except OSError:
1279 return
6eefe533
PH
1280 title_bytes = title.encode('utf-8')
1281 buf = ctypes.create_string_buffer(len(title_bytes))
1282 buf.value = title_bytes
e3946f98 1283 try:
6eefe533 1284 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1285 except AttributeError:
1286 return # Strange libc, just skip this
d7dda168
PH
1287
1288
1289def remove_start(s, start):
1290 if s.startswith(start):
1291 return s[len(start):]
1292 return s
29eb5174
PH
1293
1294
2b9faf55
PH
1295def remove_end(s, end):
1296 if s.endswith(end):
1297 return s[:-len(end)]
1298 return s
1299
1300
29eb5174 1301def url_basename(url):
9b8aaeed
JMF
1302 path = compat_urlparse.urlparse(url).path
1303 return path.strip(u'/').split(u'/')[-1]
aa94a6d3
PH
1304
1305
1306class HEADRequest(compat_urllib_request.Request):
1307 def get_method(self):
1308 return "HEAD"
7217e148
PH
1309
1310
9732d77e 1311def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1312 if get_attr:
1313 if v is not None:
1314 v = getattr(v, get_attr, None)
9572013d
PH
1315 if v == '':
1316 v = None
9732d77e
PH
1317 return default if v is None else (int(v) * invscale // scale)
1318
9572013d 1319
40a90862
JMF
1320def str_or_none(v, default=None):
1321 return default if v is None else compat_str(v)
1322
9732d77e
PH
1323
1324def str_to_int(int_str):
1325 if int_str is None:
1326 return None
1327 int_str = re.sub(r'[,\.]', u'', int_str)
1328 return int(int_str)
608d11f5
PH
1329
1330
9732d77e
PH
1331def float_or_none(v, scale=1, invscale=1, default=None):
1332 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1333
1334
608d11f5
PH
1335def parse_duration(s):
1336 if s is None:
1337 return None
1338
1339 m = re.match(
7adcbe75 1340 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?(?P<ms>\.[0-9]+)?$', s)
608d11f5
PH
1341 if not m:
1342 return None
1343 res = int(m.group('secs'))
1344 if m.group('mins'):
1345 res += int(m.group('mins')) * 60
1346 if m.group('hours'):
1347 res += int(m.group('hours')) * 60 * 60
7adcbe75
PH
1348 if m.group('ms'):
1349 res += float(m.group('ms'))
608d11f5 1350 return res
91d7d0b3
JMF
1351
1352
1353def prepend_extension(filename, ext):
1354 name, real_ext = os.path.splitext(filename)
1355 return u'{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1356
1357
1358def check_executable(exe, args=[]):
1359 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1360 args can be a list of arguments for a short output (like -version) """
1361 try:
1362 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1363 except OSError:
1364 return False
1365 return exe
b7ab0590
PH
1366
1367
1368class PagedList(object):
1369 def __init__(self, pagefunc, pagesize):
1370 self._pagefunc = pagefunc
1371 self._pagesize = pagesize
1372
dd26ced1
PH
1373 def __len__(self):
1374 # This is only useful for tests
1375 return len(self.getslice())
1376
b7ab0590
PH
1377 def getslice(self, start=0, end=None):
1378 res = []
1379 for pagenum in itertools.count(start // self._pagesize):
1380 firstid = pagenum * self._pagesize
1381 nextfirstid = pagenum * self._pagesize + self._pagesize
1382 if start >= nextfirstid:
1383 continue
1384
1385 page_results = list(self._pagefunc(pagenum))
1386
1387 startv = (
1388 start % self._pagesize
1389 if firstid <= start < nextfirstid
1390 else 0)
1391
1392 endv = (
1393 ((end - 1) % self._pagesize) + 1
1394 if (end is not None and firstid <= end <= nextfirstid)
1395 else None)
1396
1397 if startv != 0 or endv is not None:
1398 page_results = page_results[startv:endv]
1399 res.extend(page_results)
1400
1401 # A little optimization - if current page is not "full", ie. does
1402 # not contain page_size videos then we can assume that this page
1403 # is the last one - there are no more ids on further pages -
1404 # i.e. no need to query again.
1405 if len(page_results) + startv < self._pagesize:
1406 break
1407
1408 # If we got the whole page, but the next page is not interesting,
1409 # break out early as well
1410 if end == nextfirstid:
1411 break
1412 return res
81c2f20b
PH
1413
1414
1415def uppercase_escape(s):
676eb3f2 1416 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1417 return re.sub(
a612753d 1418 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1419 lambda m: unicode_escape(m.group(0))[0],
1420 s)
b53466e1
PH
1421
1422try:
1423 struct.pack(u'!I', 0)
1424except TypeError:
1425 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1426 def struct_pack(spec, *args):
1427 if isinstance(spec, compat_str):
1428 spec = spec.encode('ascii')
1429 return struct.pack(spec, *args)
1430
1431 def struct_unpack(spec, *args):
1432 if isinstance(spec, compat_str):
1433 spec = spec.encode('ascii')
1434 return struct.unpack(spec, *args)
1435else:
1436 struct_pack = struct.pack
1437 struct_unpack = struct.unpack
62e609ab
PH
1438
1439
1440def read_batch_urls(batch_fd):
1441 def fixup(url):
1442 if not isinstance(url, compat_str):
1443 url = url.decode('utf-8', 'replace')
1444 BOM_UTF8 = u'\xef\xbb\xbf'
1445 if url.startswith(BOM_UTF8):
1446 url = url[len(BOM_UTF8):]
1447 url = url.strip()
1448 if url.startswith(('#', ';', ']')):
1449 return False
1450 return url
1451
1452 with contextlib.closing(batch_fd) as fd:
1453 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1454
1455
1456def urlencode_postdata(*args, **kargs):
1457 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1458
1459
0990305d
PH
1460try:
1461 etree_iter = xml.etree.ElementTree.Element.iter
1462except AttributeError: # Python <=2.6
1463 etree_iter = lambda n: n.findall('.//*')
1464
1465
bcf89ce6
PH
1466def parse_xml(s):
1467 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1468 def doctype(self, name, pubid, system):
1469 pass # Ignore doctypes
1470
1471 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1472 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1473 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1474 # Fix up XML parser in Python 2.x
1475 if sys.version_info < (3, 0):
1476 for n in etree_iter(tree):
1477 if n.text is not None:
1478 if not isinstance(n.text, compat_str):
1479 n.text = n.text.decode('utf-8')
1480 return tree
e68301af
PH
1481
1482
1483if sys.version_info < (3, 0) and sys.platform == 'win32':
1484 def compat_getpass(prompt, *args, **kwargs):
1485 if isinstance(prompt, compat_str):
4e6f9aec 1486 prompt = prompt.encode(preferredencoding())
e68301af
PH
1487 return getpass.getpass(prompt, *args, **kwargs)
1488else:
1489 compat_getpass = getpass.getpass
a1a530b0
PH
1490
1491
1492US_RATINGS = {
1493 'G': 0,
1494 'PG': 10,
1495 'PG-13': 13,
1496 'R': 16,
1497 'NC': 18,
1498}
fac55558
PH
1499
1500
1501def strip_jsonp(code):
816930c4 1502 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
478c2c61
PH
1503
1504
e05f6939
PH
1505def js_to_json(code):
1506 def fix_kv(m):
1507 key = m.group(2)
1508 if key.startswith("'"):
1509 assert key.endswith("'")
1510 assert '"' not in key
1511 key = '"%s"' % key[1:-1]
1512 elif not key.startswith('"'):
1513 key = '"%s"' % key
1514
1515 value = m.group(4)
1516 if value.startswith("'"):
1517 assert value.endswith("'")
1518 assert '"' not in value
1519 value = '"%s"' % value[1:-1]
1520
1521 return m.group(1) + key + m.group(3) + value
1522
1523 res = re.sub(r'''(?x)
1524 ([{,]\s*)
1525 ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1526 (:\s*)
1527 ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1528 ''', fix_kv, code)
1529 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1530 return res
1531
1532
478c2c61
PH
1533def qualities(quality_ids):
1534 """ Get a numeric quality value out of a list of possible values """
1535 def q(qid):
1536 try:
1537 return quality_ids.index(qid)
1538 except ValueError:
1539 return -1
1540 return q
1541
acd69589
PH
1542
1543DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68
PH
1544
1545try:
1546 subprocess_check_output = subprocess.check_output
1547except AttributeError:
1548 def subprocess_check_output(*args, **kwargs):
1549 assert 'input' not in kwargs
1550 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1551 output, _ = p.communicate()
1552 ret = p.poll()
1553 if ret:
1554 raise subprocess.CalledProcessError(ret, p.args, output=output)
1555 return output