]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
renamed for consistency
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
912b38b4 4import calendar
676eb3f2 5import codecs
62e609ab 6import contextlib
e3946f98 7import ctypes
c496ca96
PH
8import datetime
9import email.utils
f45c185f 10import errno
e68301af 11import getpass
d77c3dfd 12import gzip
b7ab0590 13import itertools
03f9daab 14import io
f4bfd65f 15import json
d77c3dfd 16import locale
02dbf93f 17import math
d77c3dfd 18import os
4eb7f1d1 19import pipes
c496ca96 20import platform
d77c3dfd 21import re
13ebea79 22import ssl
c496ca96 23import socket
b53466e1 24import struct
1c088fa8 25import subprocess
d77c3dfd 26import sys
181c8655 27import tempfile
01951dda 28import traceback
bcf89ce6 29import xml.etree.ElementTree
d77c3dfd 30import zlib
d77c3dfd 31
01ba00ca 32try:
59ae15a5 33 import urllib.request as compat_urllib_request
01ba00ca 34except ImportError: # Python 2
59ae15a5 35 import urllib2 as compat_urllib_request
01ba00ca
PH
36
37try:
59ae15a5 38 import urllib.error as compat_urllib_error
01ba00ca 39except ImportError: # Python 2
59ae15a5 40 import urllib2 as compat_urllib_error
01ba00ca
PH
41
42try:
59ae15a5 43 import urllib.parse as compat_urllib_parse
01ba00ca 44except ImportError: # Python 2
59ae15a5 45 import urllib as compat_urllib_parse
01ba00ca 46
799c0763
PH
47try:
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
51
6543f0dc
JMF
52try:
53 import urllib.parse as compat_urlparse
54except ImportError: # Python 2
55 import urlparse as compat_urlparse
56
01ba00ca 57try:
59ae15a5 58 import http.cookiejar as compat_cookiejar
01ba00ca 59except ImportError: # Python 2
59ae15a5 60 import cookielib as compat_cookiejar
01ba00ca 61
3e669f36 62try:
59ae15a5 63 import html.entities as compat_html_entities
9f37a959 64except ImportError: # Python 2
59ae15a5 65 import htmlentitydefs as compat_html_entities
3e669f36 66
a8156c1d 67try:
59ae15a5 68 import html.parser as compat_html_parser
9f37a959 69except ImportError: # Python 2
59ae15a5 70 import HTMLParser as compat_html_parser
a8156c1d 71
348d0a7a 72try:
59ae15a5 73 import http.client as compat_http_client
9f37a959 74except ImportError: # Python 2
59ae15a5 75 import httplib as compat_http_client
348d0a7a 76
2eabb802 77try:
0e283428 78 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
79except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
81
e0df6211
PH
82try:
83 from urllib.request import urlretrieve as compat_urlretrieve
84except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
86
87
5910e210
PH
88try:
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
91except ImportError:
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
93
9f37a959 94try:
f1f725c6
PH
95 from urllib.parse import unquote as compat_urllib_parse_unquote
96except ImportError:
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
59ae15a5
PH
98 if string == '':
99 return string
100 res = string.split('%')
101 if len(res) == 1:
102 return string
103 if encoding is None:
104 encoding = 'utf-8'
105 if errors is None:
106 errors = 'replace'
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
108 pct_sequence = b''
109 string = res[0]
110 for item in res[1:]:
111 try:
112 if not item:
113 raise ValueError
114 pct_sequence += item[:2].decode('hex')
115 rest = item[2:]
116 if not rest:
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
120 continue
121 except ValueError:
122 rest = '%' + item
123 # Encountered non-percent-encoded characters. Flush the current
124 # pct_sequence.
125 string += pct_sequence.decode(encoding, errors) + rest
126 pct_sequence = b''
127 if pct_sequence:
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
130 return string
131
f1f725c6
PH
132
133try:
134 from urllib.parse import parse_qs as compat_parse_qs
135except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
138
59ae15a5
PH
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
143 r = []
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
146 continue
147 nv = name_value.split('=', 1)
148 if len(nv) != 2:
149 if strict_parsing:
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
153 nv.append('')
154 else:
155 continue
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
f1f725c6
PH
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
59ae15a5
PH
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
f1f725c6
PH
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
59ae15a5
PH
164 value = _coerce_result(value)
165 r.append((name, value))
166 return r
167
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
170 parsed_result = {}
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
176 else:
177 parsed_result[name] = [value]
178 return parsed_result
348d0a7a 179
3e669f36 180try:
59ae15a5 181 compat_str = unicode # Python 2
3e669f36 182except NameError:
59ae15a5 183 compat_str = str
3e669f36
PH
184
185try:
59ae15a5 186 compat_chr = unichr # Python 2
3e669f36 187except NameError:
59ae15a5 188 compat_chr = chr
3e669f36 189
f7300c5c
JMF
190try:
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
194
b31756c1
FV
195def compat_ord(c):
196 if type(c) is int: return c
197 else: return ord(c)
198
468e2e92
FV
199# This is not clearly defined otherwise
200compiled_regex_type = type(re.compile(''))
201
3e669f36 202std_headers = {
ae8f7871 203 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
204 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
205 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
206 'Accept-Encoding': 'gzip, deflate',
207 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 208}
f427df17 209
d77c3dfd 210def preferredencoding():
59ae15a5 211 """Get preferred encoding.
d77c3dfd 212
59ae15a5
PH
213 Returns the best encoding scheme for the system, based on
214 locale.getpreferredencoding() and some further tweaks.
215 """
216 try:
217 pref = locale.getpreferredencoding()
218 u'TEST'.encode(pref)
219 except:
220 pref = 'UTF-8'
bae611f2 221
59ae15a5 222 return pref
d77c3dfd 223
8cd10ac4 224if sys.version_info < (3,0):
59ae15a5
PH
225 def compat_print(s):
226 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 227else:
59ae15a5
PH
228 def compat_print(s):
229 assert type(s) == type(u'')
230 print(s)
d77c3dfd 231
f4bfd65f 232
181c8655
PH
233def write_json_file(obj, fn):
234 """ Encode obj as JSON and write it to fn, atomically """
235
73159f99
S
236 args = {
237 'suffix': '.tmp',
238 'prefix': os.path.basename(fn) + '.',
239 'dir': os.path.dirname(fn),
240 'delete': False,
241 }
242
181c8655
PH
243 # In Python 2.x, json.dump expects a bytestream.
244 # In Python 3.x, it writes to a character stream
245 if sys.version_info < (3, 0):
73159f99 246 args['mode'] = 'wb'
181c8655 247 else:
73159f99
S
248 args.update({
249 'mode': 'w',
250 'encoding': 'utf-8',
251 })
252
253 tf = tempfile.NamedTemporaryFile(**args)
181c8655
PH
254
255 try:
256 with tf:
257 json.dump(obj, tf)
258 os.rename(tf.name, fn)
259 except:
260 try:
261 os.remove(tf.name)
262 except OSError:
263 pass
264 raise
265
266
267if sys.version_info >= (2, 7):
59ae56fa
PH
268 def find_xpath_attr(node, xpath, key, val):
269 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
270 assert re.match(r'^[a-zA-Z-]+$', key)
271 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
59ae56fa
PH
272 expr = xpath + u"[@%s='%s']" % (key, val)
273 return node.find(expr)
274else:
275 def find_xpath_attr(node, xpath, key, val):
276 for f in node.findall(xpath):
277 if f.attrib.get(key) == val:
278 return f
279 return None
280
d7e66d39
JMF
281# On python2.6 the xml.etree.ElementTree.Element methods don't support
282# the namespace parameter
283def xpath_with_ns(path, ns_map):
284 components = [c.split(':') for c in path.split('/')]
285 replaced = []
286 for c in components:
287 if len(c) == 1:
288 replaced.append(c[0])
289 else:
290 ns, tag = c
291 replaced.append('{%s}%s' % (ns_map[ns], tag))
292 return '/'.join(replaced)
293
d77c3dfd 294def htmlentity_transform(matchobj):
59ae15a5
PH
295 """Transforms an HTML entity to a character.
296
297 This function receives a match object and is intended to be used with
298 the re.sub() function.
299 """
300 entity = matchobj.group(1)
301
302 # Known non-numeric HTML entity
303 if entity in compat_html_entities.name2codepoint:
304 return compat_chr(compat_html_entities.name2codepoint[entity])
305
306 mobj = re.match(u'(?u)#(x?\\d+)', entity)
307 if mobj is not None:
308 numstr = mobj.group(1)
309 if numstr.startswith(u'x'):
310 base = 16
311 numstr = u'0%s' % numstr
312 else:
313 base = 10
314 return compat_chr(int(numstr, base))
315
316 # Unknown entity in name, return its literal representation
317 return (u'&%s;' % entity)
d77c3dfd 318
a8156c1d 319compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
320class BaseHTMLParser(compat_html_parser.HTMLParser):
321 def __init(self):
322 compat_html_parser.HTMLParser.__init__(self)
323 self.html = None
324
325 def loads(self, html):
326 self.html = html
327 self.feed(html)
328 self.close()
329
330class AttrParser(BaseHTMLParser):
43e8fafd
ND
331 """Modified HTMLParser that isolates a tag with the specified attribute"""
332 def __init__(self, attribute, value):
333 self.attribute = attribute
334 self.value = value
59ae15a5
PH
335 self.result = None
336 self.started = False
337 self.depth = {}
59ae15a5
PH
338 self.watch_startpos = False
339 self.error_count = 0
a921f407 340 BaseHTMLParser.__init__(self)
59ae15a5
PH
341
342 def error(self, message):
343 if self.error_count > 10 or self.started:
344 raise compat_html_parser.HTMLParseError(message, self.getpos())
345 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
346 self.error_count += 1
347 self.goahead(1)
348
59ae15a5
PH
349 def handle_starttag(self, tag, attrs):
350 attrs = dict(attrs)
351 if self.started:
352 self.find_startpos(None)
43e8fafd 353 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
354 self.result = [tag]
355 self.started = True
356 self.watch_startpos = True
357 if self.started:
358 if not tag in self.depth: self.depth[tag] = 0
359 self.depth[tag] += 1
360
361 def handle_endtag(self, tag):
362 if self.started:
363 if tag in self.depth: self.depth[tag] -= 1
364 if self.depth[self.result[0]] == 0:
365 self.started = False
366 self.result.append(self.getpos())
367
368 def find_startpos(self, x):
369 """Needed to put the start position of the result (self.result[1])
370 after the opening tag with the requested id"""
371 if self.watch_startpos:
372 self.watch_startpos = False
373 self.result.append(self.getpos())
374 handle_entityref = handle_charref = handle_data = handle_comment = \
375 handle_decl = handle_pi = unknown_decl = find_startpos
376
377 def get_result(self):
378 if self.result is None:
379 return None
380 if len(self.result) != 3:
381 return None
382 lines = self.html.split('\n')
383 lines = lines[self.result[1][0]-1:self.result[2][0]]
384 lines[0] = lines[0][self.result[1][1]:]
385 if len(lines) == 1:
386 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
387 lines[-1] = lines[-1][:self.result[2][1]]
388 return '\n'.join(lines).strip()
3b024e17
PH
389# Hack for https://github.com/rg3/youtube-dl/issues/662
390if sys.version_info < (2, 7, 3):
391 AttrParser.parse_endtag = (lambda self, i:
392 i + len("</scr'+'ipt>")
393 if self.rawdata[i:].startswith("</scr'+'ipt>")
394 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
395
396def get_element_by_id(id, html):
43e8fafd
ND
397 """Return the content of the tag with the specified ID in the passed HTML document"""
398 return get_element_by_attribute("id", id, html)
399
400def get_element_by_attribute(attribute, value, html):
401 """Return the content of the tag with the specified attribute in the passed HTML document"""
402 parser = AttrParser(attribute, value)
59ae15a5
PH
403 try:
404 parser.loads(html)
405 except compat_html_parser.HTMLParseError:
406 pass
407 return parser.get_result()
9e6dd238 408
a921f407
JMF
409class MetaParser(BaseHTMLParser):
410 """
411 Modified HTMLParser that isolates a meta tag with the specified name
412 attribute.
413 """
414 def __init__(self, name):
415 BaseHTMLParser.__init__(self)
416 self.name = name
417 self.content = None
418 self.result = None
419
420 def handle_starttag(self, tag, attrs):
421 if tag != 'meta':
422 return
423 attrs = dict(attrs)
424 if attrs.get('name') == self.name:
425 self.result = attrs.get('content')
426
427 def get_result(self):
428 return self.result
429
430def get_meta_content(name, html):
431 """
432 Return the content attribute from the meta tag with the given name attribute.
433 """
434 parser = MetaParser(name)
435 try:
436 parser.loads(html)
437 except compat_html_parser.HTMLParseError:
438 pass
439 return parser.get_result()
440
9e6dd238
FV
441
442def clean_html(html):
59ae15a5
PH
443 """Clean an HTML snippet into a readable string"""
444 # Newline vs <br />
445 html = html.replace('\n', ' ')
6b3aef80
FV
446 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
447 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
448 # Strip html tags
449 html = re.sub('<.*?>', '', html)
450 # Replace html entities
451 html = unescapeHTML(html)
7decf895 452 return html.strip()
9e6dd238
FV
453
454
d77c3dfd 455def sanitize_open(filename, open_mode):
59ae15a5
PH
456 """Try to open the given filename, and slightly tweak it if this fails.
457
458 Attempts to open the given filename. If this fails, it tries to change
459 the filename slightly, step by step, until it's either able to open it
460 or it fails and raises a final exception, like the standard open()
461 function.
462
463 It returns the tuple (stream, definitive_file_name).
464 """
465 try:
466 if filename == u'-':
467 if sys.platform == 'win32':
468 import msvcrt
469 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 470 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
471 stream = open(encodeFilename(filename), open_mode)
472 return (stream, filename)
473 except (IOError, OSError) as err:
f45c185f
PH
474 if err.errno in (errno.EACCES,):
475 raise
59ae15a5 476
f45c185f
PH
477 # In case of error, try to remove win32 forbidden chars
478 alt_filename = os.path.join(
479 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
480 for path_part in os.path.split(filename)
481 )
482 if alt_filename == filename:
483 raise
484 else:
485 # An exception here should be caught in the caller
486 stream = open(encodeFilename(filename), open_mode)
487 return (stream, alt_filename)
d77c3dfd
FV
488
489
490def timeconvert(timestr):
59ae15a5
PH
491 """Convert RFC 2822 defined time string into system timestamp"""
492 timestamp = None
493 timetuple = email.utils.parsedate_tz(timestr)
494 if timetuple is not None:
495 timestamp = email.utils.mktime_tz(timetuple)
496 return timestamp
1c469a94 497
796173d0 498def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
499 """Sanitizes a string so it could be used as part of a filename.
500 If restricted is set, use a stricter subset of allowed characters.
796173d0 501 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
502 """
503 def replace_insane(char):
504 if char == '?' or ord(char) < 32 or ord(char) == 127:
505 return ''
506 elif char == '"':
507 return '' if restricted else '\''
508 elif char == ':':
509 return '_-' if restricted else ' -'
510 elif char in '\\/|*<>':
511 return '_'
627dcfff 512 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
513 return '_'
514 if restricted and ord(char) > 127:
515 return '_'
516 return char
517
518 result = u''.join(map(replace_insane, s))
796173d0
PH
519 if not is_id:
520 while '__' in result:
521 result = result.replace('__', '_')
522 result = result.strip('_')
523 # Common case of "Foreign band name - English song title"
524 if restricted and result.startswith('-_'):
525 result = result[2:]
526 if not result:
527 result = '_'
59ae15a5 528 return result
d77c3dfd
FV
529
530def orderedSet(iterable):
59ae15a5
PH
531 """ Remove all duplicates from the input iterable """
532 res = []
533 for el in iterable:
534 if el not in res:
535 res.append(el)
536 return res
d77c3dfd 537
912b38b4 538
d77c3dfd 539def unescapeHTML(s):
912b38b4
PH
540 if s is None:
541 return None
542 assert type(s) == compat_str
d77c3dfd 543
912b38b4 544 result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
59ae15a5 545 return result
d77c3dfd 546
8bf48f23
PH
547
548def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
549 """
550 @param s The name of the file
551 """
d77c3dfd 552
8bf48f23 553 assert type(s) == compat_str
d77c3dfd 554
59ae15a5
PH
555 # Python 3 has a Unicode API
556 if sys.version_info >= (3, 0):
557 return s
0f00efed 558
59ae15a5
PH
559 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
560 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
561 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
562 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
563 if not for_subprocess:
564 return s
565 else:
566 # For subprocess calls, encode with locale encoding
567 # Refer to http://stackoverflow.com/a/9951851/35070
568 encoding = preferredencoding()
59ae15a5 569 else:
6df40dcb 570 encoding = sys.getfilesystemencoding()
8bf48f23
PH
571 if encoding is None:
572 encoding = 'utf-8'
573 return s.encode(encoding, 'ignore')
574
f07b74fc
PH
575
576def encodeArgument(s):
577 if not isinstance(s, compat_str):
578 # Legacy code that uses byte strings
579 # Uncomment the following line after fixing all post processors
580 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
581 s = s.decode('ascii')
582 return encodeFilename(s, True)
583
584
8271226a
PH
585def decodeOption(optval):
586 if optval is None:
587 return optval
588 if isinstance(optval, bytes):
589 optval = optval.decode(preferredencoding())
590
591 assert isinstance(optval, compat_str)
592 return optval
1c256f70 593
4539dd30
PH
594def formatSeconds(secs):
595 if secs > 3600:
596 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
597 elif secs > 60:
598 return '%d:%02d' % (secs // 60, secs % 60)
599 else:
600 return '%d' % secs
601
a0ddb8a2
PH
602
603def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
13ebea79
PH
604 if sys.version_info < (3, 2):
605 import httplib
606
607 class HTTPSConnectionV3(httplib.HTTPSConnection):
608 def __init__(self, *args, **kwargs):
609 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
610
611 def connect(self):
612 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 613 if getattr(self, '_tunnel_host', False):
13ebea79
PH
614 self.sock = sock
615 self._tunnel()
616 try:
617 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
de79c46c 618 except ssl.SSLError:
13ebea79
PH
619 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
620
621 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
622 def https_open(self, req):
623 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 624 return HTTPSHandlerV3(**kwargs)
ea6d901e 625 else:
13ebea79 626 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
ea6d901e 627 context.verify_mode = (ssl.CERT_NONE
dca08720 628 if opts_no_check_certificate
ea6d901e 629 else ssl.CERT_REQUIRED)
303b479e
PH
630 context.set_default_verify_paths()
631 try:
632 context.load_default_certs()
633 except AttributeError:
634 pass # Python < 3.4
a0ddb8a2 635 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 636
1c256f70
PH
637class ExtractorError(Exception):
638 """Error during info extraction."""
d11271dd 639 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
640 """ tb, if given, is the original traceback (so that it can be printed out).
641 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
642 """
643
644 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
645 expected = True
d11271dd
PH
646 if video_id is not None:
647 msg = video_id + ': ' + msg
9a82b238 648 if not expected:
298f833b 649 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 650 super(ExtractorError, self).__init__(msg)
d5979c5d 651
1c256f70 652 self.traceback = tb
8cc83b8d 653 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 654 self.cause = cause
d11271dd 655 self.video_id = video_id
1c256f70 656
01951dda
PH
657 def format_traceback(self):
658 if self.traceback is None:
659 return None
660 return u''.join(traceback.format_tb(self.traceback))
661
1c256f70 662
55b3e45b
JMF
663class RegexNotFoundError(ExtractorError):
664 """Error when a regex didn't match"""
665 pass
666
667
d77c3dfd 668class DownloadError(Exception):
59ae15a5 669 """Download Error exception.
d77c3dfd 670
59ae15a5
PH
671 This exception may be thrown by FileDownloader objects if they are not
672 configured to continue on errors. They will contain the appropriate
673 error message.
674 """
8cc83b8d
FV
675 def __init__(self, msg, exc_info=None):
676 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
677 super(DownloadError, self).__init__(msg)
678 self.exc_info = exc_info
d77c3dfd
FV
679
680
681class SameFileError(Exception):
59ae15a5 682 """Same File exception.
d77c3dfd 683
59ae15a5
PH
684 This exception will be thrown by FileDownloader objects if they detect
685 multiple files would have to be downloaded to the same file on disk.
686 """
687 pass
d77c3dfd
FV
688
689
690class PostProcessingError(Exception):
59ae15a5 691 """Post Processing exception.
d77c3dfd 692
59ae15a5
PH
693 This exception may be raised by PostProcessor's .run() method to
694 indicate an error in the postprocessing task.
695 """
7851b379
PH
696 def __init__(self, msg):
697 self.msg = msg
d77c3dfd
FV
698
699class MaxDownloadsReached(Exception):
59ae15a5
PH
700 """ --max-downloads limit has been reached. """
701 pass
d77c3dfd
FV
702
703
704class UnavailableVideoError(Exception):
59ae15a5 705 """Unavailable Format exception.
d77c3dfd 706
59ae15a5
PH
707 This exception will be thrown when a video is requested
708 in a format that is not available for that video.
709 """
710 pass
d77c3dfd
FV
711
712
713class ContentTooShortError(Exception):
59ae15a5 714 """Content Too Short exception.
d77c3dfd 715
59ae15a5
PH
716 This exception may be raised by FileDownloader objects when a file they
717 download is too small for what the server announced first, indicating
718 the connection was probably interrupted.
719 """
720 # Both in bytes
721 downloaded = None
722 expected = None
d77c3dfd 723
59ae15a5
PH
724 def __init__(self, downloaded, expected):
725 self.downloaded = downloaded
726 self.expected = expected
d77c3dfd 727
acebc9cd 728class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
729 """Handler for HTTP requests and responses.
730
731 This class, when installed with an OpenerDirector, automatically adds
732 the standard headers to every HTTP request and handles gzipped and
733 deflated responses from web servers. If compression is to be avoided in
734 a particular request, the original request in the program code only has
735 to include the HTTP header "Youtubedl-No-Compression", which will be
736 removed before making the real request.
737
738 Part of this code was copied from:
739
740 http://techknack.net/python-urllib2-handlers/
741
742 Andrew Rowls, the author of that code, agreed to release it to the
743 public domain.
744 """
745
746 @staticmethod
747 def deflate(data):
748 try:
749 return zlib.decompress(data, -zlib.MAX_WBITS)
750 except zlib.error:
751 return zlib.decompress(data)
752
753 @staticmethod
754 def addinfourl_wrapper(stream, headers, url, code):
755 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
756 return compat_urllib_request.addinfourl(stream, headers, url, code)
757 ret = compat_urllib_request.addinfourl(stream, headers, url)
758 ret.code = code
759 return ret
760
acebc9cd
PH
761 def http_request(self, req):
762 for h,v in std_headers.items():
59ae15a5
PH
763 if h in req.headers:
764 del req.headers[h]
335959e7 765 req.add_header(h, v)
59ae15a5
PH
766 if 'Youtubedl-no-compression' in req.headers:
767 if 'Accept-encoding' in req.headers:
768 del req.headers['Accept-encoding']
769 del req.headers['Youtubedl-no-compression']
3446dfb7 770 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
771 if 'User-agent' in req.headers:
772 del req.headers['User-agent']
773 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 774 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
775 return req
776
acebc9cd 777 def http_response(self, req, resp):
59ae15a5
PH
778 old_resp = resp
779 # gzip
780 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
781 content = resp.read()
782 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
783 try:
784 uncompressed = io.BytesIO(gz.read())
785 except IOError as original_ioerror:
786 # There may be junk add the end of the file
787 # See http://stackoverflow.com/q/4928560/35070 for details
788 for i in range(1, 1024):
789 try:
790 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
791 uncompressed = io.BytesIO(gz.read())
792 except IOError:
793 continue
794 break
795 else:
796 raise original_ioerror
797 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
798 resp.msg = old_resp.msg
799 # deflate
800 if resp.headers.get('Content-encoding', '') == 'deflate':
801 gz = io.BytesIO(self.deflate(resp.read()))
802 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
803 resp.msg = old_resp.msg
804 return resp
0f8d03f8 805
acebc9cd
PH
806 https_request = http_request
807 https_response = http_response
bf50b038 808
5de90176 809
305d0683 810def parse_iso8601(date_str, delimiter='T'):
912b38b4
PH
811 """ Return a UNIX timestamp from the given date """
812
813 if date_str is None:
814 return None
815
816 m = re.search(
817 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
818 date_str)
819 if not m:
820 timezone = datetime.timedelta()
821 else:
822 date_str = date_str[:-len(m.group(0))]
823 if not m.group('sign'):
824 timezone = datetime.timedelta()
825 else:
826 sign = 1 if m.group('sign') == '+' else -1
827 timezone = datetime.timedelta(
828 hours=sign * int(m.group('hours')),
829 minutes=sign * int(m.group('minutes')))
305d0683
TB
830 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
831 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
832 return calendar.timegm(dt.timetuple())
833
834
bf50b038
JMF
835def unified_strdate(date_str):
836 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
837
838 if date_str is None:
839 return None
840
bf50b038
JMF
841 upload_date = None
842 #Replace commas
026fcc04 843 date_str = date_str.replace(',', ' ')
bf50b038 844 # %z (UTC offset) is only supported in python>=3.2
026fcc04 845 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
19e1d359
JMF
846 format_expressions = [
847 '%d %B %Y',
0f99566c 848 '%d %b %Y',
19e1d359
JMF
849 '%B %d %Y',
850 '%b %d %Y',
78ff59d0
PP
851 '%b %dst %Y %I:%M%p',
852 '%b %dnd %Y %I:%M%p',
853 '%b %dth %Y %I:%M%p',
19e1d359 854 '%Y-%m-%d',
fe556f1b 855 '%Y/%m/%d',
4cf96546 856 '%d.%m.%Y',
19e1d359 857 '%d/%m/%Y',
423817c4 858 '%d/%m/%y',
19e1d359 859 '%Y/%m/%d %H:%M:%S',
5d73273f 860 '%Y-%m-%d %H:%M:%S',
19e1d359 861 '%d.%m.%Y %H:%M',
b047de6f 862 '%d.%m.%Y %H.%M',
19e1d359 863 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
864 '%Y-%m-%dT%H:%M:%S.%fZ',
865 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 866 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 867 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 868 '%Y-%m-%dT%H:%M',
19e1d359 869 ]
bf50b038
JMF
870 for expression in format_expressions:
871 try:
872 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 873 except ValueError:
bf50b038 874 pass
42393ce2
PH
875 if upload_date is None:
876 timetuple = email.utils.parsedate_tz(date_str)
877 if timetuple:
878 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
879 return upload_date
880
cbdbb766 881def determine_ext(url, default_ext=u'unknown_video'):
f4776371
S
882 if url is None:
883 return default_ext
73e79f2a
PH
884 guess = url.partition(u'?')[0].rpartition(u'.')[2]
885 if re.match(r'^[A-Za-z0-9]+$', guess):
886 return guess
887 else:
cbdbb766 888 return default_ext
73e79f2a 889
d4051a8e
JMF
890def subtitles_filename(filename, sub_lang, sub_format):
891 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
892
bd558525 893def date_from_str(date_str):
37254abc
JMF
894 """
895 Return a datetime object from a string in the format YYYYMMDD or
896 (now|today)[+-][0-9](day|week|month|year)(s)?"""
897 today = datetime.date.today()
898 if date_str == 'now'or date_str == 'today':
899 return today
900 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
901 if match is not None:
902 sign = match.group('sign')
903 time = int(match.group('time'))
904 if sign == '-':
905 time = -time
906 unit = match.group('unit')
907 #A bad aproximation?
908 if unit == 'month':
909 unit = 'day'
910 time *= 30
911 elif unit == 'year':
912 unit = 'day'
913 time *= 365
914 unit += 's'
915 delta = datetime.timedelta(**{unit: time})
916 return today + delta
bd558525
JMF
917 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
918
e63fc1be 919def hyphenate_date(date_str):
920 """
921 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
922 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
923 if match is not None:
924 return '-'.join(match.groups())
925 else:
926 return date_str
927
bd558525
JMF
928class DateRange(object):
929 """Represents a time interval between two dates"""
930 def __init__(self, start=None, end=None):
931 """start and end must be strings in the format accepted by date"""
932 if start is not None:
933 self.start = date_from_str(start)
934 else:
935 self.start = datetime.datetime.min.date()
936 if end is not None:
937 self.end = date_from_str(end)
938 else:
939 self.end = datetime.datetime.max.date()
37254abc 940 if self.start > self.end:
bd558525
JMF
941 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
942 @classmethod
943 def day(cls, day):
944 """Returns a range that only contains the given day"""
945 return cls(day,day)
946 def __contains__(self, date):
947 """Check if the date is in the range"""
37254abc
JMF
948 if not isinstance(date, datetime.date):
949 date = date_from_str(date)
950 return self.start <= date <= self.end
bd558525
JMF
951 def __str__(self):
952 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
953
954
955def platform_name():
956 """ Returns the platform name as a compat_str """
957 res = platform.platform()
958 if isinstance(res, bytes):
959 res = res.decode(preferredencoding())
960
961 assert isinstance(res, compat_str)
962 return res
c257baff
PH
963
964
b58ddb32
PH
965def _windows_write_string(s, out):
966 """ Returns True if the string was written using special methods,
967 False if it has yet to be written out."""
968 # Adapted from http://stackoverflow.com/a/3259271/35070
969
970 import ctypes
971 import ctypes.wintypes
972
973 WIN_OUTPUT_IDS = {
974 1: -11,
975 2: -12,
976 }
977
a383a98a
PH
978 try:
979 fileno = out.fileno()
980 except AttributeError:
981 # If the output stream doesn't have a fileno, it's virtual
982 return False
b58ddb32
PH
983 if fileno not in WIN_OUTPUT_IDS:
984 return False
985
986 GetStdHandle = ctypes.WINFUNCTYPE(
987 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
988 ("GetStdHandle", ctypes.windll.kernel32))
989 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
990
991 WriteConsoleW = ctypes.WINFUNCTYPE(
992 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
993 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
994 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
995 written = ctypes.wintypes.DWORD(0)
996
997 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
998 FILE_TYPE_CHAR = 0x0002
999 FILE_TYPE_REMOTE = 0x8000
1000 GetConsoleMode = ctypes.WINFUNCTYPE(
1001 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1002 ctypes.POINTER(ctypes.wintypes.DWORD))(
1003 ("GetConsoleMode", ctypes.windll.kernel32))
1004 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1005
1006 def not_a_console(handle):
1007 if handle == INVALID_HANDLE_VALUE or handle is None:
1008 return True
1009 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1010 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1011
1012 if not_a_console(h):
1013 return False
1014
d1b9c912
PH
1015 def next_nonbmp_pos(s):
1016 try:
1017 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1018 except StopIteration:
1019 return len(s)
1020
1021 while s:
1022 count = min(next_nonbmp_pos(s), 1024)
1023
b58ddb32 1024 ret = WriteConsoleW(
d1b9c912 1025 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1026 if ret == 0:
1027 raise OSError('Failed to write string')
d1b9c912
PH
1028 if not count: # We just wrote a non-BMP character
1029 assert written.value == 2
1030 s = s[1:]
1031 else:
1032 assert written.value > 0
1033 s = s[written.value:]
b58ddb32
PH
1034 return True
1035
1036
734f90bb 1037def write_string(s, out=None, encoding=None):
7459e3a2
PH
1038 if out is None:
1039 out = sys.stderr
8bf48f23 1040 assert type(s) == compat_str
7459e3a2 1041
b58ddb32
PH
1042 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1043 if _windows_write_string(s, out):
1044 return
1045
7459e3a2
PH
1046 if ('b' in getattr(out, 'mode', '') or
1047 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1048 byt = s.encode(encoding or preferredencoding(), 'ignore')
1049 out.write(byt)
1050 elif hasattr(out, 'buffer'):
1051 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1052 byt = s.encode(enc, 'ignore')
1053 out.buffer.write(byt)
1054 else:
8bf48f23 1055 out.write(s)
7459e3a2
PH
1056 out.flush()
1057
1058
48ea9cea
PH
1059def bytes_to_intlist(bs):
1060 if not bs:
1061 return []
1062 if isinstance(bs[0], int): # Python 3
1063 return list(bs)
1064 else:
1065 return [ord(c) for c in bs]
1066
c257baff 1067
cba892fa 1068def intlist_to_bytes(xs):
1069 if not xs:
1070 return b''
1071 if isinstance(chr(0), bytes): # Python 2
1072 return ''.join([chr(x) for x in xs])
1073 else:
1074 return bytes(xs)
c38b1e77
PH
1075
1076
1077def get_cachedir(params={}):
1078 cache_root = os.environ.get('XDG_CACHE_HOME',
1079 os.path.expanduser('~/.cache'))
1080 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
c1c9a79c
PH
1081
1082
1083# Cross-platform file locking
1084if sys.platform == 'win32':
1085 import ctypes.wintypes
1086 import msvcrt
1087
1088 class OVERLAPPED(ctypes.Structure):
1089 _fields_ = [
1090 ('Internal', ctypes.wintypes.LPVOID),
1091 ('InternalHigh', ctypes.wintypes.LPVOID),
1092 ('Offset', ctypes.wintypes.DWORD),
1093 ('OffsetHigh', ctypes.wintypes.DWORD),
1094 ('hEvent', ctypes.wintypes.HANDLE),
1095 ]
1096
1097 kernel32 = ctypes.windll.kernel32
1098 LockFileEx = kernel32.LockFileEx
1099 LockFileEx.argtypes = [
1100 ctypes.wintypes.HANDLE, # hFile
1101 ctypes.wintypes.DWORD, # dwFlags
1102 ctypes.wintypes.DWORD, # dwReserved
1103 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1104 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1105 ctypes.POINTER(OVERLAPPED) # Overlapped
1106 ]
1107 LockFileEx.restype = ctypes.wintypes.BOOL
1108 UnlockFileEx = kernel32.UnlockFileEx
1109 UnlockFileEx.argtypes = [
1110 ctypes.wintypes.HANDLE, # hFile
1111 ctypes.wintypes.DWORD, # dwReserved
1112 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1113 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1114 ctypes.POINTER(OVERLAPPED) # Overlapped
1115 ]
1116 UnlockFileEx.restype = ctypes.wintypes.BOOL
1117 whole_low = 0xffffffff
1118 whole_high = 0x7fffffff
1119
1120 def _lock_file(f, exclusive):
1121 overlapped = OVERLAPPED()
1122 overlapped.Offset = 0
1123 overlapped.OffsetHigh = 0
1124 overlapped.hEvent = 0
1125 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1126 handle = msvcrt.get_osfhandle(f.fileno())
1127 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1128 whole_low, whole_high, f._lock_file_overlapped_p):
1129 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1130
1131 def _unlock_file(f):
1132 assert f._lock_file_overlapped_p
1133 handle = msvcrt.get_osfhandle(f.fileno())
1134 if not UnlockFileEx(handle, 0,
1135 whole_low, whole_high, f._lock_file_overlapped_p):
1136 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1137
1138else:
1139 import fcntl
1140
1141 def _lock_file(f, exclusive):
1142 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1143
1144 def _unlock_file(f):
1145 fcntl.lockf(f, fcntl.LOCK_UN)
1146
1147
1148class locked_file(object):
1149 def __init__(self, filename, mode, encoding=None):
1150 assert mode in ['r', 'a', 'w']
1151 self.f = io.open(filename, mode, encoding=encoding)
1152 self.mode = mode
1153
1154 def __enter__(self):
1155 exclusive = self.mode != 'r'
1156 try:
1157 _lock_file(self.f, exclusive)
1158 except IOError:
1159 self.f.close()
1160 raise
1161 return self
1162
1163 def __exit__(self, etype, value, traceback):
1164 try:
1165 _unlock_file(self.f)
1166 finally:
1167 self.f.close()
1168
1169 def __iter__(self):
1170 return iter(self.f)
1171
1172 def write(self, *args):
1173 return self.f.write(*args)
1174
1175 def read(self, *args):
1176 return self.f.read(*args)
4eb7f1d1
JMF
1177
1178
1179def shell_quote(args):
a6a173c2
JMF
1180 quoted_args = []
1181 encoding = sys.getfilesystemencoding()
1182 if encoding is None:
1183 encoding = 'utf-8'
1184 for a in args:
1185 if isinstance(a, bytes):
1186 # We may get a filename encoded with 'encodeFilename'
1187 a = a.decode(encoding)
1188 quoted_args.append(pipes.quote(a))
1189 return u' '.join(quoted_args)
9d4660ca
PH
1190
1191
f4d96df0
PH
1192def takewhile_inclusive(pred, seq):
1193 """ Like itertools.takewhile, but include the latest evaluated element
1194 (the first element so that Not pred(e)) """
1195 for e in seq:
1196 yield e
1197 if not pred(e):
1198 return
1199
1200
9d4660ca
PH
1201def smuggle_url(url, data):
1202 """ Pass additional data in a URL for internal use. """
1203
1204 sdata = compat_urllib_parse.urlencode(
1205 {u'__youtubedl_smuggle': json.dumps(data)})
1206 return url + u'#' + sdata
1207
1208
79f82953 1209def unsmuggle_url(smug_url, default=None):
9d4660ca 1210 if not '#__youtubedl_smuggle' in smug_url:
79f82953 1211 return smug_url, default
9d4660ca
PH
1212 url, _, sdata = smug_url.rpartition(u'#')
1213 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1214 data = json.loads(jsond)
1215 return url, data
02dbf93f
PH
1216
1217
02dbf93f
PH
1218def format_bytes(bytes):
1219 if bytes is None:
1220 return u'N/A'
1221 if type(bytes) is str:
1222 bytes = float(bytes)
1223 if bytes == 0.0:
1224 exponent = 0
1225 else:
1226 exponent = int(math.log(bytes, 1024.0))
1227 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1228 converted = float(bytes) / float(1024 ** exponent)
1229 return u'%.2f%s' % (converted, suffix)
f53c966a 1230
1c088fa8 1231
1c088fa8
PH
1232def get_term_width():
1233 columns = os.environ.get('COLUMNS', None)
1234 if columns:
1235 return int(columns)
1236
1237 try:
1238 sp = subprocess.Popen(
1239 ['stty', 'size'],
1240 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1241 out, err = sp.communicate()
1242 return int(out.split()[1])
1243 except:
1244 pass
1245 return None
caefb1de
PH
1246
1247
1248def month_by_name(name):
1249 """ Return the number of a month by (locale-independently) English name """
1250
1251 ENGLISH_NAMES = [
dadb8184 1252 u'January', u'February', u'March', u'April', u'May', u'June',
caefb1de
PH
1253 u'July', u'August', u'September', u'October', u'November', u'December']
1254 try:
1255 return ENGLISH_NAMES.index(name) + 1
1256 except ValueError:
1257 return None
18258362
JMF
1258
1259
5aafe895 1260def fix_xml_ampersands(xml_str):
18258362 1261 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1262 return re.sub(
1263 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1264 u'&amp;',
1265 xml_str)
e3946f98
PH
1266
1267
1268def setproctitle(title):
8bf48f23 1269 assert isinstance(title, compat_str)
e3946f98
PH
1270 try:
1271 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1272 except OSError:
1273 return
6eefe533
PH
1274 title_bytes = title.encode('utf-8')
1275 buf = ctypes.create_string_buffer(len(title_bytes))
1276 buf.value = title_bytes
e3946f98 1277 try:
6eefe533 1278 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1279 except AttributeError:
1280 return # Strange libc, just skip this
d7dda168
PH
1281
1282
1283def remove_start(s, start):
1284 if s.startswith(start):
1285 return s[len(start):]
1286 return s
29eb5174
PH
1287
1288
2b9faf55
PH
1289def remove_end(s, end):
1290 if s.endswith(end):
1291 return s[:-len(end)]
1292 return s
1293
1294
29eb5174 1295def url_basename(url):
9b8aaeed
JMF
1296 path = compat_urlparse.urlparse(url).path
1297 return path.strip(u'/').split(u'/')[-1]
aa94a6d3
PH
1298
1299
1300class HEADRequest(compat_urllib_request.Request):
1301 def get_method(self):
1302 return "HEAD"
7217e148
PH
1303
1304
9732d77e 1305def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1306 if get_attr:
1307 if v is not None:
1308 v = getattr(v, get_attr, None)
9572013d
PH
1309 if v == '':
1310 v = None
9732d77e
PH
1311 return default if v is None else (int(v) * invscale // scale)
1312
9572013d 1313
40a90862
JMF
1314def str_or_none(v, default=None):
1315 return default if v is None else compat_str(v)
1316
9732d77e
PH
1317
1318def str_to_int(int_str):
1319 if int_str is None:
1320 return None
1321 int_str = re.sub(r'[,\.]', u'', int_str)
1322 return int(int_str)
608d11f5
PH
1323
1324
9732d77e
PH
1325def float_or_none(v, scale=1, invscale=1, default=None):
1326 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1327
1328
608d11f5
PH
1329def parse_duration(s):
1330 if s is None:
1331 return None
1332
1333 m = re.match(
ba40a746 1334 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
608d11f5
PH
1335 if not m:
1336 return None
1337 res = int(m.group('secs'))
1338 if m.group('mins'):
1339 res += int(m.group('mins')) * 60
1340 if m.group('hours'):
1341 res += int(m.group('hours')) * 60 * 60
1342 return res
91d7d0b3
JMF
1343
1344
1345def prepend_extension(filename, ext):
1346 name, real_ext = os.path.splitext(filename)
1347 return u'{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1348
1349
1350def check_executable(exe, args=[]):
1351 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1352 args can be a list of arguments for a short output (like -version) """
1353 try:
1354 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1355 except OSError:
1356 return False
1357 return exe
b7ab0590
PH
1358
1359
1360class PagedList(object):
1361 def __init__(self, pagefunc, pagesize):
1362 self._pagefunc = pagefunc
1363 self._pagesize = pagesize
1364
dd26ced1
PH
1365 def __len__(self):
1366 # This is only useful for tests
1367 return len(self.getslice())
1368
b7ab0590
PH
1369 def getslice(self, start=0, end=None):
1370 res = []
1371 for pagenum in itertools.count(start // self._pagesize):
1372 firstid = pagenum * self._pagesize
1373 nextfirstid = pagenum * self._pagesize + self._pagesize
1374 if start >= nextfirstid:
1375 continue
1376
1377 page_results = list(self._pagefunc(pagenum))
1378
1379 startv = (
1380 start % self._pagesize
1381 if firstid <= start < nextfirstid
1382 else 0)
1383
1384 endv = (
1385 ((end - 1) % self._pagesize) + 1
1386 if (end is not None and firstid <= end <= nextfirstid)
1387 else None)
1388
1389 if startv != 0 or endv is not None:
1390 page_results = page_results[startv:endv]
1391 res.extend(page_results)
1392
1393 # A little optimization - if current page is not "full", ie. does
1394 # not contain page_size videos then we can assume that this page
1395 # is the last one - there are no more ids on further pages -
1396 # i.e. no need to query again.
1397 if len(page_results) + startv < self._pagesize:
1398 break
1399
1400 # If we got the whole page, but the next page is not interesting,
1401 # break out early as well
1402 if end == nextfirstid:
1403 break
1404 return res
81c2f20b
PH
1405
1406
1407def uppercase_escape(s):
676eb3f2 1408 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1409 return re.sub(
a612753d 1410 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1411 lambda m: unicode_escape(m.group(0))[0],
1412 s)
b53466e1
PH
1413
1414try:
1415 struct.pack(u'!I', 0)
1416except TypeError:
1417 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1418 def struct_pack(spec, *args):
1419 if isinstance(spec, compat_str):
1420 spec = spec.encode('ascii')
1421 return struct.pack(spec, *args)
1422
1423 def struct_unpack(spec, *args):
1424 if isinstance(spec, compat_str):
1425 spec = spec.encode('ascii')
1426 return struct.unpack(spec, *args)
1427else:
1428 struct_pack = struct.pack
1429 struct_unpack = struct.unpack
62e609ab
PH
1430
1431
1432def read_batch_urls(batch_fd):
1433 def fixup(url):
1434 if not isinstance(url, compat_str):
1435 url = url.decode('utf-8', 'replace')
1436 BOM_UTF8 = u'\xef\xbb\xbf'
1437 if url.startswith(BOM_UTF8):
1438 url = url[len(BOM_UTF8):]
1439 url = url.strip()
1440 if url.startswith(('#', ';', ']')):
1441 return False
1442 return url
1443
1444 with contextlib.closing(batch_fd) as fd:
1445 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1446
1447
1448def urlencode_postdata(*args, **kargs):
1449 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1450
1451
1452def parse_xml(s):
1453 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1454 def doctype(self, name, pubid, system):
1455 pass # Ignore doctypes
1456
1457 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1458 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1459 return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
e68301af
PH
1460
1461
1462if sys.version_info < (3, 0) and sys.platform == 'win32':
1463 def compat_getpass(prompt, *args, **kwargs):
1464 if isinstance(prompt, compat_str):
4e6f9aec 1465 prompt = prompt.encode(preferredencoding())
e68301af
PH
1466 return getpass.getpass(prompt, *args, **kwargs)
1467else:
1468 compat_getpass = getpass.getpass
a1a530b0
PH
1469
1470
1471US_RATINGS = {
1472 'G': 0,
1473 'PG': 10,
1474 'PG-13': 13,
1475 'R': 16,
1476 'NC': 18,
1477}
fac55558
PH
1478
1479
1480def strip_jsonp(code):
816930c4 1481 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
478c2c61
PH
1482
1483
e05f6939
PH
1484def js_to_json(code):
1485 def fix_kv(m):
1486 key = m.group(2)
1487 if key.startswith("'"):
1488 assert key.endswith("'")
1489 assert '"' not in key
1490 key = '"%s"' % key[1:-1]
1491 elif not key.startswith('"'):
1492 key = '"%s"' % key
1493
1494 value = m.group(4)
1495 if value.startswith("'"):
1496 assert value.endswith("'")
1497 assert '"' not in value
1498 value = '"%s"' % value[1:-1]
1499
1500 return m.group(1) + key + m.group(3) + value
1501
1502 res = re.sub(r'''(?x)
1503 ([{,]\s*)
1504 ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1505 (:\s*)
1506 ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1507 ''', fix_kv, code)
1508 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1509 return res
1510
1511
478c2c61
PH
1512def qualities(quality_ids):
1513 """ Get a numeric quality value out of a list of possible values """
1514 def q(qid):
1515 try:
1516 return quality_ids.index(qid)
1517 except ValueError:
1518 return -1
1519 return q
1520
acd69589
PH
1521
1522DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68
PH
1523
1524try:
1525 subprocess_check_output = subprocess.check_output
1526except AttributeError:
1527 def subprocess_check_output(*args, **kwargs):
1528 assert 'input' not in kwargs
1529 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1530 output, _ = p.communicate()
1531 ret = p.poll()
1532 if ret:
1533 raise subprocess.CalledProcessError(ret, p.args, output=output)
1534 return output