]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[utils] Improve and test js_to_json
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
912b38b4 4import calendar
676eb3f2 5import codecs
62e609ab 6import contextlib
e3946f98 7import ctypes
c496ca96
PH
8import datetime
9import email.utils
f45c185f 10import errno
e68301af 11import getpass
d77c3dfd 12import gzip
b7ab0590 13import itertools
03f9daab 14import io
f4bfd65f 15import json
d77c3dfd 16import locale
02dbf93f 17import math
d77c3dfd 18import os
4eb7f1d1 19import pipes
c496ca96 20import platform
d77c3dfd 21import re
13ebea79 22import ssl
c496ca96 23import socket
b53466e1 24import struct
1c088fa8 25import subprocess
d77c3dfd 26import sys
181c8655 27import tempfile
01951dda 28import traceback
bcf89ce6 29import xml.etree.ElementTree
d77c3dfd 30import zlib
d77c3dfd 31
01ba00ca 32try:
59ae15a5 33 import urllib.request as compat_urllib_request
01ba00ca 34except ImportError: # Python 2
59ae15a5 35 import urllib2 as compat_urllib_request
01ba00ca
PH
36
37try:
59ae15a5 38 import urllib.error as compat_urllib_error
01ba00ca 39except ImportError: # Python 2
59ae15a5 40 import urllib2 as compat_urllib_error
01ba00ca
PH
41
42try:
59ae15a5 43 import urllib.parse as compat_urllib_parse
01ba00ca 44except ImportError: # Python 2
59ae15a5 45 import urllib as compat_urllib_parse
01ba00ca 46
799c0763
PH
47try:
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
51
6543f0dc
JMF
52try:
53 import urllib.parse as compat_urlparse
54except ImportError: # Python 2
55 import urlparse as compat_urlparse
56
01ba00ca 57try:
59ae15a5 58 import http.cookiejar as compat_cookiejar
01ba00ca 59except ImportError: # Python 2
59ae15a5 60 import cookielib as compat_cookiejar
01ba00ca 61
3e669f36 62try:
59ae15a5 63 import html.entities as compat_html_entities
9f37a959 64except ImportError: # Python 2
59ae15a5 65 import htmlentitydefs as compat_html_entities
3e669f36 66
a8156c1d 67try:
59ae15a5 68 import html.parser as compat_html_parser
9f37a959 69except ImportError: # Python 2
59ae15a5 70 import HTMLParser as compat_html_parser
a8156c1d 71
348d0a7a 72try:
59ae15a5 73 import http.client as compat_http_client
9f37a959 74except ImportError: # Python 2
59ae15a5 75 import httplib as compat_http_client
348d0a7a 76
2eabb802 77try:
0e283428 78 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
79except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
81
e0df6211
PH
82try:
83 from urllib.request import urlretrieve as compat_urlretrieve
84except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
86
87
5910e210
PH
88try:
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
91except ImportError:
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
93
9f37a959 94try:
f1f725c6
PH
95 from urllib.parse import unquote as compat_urllib_parse_unquote
96except ImportError:
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
59ae15a5
PH
98 if string == '':
99 return string
100 res = string.split('%')
101 if len(res) == 1:
102 return string
103 if encoding is None:
104 encoding = 'utf-8'
105 if errors is None:
106 errors = 'replace'
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
108 pct_sequence = b''
109 string = res[0]
110 for item in res[1:]:
111 try:
112 if not item:
113 raise ValueError
114 pct_sequence += item[:2].decode('hex')
115 rest = item[2:]
116 if not rest:
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
120 continue
121 except ValueError:
122 rest = '%' + item
123 # Encountered non-percent-encoded characters. Flush the current
124 # pct_sequence.
125 string += pct_sequence.decode(encoding, errors) + rest
126 pct_sequence = b''
127 if pct_sequence:
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
130 return string
131
f1f725c6
PH
132
133try:
134 from urllib.parse import parse_qs as compat_parse_qs
135except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
138
59ae15a5
PH
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
143 r = []
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
146 continue
147 nv = name_value.split('=', 1)
148 if len(nv) != 2:
149 if strict_parsing:
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
153 nv.append('')
154 else:
155 continue
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
f1f725c6
PH
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
59ae15a5
PH
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
f1f725c6
PH
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
59ae15a5
PH
164 value = _coerce_result(value)
165 r.append((name, value))
166 return r
167
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
170 parsed_result = {}
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
176 else:
177 parsed_result[name] = [value]
178 return parsed_result
348d0a7a 179
3e669f36 180try:
59ae15a5 181 compat_str = unicode # Python 2
3e669f36 182except NameError:
59ae15a5 183 compat_str = str
3e669f36
PH
184
185try:
59ae15a5 186 compat_chr = unichr # Python 2
3e669f36 187except NameError:
59ae15a5 188 compat_chr = chr
3e669f36 189
f7300c5c
JMF
190try:
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
194
8d31fa3c
PH
195try:
196 from shlex import quote as shlex_quote
197except ImportError: # Python < 3.3
198 def shlex_quote(s):
199 return "'" + s.replace("'", "'\"'\"'") + "'"
200
201
b31756c1
FV
202def compat_ord(c):
203 if type(c) is int: return c
204 else: return ord(c)
205
468e2e92
FV
206# This is not clearly defined otherwise
207compiled_regex_type = type(re.compile(''))
208
3e669f36 209std_headers = {
ae8f7871 210 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
211 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
212 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
213 'Accept-Encoding': 'gzip, deflate',
214 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 215}
f427df17 216
d77c3dfd 217def preferredencoding():
59ae15a5 218 """Get preferred encoding.
d77c3dfd 219
59ae15a5
PH
220 Returns the best encoding scheme for the system, based on
221 locale.getpreferredencoding() and some further tweaks.
222 """
223 try:
224 pref = locale.getpreferredencoding()
225 u'TEST'.encode(pref)
226 except:
227 pref = 'UTF-8'
bae611f2 228
59ae15a5 229 return pref
d77c3dfd 230
8cd10ac4 231if sys.version_info < (3,0):
59ae15a5
PH
232 def compat_print(s):
233 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 234else:
59ae15a5
PH
235 def compat_print(s):
236 assert type(s) == type(u'')
237 print(s)
d77c3dfd 238
f4bfd65f 239
181c8655
PH
240def write_json_file(obj, fn):
241 """ Encode obj as JSON and write it to fn, atomically """
242
73159f99
S
243 args = {
244 'suffix': '.tmp',
245 'prefix': os.path.basename(fn) + '.',
246 'dir': os.path.dirname(fn),
247 'delete': False,
248 }
249
181c8655
PH
250 # In Python 2.x, json.dump expects a bytestream.
251 # In Python 3.x, it writes to a character stream
252 if sys.version_info < (3, 0):
73159f99 253 args['mode'] = 'wb'
181c8655 254 else:
73159f99
S
255 args.update({
256 'mode': 'w',
257 'encoding': 'utf-8',
258 })
259
260 tf = tempfile.NamedTemporaryFile(**args)
181c8655
PH
261
262 try:
263 with tf:
264 json.dump(obj, tf)
265 os.rename(tf.name, fn)
266 except:
267 try:
268 os.remove(tf.name)
269 except OSError:
270 pass
271 raise
272
273
274if sys.version_info >= (2, 7):
59ae56fa
PH
275 def find_xpath_attr(node, xpath, key, val):
276 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
277 assert re.match(r'^[a-zA-Z-]+$', key)
278 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
59ae56fa
PH
279 expr = xpath + u"[@%s='%s']" % (key, val)
280 return node.find(expr)
281else:
282 def find_xpath_attr(node, xpath, key, val):
4eefbfdb
PH
283 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
284 # .//node does not match if a node is a direct child of . !
285 if isinstance(xpath, unicode):
286 xpath = xpath.encode('ascii')
287
59ae56fa
PH
288 for f in node.findall(xpath):
289 if f.attrib.get(key) == val:
290 return f
291 return None
292
d7e66d39
JMF
293# On python2.6 the xml.etree.ElementTree.Element methods don't support
294# the namespace parameter
295def xpath_with_ns(path, ns_map):
296 components = [c.split(':') for c in path.split('/')]
297 replaced = []
298 for c in components:
299 if len(c) == 1:
300 replaced.append(c[0])
301 else:
302 ns, tag = c
303 replaced.append('{%s}%s' % (ns_map[ns], tag))
304 return '/'.join(replaced)
305
d77c3dfd 306
bf0ff932 307def xpath_text(node, xpath, name=None, fatal=False):
d74bebd5
PH
308 if sys.version_info < (2, 7): # Crazy 2.6
309 xpath = xpath.encode('ascii')
310
bf0ff932
PH
311 n = node.find(xpath)
312 if n is None:
313 if fatal:
314 name = xpath if name is None else name
315 raise ExtractorError('Could not find XML element %s' % name)
316 else:
317 return None
318 return n.text
319
320
a8156c1d 321compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
322class BaseHTMLParser(compat_html_parser.HTMLParser):
323 def __init(self):
324 compat_html_parser.HTMLParser.__init__(self)
325 self.html = None
326
327 def loads(self, html):
328 self.html = html
329 self.feed(html)
330 self.close()
331
332class AttrParser(BaseHTMLParser):
43e8fafd
ND
333 """Modified HTMLParser that isolates a tag with the specified attribute"""
334 def __init__(self, attribute, value):
335 self.attribute = attribute
336 self.value = value
59ae15a5
PH
337 self.result = None
338 self.started = False
339 self.depth = {}
59ae15a5
PH
340 self.watch_startpos = False
341 self.error_count = 0
a921f407 342 BaseHTMLParser.__init__(self)
59ae15a5
PH
343
344 def error(self, message):
345 if self.error_count > 10 or self.started:
346 raise compat_html_parser.HTMLParseError(message, self.getpos())
347 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
348 self.error_count += 1
349 self.goahead(1)
350
59ae15a5
PH
351 def handle_starttag(self, tag, attrs):
352 attrs = dict(attrs)
353 if self.started:
354 self.find_startpos(None)
43e8fafd 355 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
356 self.result = [tag]
357 self.started = True
358 self.watch_startpos = True
359 if self.started:
360 if not tag in self.depth: self.depth[tag] = 0
361 self.depth[tag] += 1
362
363 def handle_endtag(self, tag):
364 if self.started:
365 if tag in self.depth: self.depth[tag] -= 1
366 if self.depth[self.result[0]] == 0:
367 self.started = False
368 self.result.append(self.getpos())
369
370 def find_startpos(self, x):
371 """Needed to put the start position of the result (self.result[1])
372 after the opening tag with the requested id"""
373 if self.watch_startpos:
374 self.watch_startpos = False
375 self.result.append(self.getpos())
376 handle_entityref = handle_charref = handle_data = handle_comment = \
377 handle_decl = handle_pi = unknown_decl = find_startpos
378
379 def get_result(self):
380 if self.result is None:
381 return None
382 if len(self.result) != 3:
383 return None
384 lines = self.html.split('\n')
385 lines = lines[self.result[1][0]-1:self.result[2][0]]
386 lines[0] = lines[0][self.result[1][1]:]
387 if len(lines) == 1:
388 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
389 lines[-1] = lines[-1][:self.result[2][1]]
390 return '\n'.join(lines).strip()
3b024e17
PH
391# Hack for https://github.com/rg3/youtube-dl/issues/662
392if sys.version_info < (2, 7, 3):
393 AttrParser.parse_endtag = (lambda self, i:
394 i + len("</scr'+'ipt>")
395 if self.rawdata[i:].startswith("</scr'+'ipt>")
396 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
397
398def get_element_by_id(id, html):
43e8fafd
ND
399 """Return the content of the tag with the specified ID in the passed HTML document"""
400 return get_element_by_attribute("id", id, html)
401
402def get_element_by_attribute(attribute, value, html):
403 """Return the content of the tag with the specified attribute in the passed HTML document"""
404 parser = AttrParser(attribute, value)
59ae15a5
PH
405 try:
406 parser.loads(html)
407 except compat_html_parser.HTMLParseError:
408 pass
409 return parser.get_result()
9e6dd238 410
a921f407
JMF
411class MetaParser(BaseHTMLParser):
412 """
413 Modified HTMLParser that isolates a meta tag with the specified name
414 attribute.
415 """
416 def __init__(self, name):
417 BaseHTMLParser.__init__(self)
418 self.name = name
419 self.content = None
420 self.result = None
421
422 def handle_starttag(self, tag, attrs):
423 if tag != 'meta':
424 return
425 attrs = dict(attrs)
426 if attrs.get('name') == self.name:
427 self.result = attrs.get('content')
428
429 def get_result(self):
430 return self.result
431
432def get_meta_content(name, html):
433 """
434 Return the content attribute from the meta tag with the given name attribute.
435 """
436 parser = MetaParser(name)
437 try:
438 parser.loads(html)
439 except compat_html_parser.HTMLParseError:
440 pass
441 return parser.get_result()
442
9e6dd238
FV
443
444def clean_html(html):
59ae15a5
PH
445 """Clean an HTML snippet into a readable string"""
446 # Newline vs <br />
447 html = html.replace('\n', ' ')
6b3aef80
FV
448 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
449 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
450 # Strip html tags
451 html = re.sub('<.*?>', '', html)
452 # Replace html entities
453 html = unescapeHTML(html)
7decf895 454 return html.strip()
9e6dd238
FV
455
456
d77c3dfd 457def sanitize_open(filename, open_mode):
59ae15a5
PH
458 """Try to open the given filename, and slightly tweak it if this fails.
459
460 Attempts to open the given filename. If this fails, it tries to change
461 the filename slightly, step by step, until it's either able to open it
462 or it fails and raises a final exception, like the standard open()
463 function.
464
465 It returns the tuple (stream, definitive_file_name).
466 """
467 try:
468 if filename == u'-':
469 if sys.platform == 'win32':
470 import msvcrt
471 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 472 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
473 stream = open(encodeFilename(filename), open_mode)
474 return (stream, filename)
475 except (IOError, OSError) as err:
f45c185f
PH
476 if err.errno in (errno.EACCES,):
477 raise
59ae15a5 478
f45c185f
PH
479 # In case of error, try to remove win32 forbidden chars
480 alt_filename = os.path.join(
481 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
482 for path_part in os.path.split(filename)
483 )
484 if alt_filename == filename:
485 raise
486 else:
487 # An exception here should be caught in the caller
488 stream = open(encodeFilename(filename), open_mode)
489 return (stream, alt_filename)
d77c3dfd
FV
490
491
492def timeconvert(timestr):
59ae15a5
PH
493 """Convert RFC 2822 defined time string into system timestamp"""
494 timestamp = None
495 timetuple = email.utils.parsedate_tz(timestr)
496 if timetuple is not None:
497 timestamp = email.utils.mktime_tz(timetuple)
498 return timestamp
1c469a94 499
796173d0 500def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
501 """Sanitizes a string so it could be used as part of a filename.
502 If restricted is set, use a stricter subset of allowed characters.
796173d0 503 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
504 """
505 def replace_insane(char):
506 if char == '?' or ord(char) < 32 or ord(char) == 127:
507 return ''
508 elif char == '"':
509 return '' if restricted else '\''
510 elif char == ':':
511 return '_-' if restricted else ' -'
512 elif char in '\\/|*<>':
513 return '_'
627dcfff 514 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
515 return '_'
516 if restricted and ord(char) > 127:
517 return '_'
518 return char
519
520 result = u''.join(map(replace_insane, s))
796173d0
PH
521 if not is_id:
522 while '__' in result:
523 result = result.replace('__', '_')
524 result = result.strip('_')
525 # Common case of "Foreign band name - English song title"
526 if restricted and result.startswith('-_'):
527 result = result[2:]
528 if not result:
529 result = '_'
59ae15a5 530 return result
d77c3dfd
FV
531
532def orderedSet(iterable):
59ae15a5
PH
533 """ Remove all duplicates from the input iterable """
534 res = []
535 for el in iterable:
536 if el not in res:
537 res.append(el)
538 return res
d77c3dfd 539
912b38b4 540
4e408e47
PH
541def _htmlentity_transform(entity):
542 """Transforms an HTML entity to a character."""
543 # Known non-numeric HTML entity
544 if entity in compat_html_entities.name2codepoint:
545 return compat_chr(compat_html_entities.name2codepoint[entity])
546
547 mobj = re.match(r'#(x?[0-9]+)', entity)
548 if mobj is not None:
549 numstr = mobj.group(1)
550 if numstr.startswith(u'x'):
551 base = 16
552 numstr = u'0%s' % numstr
553 else:
554 base = 10
555 return compat_chr(int(numstr, base))
556
557 # Unknown entity in name, return its literal representation
558 return (u'&%s;' % entity)
559
560
d77c3dfd 561def unescapeHTML(s):
912b38b4
PH
562 if s is None:
563 return None
564 assert type(s) == compat_str
d77c3dfd 565
4e408e47
PH
566 return re.sub(
567 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 568
8bf48f23
PH
569
570def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
571 """
572 @param s The name of the file
573 """
d77c3dfd 574
8bf48f23 575 assert type(s) == compat_str
d77c3dfd 576
59ae15a5
PH
577 # Python 3 has a Unicode API
578 if sys.version_info >= (3, 0):
579 return s
0f00efed 580
59ae15a5
PH
581 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
582 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
583 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
584 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
585 if not for_subprocess:
586 return s
587 else:
588 # For subprocess calls, encode with locale encoding
589 # Refer to http://stackoverflow.com/a/9951851/35070
590 encoding = preferredencoding()
59ae15a5 591 else:
6df40dcb 592 encoding = sys.getfilesystemencoding()
8bf48f23
PH
593 if encoding is None:
594 encoding = 'utf-8'
595 return s.encode(encoding, 'ignore')
596
f07b74fc
PH
597
598def encodeArgument(s):
599 if not isinstance(s, compat_str):
600 # Legacy code that uses byte strings
601 # Uncomment the following line after fixing all post processors
602 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
603 s = s.decode('ascii')
604 return encodeFilename(s, True)
605
606
8271226a
PH
607def decodeOption(optval):
608 if optval is None:
609 return optval
610 if isinstance(optval, bytes):
611 optval = optval.decode(preferredencoding())
612
613 assert isinstance(optval, compat_str)
614 return optval
1c256f70 615
4539dd30
PH
616def formatSeconds(secs):
617 if secs > 3600:
618 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
619 elif secs > 60:
620 return '%d:%02d' % (secs // 60, secs % 60)
621 else:
622 return '%d' % secs
623
a0ddb8a2
PH
624
625def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
13ebea79
PH
626 if sys.version_info < (3, 2):
627 import httplib
628
629 class HTTPSConnectionV3(httplib.HTTPSConnection):
630 def __init__(self, *args, **kwargs):
631 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
632
633 def connect(self):
634 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 635 if getattr(self, '_tunnel_host', False):
13ebea79
PH
636 self.sock = sock
637 self._tunnel()
638 try:
aa37e3d4 639 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
de79c46c 640 except ssl.SSLError:
13ebea79
PH
641 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
642
643 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
644 def https_open(self, req):
645 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 646 return HTTPSHandlerV3(**kwargs)
aa37e3d4
PH
647 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
648 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
649 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
650 if opts_no_check_certificate:
651 context.verify_mode = ssl.CERT_NONE
652 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
653 else: # Python < 3.4
654 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
ea6d901e 655 context.verify_mode = (ssl.CERT_NONE
dca08720 656 if opts_no_check_certificate
ea6d901e 657 else ssl.CERT_REQUIRED)
303b479e
PH
658 context.set_default_verify_paths()
659 try:
660 context.load_default_certs()
661 except AttributeError:
662 pass # Python < 3.4
a0ddb8a2 663 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 664
1c256f70
PH
665class ExtractorError(Exception):
666 """Error during info extraction."""
d11271dd 667 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
668 """ tb, if given, is the original traceback (so that it can be printed out).
669 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
670 """
671
672 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
673 expected = True
d11271dd
PH
674 if video_id is not None:
675 msg = video_id + ': ' + msg
410f3e73
PH
676 if cause:
677 msg += u' (caused by %r)' % cause
9a82b238 678 if not expected:
298f833b 679 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 680 super(ExtractorError, self).__init__(msg)
d5979c5d 681
1c256f70 682 self.traceback = tb
8cc83b8d 683 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 684 self.cause = cause
d11271dd 685 self.video_id = video_id
1c256f70 686
01951dda
PH
687 def format_traceback(self):
688 if self.traceback is None:
689 return None
690 return u''.join(traceback.format_tb(self.traceback))
691
1c256f70 692
55b3e45b
JMF
693class RegexNotFoundError(ExtractorError):
694 """Error when a regex didn't match"""
695 pass
696
697
d77c3dfd 698class DownloadError(Exception):
59ae15a5 699 """Download Error exception.
d77c3dfd 700
59ae15a5
PH
701 This exception may be thrown by FileDownloader objects if they are not
702 configured to continue on errors. They will contain the appropriate
703 error message.
704 """
8cc83b8d
FV
705 def __init__(self, msg, exc_info=None):
706 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
707 super(DownloadError, self).__init__(msg)
708 self.exc_info = exc_info
d77c3dfd
FV
709
710
711class SameFileError(Exception):
59ae15a5 712 """Same File exception.
d77c3dfd 713
59ae15a5
PH
714 This exception will be thrown by FileDownloader objects if they detect
715 multiple files would have to be downloaded to the same file on disk.
716 """
717 pass
d77c3dfd
FV
718
719
720class PostProcessingError(Exception):
59ae15a5 721 """Post Processing exception.
d77c3dfd 722
59ae15a5
PH
723 This exception may be raised by PostProcessor's .run() method to
724 indicate an error in the postprocessing task.
725 """
7851b379
PH
726 def __init__(self, msg):
727 self.msg = msg
d77c3dfd
FV
728
729class MaxDownloadsReached(Exception):
59ae15a5
PH
730 """ --max-downloads limit has been reached. """
731 pass
d77c3dfd
FV
732
733
734class UnavailableVideoError(Exception):
59ae15a5 735 """Unavailable Format exception.
d77c3dfd 736
59ae15a5
PH
737 This exception will be thrown when a video is requested
738 in a format that is not available for that video.
739 """
740 pass
d77c3dfd
FV
741
742
743class ContentTooShortError(Exception):
59ae15a5 744 """Content Too Short exception.
d77c3dfd 745
59ae15a5
PH
746 This exception may be raised by FileDownloader objects when a file they
747 download is too small for what the server announced first, indicating
748 the connection was probably interrupted.
749 """
750 # Both in bytes
751 downloaded = None
752 expected = None
d77c3dfd 753
59ae15a5
PH
754 def __init__(self, downloaded, expected):
755 self.downloaded = downloaded
756 self.expected = expected
d77c3dfd 757
acebc9cd 758class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
759 """Handler for HTTP requests and responses.
760
761 This class, when installed with an OpenerDirector, automatically adds
762 the standard headers to every HTTP request and handles gzipped and
763 deflated responses from web servers. If compression is to be avoided in
764 a particular request, the original request in the program code only has
765 to include the HTTP header "Youtubedl-No-Compression", which will be
766 removed before making the real request.
767
768 Part of this code was copied from:
769
770 http://techknack.net/python-urllib2-handlers/
771
772 Andrew Rowls, the author of that code, agreed to release it to the
773 public domain.
774 """
775
776 @staticmethod
777 def deflate(data):
778 try:
779 return zlib.decompress(data, -zlib.MAX_WBITS)
780 except zlib.error:
781 return zlib.decompress(data)
782
783 @staticmethod
784 def addinfourl_wrapper(stream, headers, url, code):
785 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
786 return compat_urllib_request.addinfourl(stream, headers, url, code)
787 ret = compat_urllib_request.addinfourl(stream, headers, url)
788 ret.code = code
789 return ret
790
acebc9cd 791 def http_request(self, req):
33ac271b
PH
792 for h, v in std_headers.items():
793 if h not in req.headers:
794 req.add_header(h, v)
59ae15a5
PH
795 if 'Youtubedl-no-compression' in req.headers:
796 if 'Accept-encoding' in req.headers:
797 del req.headers['Accept-encoding']
798 del req.headers['Youtubedl-no-compression']
3446dfb7 799 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
800 if 'User-agent' in req.headers:
801 del req.headers['User-agent']
802 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 803 del req.headers['Youtubedl-user-agent']
989b4b2b
PH
804
805 if sys.version_info < (2, 7) and '#' in req.get_full_url():
806 # Python 2.6 is brain-dead when it comes to fragments
807 req._Request__original = req._Request__original.partition('#')[0]
808 req._Request__r_type = req._Request__r_type.partition('#')[0]
809
59ae15a5
PH
810 return req
811
acebc9cd 812 def http_response(self, req, resp):
59ae15a5
PH
813 old_resp = resp
814 # gzip
815 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
816 content = resp.read()
817 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
818 try:
819 uncompressed = io.BytesIO(gz.read())
820 except IOError as original_ioerror:
821 # There may be junk add the end of the file
822 # See http://stackoverflow.com/q/4928560/35070 for details
823 for i in range(1, 1024):
824 try:
825 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
826 uncompressed = io.BytesIO(gz.read())
827 except IOError:
828 continue
829 break
830 else:
831 raise original_ioerror
832 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
833 resp.msg = old_resp.msg
834 # deflate
835 if resp.headers.get('Content-encoding', '') == 'deflate':
836 gz = io.BytesIO(self.deflate(resp.read()))
837 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
838 resp.msg = old_resp.msg
839 return resp
0f8d03f8 840
acebc9cd
PH
841 https_request = http_request
842 https_response = http_response
bf50b038 843
5de90176 844
305d0683 845def parse_iso8601(date_str, delimiter='T'):
912b38b4
PH
846 """ Return a UNIX timestamp from the given date """
847
848 if date_str is None:
849 return None
850
851 m = re.search(
852 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
853 date_str)
854 if not m:
855 timezone = datetime.timedelta()
856 else:
857 date_str = date_str[:-len(m.group(0))]
858 if not m.group('sign'):
859 timezone = datetime.timedelta()
860 else:
861 sign = 1 if m.group('sign') == '+' else -1
862 timezone = datetime.timedelta(
863 hours=sign * int(m.group('hours')),
864 minutes=sign * int(m.group('minutes')))
305d0683
TB
865 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
866 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
867 return calendar.timegm(dt.timetuple())
868
869
bf50b038
JMF
870def unified_strdate(date_str):
871 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
872
873 if date_str is None:
874 return None
875
bf50b038
JMF
876 upload_date = None
877 #Replace commas
026fcc04 878 date_str = date_str.replace(',', ' ')
bf50b038 879 # %z (UTC offset) is only supported in python>=3.2
026fcc04 880 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
19e1d359
JMF
881 format_expressions = [
882 '%d %B %Y',
0f99566c 883 '%d %b %Y',
19e1d359
JMF
884 '%B %d %Y',
885 '%b %d %Y',
78ff59d0
PP
886 '%b %dst %Y %I:%M%p',
887 '%b %dnd %Y %I:%M%p',
888 '%b %dth %Y %I:%M%p',
19e1d359 889 '%Y-%m-%d',
fe556f1b 890 '%Y/%m/%d',
4cf96546 891 '%d.%m.%Y',
19e1d359 892 '%d/%m/%Y',
423817c4 893 '%d/%m/%y',
19e1d359 894 '%Y/%m/%d %H:%M:%S',
99b67fec 895 '%d/%m/%Y %H:%M:%S',
5d73273f 896 '%Y-%m-%d %H:%M:%S',
19e1d359 897 '%d.%m.%Y %H:%M',
b047de6f 898 '%d.%m.%Y %H.%M',
19e1d359 899 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
900 '%Y-%m-%dT%H:%M:%S.%fZ',
901 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 902 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 903 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 904 '%Y-%m-%dT%H:%M',
19e1d359 905 ]
bf50b038
JMF
906 for expression in format_expressions:
907 try:
908 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 909 except ValueError:
bf50b038 910 pass
42393ce2
PH
911 if upload_date is None:
912 timetuple = email.utils.parsedate_tz(date_str)
913 if timetuple:
914 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
915 return upload_date
916
cbdbb766 917def determine_ext(url, default_ext=u'unknown_video'):
f4776371
S
918 if url is None:
919 return default_ext
73e79f2a
PH
920 guess = url.partition(u'?')[0].rpartition(u'.')[2]
921 if re.match(r'^[A-Za-z0-9]+$', guess):
922 return guess
923 else:
cbdbb766 924 return default_ext
73e79f2a 925
d4051a8e
JMF
926def subtitles_filename(filename, sub_lang, sub_format):
927 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
928
bd558525 929def date_from_str(date_str):
37254abc
JMF
930 """
931 Return a datetime object from a string in the format YYYYMMDD or
932 (now|today)[+-][0-9](day|week|month|year)(s)?"""
933 today = datetime.date.today()
934 if date_str == 'now'or date_str == 'today':
935 return today
936 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
937 if match is not None:
938 sign = match.group('sign')
939 time = int(match.group('time'))
940 if sign == '-':
941 time = -time
942 unit = match.group('unit')
943 #A bad aproximation?
944 if unit == 'month':
945 unit = 'day'
946 time *= 30
947 elif unit == 'year':
948 unit = 'day'
949 time *= 365
950 unit += 's'
951 delta = datetime.timedelta(**{unit: time})
952 return today + delta
bd558525
JMF
953 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
954
e63fc1be 955def hyphenate_date(date_str):
956 """
957 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
958 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
959 if match is not None:
960 return '-'.join(match.groups())
961 else:
962 return date_str
963
bd558525
JMF
964class DateRange(object):
965 """Represents a time interval between two dates"""
966 def __init__(self, start=None, end=None):
967 """start and end must be strings in the format accepted by date"""
968 if start is not None:
969 self.start = date_from_str(start)
970 else:
971 self.start = datetime.datetime.min.date()
972 if end is not None:
973 self.end = date_from_str(end)
974 else:
975 self.end = datetime.datetime.max.date()
37254abc 976 if self.start > self.end:
bd558525
JMF
977 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
978 @classmethod
979 def day(cls, day):
980 """Returns a range that only contains the given day"""
981 return cls(day,day)
982 def __contains__(self, date):
983 """Check if the date is in the range"""
37254abc
JMF
984 if not isinstance(date, datetime.date):
985 date = date_from_str(date)
986 return self.start <= date <= self.end
bd558525
JMF
987 def __str__(self):
988 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
989
990
991def platform_name():
992 """ Returns the platform name as a compat_str """
993 res = platform.platform()
994 if isinstance(res, bytes):
995 res = res.decode(preferredencoding())
996
997 assert isinstance(res, compat_str)
998 return res
c257baff
PH
999
1000
b58ddb32
PH
1001def _windows_write_string(s, out):
1002 """ Returns True if the string was written using special methods,
1003 False if it has yet to be written out."""
1004 # Adapted from http://stackoverflow.com/a/3259271/35070
1005
1006 import ctypes
1007 import ctypes.wintypes
1008
1009 WIN_OUTPUT_IDS = {
1010 1: -11,
1011 2: -12,
1012 }
1013
a383a98a
PH
1014 try:
1015 fileno = out.fileno()
1016 except AttributeError:
1017 # If the output stream doesn't have a fileno, it's virtual
1018 return False
b58ddb32
PH
1019 if fileno not in WIN_OUTPUT_IDS:
1020 return False
1021
1022 GetStdHandle = ctypes.WINFUNCTYPE(
1023 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1024 ("GetStdHandle", ctypes.windll.kernel32))
1025 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1026
1027 WriteConsoleW = ctypes.WINFUNCTYPE(
1028 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1029 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1030 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1031 written = ctypes.wintypes.DWORD(0)
1032
1033 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1034 FILE_TYPE_CHAR = 0x0002
1035 FILE_TYPE_REMOTE = 0x8000
1036 GetConsoleMode = ctypes.WINFUNCTYPE(
1037 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1038 ctypes.POINTER(ctypes.wintypes.DWORD))(
1039 ("GetConsoleMode", ctypes.windll.kernel32))
1040 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1041
1042 def not_a_console(handle):
1043 if handle == INVALID_HANDLE_VALUE or handle is None:
1044 return True
1045 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1046 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1047
1048 if not_a_console(h):
1049 return False
1050
d1b9c912
PH
1051 def next_nonbmp_pos(s):
1052 try:
1053 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1054 except StopIteration:
1055 return len(s)
1056
1057 while s:
1058 count = min(next_nonbmp_pos(s), 1024)
1059
b58ddb32 1060 ret = WriteConsoleW(
d1b9c912 1061 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1062 if ret == 0:
1063 raise OSError('Failed to write string')
d1b9c912
PH
1064 if not count: # We just wrote a non-BMP character
1065 assert written.value == 2
1066 s = s[1:]
1067 else:
1068 assert written.value > 0
1069 s = s[written.value:]
b58ddb32
PH
1070 return True
1071
1072
734f90bb 1073def write_string(s, out=None, encoding=None):
7459e3a2
PH
1074 if out is None:
1075 out = sys.stderr
8bf48f23 1076 assert type(s) == compat_str
7459e3a2 1077
b58ddb32
PH
1078 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1079 if _windows_write_string(s, out):
1080 return
1081
7459e3a2
PH
1082 if ('b' in getattr(out, 'mode', '') or
1083 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1084 byt = s.encode(encoding or preferredencoding(), 'ignore')
1085 out.write(byt)
1086 elif hasattr(out, 'buffer'):
1087 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1088 byt = s.encode(enc, 'ignore')
1089 out.buffer.write(byt)
1090 else:
8bf48f23 1091 out.write(s)
7459e3a2
PH
1092 out.flush()
1093
1094
48ea9cea
PH
1095def bytes_to_intlist(bs):
1096 if not bs:
1097 return []
1098 if isinstance(bs[0], int): # Python 3
1099 return list(bs)
1100 else:
1101 return [ord(c) for c in bs]
1102
c257baff 1103
cba892fa 1104def intlist_to_bytes(xs):
1105 if not xs:
1106 return b''
1107 if isinstance(chr(0), bytes): # Python 2
1108 return ''.join([chr(x) for x in xs])
1109 else:
1110 return bytes(xs)
c38b1e77
PH
1111
1112
c1c9a79c
PH
1113# Cross-platform file locking
1114if sys.platform == 'win32':
1115 import ctypes.wintypes
1116 import msvcrt
1117
1118 class OVERLAPPED(ctypes.Structure):
1119 _fields_ = [
1120 ('Internal', ctypes.wintypes.LPVOID),
1121 ('InternalHigh', ctypes.wintypes.LPVOID),
1122 ('Offset', ctypes.wintypes.DWORD),
1123 ('OffsetHigh', ctypes.wintypes.DWORD),
1124 ('hEvent', ctypes.wintypes.HANDLE),
1125 ]
1126
1127 kernel32 = ctypes.windll.kernel32
1128 LockFileEx = kernel32.LockFileEx
1129 LockFileEx.argtypes = [
1130 ctypes.wintypes.HANDLE, # hFile
1131 ctypes.wintypes.DWORD, # dwFlags
1132 ctypes.wintypes.DWORD, # dwReserved
1133 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1134 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1135 ctypes.POINTER(OVERLAPPED) # Overlapped
1136 ]
1137 LockFileEx.restype = ctypes.wintypes.BOOL
1138 UnlockFileEx = kernel32.UnlockFileEx
1139 UnlockFileEx.argtypes = [
1140 ctypes.wintypes.HANDLE, # hFile
1141 ctypes.wintypes.DWORD, # dwReserved
1142 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1143 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1144 ctypes.POINTER(OVERLAPPED) # Overlapped
1145 ]
1146 UnlockFileEx.restype = ctypes.wintypes.BOOL
1147 whole_low = 0xffffffff
1148 whole_high = 0x7fffffff
1149
1150 def _lock_file(f, exclusive):
1151 overlapped = OVERLAPPED()
1152 overlapped.Offset = 0
1153 overlapped.OffsetHigh = 0
1154 overlapped.hEvent = 0
1155 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1156 handle = msvcrt.get_osfhandle(f.fileno())
1157 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1158 whole_low, whole_high, f._lock_file_overlapped_p):
1159 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1160
1161 def _unlock_file(f):
1162 assert f._lock_file_overlapped_p
1163 handle = msvcrt.get_osfhandle(f.fileno())
1164 if not UnlockFileEx(handle, 0,
1165 whole_low, whole_high, f._lock_file_overlapped_p):
1166 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1167
1168else:
1169 import fcntl
1170
1171 def _lock_file(f, exclusive):
2582bebe 1172 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
1173
1174 def _unlock_file(f):
2582bebe 1175 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
1176
1177
1178class locked_file(object):
1179 def __init__(self, filename, mode, encoding=None):
1180 assert mode in ['r', 'a', 'w']
1181 self.f = io.open(filename, mode, encoding=encoding)
1182 self.mode = mode
1183
1184 def __enter__(self):
1185 exclusive = self.mode != 'r'
1186 try:
1187 _lock_file(self.f, exclusive)
1188 except IOError:
1189 self.f.close()
1190 raise
1191 return self
1192
1193 def __exit__(self, etype, value, traceback):
1194 try:
1195 _unlock_file(self.f)
1196 finally:
1197 self.f.close()
1198
1199 def __iter__(self):
1200 return iter(self.f)
1201
1202 def write(self, *args):
1203 return self.f.write(*args)
1204
1205 def read(self, *args):
1206 return self.f.read(*args)
4eb7f1d1
JMF
1207
1208
1209def shell_quote(args):
a6a173c2
JMF
1210 quoted_args = []
1211 encoding = sys.getfilesystemencoding()
1212 if encoding is None:
1213 encoding = 'utf-8'
1214 for a in args:
1215 if isinstance(a, bytes):
1216 # We may get a filename encoded with 'encodeFilename'
1217 a = a.decode(encoding)
1218 quoted_args.append(pipes.quote(a))
1219 return u' '.join(quoted_args)
9d4660ca
PH
1220
1221
f4d96df0
PH
1222def takewhile_inclusive(pred, seq):
1223 """ Like itertools.takewhile, but include the latest evaluated element
1224 (the first element so that Not pred(e)) """
1225 for e in seq:
1226 yield e
1227 if not pred(e):
1228 return
1229
1230
9d4660ca
PH
1231def smuggle_url(url, data):
1232 """ Pass additional data in a URL for internal use. """
1233
1234 sdata = compat_urllib_parse.urlencode(
1235 {u'__youtubedl_smuggle': json.dumps(data)})
1236 return url + u'#' + sdata
1237
1238
79f82953 1239def unsmuggle_url(smug_url, default=None):
9d4660ca 1240 if not '#__youtubedl_smuggle' in smug_url:
79f82953 1241 return smug_url, default
9d4660ca
PH
1242 url, _, sdata = smug_url.rpartition(u'#')
1243 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1244 data = json.loads(jsond)
1245 return url, data
02dbf93f
PH
1246
1247
02dbf93f
PH
1248def format_bytes(bytes):
1249 if bytes is None:
1250 return u'N/A'
1251 if type(bytes) is str:
1252 bytes = float(bytes)
1253 if bytes == 0.0:
1254 exponent = 0
1255 else:
1256 exponent = int(math.log(bytes, 1024.0))
1257 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1258 converted = float(bytes) / float(1024 ** exponent)
1259 return u'%.2f%s' % (converted, suffix)
f53c966a 1260
1c088fa8 1261
1c088fa8
PH
1262def get_term_width():
1263 columns = os.environ.get('COLUMNS', None)
1264 if columns:
1265 return int(columns)
1266
1267 try:
1268 sp = subprocess.Popen(
1269 ['stty', 'size'],
1270 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1271 out, err = sp.communicate()
1272 return int(out.split()[1])
1273 except:
1274 pass
1275 return None
caefb1de
PH
1276
1277
1278def month_by_name(name):
1279 """ Return the number of a month by (locale-independently) English name """
1280
1281 ENGLISH_NAMES = [
dadb8184 1282 u'January', u'February', u'March', u'April', u'May', u'June',
caefb1de
PH
1283 u'July', u'August', u'September', u'October', u'November', u'December']
1284 try:
1285 return ENGLISH_NAMES.index(name) + 1
1286 except ValueError:
1287 return None
18258362
JMF
1288
1289
5aafe895 1290def fix_xml_ampersands(xml_str):
18258362 1291 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1292 return re.sub(
1293 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1294 u'&amp;',
1295 xml_str)
e3946f98
PH
1296
1297
1298def setproctitle(title):
8bf48f23 1299 assert isinstance(title, compat_str)
e3946f98
PH
1300 try:
1301 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1302 except OSError:
1303 return
6eefe533
PH
1304 title_bytes = title.encode('utf-8')
1305 buf = ctypes.create_string_buffer(len(title_bytes))
1306 buf.value = title_bytes
e3946f98 1307 try:
6eefe533 1308 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1309 except AttributeError:
1310 return # Strange libc, just skip this
d7dda168
PH
1311
1312
1313def remove_start(s, start):
1314 if s.startswith(start):
1315 return s[len(start):]
1316 return s
29eb5174
PH
1317
1318
2b9faf55
PH
1319def remove_end(s, end):
1320 if s.endswith(end):
1321 return s[:-len(end)]
1322 return s
1323
1324
29eb5174 1325def url_basename(url):
9b8aaeed
JMF
1326 path = compat_urlparse.urlparse(url).path
1327 return path.strip(u'/').split(u'/')[-1]
aa94a6d3
PH
1328
1329
1330class HEADRequest(compat_urllib_request.Request):
1331 def get_method(self):
1332 return "HEAD"
7217e148
PH
1333
1334
9732d77e 1335def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1336 if get_attr:
1337 if v is not None:
1338 v = getattr(v, get_attr, None)
9572013d
PH
1339 if v == '':
1340 v = None
9732d77e
PH
1341 return default if v is None else (int(v) * invscale // scale)
1342
9572013d 1343
40a90862
JMF
1344def str_or_none(v, default=None):
1345 return default if v is None else compat_str(v)
1346
9732d77e
PH
1347
1348def str_to_int(int_str):
48d4681e 1349 """ A more relaxed version of int_or_none """
9732d77e
PH
1350 if int_str is None:
1351 return None
884ae747 1352 int_str = re.sub(r'[,\.\+]', u'', int_str)
9732d77e 1353 return int(int_str)
608d11f5
PH
1354
1355
9732d77e
PH
1356def float_or_none(v, scale=1, invscale=1, default=None):
1357 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1358
1359
608d11f5
PH
1360def parse_duration(s):
1361 if s is None:
1362 return None
1363
ca7b3246
S
1364 s = s.strip()
1365
608d11f5 1366 m = re.match(
f164038b 1367 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
608d11f5
PH
1368 if not m:
1369 return None
1370 res = int(m.group('secs'))
1371 if m.group('mins'):
1372 res += int(m.group('mins')) * 60
1373 if m.group('hours'):
1374 res += int(m.group('hours')) * 60 * 60
7adcbe75
PH
1375 if m.group('ms'):
1376 res += float(m.group('ms'))
608d11f5 1377 return res
91d7d0b3
JMF
1378
1379
1380def prepend_extension(filename, ext):
1381 name, real_ext = os.path.splitext(filename)
1382 return u'{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1383
1384
1385def check_executable(exe, args=[]):
1386 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1387 args can be a list of arguments for a short output (like -version) """
1388 try:
1389 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1390 except OSError:
1391 return False
1392 return exe
b7ab0590
PH
1393
1394
1395class PagedList(object):
dd26ced1
PH
1396 def __len__(self):
1397 # This is only useful for tests
1398 return len(self.getslice())
1399
9c44d242
PH
1400
1401class OnDemandPagedList(PagedList):
1402 def __init__(self, pagefunc, pagesize):
1403 self._pagefunc = pagefunc
1404 self._pagesize = pagesize
1405
b7ab0590
PH
1406 def getslice(self, start=0, end=None):
1407 res = []
1408 for pagenum in itertools.count(start // self._pagesize):
1409 firstid = pagenum * self._pagesize
1410 nextfirstid = pagenum * self._pagesize + self._pagesize
1411 if start >= nextfirstid:
1412 continue
1413
1414 page_results = list(self._pagefunc(pagenum))
1415
1416 startv = (
1417 start % self._pagesize
1418 if firstid <= start < nextfirstid
1419 else 0)
1420
1421 endv = (
1422 ((end - 1) % self._pagesize) + 1
1423 if (end is not None and firstid <= end <= nextfirstid)
1424 else None)
1425
1426 if startv != 0 or endv is not None:
1427 page_results = page_results[startv:endv]
1428 res.extend(page_results)
1429
1430 # A little optimization - if current page is not "full", ie. does
1431 # not contain page_size videos then we can assume that this page
1432 # is the last one - there are no more ids on further pages -
1433 # i.e. no need to query again.
1434 if len(page_results) + startv < self._pagesize:
1435 break
1436
1437 # If we got the whole page, but the next page is not interesting,
1438 # break out early as well
1439 if end == nextfirstid:
1440 break
1441 return res
81c2f20b
PH
1442
1443
9c44d242
PH
1444class InAdvancePagedList(PagedList):
1445 def __init__(self, pagefunc, pagecount, pagesize):
1446 self._pagefunc = pagefunc
1447 self._pagecount = pagecount
1448 self._pagesize = pagesize
1449
1450 def getslice(self, start=0, end=None):
1451 res = []
1452 start_page = start // self._pagesize
1453 end_page = (
1454 self._pagecount if end is None else (end // self._pagesize + 1))
1455 skip_elems = start - start_page * self._pagesize
1456 only_more = None if end is None else end - start
1457 for pagenum in range(start_page, end_page):
1458 page = list(self._pagefunc(pagenum))
1459 if skip_elems:
1460 page = page[skip_elems:]
1461 skip_elems = None
1462 if only_more is not None:
1463 if len(page) < only_more:
1464 only_more -= len(page)
1465 else:
1466 page = page[:only_more]
1467 res.extend(page)
1468 break
1469 res.extend(page)
1470 return res
1471
1472
81c2f20b 1473def uppercase_escape(s):
676eb3f2 1474 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1475 return re.sub(
a612753d 1476 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1477 lambda m: unicode_escape(m.group(0))[0],
1478 s)
b53466e1 1479
d05cfe06
S
1480
1481def escape_rfc3986(s):
1482 """Escape non-ASCII characters as suggested by RFC 3986"""
1483 if sys.version_info < (3, 0) and isinstance(s, unicode):
1484 s = s.encode('utf-8')
984e8e14 1485 return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1486
1487
1488def escape_url(url):
1489 """Escape URL as suggested by RFC 3986"""
1490 url_parsed = compat_urllib_parse_urlparse(url)
1491 return url_parsed._replace(
1492 path=escape_rfc3986(url_parsed.path),
1493 params=escape_rfc3986(url_parsed.params),
1494 query=escape_rfc3986(url_parsed.query),
1495 fragment=escape_rfc3986(url_parsed.fragment)
1496 ).geturl()
1497
b53466e1
PH
1498try:
1499 struct.pack(u'!I', 0)
1500except TypeError:
1501 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1502 def struct_pack(spec, *args):
1503 if isinstance(spec, compat_str):
1504 spec = spec.encode('ascii')
1505 return struct.pack(spec, *args)
1506
1507 def struct_unpack(spec, *args):
1508 if isinstance(spec, compat_str):
1509 spec = spec.encode('ascii')
1510 return struct.unpack(spec, *args)
1511else:
1512 struct_pack = struct.pack
1513 struct_unpack = struct.unpack
62e609ab
PH
1514
1515
1516def read_batch_urls(batch_fd):
1517 def fixup(url):
1518 if not isinstance(url, compat_str):
1519 url = url.decode('utf-8', 'replace')
1520 BOM_UTF8 = u'\xef\xbb\xbf'
1521 if url.startswith(BOM_UTF8):
1522 url = url[len(BOM_UTF8):]
1523 url = url.strip()
1524 if url.startswith(('#', ';', ']')):
1525 return False
1526 return url
1527
1528 with contextlib.closing(batch_fd) as fd:
1529 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1530
1531
1532def urlencode_postdata(*args, **kargs):
1533 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1534
1535
0990305d
PH
1536try:
1537 etree_iter = xml.etree.ElementTree.Element.iter
1538except AttributeError: # Python <=2.6
1539 etree_iter = lambda n: n.findall('.//*')
1540
1541
bcf89ce6
PH
1542def parse_xml(s):
1543 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1544 def doctype(self, name, pubid, system):
1545 pass # Ignore doctypes
1546
1547 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1548 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1549 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1550 # Fix up XML parser in Python 2.x
1551 if sys.version_info < (3, 0):
1552 for n in etree_iter(tree):
1553 if n.text is not None:
1554 if not isinstance(n.text, compat_str):
1555 n.text = n.text.decode('utf-8')
1556 return tree
e68301af
PH
1557
1558
1559if sys.version_info < (3, 0) and sys.platform == 'win32':
1560 def compat_getpass(prompt, *args, **kwargs):
1561 if isinstance(prompt, compat_str):
4e6f9aec 1562 prompt = prompt.encode(preferredencoding())
e68301af
PH
1563 return getpass.getpass(prompt, *args, **kwargs)
1564else:
1565 compat_getpass = getpass.getpass
a1a530b0
PH
1566
1567
1568US_RATINGS = {
1569 'G': 0,
1570 'PG': 10,
1571 'PG-13': 13,
1572 'R': 16,
1573 'NC': 18,
1574}
fac55558
PH
1575
1576
1577def strip_jsonp(code):
816930c4 1578 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
478c2c61
PH
1579
1580
e05f6939
PH
1581def js_to_json(code):
1582 def fix_kv(m):
e7b6d122
PH
1583 v = m.group(0)
1584 if v in ('true', 'false', 'null'):
1585 return v
1586 if v.startswith('"'):
1587 return v
1588 if v.startswith("'"):
1589 v = v[1:-1]
1590 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1591 '\\\\': '\\\\',
1592 "\\'": "'",
1593 '"': '\\"',
1594 }[m.group(0)], v)
1595 return '"%s"' % v
e05f6939
PH
1596
1597 res = re.sub(r'''(?x)
e7b6d122
PH
1598 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1599 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1600 [a-zA-Z_][a-zA-Z_0-9]*
e05f6939
PH
1601 ''', fix_kv, code)
1602 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1603 return res
1604
1605
478c2c61
PH
1606def qualities(quality_ids):
1607 """ Get a numeric quality value out of a list of possible values """
1608 def q(qid):
1609 try:
1610 return quality_ids.index(qid)
1611 except ValueError:
1612 return -1
1613 return q
1614
acd69589
PH
1615
1616DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68
PH
1617
1618try:
1619 subprocess_check_output = subprocess.check_output
1620except AttributeError:
1621 def subprocess_check_output(*args, **kwargs):
1622 assert 'input' not in kwargs
1623 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1624 output, _ = p.communicate()
1625 ret = p.poll()
1626 if ret:
1627 raise subprocess.CalledProcessError(ret, p.args, output=output)
1628 return output
a020a0dc
PH
1629
1630
1631def limit_length(s, length):
1632 """ Add ellipses to overly long strings """
1633 if s is None:
1634 return None
1635 ELLIPSES = '...'
1636 if len(s) > length:
1637 return s[:length - len(ELLIPSES)] + ELLIPSES
1638 return s