]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[godtube] Fix on Python 2.6
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
912b38b4 4import calendar
676eb3f2 5import codecs
62e609ab 6import contextlib
e3946f98 7import ctypes
c496ca96
PH
8import datetime
9import email.utils
f45c185f 10import errno
e68301af 11import getpass
d77c3dfd 12import gzip
b7ab0590 13import itertools
03f9daab 14import io
f4bfd65f 15import json
d77c3dfd 16import locale
02dbf93f 17import math
d77c3dfd 18import os
4eb7f1d1 19import pipes
c496ca96 20import platform
d77c3dfd 21import re
13ebea79 22import ssl
c496ca96 23import socket
b53466e1 24import struct
1c088fa8 25import subprocess
d77c3dfd 26import sys
181c8655 27import tempfile
01951dda 28import traceback
bcf89ce6 29import xml.etree.ElementTree
d77c3dfd 30import zlib
d77c3dfd 31
01ba00ca 32try:
59ae15a5 33 import urllib.request as compat_urllib_request
01ba00ca 34except ImportError: # Python 2
59ae15a5 35 import urllib2 as compat_urllib_request
01ba00ca
PH
36
37try:
59ae15a5 38 import urllib.error as compat_urllib_error
01ba00ca 39except ImportError: # Python 2
59ae15a5 40 import urllib2 as compat_urllib_error
01ba00ca
PH
41
42try:
59ae15a5 43 import urllib.parse as compat_urllib_parse
01ba00ca 44except ImportError: # Python 2
59ae15a5 45 import urllib as compat_urllib_parse
01ba00ca 46
799c0763
PH
47try:
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
51
6543f0dc
JMF
52try:
53 import urllib.parse as compat_urlparse
54except ImportError: # Python 2
55 import urlparse as compat_urlparse
56
01ba00ca 57try:
59ae15a5 58 import http.cookiejar as compat_cookiejar
01ba00ca 59except ImportError: # Python 2
59ae15a5 60 import cookielib as compat_cookiejar
01ba00ca 61
3e669f36 62try:
59ae15a5 63 import html.entities as compat_html_entities
9f37a959 64except ImportError: # Python 2
59ae15a5 65 import htmlentitydefs as compat_html_entities
3e669f36 66
a8156c1d 67try:
59ae15a5 68 import html.parser as compat_html_parser
9f37a959 69except ImportError: # Python 2
59ae15a5 70 import HTMLParser as compat_html_parser
a8156c1d 71
348d0a7a 72try:
59ae15a5 73 import http.client as compat_http_client
9f37a959 74except ImportError: # Python 2
59ae15a5 75 import httplib as compat_http_client
348d0a7a 76
2eabb802 77try:
0e283428 78 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
79except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
81
e0df6211
PH
82try:
83 from urllib.request import urlretrieve as compat_urlretrieve
84except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
86
87
5910e210
PH
88try:
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
91except ImportError:
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
93
9f37a959 94try:
f1f725c6
PH
95 from urllib.parse import unquote as compat_urllib_parse_unquote
96except ImportError:
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
59ae15a5
PH
98 if string == '':
99 return string
100 res = string.split('%')
101 if len(res) == 1:
102 return string
103 if encoding is None:
104 encoding = 'utf-8'
105 if errors is None:
106 errors = 'replace'
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
108 pct_sequence = b''
109 string = res[0]
110 for item in res[1:]:
111 try:
112 if not item:
113 raise ValueError
114 pct_sequence += item[:2].decode('hex')
115 rest = item[2:]
116 if not rest:
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
120 continue
121 except ValueError:
122 rest = '%' + item
123 # Encountered non-percent-encoded characters. Flush the current
124 # pct_sequence.
125 string += pct_sequence.decode(encoding, errors) + rest
126 pct_sequence = b''
127 if pct_sequence:
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
130 return string
131
f1f725c6
PH
132
133try:
134 from urllib.parse import parse_qs as compat_parse_qs
135except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
138
59ae15a5
PH
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
143 r = []
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
146 continue
147 nv = name_value.split('=', 1)
148 if len(nv) != 2:
149 if strict_parsing:
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
153 nv.append('')
154 else:
155 continue
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
f1f725c6
PH
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
59ae15a5
PH
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
f1f725c6
PH
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
59ae15a5
PH
164 value = _coerce_result(value)
165 r.append((name, value))
166 return r
167
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
170 parsed_result = {}
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
176 else:
177 parsed_result[name] = [value]
178 return parsed_result
348d0a7a 179
3e669f36 180try:
59ae15a5 181 compat_str = unicode # Python 2
3e669f36 182except NameError:
59ae15a5 183 compat_str = str
3e669f36
PH
184
185try:
59ae15a5 186 compat_chr = unichr # Python 2
3e669f36 187except NameError:
59ae15a5 188 compat_chr = chr
3e669f36 189
f7300c5c
JMF
190try:
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
194
8d31fa3c
PH
195try:
196 from shlex import quote as shlex_quote
197except ImportError: # Python < 3.3
198 def shlex_quote(s):
199 return "'" + s.replace("'", "'\"'\"'") + "'"
200
201
b31756c1
FV
202def compat_ord(c):
203 if type(c) is int: return c
204 else: return ord(c)
205
468e2e92
FV
206# This is not clearly defined otherwise
207compiled_regex_type = type(re.compile(''))
208
3e669f36 209std_headers = {
ae8f7871 210 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
211 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
212 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
213 'Accept-Encoding': 'gzip, deflate',
214 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 215}
f427df17 216
d77c3dfd 217def preferredencoding():
59ae15a5 218 """Get preferred encoding.
d77c3dfd 219
59ae15a5
PH
220 Returns the best encoding scheme for the system, based on
221 locale.getpreferredencoding() and some further tweaks.
222 """
223 try:
224 pref = locale.getpreferredencoding()
225 u'TEST'.encode(pref)
226 except:
227 pref = 'UTF-8'
bae611f2 228
59ae15a5 229 return pref
d77c3dfd 230
8cd10ac4 231if sys.version_info < (3,0):
59ae15a5
PH
232 def compat_print(s):
233 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 234else:
59ae15a5
PH
235 def compat_print(s):
236 assert type(s) == type(u'')
237 print(s)
d77c3dfd 238
f4bfd65f 239
181c8655
PH
240def write_json_file(obj, fn):
241 """ Encode obj as JSON and write it to fn, atomically """
242
73159f99
S
243 args = {
244 'suffix': '.tmp',
245 'prefix': os.path.basename(fn) + '.',
246 'dir': os.path.dirname(fn),
247 'delete': False,
248 }
249
181c8655
PH
250 # In Python 2.x, json.dump expects a bytestream.
251 # In Python 3.x, it writes to a character stream
252 if sys.version_info < (3, 0):
73159f99 253 args['mode'] = 'wb'
181c8655 254 else:
73159f99
S
255 args.update({
256 'mode': 'w',
257 'encoding': 'utf-8',
258 })
259
260 tf = tempfile.NamedTemporaryFile(**args)
181c8655
PH
261
262 try:
263 with tf:
264 json.dump(obj, tf)
265 os.rename(tf.name, fn)
266 except:
267 try:
268 os.remove(tf.name)
269 except OSError:
270 pass
271 raise
272
273
274if sys.version_info >= (2, 7):
59ae56fa
PH
275 def find_xpath_attr(node, xpath, key, val):
276 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
277 assert re.match(r'^[a-zA-Z-]+$', key)
278 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
59ae56fa
PH
279 expr = xpath + u"[@%s='%s']" % (key, val)
280 return node.find(expr)
281else:
282 def find_xpath_attr(node, xpath, key, val):
4eefbfdb
PH
283 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
284 # .//node does not match if a node is a direct child of . !
285 if isinstance(xpath, unicode):
286 xpath = xpath.encode('ascii')
287
59ae56fa
PH
288 for f in node.findall(xpath):
289 if f.attrib.get(key) == val:
290 return f
291 return None
292
d7e66d39
JMF
293# On python2.6 the xml.etree.ElementTree.Element methods don't support
294# the namespace parameter
295def xpath_with_ns(path, ns_map):
296 components = [c.split(':') for c in path.split('/')]
297 replaced = []
298 for c in components:
299 if len(c) == 1:
300 replaced.append(c[0])
301 else:
302 ns, tag = c
303 replaced.append('{%s}%s' % (ns_map[ns], tag))
304 return '/'.join(replaced)
305
d77c3dfd 306
bf0ff932 307def xpath_text(node, xpath, name=None, fatal=False):
d74bebd5
PH
308 if sys.version_info < (2, 7): # Crazy 2.6
309 xpath = xpath.encode('ascii')
310
bf0ff932
PH
311 n = node.find(xpath)
312 if n is None:
313 if fatal:
314 name = xpath if name is None else name
315 raise ExtractorError('Could not find XML element %s' % name)
316 else:
317 return None
318 return n.text
319
320
a8156c1d 321compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
322class BaseHTMLParser(compat_html_parser.HTMLParser):
323 def __init(self):
324 compat_html_parser.HTMLParser.__init__(self)
325 self.html = None
326
327 def loads(self, html):
328 self.html = html
329 self.feed(html)
330 self.close()
331
332class AttrParser(BaseHTMLParser):
43e8fafd
ND
333 """Modified HTMLParser that isolates a tag with the specified attribute"""
334 def __init__(self, attribute, value):
335 self.attribute = attribute
336 self.value = value
59ae15a5
PH
337 self.result = None
338 self.started = False
339 self.depth = {}
59ae15a5
PH
340 self.watch_startpos = False
341 self.error_count = 0
a921f407 342 BaseHTMLParser.__init__(self)
59ae15a5
PH
343
344 def error(self, message):
345 if self.error_count > 10 or self.started:
346 raise compat_html_parser.HTMLParseError(message, self.getpos())
347 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
348 self.error_count += 1
349 self.goahead(1)
350
59ae15a5
PH
351 def handle_starttag(self, tag, attrs):
352 attrs = dict(attrs)
353 if self.started:
354 self.find_startpos(None)
43e8fafd 355 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
356 self.result = [tag]
357 self.started = True
358 self.watch_startpos = True
359 if self.started:
360 if not tag in self.depth: self.depth[tag] = 0
361 self.depth[tag] += 1
362
363 def handle_endtag(self, tag):
364 if self.started:
365 if tag in self.depth: self.depth[tag] -= 1
366 if self.depth[self.result[0]] == 0:
367 self.started = False
368 self.result.append(self.getpos())
369
370 def find_startpos(self, x):
371 """Needed to put the start position of the result (self.result[1])
372 after the opening tag with the requested id"""
373 if self.watch_startpos:
374 self.watch_startpos = False
375 self.result.append(self.getpos())
376 handle_entityref = handle_charref = handle_data = handle_comment = \
377 handle_decl = handle_pi = unknown_decl = find_startpos
378
379 def get_result(self):
380 if self.result is None:
381 return None
382 if len(self.result) != 3:
383 return None
384 lines = self.html.split('\n')
385 lines = lines[self.result[1][0]-1:self.result[2][0]]
386 lines[0] = lines[0][self.result[1][1]:]
387 if len(lines) == 1:
388 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
389 lines[-1] = lines[-1][:self.result[2][1]]
390 return '\n'.join(lines).strip()
3b024e17
PH
391# Hack for https://github.com/rg3/youtube-dl/issues/662
392if sys.version_info < (2, 7, 3):
393 AttrParser.parse_endtag = (lambda self, i:
394 i + len("</scr'+'ipt>")
395 if self.rawdata[i:].startswith("</scr'+'ipt>")
396 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
397
398def get_element_by_id(id, html):
43e8fafd
ND
399 """Return the content of the tag with the specified ID in the passed HTML document"""
400 return get_element_by_attribute("id", id, html)
401
402def get_element_by_attribute(attribute, value, html):
403 """Return the content of the tag with the specified attribute in the passed HTML document"""
404 parser = AttrParser(attribute, value)
59ae15a5
PH
405 try:
406 parser.loads(html)
407 except compat_html_parser.HTMLParseError:
408 pass
409 return parser.get_result()
9e6dd238 410
a921f407
JMF
411class MetaParser(BaseHTMLParser):
412 """
413 Modified HTMLParser that isolates a meta tag with the specified name
414 attribute.
415 """
416 def __init__(self, name):
417 BaseHTMLParser.__init__(self)
418 self.name = name
419 self.content = None
420 self.result = None
421
422 def handle_starttag(self, tag, attrs):
423 if tag != 'meta':
424 return
425 attrs = dict(attrs)
426 if attrs.get('name') == self.name:
427 self.result = attrs.get('content')
428
429 def get_result(self):
430 return self.result
431
432def get_meta_content(name, html):
433 """
434 Return the content attribute from the meta tag with the given name attribute.
435 """
436 parser = MetaParser(name)
437 try:
438 parser.loads(html)
439 except compat_html_parser.HTMLParseError:
440 pass
441 return parser.get_result()
442
9e6dd238
FV
443
444def clean_html(html):
59ae15a5
PH
445 """Clean an HTML snippet into a readable string"""
446 # Newline vs <br />
447 html = html.replace('\n', ' ')
6b3aef80
FV
448 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
449 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
450 # Strip html tags
451 html = re.sub('<.*?>', '', html)
452 # Replace html entities
453 html = unescapeHTML(html)
7decf895 454 return html.strip()
9e6dd238
FV
455
456
d77c3dfd 457def sanitize_open(filename, open_mode):
59ae15a5
PH
458 """Try to open the given filename, and slightly tweak it if this fails.
459
460 Attempts to open the given filename. If this fails, it tries to change
461 the filename slightly, step by step, until it's either able to open it
462 or it fails and raises a final exception, like the standard open()
463 function.
464
465 It returns the tuple (stream, definitive_file_name).
466 """
467 try:
468 if filename == u'-':
469 if sys.platform == 'win32':
470 import msvcrt
471 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 472 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
473 stream = open(encodeFilename(filename), open_mode)
474 return (stream, filename)
475 except (IOError, OSError) as err:
f45c185f
PH
476 if err.errno in (errno.EACCES,):
477 raise
59ae15a5 478
f45c185f
PH
479 # In case of error, try to remove win32 forbidden chars
480 alt_filename = os.path.join(
481 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
482 for path_part in os.path.split(filename)
483 )
484 if alt_filename == filename:
485 raise
486 else:
487 # An exception here should be caught in the caller
488 stream = open(encodeFilename(filename), open_mode)
489 return (stream, alt_filename)
d77c3dfd
FV
490
491
492def timeconvert(timestr):
59ae15a5
PH
493 """Convert RFC 2822 defined time string into system timestamp"""
494 timestamp = None
495 timetuple = email.utils.parsedate_tz(timestr)
496 if timetuple is not None:
497 timestamp = email.utils.mktime_tz(timetuple)
498 return timestamp
1c469a94 499
796173d0 500def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
501 """Sanitizes a string so it could be used as part of a filename.
502 If restricted is set, use a stricter subset of allowed characters.
796173d0 503 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
504 """
505 def replace_insane(char):
506 if char == '?' or ord(char) < 32 or ord(char) == 127:
507 return ''
508 elif char == '"':
509 return '' if restricted else '\''
510 elif char == ':':
511 return '_-' if restricted else ' -'
512 elif char in '\\/|*<>':
513 return '_'
627dcfff 514 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
515 return '_'
516 if restricted and ord(char) > 127:
517 return '_'
518 return char
519
520 result = u''.join(map(replace_insane, s))
796173d0
PH
521 if not is_id:
522 while '__' in result:
523 result = result.replace('__', '_')
524 result = result.strip('_')
525 # Common case of "Foreign band name - English song title"
526 if restricted and result.startswith('-_'):
527 result = result[2:]
528 if not result:
529 result = '_'
59ae15a5 530 return result
d77c3dfd
FV
531
532def orderedSet(iterable):
59ae15a5
PH
533 """ Remove all duplicates from the input iterable """
534 res = []
535 for el in iterable:
536 if el not in res:
537 res.append(el)
538 return res
d77c3dfd 539
912b38b4 540
4e408e47
PH
541def _htmlentity_transform(entity):
542 """Transforms an HTML entity to a character."""
543 # Known non-numeric HTML entity
544 if entity in compat_html_entities.name2codepoint:
545 return compat_chr(compat_html_entities.name2codepoint[entity])
546
547 mobj = re.match(r'#(x?[0-9]+)', entity)
548 if mobj is not None:
549 numstr = mobj.group(1)
550 if numstr.startswith(u'x'):
551 base = 16
552 numstr = u'0%s' % numstr
553 else:
554 base = 10
555 return compat_chr(int(numstr, base))
556
557 # Unknown entity in name, return its literal representation
558 return (u'&%s;' % entity)
559
560
d77c3dfd 561def unescapeHTML(s):
912b38b4
PH
562 if s is None:
563 return None
564 assert type(s) == compat_str
d77c3dfd 565
4e408e47
PH
566 return re.sub(
567 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 568
8bf48f23
PH
569
570def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
571 """
572 @param s The name of the file
573 """
d77c3dfd 574
8bf48f23 575 assert type(s) == compat_str
d77c3dfd 576
59ae15a5
PH
577 # Python 3 has a Unicode API
578 if sys.version_info >= (3, 0):
579 return s
0f00efed 580
59ae15a5
PH
581 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
582 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
583 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
584 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
585 if not for_subprocess:
586 return s
587 else:
588 # For subprocess calls, encode with locale encoding
589 # Refer to http://stackoverflow.com/a/9951851/35070
590 encoding = preferredencoding()
59ae15a5 591 else:
6df40dcb 592 encoding = sys.getfilesystemencoding()
8bf48f23
PH
593 if encoding is None:
594 encoding = 'utf-8'
595 return s.encode(encoding, 'ignore')
596
f07b74fc
PH
597
598def encodeArgument(s):
599 if not isinstance(s, compat_str):
600 # Legacy code that uses byte strings
601 # Uncomment the following line after fixing all post processors
602 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
603 s = s.decode('ascii')
604 return encodeFilename(s, True)
605
606
8271226a
PH
607def decodeOption(optval):
608 if optval is None:
609 return optval
610 if isinstance(optval, bytes):
611 optval = optval.decode(preferredencoding())
612
613 assert isinstance(optval, compat_str)
614 return optval
1c256f70 615
4539dd30
PH
616def formatSeconds(secs):
617 if secs > 3600:
618 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
619 elif secs > 60:
620 return '%d:%02d' % (secs // 60, secs % 60)
621 else:
622 return '%d' % secs
623
a0ddb8a2
PH
624
625def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
13ebea79
PH
626 if sys.version_info < (3, 2):
627 import httplib
628
629 class HTTPSConnectionV3(httplib.HTTPSConnection):
630 def __init__(self, *args, **kwargs):
631 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
632
633 def connect(self):
634 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 635 if getattr(self, '_tunnel_host', False):
13ebea79
PH
636 self.sock = sock
637 self._tunnel()
638 try:
aa37e3d4 639 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
de79c46c 640 except ssl.SSLError:
13ebea79
PH
641 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
642
643 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
644 def https_open(self, req):
645 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 646 return HTTPSHandlerV3(**kwargs)
aa37e3d4
PH
647 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
648 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
649 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
650 if opts_no_check_certificate:
651 context.verify_mode = ssl.CERT_NONE
652 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
653 else: # Python < 3.4
654 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
ea6d901e 655 context.verify_mode = (ssl.CERT_NONE
dca08720 656 if opts_no_check_certificate
ea6d901e 657 else ssl.CERT_REQUIRED)
303b479e
PH
658 context.set_default_verify_paths()
659 try:
660 context.load_default_certs()
661 except AttributeError:
662 pass # Python < 3.4
a0ddb8a2 663 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 664
1c256f70
PH
665class ExtractorError(Exception):
666 """Error during info extraction."""
d11271dd 667 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
668 """ tb, if given, is the original traceback (so that it can be printed out).
669 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
670 """
671
672 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
673 expected = True
d11271dd
PH
674 if video_id is not None:
675 msg = video_id + ': ' + msg
9a82b238 676 if not expected:
298f833b 677 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 678 super(ExtractorError, self).__init__(msg)
d5979c5d 679
1c256f70 680 self.traceback = tb
8cc83b8d 681 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 682 self.cause = cause
d11271dd 683 self.video_id = video_id
1c256f70 684
01951dda
PH
685 def format_traceback(self):
686 if self.traceback is None:
687 return None
688 return u''.join(traceback.format_tb(self.traceback))
689
1c256f70 690
55b3e45b
JMF
691class RegexNotFoundError(ExtractorError):
692 """Error when a regex didn't match"""
693 pass
694
695
d77c3dfd 696class DownloadError(Exception):
59ae15a5 697 """Download Error exception.
d77c3dfd 698
59ae15a5
PH
699 This exception may be thrown by FileDownloader objects if they are not
700 configured to continue on errors. They will contain the appropriate
701 error message.
702 """
8cc83b8d
FV
703 def __init__(self, msg, exc_info=None):
704 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
705 super(DownloadError, self).__init__(msg)
706 self.exc_info = exc_info
d77c3dfd
FV
707
708
709class SameFileError(Exception):
59ae15a5 710 """Same File exception.
d77c3dfd 711
59ae15a5
PH
712 This exception will be thrown by FileDownloader objects if they detect
713 multiple files would have to be downloaded to the same file on disk.
714 """
715 pass
d77c3dfd
FV
716
717
718class PostProcessingError(Exception):
59ae15a5 719 """Post Processing exception.
d77c3dfd 720
59ae15a5
PH
721 This exception may be raised by PostProcessor's .run() method to
722 indicate an error in the postprocessing task.
723 """
7851b379
PH
724 def __init__(self, msg):
725 self.msg = msg
d77c3dfd
FV
726
727class MaxDownloadsReached(Exception):
59ae15a5
PH
728 """ --max-downloads limit has been reached. """
729 pass
d77c3dfd
FV
730
731
732class UnavailableVideoError(Exception):
59ae15a5 733 """Unavailable Format exception.
d77c3dfd 734
59ae15a5
PH
735 This exception will be thrown when a video is requested
736 in a format that is not available for that video.
737 """
738 pass
d77c3dfd
FV
739
740
741class ContentTooShortError(Exception):
59ae15a5 742 """Content Too Short exception.
d77c3dfd 743
59ae15a5
PH
744 This exception may be raised by FileDownloader objects when a file they
745 download is too small for what the server announced first, indicating
746 the connection was probably interrupted.
747 """
748 # Both in bytes
749 downloaded = None
750 expected = None
d77c3dfd 751
59ae15a5
PH
752 def __init__(self, downloaded, expected):
753 self.downloaded = downloaded
754 self.expected = expected
d77c3dfd 755
acebc9cd 756class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
757 """Handler for HTTP requests and responses.
758
759 This class, when installed with an OpenerDirector, automatically adds
760 the standard headers to every HTTP request and handles gzipped and
761 deflated responses from web servers. If compression is to be avoided in
762 a particular request, the original request in the program code only has
763 to include the HTTP header "Youtubedl-No-Compression", which will be
764 removed before making the real request.
765
766 Part of this code was copied from:
767
768 http://techknack.net/python-urllib2-handlers/
769
770 Andrew Rowls, the author of that code, agreed to release it to the
771 public domain.
772 """
773
774 @staticmethod
775 def deflate(data):
776 try:
777 return zlib.decompress(data, -zlib.MAX_WBITS)
778 except zlib.error:
779 return zlib.decompress(data)
780
781 @staticmethod
782 def addinfourl_wrapper(stream, headers, url, code):
783 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
784 return compat_urllib_request.addinfourl(stream, headers, url, code)
785 ret = compat_urllib_request.addinfourl(stream, headers, url)
786 ret.code = code
787 return ret
788
acebc9cd 789 def http_request(self, req):
33ac271b
PH
790 for h, v in std_headers.items():
791 if h not in req.headers:
792 req.add_header(h, v)
59ae15a5
PH
793 if 'Youtubedl-no-compression' in req.headers:
794 if 'Accept-encoding' in req.headers:
795 del req.headers['Accept-encoding']
796 del req.headers['Youtubedl-no-compression']
3446dfb7 797 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
798 if 'User-agent' in req.headers:
799 del req.headers['User-agent']
800 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 801 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
802 return req
803
acebc9cd 804 def http_response(self, req, resp):
59ae15a5
PH
805 old_resp = resp
806 # gzip
807 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
808 content = resp.read()
809 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
810 try:
811 uncompressed = io.BytesIO(gz.read())
812 except IOError as original_ioerror:
813 # There may be junk add the end of the file
814 # See http://stackoverflow.com/q/4928560/35070 for details
815 for i in range(1, 1024):
816 try:
817 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
818 uncompressed = io.BytesIO(gz.read())
819 except IOError:
820 continue
821 break
822 else:
823 raise original_ioerror
824 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
825 resp.msg = old_resp.msg
826 # deflate
827 if resp.headers.get('Content-encoding', '') == 'deflate':
828 gz = io.BytesIO(self.deflate(resp.read()))
829 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
830 resp.msg = old_resp.msg
831 return resp
0f8d03f8 832
acebc9cd
PH
833 https_request = http_request
834 https_response = http_response
bf50b038 835
5de90176 836
305d0683 837def parse_iso8601(date_str, delimiter='T'):
912b38b4
PH
838 """ Return a UNIX timestamp from the given date """
839
840 if date_str is None:
841 return None
842
843 m = re.search(
844 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
845 date_str)
846 if not m:
847 timezone = datetime.timedelta()
848 else:
849 date_str = date_str[:-len(m.group(0))]
850 if not m.group('sign'):
851 timezone = datetime.timedelta()
852 else:
853 sign = 1 if m.group('sign') == '+' else -1
854 timezone = datetime.timedelta(
855 hours=sign * int(m.group('hours')),
856 minutes=sign * int(m.group('minutes')))
305d0683
TB
857 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
858 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
859 return calendar.timegm(dt.timetuple())
860
861
bf50b038
JMF
862def unified_strdate(date_str):
863 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
864
865 if date_str is None:
866 return None
867
bf50b038
JMF
868 upload_date = None
869 #Replace commas
026fcc04 870 date_str = date_str.replace(',', ' ')
bf50b038 871 # %z (UTC offset) is only supported in python>=3.2
026fcc04 872 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
19e1d359
JMF
873 format_expressions = [
874 '%d %B %Y',
0f99566c 875 '%d %b %Y',
19e1d359
JMF
876 '%B %d %Y',
877 '%b %d %Y',
78ff59d0
PP
878 '%b %dst %Y %I:%M%p',
879 '%b %dnd %Y %I:%M%p',
880 '%b %dth %Y %I:%M%p',
19e1d359 881 '%Y-%m-%d',
fe556f1b 882 '%Y/%m/%d',
4cf96546 883 '%d.%m.%Y',
19e1d359 884 '%d/%m/%Y',
423817c4 885 '%d/%m/%y',
19e1d359 886 '%Y/%m/%d %H:%M:%S',
5d73273f 887 '%Y-%m-%d %H:%M:%S',
19e1d359 888 '%d.%m.%Y %H:%M',
b047de6f 889 '%d.%m.%Y %H.%M',
19e1d359 890 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
891 '%Y-%m-%dT%H:%M:%S.%fZ',
892 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 893 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 894 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 895 '%Y-%m-%dT%H:%M',
19e1d359 896 ]
bf50b038
JMF
897 for expression in format_expressions:
898 try:
899 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 900 except ValueError:
bf50b038 901 pass
42393ce2
PH
902 if upload_date is None:
903 timetuple = email.utils.parsedate_tz(date_str)
904 if timetuple:
905 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
906 return upload_date
907
cbdbb766 908def determine_ext(url, default_ext=u'unknown_video'):
f4776371
S
909 if url is None:
910 return default_ext
73e79f2a
PH
911 guess = url.partition(u'?')[0].rpartition(u'.')[2]
912 if re.match(r'^[A-Za-z0-9]+$', guess):
913 return guess
914 else:
cbdbb766 915 return default_ext
73e79f2a 916
d4051a8e
JMF
917def subtitles_filename(filename, sub_lang, sub_format):
918 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
919
bd558525 920def date_from_str(date_str):
37254abc
JMF
921 """
922 Return a datetime object from a string in the format YYYYMMDD or
923 (now|today)[+-][0-9](day|week|month|year)(s)?"""
924 today = datetime.date.today()
925 if date_str == 'now'or date_str == 'today':
926 return today
927 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
928 if match is not None:
929 sign = match.group('sign')
930 time = int(match.group('time'))
931 if sign == '-':
932 time = -time
933 unit = match.group('unit')
934 #A bad aproximation?
935 if unit == 'month':
936 unit = 'day'
937 time *= 30
938 elif unit == 'year':
939 unit = 'day'
940 time *= 365
941 unit += 's'
942 delta = datetime.timedelta(**{unit: time})
943 return today + delta
bd558525
JMF
944 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
945
e63fc1be 946def hyphenate_date(date_str):
947 """
948 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
949 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
950 if match is not None:
951 return '-'.join(match.groups())
952 else:
953 return date_str
954
bd558525
JMF
955class DateRange(object):
956 """Represents a time interval between two dates"""
957 def __init__(self, start=None, end=None):
958 """start and end must be strings in the format accepted by date"""
959 if start is not None:
960 self.start = date_from_str(start)
961 else:
962 self.start = datetime.datetime.min.date()
963 if end is not None:
964 self.end = date_from_str(end)
965 else:
966 self.end = datetime.datetime.max.date()
37254abc 967 if self.start > self.end:
bd558525
JMF
968 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
969 @classmethod
970 def day(cls, day):
971 """Returns a range that only contains the given day"""
972 return cls(day,day)
973 def __contains__(self, date):
974 """Check if the date is in the range"""
37254abc
JMF
975 if not isinstance(date, datetime.date):
976 date = date_from_str(date)
977 return self.start <= date <= self.end
bd558525
JMF
978 def __str__(self):
979 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
980
981
982def platform_name():
983 """ Returns the platform name as a compat_str """
984 res = platform.platform()
985 if isinstance(res, bytes):
986 res = res.decode(preferredencoding())
987
988 assert isinstance(res, compat_str)
989 return res
c257baff
PH
990
991
b58ddb32
PH
992def _windows_write_string(s, out):
993 """ Returns True if the string was written using special methods,
994 False if it has yet to be written out."""
995 # Adapted from http://stackoverflow.com/a/3259271/35070
996
997 import ctypes
998 import ctypes.wintypes
999
1000 WIN_OUTPUT_IDS = {
1001 1: -11,
1002 2: -12,
1003 }
1004
a383a98a
PH
1005 try:
1006 fileno = out.fileno()
1007 except AttributeError:
1008 # If the output stream doesn't have a fileno, it's virtual
1009 return False
b58ddb32
PH
1010 if fileno not in WIN_OUTPUT_IDS:
1011 return False
1012
1013 GetStdHandle = ctypes.WINFUNCTYPE(
1014 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1015 ("GetStdHandle", ctypes.windll.kernel32))
1016 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1017
1018 WriteConsoleW = ctypes.WINFUNCTYPE(
1019 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1020 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1021 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1022 written = ctypes.wintypes.DWORD(0)
1023
1024 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1025 FILE_TYPE_CHAR = 0x0002
1026 FILE_TYPE_REMOTE = 0x8000
1027 GetConsoleMode = ctypes.WINFUNCTYPE(
1028 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1029 ctypes.POINTER(ctypes.wintypes.DWORD))(
1030 ("GetConsoleMode", ctypes.windll.kernel32))
1031 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1032
1033 def not_a_console(handle):
1034 if handle == INVALID_HANDLE_VALUE or handle is None:
1035 return True
1036 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1037 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1038
1039 if not_a_console(h):
1040 return False
1041
d1b9c912
PH
1042 def next_nonbmp_pos(s):
1043 try:
1044 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1045 except StopIteration:
1046 return len(s)
1047
1048 while s:
1049 count = min(next_nonbmp_pos(s), 1024)
1050
b58ddb32 1051 ret = WriteConsoleW(
d1b9c912 1052 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1053 if ret == 0:
1054 raise OSError('Failed to write string')
d1b9c912
PH
1055 if not count: # We just wrote a non-BMP character
1056 assert written.value == 2
1057 s = s[1:]
1058 else:
1059 assert written.value > 0
1060 s = s[written.value:]
b58ddb32
PH
1061 return True
1062
1063
734f90bb 1064def write_string(s, out=None, encoding=None):
7459e3a2
PH
1065 if out is None:
1066 out = sys.stderr
8bf48f23 1067 assert type(s) == compat_str
7459e3a2 1068
b58ddb32
PH
1069 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1070 if _windows_write_string(s, out):
1071 return
1072
7459e3a2
PH
1073 if ('b' in getattr(out, 'mode', '') or
1074 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1075 byt = s.encode(encoding or preferredencoding(), 'ignore')
1076 out.write(byt)
1077 elif hasattr(out, 'buffer'):
1078 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1079 byt = s.encode(enc, 'ignore')
1080 out.buffer.write(byt)
1081 else:
8bf48f23 1082 out.write(s)
7459e3a2
PH
1083 out.flush()
1084
1085
48ea9cea
PH
1086def bytes_to_intlist(bs):
1087 if not bs:
1088 return []
1089 if isinstance(bs[0], int): # Python 3
1090 return list(bs)
1091 else:
1092 return [ord(c) for c in bs]
1093
c257baff 1094
cba892fa 1095def intlist_to_bytes(xs):
1096 if not xs:
1097 return b''
1098 if isinstance(chr(0), bytes): # Python 2
1099 return ''.join([chr(x) for x in xs])
1100 else:
1101 return bytes(xs)
c38b1e77
PH
1102
1103
c1c9a79c
PH
1104# Cross-platform file locking
1105if sys.platform == 'win32':
1106 import ctypes.wintypes
1107 import msvcrt
1108
1109 class OVERLAPPED(ctypes.Structure):
1110 _fields_ = [
1111 ('Internal', ctypes.wintypes.LPVOID),
1112 ('InternalHigh', ctypes.wintypes.LPVOID),
1113 ('Offset', ctypes.wintypes.DWORD),
1114 ('OffsetHigh', ctypes.wintypes.DWORD),
1115 ('hEvent', ctypes.wintypes.HANDLE),
1116 ]
1117
1118 kernel32 = ctypes.windll.kernel32
1119 LockFileEx = kernel32.LockFileEx
1120 LockFileEx.argtypes = [
1121 ctypes.wintypes.HANDLE, # hFile
1122 ctypes.wintypes.DWORD, # dwFlags
1123 ctypes.wintypes.DWORD, # dwReserved
1124 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1125 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1126 ctypes.POINTER(OVERLAPPED) # Overlapped
1127 ]
1128 LockFileEx.restype = ctypes.wintypes.BOOL
1129 UnlockFileEx = kernel32.UnlockFileEx
1130 UnlockFileEx.argtypes = [
1131 ctypes.wintypes.HANDLE, # hFile
1132 ctypes.wintypes.DWORD, # dwReserved
1133 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1134 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1135 ctypes.POINTER(OVERLAPPED) # Overlapped
1136 ]
1137 UnlockFileEx.restype = ctypes.wintypes.BOOL
1138 whole_low = 0xffffffff
1139 whole_high = 0x7fffffff
1140
1141 def _lock_file(f, exclusive):
1142 overlapped = OVERLAPPED()
1143 overlapped.Offset = 0
1144 overlapped.OffsetHigh = 0
1145 overlapped.hEvent = 0
1146 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1147 handle = msvcrt.get_osfhandle(f.fileno())
1148 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1149 whole_low, whole_high, f._lock_file_overlapped_p):
1150 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1151
1152 def _unlock_file(f):
1153 assert f._lock_file_overlapped_p
1154 handle = msvcrt.get_osfhandle(f.fileno())
1155 if not UnlockFileEx(handle, 0,
1156 whole_low, whole_high, f._lock_file_overlapped_p):
1157 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1158
1159else:
1160 import fcntl
1161
1162 def _lock_file(f, exclusive):
2582bebe 1163 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
1164
1165 def _unlock_file(f):
2582bebe 1166 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
1167
1168
1169class locked_file(object):
1170 def __init__(self, filename, mode, encoding=None):
1171 assert mode in ['r', 'a', 'w']
1172 self.f = io.open(filename, mode, encoding=encoding)
1173 self.mode = mode
1174
1175 def __enter__(self):
1176 exclusive = self.mode != 'r'
1177 try:
1178 _lock_file(self.f, exclusive)
1179 except IOError:
1180 self.f.close()
1181 raise
1182 return self
1183
1184 def __exit__(self, etype, value, traceback):
1185 try:
1186 _unlock_file(self.f)
1187 finally:
1188 self.f.close()
1189
1190 def __iter__(self):
1191 return iter(self.f)
1192
1193 def write(self, *args):
1194 return self.f.write(*args)
1195
1196 def read(self, *args):
1197 return self.f.read(*args)
4eb7f1d1
JMF
1198
1199
1200def shell_quote(args):
a6a173c2
JMF
1201 quoted_args = []
1202 encoding = sys.getfilesystemencoding()
1203 if encoding is None:
1204 encoding = 'utf-8'
1205 for a in args:
1206 if isinstance(a, bytes):
1207 # We may get a filename encoded with 'encodeFilename'
1208 a = a.decode(encoding)
1209 quoted_args.append(pipes.quote(a))
1210 return u' '.join(quoted_args)
9d4660ca
PH
1211
1212
f4d96df0
PH
1213def takewhile_inclusive(pred, seq):
1214 """ Like itertools.takewhile, but include the latest evaluated element
1215 (the first element so that Not pred(e)) """
1216 for e in seq:
1217 yield e
1218 if not pred(e):
1219 return
1220
1221
9d4660ca
PH
1222def smuggle_url(url, data):
1223 """ Pass additional data in a URL for internal use. """
1224
1225 sdata = compat_urllib_parse.urlencode(
1226 {u'__youtubedl_smuggle': json.dumps(data)})
1227 return url + u'#' + sdata
1228
1229
79f82953 1230def unsmuggle_url(smug_url, default=None):
9d4660ca 1231 if not '#__youtubedl_smuggle' in smug_url:
79f82953 1232 return smug_url, default
9d4660ca
PH
1233 url, _, sdata = smug_url.rpartition(u'#')
1234 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1235 data = json.loads(jsond)
1236 return url, data
02dbf93f
PH
1237
1238
02dbf93f
PH
1239def format_bytes(bytes):
1240 if bytes is None:
1241 return u'N/A'
1242 if type(bytes) is str:
1243 bytes = float(bytes)
1244 if bytes == 0.0:
1245 exponent = 0
1246 else:
1247 exponent = int(math.log(bytes, 1024.0))
1248 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1249 converted = float(bytes) / float(1024 ** exponent)
1250 return u'%.2f%s' % (converted, suffix)
f53c966a 1251
1c088fa8 1252
1c088fa8
PH
1253def get_term_width():
1254 columns = os.environ.get('COLUMNS', None)
1255 if columns:
1256 return int(columns)
1257
1258 try:
1259 sp = subprocess.Popen(
1260 ['stty', 'size'],
1261 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1262 out, err = sp.communicate()
1263 return int(out.split()[1])
1264 except:
1265 pass
1266 return None
caefb1de
PH
1267
1268
1269def month_by_name(name):
1270 """ Return the number of a month by (locale-independently) English name """
1271
1272 ENGLISH_NAMES = [
dadb8184 1273 u'January', u'February', u'March', u'April', u'May', u'June',
caefb1de
PH
1274 u'July', u'August', u'September', u'October', u'November', u'December']
1275 try:
1276 return ENGLISH_NAMES.index(name) + 1
1277 except ValueError:
1278 return None
18258362
JMF
1279
1280
5aafe895 1281def fix_xml_ampersands(xml_str):
18258362 1282 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1283 return re.sub(
1284 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1285 u'&amp;',
1286 xml_str)
e3946f98
PH
1287
1288
1289def setproctitle(title):
8bf48f23 1290 assert isinstance(title, compat_str)
e3946f98
PH
1291 try:
1292 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1293 except OSError:
1294 return
6eefe533
PH
1295 title_bytes = title.encode('utf-8')
1296 buf = ctypes.create_string_buffer(len(title_bytes))
1297 buf.value = title_bytes
e3946f98 1298 try:
6eefe533 1299 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1300 except AttributeError:
1301 return # Strange libc, just skip this
d7dda168
PH
1302
1303
1304def remove_start(s, start):
1305 if s.startswith(start):
1306 return s[len(start):]
1307 return s
29eb5174
PH
1308
1309
2b9faf55
PH
1310def remove_end(s, end):
1311 if s.endswith(end):
1312 return s[:-len(end)]
1313 return s
1314
1315
29eb5174 1316def url_basename(url):
9b8aaeed
JMF
1317 path = compat_urlparse.urlparse(url).path
1318 return path.strip(u'/').split(u'/')[-1]
aa94a6d3
PH
1319
1320
1321class HEADRequest(compat_urllib_request.Request):
1322 def get_method(self):
1323 return "HEAD"
7217e148
PH
1324
1325
9732d77e 1326def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1327 if get_attr:
1328 if v is not None:
1329 v = getattr(v, get_attr, None)
9572013d
PH
1330 if v == '':
1331 v = None
9732d77e
PH
1332 return default if v is None else (int(v) * invscale // scale)
1333
9572013d 1334
40a90862
JMF
1335def str_or_none(v, default=None):
1336 return default if v is None else compat_str(v)
1337
9732d77e
PH
1338
1339def str_to_int(int_str):
48d4681e 1340 """ A more relaxed version of int_or_none """
9732d77e
PH
1341 if int_str is None:
1342 return None
884ae747 1343 int_str = re.sub(r'[,\.\+]', u'', int_str)
9732d77e 1344 return int(int_str)
608d11f5
PH
1345
1346
9732d77e
PH
1347def float_or_none(v, scale=1, invscale=1, default=None):
1348 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1349
1350
608d11f5
PH
1351def parse_duration(s):
1352 if s is None:
1353 return None
1354
ca7b3246
S
1355 s = s.strip()
1356
608d11f5 1357 m = re.match(
f164038b 1358 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
608d11f5
PH
1359 if not m:
1360 return None
1361 res = int(m.group('secs'))
1362 if m.group('mins'):
1363 res += int(m.group('mins')) * 60
1364 if m.group('hours'):
1365 res += int(m.group('hours')) * 60 * 60
7adcbe75
PH
1366 if m.group('ms'):
1367 res += float(m.group('ms'))
608d11f5 1368 return res
91d7d0b3
JMF
1369
1370
1371def prepend_extension(filename, ext):
1372 name, real_ext = os.path.splitext(filename)
1373 return u'{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1374
1375
1376def check_executable(exe, args=[]):
1377 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1378 args can be a list of arguments for a short output (like -version) """
1379 try:
1380 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1381 except OSError:
1382 return False
1383 return exe
b7ab0590
PH
1384
1385
1386class PagedList(object):
dd26ced1
PH
1387 def __len__(self):
1388 # This is only useful for tests
1389 return len(self.getslice())
1390
9c44d242
PH
1391
1392class OnDemandPagedList(PagedList):
1393 def __init__(self, pagefunc, pagesize):
1394 self._pagefunc = pagefunc
1395 self._pagesize = pagesize
1396
b7ab0590
PH
1397 def getslice(self, start=0, end=None):
1398 res = []
1399 for pagenum in itertools.count(start // self._pagesize):
1400 firstid = pagenum * self._pagesize
1401 nextfirstid = pagenum * self._pagesize + self._pagesize
1402 if start >= nextfirstid:
1403 continue
1404
1405 page_results = list(self._pagefunc(pagenum))
1406
1407 startv = (
1408 start % self._pagesize
1409 if firstid <= start < nextfirstid
1410 else 0)
1411
1412 endv = (
1413 ((end - 1) % self._pagesize) + 1
1414 if (end is not None and firstid <= end <= nextfirstid)
1415 else None)
1416
1417 if startv != 0 or endv is not None:
1418 page_results = page_results[startv:endv]
1419 res.extend(page_results)
1420
1421 # A little optimization - if current page is not "full", ie. does
1422 # not contain page_size videos then we can assume that this page
1423 # is the last one - there are no more ids on further pages -
1424 # i.e. no need to query again.
1425 if len(page_results) + startv < self._pagesize:
1426 break
1427
1428 # If we got the whole page, but the next page is not interesting,
1429 # break out early as well
1430 if end == nextfirstid:
1431 break
1432 return res
81c2f20b
PH
1433
1434
9c44d242
PH
1435class InAdvancePagedList(PagedList):
1436 def __init__(self, pagefunc, pagecount, pagesize):
1437 self._pagefunc = pagefunc
1438 self._pagecount = pagecount
1439 self._pagesize = pagesize
1440
1441 def getslice(self, start=0, end=None):
1442 res = []
1443 start_page = start // self._pagesize
1444 end_page = (
1445 self._pagecount if end is None else (end // self._pagesize + 1))
1446 skip_elems = start - start_page * self._pagesize
1447 only_more = None if end is None else end - start
1448 for pagenum in range(start_page, end_page):
1449 page = list(self._pagefunc(pagenum))
1450 if skip_elems:
1451 page = page[skip_elems:]
1452 skip_elems = None
1453 if only_more is not None:
1454 if len(page) < only_more:
1455 only_more -= len(page)
1456 else:
1457 page = page[:only_more]
1458 res.extend(page)
1459 break
1460 res.extend(page)
1461 return res
1462
1463
81c2f20b 1464def uppercase_escape(s):
676eb3f2 1465 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1466 return re.sub(
a612753d 1467 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1468 lambda m: unicode_escape(m.group(0))[0],
1469 s)
b53466e1 1470
d05cfe06
S
1471
1472def escape_rfc3986(s):
1473 """Escape non-ASCII characters as suggested by RFC 3986"""
1474 if sys.version_info < (3, 0) and isinstance(s, unicode):
1475 s = s.encode('utf-8')
984e8e14 1476 return compat_urllib_parse.quote(s, "%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1477
1478
1479def escape_url(url):
1480 """Escape URL as suggested by RFC 3986"""
1481 url_parsed = compat_urllib_parse_urlparse(url)
1482 return url_parsed._replace(
1483 path=escape_rfc3986(url_parsed.path),
1484 params=escape_rfc3986(url_parsed.params),
1485 query=escape_rfc3986(url_parsed.query),
1486 fragment=escape_rfc3986(url_parsed.fragment)
1487 ).geturl()
1488
b53466e1
PH
1489try:
1490 struct.pack(u'!I', 0)
1491except TypeError:
1492 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1493 def struct_pack(spec, *args):
1494 if isinstance(spec, compat_str):
1495 spec = spec.encode('ascii')
1496 return struct.pack(spec, *args)
1497
1498 def struct_unpack(spec, *args):
1499 if isinstance(spec, compat_str):
1500 spec = spec.encode('ascii')
1501 return struct.unpack(spec, *args)
1502else:
1503 struct_pack = struct.pack
1504 struct_unpack = struct.unpack
62e609ab
PH
1505
1506
1507def read_batch_urls(batch_fd):
1508 def fixup(url):
1509 if not isinstance(url, compat_str):
1510 url = url.decode('utf-8', 'replace')
1511 BOM_UTF8 = u'\xef\xbb\xbf'
1512 if url.startswith(BOM_UTF8):
1513 url = url[len(BOM_UTF8):]
1514 url = url.strip()
1515 if url.startswith(('#', ';', ']')):
1516 return False
1517 return url
1518
1519 with contextlib.closing(batch_fd) as fd:
1520 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1521
1522
1523def urlencode_postdata(*args, **kargs):
1524 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1525
1526
0990305d
PH
1527try:
1528 etree_iter = xml.etree.ElementTree.Element.iter
1529except AttributeError: # Python <=2.6
1530 etree_iter = lambda n: n.findall('.//*')
1531
1532
bcf89ce6
PH
1533def parse_xml(s):
1534 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1535 def doctype(self, name, pubid, system):
1536 pass # Ignore doctypes
1537
1538 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1539 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1540 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1541 # Fix up XML parser in Python 2.x
1542 if sys.version_info < (3, 0):
1543 for n in etree_iter(tree):
1544 if n.text is not None:
1545 if not isinstance(n.text, compat_str):
1546 n.text = n.text.decode('utf-8')
1547 return tree
e68301af
PH
1548
1549
1550if sys.version_info < (3, 0) and sys.platform == 'win32':
1551 def compat_getpass(prompt, *args, **kwargs):
1552 if isinstance(prompt, compat_str):
4e6f9aec 1553 prompt = prompt.encode(preferredencoding())
e68301af
PH
1554 return getpass.getpass(prompt, *args, **kwargs)
1555else:
1556 compat_getpass = getpass.getpass
a1a530b0
PH
1557
1558
1559US_RATINGS = {
1560 'G': 0,
1561 'PG': 10,
1562 'PG-13': 13,
1563 'R': 16,
1564 'NC': 18,
1565}
fac55558
PH
1566
1567
1568def strip_jsonp(code):
816930c4 1569 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
478c2c61
PH
1570
1571
e05f6939
PH
1572def js_to_json(code):
1573 def fix_kv(m):
1574 key = m.group(2)
1575 if key.startswith("'"):
1576 assert key.endswith("'")
1577 assert '"' not in key
1578 key = '"%s"' % key[1:-1]
1579 elif not key.startswith('"'):
1580 key = '"%s"' % key
1581
1582 value = m.group(4)
1583 if value.startswith("'"):
1584 assert value.endswith("'")
1585 assert '"' not in value
1586 value = '"%s"' % value[1:-1]
1587
1588 return m.group(1) + key + m.group(3) + value
1589
1590 res = re.sub(r'''(?x)
1591 ([{,]\s*)
1592 ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1593 (:\s*)
1594 ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1595 ''', fix_kv, code)
1596 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1597 return res
1598
1599
478c2c61
PH
1600def qualities(quality_ids):
1601 """ Get a numeric quality value out of a list of possible values """
1602 def q(qid):
1603 try:
1604 return quality_ids.index(qid)
1605 except ValueError:
1606 return -1
1607 return q
1608
acd69589
PH
1609
1610DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68
PH
1611
1612try:
1613 subprocess_check_output = subprocess.check_output
1614except AttributeError:
1615 def subprocess_check_output(*args, **kwargs):
1616 assert 'input' not in kwargs
1617 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1618 output, _ = p.communicate()
1619 ret = p.poll()
1620 if ret:
1621 raise subprocess.CalledProcessError(ret, p.args, output=output)
1622 return output
a020a0dc
PH
1623
1624
1625def limit_length(s, length):
1626 """ Add ellipses to overly long strings """
1627 if s is None:
1628 return None
1629 ELLIPSES = '...'
1630 if len(s) > length:
1631 return s[:length - len(ELLIPSES)] + ELLIPSES
1632 return s