]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[viki] Modernize
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
62e609ab 4import contextlib
e3946f98 5import ctypes
c496ca96
PH
6import datetime
7import email.utils
f45c185f 8import errno
e68301af 9import getpass
d77c3dfd 10import gzip
b7ab0590 11import itertools
03f9daab 12import io
f4bfd65f 13import json
d77c3dfd 14import locale
02dbf93f 15import math
d77c3dfd 16import os
4eb7f1d1 17import pipes
c496ca96 18import platform
d77c3dfd 19import re
13ebea79 20import ssl
c496ca96 21import socket
b53466e1 22import struct
1c088fa8 23import subprocess
d77c3dfd 24import sys
01951dda 25import traceback
bcf89ce6 26import xml.etree.ElementTree
d77c3dfd 27import zlib
d77c3dfd 28
01ba00ca 29try:
59ae15a5 30 import urllib.request as compat_urllib_request
01ba00ca 31except ImportError: # Python 2
59ae15a5 32 import urllib2 as compat_urllib_request
01ba00ca
PH
33
34try:
59ae15a5 35 import urllib.error as compat_urllib_error
01ba00ca 36except ImportError: # Python 2
59ae15a5 37 import urllib2 as compat_urllib_error
01ba00ca
PH
38
39try:
59ae15a5 40 import urllib.parse as compat_urllib_parse
01ba00ca 41except ImportError: # Python 2
59ae15a5 42 import urllib as compat_urllib_parse
01ba00ca 43
799c0763
PH
44try:
45 from urllib.parse import urlparse as compat_urllib_parse_urlparse
46except ImportError: # Python 2
47 from urlparse import urlparse as compat_urllib_parse_urlparse
48
6543f0dc
JMF
49try:
50 import urllib.parse as compat_urlparse
51except ImportError: # Python 2
52 import urlparse as compat_urlparse
53
01ba00ca 54try:
59ae15a5 55 import http.cookiejar as compat_cookiejar
01ba00ca 56except ImportError: # Python 2
59ae15a5 57 import cookielib as compat_cookiejar
01ba00ca 58
3e669f36 59try:
59ae15a5 60 import html.entities as compat_html_entities
9f37a959 61except ImportError: # Python 2
59ae15a5 62 import htmlentitydefs as compat_html_entities
3e669f36 63
a8156c1d 64try:
59ae15a5 65 import html.parser as compat_html_parser
9f37a959 66except ImportError: # Python 2
59ae15a5 67 import HTMLParser as compat_html_parser
a8156c1d 68
348d0a7a 69try:
59ae15a5 70 import http.client as compat_http_client
9f37a959 71except ImportError: # Python 2
59ae15a5 72 import httplib as compat_http_client
348d0a7a 73
2eabb802 74try:
0e283428 75 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
76except ImportError: # Python 2
77 from urllib2 import HTTPError as compat_HTTPError
78
e0df6211
PH
79try:
80 from urllib.request import urlretrieve as compat_urlretrieve
81except ImportError: # Python 2
82 from urllib import urlretrieve as compat_urlretrieve
83
84
5910e210
PH
85try:
86 from subprocess import DEVNULL
87 compat_subprocess_get_DEVNULL = lambda: DEVNULL
88except ImportError:
89 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
90
9f37a959 91try:
59ae15a5 92 from urllib.parse import parse_qs as compat_parse_qs
9f37a959 93except ImportError: # Python 2
59ae15a5
PH
94 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
95 # Python 2's version is apparently totally broken
96 def _unquote(string, encoding='utf-8', errors='replace'):
97 if string == '':
98 return string
99 res = string.split('%')
100 if len(res) == 1:
101 return string
102 if encoding is None:
103 encoding = 'utf-8'
104 if errors is None:
105 errors = 'replace'
106 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
107 pct_sequence = b''
108 string = res[0]
109 for item in res[1:]:
110 try:
111 if not item:
112 raise ValueError
113 pct_sequence += item[:2].decode('hex')
114 rest = item[2:]
115 if not rest:
116 # This segment was just a single percent-encoded character.
117 # May be part of a sequence of code units, so delay decoding.
118 # (Stored in pct_sequence).
119 continue
120 except ValueError:
121 rest = '%' + item
122 # Encountered non-percent-encoded characters. Flush the current
123 # pct_sequence.
124 string += pct_sequence.decode(encoding, errors) + rest
125 pct_sequence = b''
126 if pct_sequence:
127 # Flush the final pct_sequence
128 string += pct_sequence.decode(encoding, errors)
129 return string
130
131 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
132 encoding='utf-8', errors='replace'):
133 qs, _coerce_result = qs, unicode
134 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
135 r = []
136 for name_value in pairs:
137 if not name_value and not strict_parsing:
138 continue
139 nv = name_value.split('=', 1)
140 if len(nv) != 2:
141 if strict_parsing:
142 raise ValueError("bad query field: %r" % (name_value,))
143 # Handle case of a control-name with no equal sign
144 if keep_blank_values:
145 nv.append('')
146 else:
147 continue
148 if len(nv[1]) or keep_blank_values:
149 name = nv[0].replace('+', ' ')
150 name = _unquote(name, encoding=encoding, errors=errors)
151 name = _coerce_result(name)
152 value = nv[1].replace('+', ' ')
153 value = _unquote(value, encoding=encoding, errors=errors)
154 value = _coerce_result(value)
155 r.append((name, value))
156 return r
157
158 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
159 encoding='utf-8', errors='replace'):
160 parsed_result = {}
161 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
162 encoding=encoding, errors=errors)
163 for name, value in pairs:
164 if name in parsed_result:
165 parsed_result[name].append(value)
166 else:
167 parsed_result[name] = [value]
168 return parsed_result
348d0a7a 169
3e669f36 170try:
59ae15a5 171 compat_str = unicode # Python 2
3e669f36 172except NameError:
59ae15a5 173 compat_str = str
3e669f36
PH
174
175try:
59ae15a5 176 compat_chr = unichr # Python 2
3e669f36 177except NameError:
59ae15a5 178 compat_chr = chr
3e669f36 179
f7300c5c
JMF
180try:
181 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
182except ImportError: # Python 2.6
183 from xml.parsers.expat import ExpatError as compat_xml_parse_error
184
b31756c1
FV
185def compat_ord(c):
186 if type(c) is int: return c
187 else: return ord(c)
188
468e2e92
FV
189# This is not clearly defined otherwise
190compiled_regex_type = type(re.compile(''))
191
3e669f36 192std_headers = {
ae8f7871 193 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
194 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
195 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
196 'Accept-Encoding': 'gzip, deflate',
197 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 198}
f427df17 199
d77c3dfd 200def preferredencoding():
59ae15a5 201 """Get preferred encoding.
d77c3dfd 202
59ae15a5
PH
203 Returns the best encoding scheme for the system, based on
204 locale.getpreferredencoding() and some further tweaks.
205 """
206 try:
207 pref = locale.getpreferredencoding()
208 u'TEST'.encode(pref)
209 except:
210 pref = 'UTF-8'
bae611f2 211
59ae15a5 212 return pref
d77c3dfd 213
8cd10ac4 214if sys.version_info < (3,0):
59ae15a5
PH
215 def compat_print(s):
216 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 217else:
59ae15a5
PH
218 def compat_print(s):
219 assert type(s) == type(u'')
220 print(s)
d77c3dfd 221
f4bfd65f
PH
222# In Python 2.x, json.dump expects a bytestream.
223# In Python 3.x, it writes to a character stream
224if sys.version_info < (3,0):
225 def write_json_file(obj, fn):
226 with open(fn, 'wb') as f:
227 json.dump(obj, f)
228else:
229 def write_json_file(obj, fn):
230 with open(fn, 'w', encoding='utf-8') as f:
231 json.dump(obj, f)
232
59ae56fa
PH
233if sys.version_info >= (2,7):
234 def find_xpath_attr(node, xpath, key, val):
235 """ Find the xpath xpath[@key=val] """
5de3ece2 236 assert re.match(r'^[a-zA-Z]+$', key)
af1588c0 237 assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
59ae56fa
PH
238 expr = xpath + u"[@%s='%s']" % (key, val)
239 return node.find(expr)
240else:
241 def find_xpath_attr(node, xpath, key, val):
242 for f in node.findall(xpath):
243 if f.attrib.get(key) == val:
244 return f
245 return None
246
d7e66d39
JMF
247# On python2.6 the xml.etree.ElementTree.Element methods don't support
248# the namespace parameter
249def xpath_with_ns(path, ns_map):
250 components = [c.split(':') for c in path.split('/')]
251 replaced = []
252 for c in components:
253 if len(c) == 1:
254 replaced.append(c[0])
255 else:
256 ns, tag = c
257 replaced.append('{%s}%s' % (ns_map[ns], tag))
258 return '/'.join(replaced)
259
d77c3dfd 260def htmlentity_transform(matchobj):
59ae15a5
PH
261 """Transforms an HTML entity to a character.
262
263 This function receives a match object and is intended to be used with
264 the re.sub() function.
265 """
266 entity = matchobj.group(1)
267
268 # Known non-numeric HTML entity
269 if entity in compat_html_entities.name2codepoint:
270 return compat_chr(compat_html_entities.name2codepoint[entity])
271
272 mobj = re.match(u'(?u)#(x?\\d+)', entity)
273 if mobj is not None:
274 numstr = mobj.group(1)
275 if numstr.startswith(u'x'):
276 base = 16
277 numstr = u'0%s' % numstr
278 else:
279 base = 10
280 return compat_chr(int(numstr, base))
281
282 # Unknown entity in name, return its literal representation
283 return (u'&%s;' % entity)
d77c3dfd 284
a8156c1d 285compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
286class BaseHTMLParser(compat_html_parser.HTMLParser):
287 def __init(self):
288 compat_html_parser.HTMLParser.__init__(self)
289 self.html = None
290
291 def loads(self, html):
292 self.html = html
293 self.feed(html)
294 self.close()
295
296class AttrParser(BaseHTMLParser):
43e8fafd
ND
297 """Modified HTMLParser that isolates a tag with the specified attribute"""
298 def __init__(self, attribute, value):
299 self.attribute = attribute
300 self.value = value
59ae15a5
PH
301 self.result = None
302 self.started = False
303 self.depth = {}
59ae15a5
PH
304 self.watch_startpos = False
305 self.error_count = 0
a921f407 306 BaseHTMLParser.__init__(self)
59ae15a5
PH
307
308 def error(self, message):
309 if self.error_count > 10 or self.started:
310 raise compat_html_parser.HTMLParseError(message, self.getpos())
311 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
312 self.error_count += 1
313 self.goahead(1)
314
59ae15a5
PH
315 def handle_starttag(self, tag, attrs):
316 attrs = dict(attrs)
317 if self.started:
318 self.find_startpos(None)
43e8fafd 319 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
320 self.result = [tag]
321 self.started = True
322 self.watch_startpos = True
323 if self.started:
324 if not tag in self.depth: self.depth[tag] = 0
325 self.depth[tag] += 1
326
327 def handle_endtag(self, tag):
328 if self.started:
329 if tag in self.depth: self.depth[tag] -= 1
330 if self.depth[self.result[0]] == 0:
331 self.started = False
332 self.result.append(self.getpos())
333
334 def find_startpos(self, x):
335 """Needed to put the start position of the result (self.result[1])
336 after the opening tag with the requested id"""
337 if self.watch_startpos:
338 self.watch_startpos = False
339 self.result.append(self.getpos())
340 handle_entityref = handle_charref = handle_data = handle_comment = \
341 handle_decl = handle_pi = unknown_decl = find_startpos
342
343 def get_result(self):
344 if self.result is None:
345 return None
346 if len(self.result) != 3:
347 return None
348 lines = self.html.split('\n')
349 lines = lines[self.result[1][0]-1:self.result[2][0]]
350 lines[0] = lines[0][self.result[1][1]:]
351 if len(lines) == 1:
352 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
353 lines[-1] = lines[-1][:self.result[2][1]]
354 return '\n'.join(lines).strip()
3b024e17
PH
355# Hack for https://github.com/rg3/youtube-dl/issues/662
356if sys.version_info < (2, 7, 3):
357 AttrParser.parse_endtag = (lambda self, i:
358 i + len("</scr'+'ipt>")
359 if self.rawdata[i:].startswith("</scr'+'ipt>")
360 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
361
362def get_element_by_id(id, html):
43e8fafd
ND
363 """Return the content of the tag with the specified ID in the passed HTML document"""
364 return get_element_by_attribute("id", id, html)
365
366def get_element_by_attribute(attribute, value, html):
367 """Return the content of the tag with the specified attribute in the passed HTML document"""
368 parser = AttrParser(attribute, value)
59ae15a5
PH
369 try:
370 parser.loads(html)
371 except compat_html_parser.HTMLParseError:
372 pass
373 return parser.get_result()
9e6dd238 374
a921f407
JMF
375class MetaParser(BaseHTMLParser):
376 """
377 Modified HTMLParser that isolates a meta tag with the specified name
378 attribute.
379 """
380 def __init__(self, name):
381 BaseHTMLParser.__init__(self)
382 self.name = name
383 self.content = None
384 self.result = None
385
386 def handle_starttag(self, tag, attrs):
387 if tag != 'meta':
388 return
389 attrs = dict(attrs)
390 if attrs.get('name') == self.name:
391 self.result = attrs.get('content')
392
393 def get_result(self):
394 return self.result
395
396def get_meta_content(name, html):
397 """
398 Return the content attribute from the meta tag with the given name attribute.
399 """
400 parser = MetaParser(name)
401 try:
402 parser.loads(html)
403 except compat_html_parser.HTMLParseError:
404 pass
405 return parser.get_result()
406
9e6dd238
FV
407
408def clean_html(html):
59ae15a5
PH
409 """Clean an HTML snippet into a readable string"""
410 # Newline vs <br />
411 html = html.replace('\n', ' ')
6b3aef80
FV
412 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
413 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
414 # Strip html tags
415 html = re.sub('<.*?>', '', html)
416 # Replace html entities
417 html = unescapeHTML(html)
7decf895 418 return html.strip()
9e6dd238
FV
419
420
d77c3dfd 421def sanitize_open(filename, open_mode):
59ae15a5
PH
422 """Try to open the given filename, and slightly tweak it if this fails.
423
424 Attempts to open the given filename. If this fails, it tries to change
425 the filename slightly, step by step, until it's either able to open it
426 or it fails and raises a final exception, like the standard open()
427 function.
428
429 It returns the tuple (stream, definitive_file_name).
430 """
431 try:
432 if filename == u'-':
433 if sys.platform == 'win32':
434 import msvcrt
435 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 436 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
437 stream = open(encodeFilename(filename), open_mode)
438 return (stream, filename)
439 except (IOError, OSError) as err:
f45c185f
PH
440 if err.errno in (errno.EACCES,):
441 raise
59ae15a5 442
f45c185f
PH
443 # In case of error, try to remove win32 forbidden chars
444 alt_filename = os.path.join(
445 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
446 for path_part in os.path.split(filename)
447 )
448 if alt_filename == filename:
449 raise
450 else:
451 # An exception here should be caught in the caller
452 stream = open(encodeFilename(filename), open_mode)
453 return (stream, alt_filename)
d77c3dfd
FV
454
455
456def timeconvert(timestr):
59ae15a5
PH
457 """Convert RFC 2822 defined time string into system timestamp"""
458 timestamp = None
459 timetuple = email.utils.parsedate_tz(timestr)
460 if timetuple is not None:
461 timestamp = email.utils.mktime_tz(timetuple)
462 return timestamp
1c469a94 463
796173d0 464def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
465 """Sanitizes a string so it could be used as part of a filename.
466 If restricted is set, use a stricter subset of allowed characters.
796173d0 467 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
468 """
469 def replace_insane(char):
470 if char == '?' or ord(char) < 32 or ord(char) == 127:
471 return ''
472 elif char == '"':
473 return '' if restricted else '\''
474 elif char == ':':
475 return '_-' if restricted else ' -'
476 elif char in '\\/|*<>':
477 return '_'
627dcfff 478 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
479 return '_'
480 if restricted and ord(char) > 127:
481 return '_'
482 return char
483
484 result = u''.join(map(replace_insane, s))
796173d0
PH
485 if not is_id:
486 while '__' in result:
487 result = result.replace('__', '_')
488 result = result.strip('_')
489 # Common case of "Foreign band name - English song title"
490 if restricted and result.startswith('-_'):
491 result = result[2:]
492 if not result:
493 result = '_'
59ae15a5 494 return result
d77c3dfd
FV
495
496def orderedSet(iterable):
59ae15a5
PH
497 """ Remove all duplicates from the input iterable """
498 res = []
499 for el in iterable:
500 if el not in res:
501 res.append(el)
502 return res
d77c3dfd
FV
503
504def unescapeHTML(s):
59ae15a5
PH
505 """
506 @param s a string
507 """
508 assert type(s) == type(u'')
d77c3dfd 509
59ae15a5
PH
510 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
511 return result
d77c3dfd 512
8bf48f23
PH
513
514def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
515 """
516 @param s The name of the file
517 """
d77c3dfd 518
8bf48f23 519 assert type(s) == compat_str
d77c3dfd 520
59ae15a5
PH
521 # Python 3 has a Unicode API
522 if sys.version_info >= (3, 0):
523 return s
0f00efed 524
59ae15a5
PH
525 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
526 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
527 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
528 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
529 if not for_subprocess:
530 return s
531 else:
532 # For subprocess calls, encode with locale encoding
533 # Refer to http://stackoverflow.com/a/9951851/35070
534 encoding = preferredencoding()
59ae15a5 535 else:
6df40dcb 536 encoding = sys.getfilesystemencoding()
8bf48f23
PH
537 if encoding is None:
538 encoding = 'utf-8'
539 return s.encode(encoding, 'ignore')
540
d77c3dfd 541
8271226a
PH
542def decodeOption(optval):
543 if optval is None:
544 return optval
545 if isinstance(optval, bytes):
546 optval = optval.decode(preferredencoding())
547
548 assert isinstance(optval, compat_str)
549 return optval
1c256f70 550
4539dd30
PH
551def formatSeconds(secs):
552 if secs > 3600:
553 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
554 elif secs > 60:
555 return '%d:%02d' % (secs // 60, secs % 60)
556 else:
557 return '%d' % secs
558
a0ddb8a2
PH
559
560def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
13ebea79
PH
561 if sys.version_info < (3, 2):
562 import httplib
563
564 class HTTPSConnectionV3(httplib.HTTPSConnection):
565 def __init__(self, *args, **kwargs):
566 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
567
568 def connect(self):
569 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 570 if getattr(self, '_tunnel_host', False):
13ebea79
PH
571 self.sock = sock
572 self._tunnel()
573 try:
574 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
de79c46c 575 except ssl.SSLError:
13ebea79
PH
576 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
577
578 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
579 def https_open(self, req):
580 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 581 return HTTPSHandlerV3(**kwargs)
ea6d901e 582 else:
13ebea79 583 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
ea6d901e 584 context.verify_mode = (ssl.CERT_NONE
dca08720 585 if opts_no_check_certificate
ea6d901e 586 else ssl.CERT_REQUIRED)
303b479e
PH
587 context.set_default_verify_paths()
588 try:
589 context.load_default_certs()
590 except AttributeError:
591 pass # Python < 3.4
a0ddb8a2 592 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 593
1c256f70
PH
594class ExtractorError(Exception):
595 """Error during info extraction."""
2eabb802 596 def __init__(self, msg, tb=None, expected=False, cause=None):
9a82b238
PH
597 """ tb, if given, is the original traceback (so that it can be printed out).
598 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
599 """
600
601 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
602 expected = True
603 if not expected:
298f833b 604 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 605 super(ExtractorError, self).__init__(msg)
d5979c5d 606
1c256f70 607 self.traceback = tb
8cc83b8d 608 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 609 self.cause = cause
1c256f70 610
01951dda
PH
611 def format_traceback(self):
612 if self.traceback is None:
613 return None
614 return u''.join(traceback.format_tb(self.traceback))
615
1c256f70 616
55b3e45b
JMF
617class RegexNotFoundError(ExtractorError):
618 """Error when a regex didn't match"""
619 pass
620
621
d77c3dfd 622class DownloadError(Exception):
59ae15a5 623 """Download Error exception.
d77c3dfd 624
59ae15a5
PH
625 This exception may be thrown by FileDownloader objects if they are not
626 configured to continue on errors. They will contain the appropriate
627 error message.
628 """
8cc83b8d
FV
629 def __init__(self, msg, exc_info=None):
630 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
631 super(DownloadError, self).__init__(msg)
632 self.exc_info = exc_info
d77c3dfd
FV
633
634
635class SameFileError(Exception):
59ae15a5 636 """Same File exception.
d77c3dfd 637
59ae15a5
PH
638 This exception will be thrown by FileDownloader objects if they detect
639 multiple files would have to be downloaded to the same file on disk.
640 """
641 pass
d77c3dfd
FV
642
643
644class PostProcessingError(Exception):
59ae15a5 645 """Post Processing exception.
d77c3dfd 646
59ae15a5
PH
647 This exception may be raised by PostProcessor's .run() method to
648 indicate an error in the postprocessing task.
649 """
7851b379
PH
650 def __init__(self, msg):
651 self.msg = msg
d77c3dfd
FV
652
653class MaxDownloadsReached(Exception):
59ae15a5
PH
654 """ --max-downloads limit has been reached. """
655 pass
d77c3dfd
FV
656
657
658class UnavailableVideoError(Exception):
59ae15a5 659 """Unavailable Format exception.
d77c3dfd 660
59ae15a5
PH
661 This exception will be thrown when a video is requested
662 in a format that is not available for that video.
663 """
664 pass
d77c3dfd
FV
665
666
667class ContentTooShortError(Exception):
59ae15a5 668 """Content Too Short exception.
d77c3dfd 669
59ae15a5
PH
670 This exception may be raised by FileDownloader objects when a file they
671 download is too small for what the server announced first, indicating
672 the connection was probably interrupted.
673 """
674 # Both in bytes
675 downloaded = None
676 expected = None
d77c3dfd 677
59ae15a5
PH
678 def __init__(self, downloaded, expected):
679 self.downloaded = downloaded
680 self.expected = expected
d77c3dfd 681
acebc9cd 682class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
683 """Handler for HTTP requests and responses.
684
685 This class, when installed with an OpenerDirector, automatically adds
686 the standard headers to every HTTP request and handles gzipped and
687 deflated responses from web servers. If compression is to be avoided in
688 a particular request, the original request in the program code only has
689 to include the HTTP header "Youtubedl-No-Compression", which will be
690 removed before making the real request.
691
692 Part of this code was copied from:
693
694 http://techknack.net/python-urllib2-handlers/
695
696 Andrew Rowls, the author of that code, agreed to release it to the
697 public domain.
698 """
699
700 @staticmethod
701 def deflate(data):
702 try:
703 return zlib.decompress(data, -zlib.MAX_WBITS)
704 except zlib.error:
705 return zlib.decompress(data)
706
707 @staticmethod
708 def addinfourl_wrapper(stream, headers, url, code):
709 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
710 return compat_urllib_request.addinfourl(stream, headers, url, code)
711 ret = compat_urllib_request.addinfourl(stream, headers, url)
712 ret.code = code
713 return ret
714
acebc9cd
PH
715 def http_request(self, req):
716 for h,v in std_headers.items():
59ae15a5
PH
717 if h in req.headers:
718 del req.headers[h]
335959e7 719 req.add_header(h, v)
59ae15a5
PH
720 if 'Youtubedl-no-compression' in req.headers:
721 if 'Accept-encoding' in req.headers:
722 del req.headers['Accept-encoding']
723 del req.headers['Youtubedl-no-compression']
3446dfb7 724 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
725 if 'User-agent' in req.headers:
726 del req.headers['User-agent']
727 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 728 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
729 return req
730
acebc9cd 731 def http_response(self, req, resp):
59ae15a5
PH
732 old_resp = resp
733 # gzip
734 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
735 content = resp.read()
736 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
737 try:
738 uncompressed = io.BytesIO(gz.read())
739 except IOError as original_ioerror:
740 # There may be junk add the end of the file
741 # See http://stackoverflow.com/q/4928560/35070 for details
742 for i in range(1, 1024):
743 try:
744 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
745 uncompressed = io.BytesIO(gz.read())
746 except IOError:
747 continue
748 break
749 else:
750 raise original_ioerror
751 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
752 resp.msg = old_resp.msg
753 # deflate
754 if resp.headers.get('Content-encoding', '') == 'deflate':
755 gz = io.BytesIO(self.deflate(resp.read()))
756 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
757 resp.msg = old_resp.msg
758 return resp
0f8d03f8 759
acebc9cd
PH
760 https_request = http_request
761 https_response = http_response
bf50b038 762
5de90176 763
bf50b038
JMF
764def unified_strdate(date_str):
765 """Return a string with the date in the format YYYYMMDD"""
766 upload_date = None
767 #Replace commas
026fcc04 768 date_str = date_str.replace(',', ' ')
bf50b038 769 # %z (UTC offset) is only supported in python>=3.2
026fcc04 770 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
19e1d359
JMF
771 format_expressions = [
772 '%d %B %Y',
0f99566c 773 '%d %b %Y',
19e1d359
JMF
774 '%B %d %Y',
775 '%b %d %Y',
776 '%Y-%m-%d',
4cf96546 777 '%d.%m.%Y',
19e1d359
JMF
778 '%d/%m/%Y',
779 '%Y/%m/%d %H:%M:%S',
5d73273f 780 '%Y-%m-%d %H:%M:%S',
19e1d359 781 '%d.%m.%Y %H:%M',
b047de6f 782 '%d.%m.%Y %H.%M',
19e1d359 783 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
784 '%Y-%m-%dT%H:%M:%S.%fZ',
785 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 786 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 787 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 788 '%Y-%m-%dT%H:%M',
19e1d359 789 ]
bf50b038
JMF
790 for expression in format_expressions:
791 try:
792 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 793 except ValueError:
bf50b038 794 pass
42393ce2
PH
795 if upload_date is None:
796 timetuple = email.utils.parsedate_tz(date_str)
797 if timetuple:
798 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
799 return upload_date
800
cbdbb766 801def determine_ext(url, default_ext=u'unknown_video'):
73e79f2a
PH
802 guess = url.partition(u'?')[0].rpartition(u'.')[2]
803 if re.match(r'^[A-Za-z0-9]+$', guess):
804 return guess
805 else:
cbdbb766 806 return default_ext
73e79f2a 807
d4051a8e
JMF
808def subtitles_filename(filename, sub_lang, sub_format):
809 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
810
bd558525 811def date_from_str(date_str):
37254abc
JMF
812 """
813 Return a datetime object from a string in the format YYYYMMDD or
814 (now|today)[+-][0-9](day|week|month|year)(s)?"""
815 today = datetime.date.today()
816 if date_str == 'now'or date_str == 'today':
817 return today
818 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
819 if match is not None:
820 sign = match.group('sign')
821 time = int(match.group('time'))
822 if sign == '-':
823 time = -time
824 unit = match.group('unit')
825 #A bad aproximation?
826 if unit == 'month':
827 unit = 'day'
828 time *= 30
829 elif unit == 'year':
830 unit = 'day'
831 time *= 365
832 unit += 's'
833 delta = datetime.timedelta(**{unit: time})
834 return today + delta
bd558525
JMF
835 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
836
e63fc1be 837def hyphenate_date(date_str):
838 """
839 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
840 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
841 if match is not None:
842 return '-'.join(match.groups())
843 else:
844 return date_str
845
bd558525
JMF
846class DateRange(object):
847 """Represents a time interval between two dates"""
848 def __init__(self, start=None, end=None):
849 """start and end must be strings in the format accepted by date"""
850 if start is not None:
851 self.start = date_from_str(start)
852 else:
853 self.start = datetime.datetime.min.date()
854 if end is not None:
855 self.end = date_from_str(end)
856 else:
857 self.end = datetime.datetime.max.date()
37254abc 858 if self.start > self.end:
bd558525
JMF
859 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
860 @classmethod
861 def day(cls, day):
862 """Returns a range that only contains the given day"""
863 return cls(day,day)
864 def __contains__(self, date):
865 """Check if the date is in the range"""
37254abc
JMF
866 if not isinstance(date, datetime.date):
867 date = date_from_str(date)
868 return self.start <= date <= self.end
bd558525
JMF
869 def __str__(self):
870 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
871
872
873def platform_name():
874 """ Returns the platform name as a compat_str """
875 res = platform.platform()
876 if isinstance(res, bytes):
877 res = res.decode(preferredencoding())
878
879 assert isinstance(res, compat_str)
880 return res
c257baff
PH
881
882
7459e3a2
PH
883def write_string(s, out=None):
884 if out is None:
885 out = sys.stderr
8bf48f23 886 assert type(s) == compat_str
7459e3a2
PH
887
888 if ('b' in getattr(out, 'mode', '') or
889 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
890 s = s.encode(preferredencoding(), 'ignore')
8bf48f23
PH
891 try:
892 out.write(s)
893 except UnicodeEncodeError:
894 # In Windows shells, this can fail even when the codec is just charmap!?
895 # See https://wiki.python.org/moin/PrintFails#Issue
896 if sys.platform == 'win32' and hasattr(out, 'encoding'):
897 s = s.encode(out.encoding, 'ignore').decode(out.encoding)
898 out.write(s)
899 else:
900 raise
901
7459e3a2
PH
902 out.flush()
903
904
48ea9cea
PH
905def bytes_to_intlist(bs):
906 if not bs:
907 return []
908 if isinstance(bs[0], int): # Python 3
909 return list(bs)
910 else:
911 return [ord(c) for c in bs]
912
c257baff 913
cba892fa 914def intlist_to_bytes(xs):
915 if not xs:
916 return b''
917 if isinstance(chr(0), bytes): # Python 2
918 return ''.join([chr(x) for x in xs])
919 else:
920 return bytes(xs)
c38b1e77
PH
921
922
923def get_cachedir(params={}):
924 cache_root = os.environ.get('XDG_CACHE_HOME',
925 os.path.expanduser('~/.cache'))
926 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
c1c9a79c
PH
927
928
929# Cross-platform file locking
930if sys.platform == 'win32':
931 import ctypes.wintypes
932 import msvcrt
933
934 class OVERLAPPED(ctypes.Structure):
935 _fields_ = [
936 ('Internal', ctypes.wintypes.LPVOID),
937 ('InternalHigh', ctypes.wintypes.LPVOID),
938 ('Offset', ctypes.wintypes.DWORD),
939 ('OffsetHigh', ctypes.wintypes.DWORD),
940 ('hEvent', ctypes.wintypes.HANDLE),
941 ]
942
943 kernel32 = ctypes.windll.kernel32
944 LockFileEx = kernel32.LockFileEx
945 LockFileEx.argtypes = [
946 ctypes.wintypes.HANDLE, # hFile
947 ctypes.wintypes.DWORD, # dwFlags
948 ctypes.wintypes.DWORD, # dwReserved
949 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
950 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
951 ctypes.POINTER(OVERLAPPED) # Overlapped
952 ]
953 LockFileEx.restype = ctypes.wintypes.BOOL
954 UnlockFileEx = kernel32.UnlockFileEx
955 UnlockFileEx.argtypes = [
956 ctypes.wintypes.HANDLE, # hFile
957 ctypes.wintypes.DWORD, # dwReserved
958 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
959 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
960 ctypes.POINTER(OVERLAPPED) # Overlapped
961 ]
962 UnlockFileEx.restype = ctypes.wintypes.BOOL
963 whole_low = 0xffffffff
964 whole_high = 0x7fffffff
965
966 def _lock_file(f, exclusive):
967 overlapped = OVERLAPPED()
968 overlapped.Offset = 0
969 overlapped.OffsetHigh = 0
970 overlapped.hEvent = 0
971 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
972 handle = msvcrt.get_osfhandle(f.fileno())
973 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
974 whole_low, whole_high, f._lock_file_overlapped_p):
975 raise OSError('Locking file failed: %r' % ctypes.FormatError())
976
977 def _unlock_file(f):
978 assert f._lock_file_overlapped_p
979 handle = msvcrt.get_osfhandle(f.fileno())
980 if not UnlockFileEx(handle, 0,
981 whole_low, whole_high, f._lock_file_overlapped_p):
982 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
983
984else:
985 import fcntl
986
987 def _lock_file(f, exclusive):
988 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
989
990 def _unlock_file(f):
991 fcntl.lockf(f, fcntl.LOCK_UN)
992
993
994class locked_file(object):
995 def __init__(self, filename, mode, encoding=None):
996 assert mode in ['r', 'a', 'w']
997 self.f = io.open(filename, mode, encoding=encoding)
998 self.mode = mode
999
1000 def __enter__(self):
1001 exclusive = self.mode != 'r'
1002 try:
1003 _lock_file(self.f, exclusive)
1004 except IOError:
1005 self.f.close()
1006 raise
1007 return self
1008
1009 def __exit__(self, etype, value, traceback):
1010 try:
1011 _unlock_file(self.f)
1012 finally:
1013 self.f.close()
1014
1015 def __iter__(self):
1016 return iter(self.f)
1017
1018 def write(self, *args):
1019 return self.f.write(*args)
1020
1021 def read(self, *args):
1022 return self.f.read(*args)
4eb7f1d1
JMF
1023
1024
1025def shell_quote(args):
a6a173c2
JMF
1026 quoted_args = []
1027 encoding = sys.getfilesystemencoding()
1028 if encoding is None:
1029 encoding = 'utf-8'
1030 for a in args:
1031 if isinstance(a, bytes):
1032 # We may get a filename encoded with 'encodeFilename'
1033 a = a.decode(encoding)
1034 quoted_args.append(pipes.quote(a))
1035 return u' '.join(quoted_args)
9d4660ca
PH
1036
1037
f4d96df0
PH
1038def takewhile_inclusive(pred, seq):
1039 """ Like itertools.takewhile, but include the latest evaluated element
1040 (the first element so that Not pred(e)) """
1041 for e in seq:
1042 yield e
1043 if not pred(e):
1044 return
1045
1046
9d4660ca
PH
1047def smuggle_url(url, data):
1048 """ Pass additional data in a URL for internal use. """
1049
1050 sdata = compat_urllib_parse.urlencode(
1051 {u'__youtubedl_smuggle': json.dumps(data)})
1052 return url + u'#' + sdata
1053
1054
79f82953 1055def unsmuggle_url(smug_url, default=None):
9d4660ca 1056 if not '#__youtubedl_smuggle' in smug_url:
79f82953 1057 return smug_url, default
9d4660ca
PH
1058 url, _, sdata = smug_url.rpartition(u'#')
1059 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1060 data = json.loads(jsond)
1061 return url, data
02dbf93f
PH
1062
1063
02dbf93f
PH
1064def format_bytes(bytes):
1065 if bytes is None:
1066 return u'N/A'
1067 if type(bytes) is str:
1068 bytes = float(bytes)
1069 if bytes == 0.0:
1070 exponent = 0
1071 else:
1072 exponent = int(math.log(bytes, 1024.0))
1073 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1074 converted = float(bytes) / float(1024 ** exponent)
1075 return u'%.2f%s' % (converted, suffix)
f53c966a 1076
1c088fa8 1077
f53c966a
JMF
1078def str_to_int(int_str):
1079 int_str = re.sub(r'[,\.]', u'', int_str)
1080 return int(int_str)
1c088fa8
PH
1081
1082
1083def get_term_width():
1084 columns = os.environ.get('COLUMNS', None)
1085 if columns:
1086 return int(columns)
1087
1088 try:
1089 sp = subprocess.Popen(
1090 ['stty', 'size'],
1091 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1092 out, err = sp.communicate()
1093 return int(out.split()[1])
1094 except:
1095 pass
1096 return None
caefb1de
PH
1097
1098
1099def month_by_name(name):
1100 """ Return the number of a month by (locale-independently) English name """
1101
1102 ENGLISH_NAMES = [
dadb8184 1103 u'January', u'February', u'March', u'April', u'May', u'June',
caefb1de
PH
1104 u'July', u'August', u'September', u'October', u'November', u'December']
1105 try:
1106 return ENGLISH_NAMES.index(name) + 1
1107 except ValueError:
1108 return None
18258362
JMF
1109
1110
5aafe895 1111def fix_xml_ampersands(xml_str):
18258362 1112 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1113 return re.sub(
1114 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1115 u'&amp;',
1116 xml_str)
e3946f98
PH
1117
1118
1119def setproctitle(title):
8bf48f23 1120 assert isinstance(title, compat_str)
e3946f98
PH
1121 try:
1122 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1123 except OSError:
1124 return
1125 title = title
1126 buf = ctypes.create_string_buffer(len(title) + 1)
e64eaaa9 1127 buf.value = title.encode('utf-8')
e3946f98
PH
1128 try:
1129 libc.prctl(15, ctypes.byref(buf), 0, 0, 0)
1130 except AttributeError:
1131 return # Strange libc, just skip this
d7dda168
PH
1132
1133
1134def remove_start(s, start):
1135 if s.startswith(start):
1136 return s[len(start):]
1137 return s
29eb5174
PH
1138
1139
1140def url_basename(url):
9b8aaeed
JMF
1141 path = compat_urlparse.urlparse(url).path
1142 return path.strip(u'/').split(u'/')[-1]
aa94a6d3
PH
1143
1144
1145class HEADRequest(compat_urllib_request.Request):
1146 def get_method(self):
1147 return "HEAD"
7217e148
PH
1148
1149
dd27fd17
PH
1150def int_or_none(v, scale=1):
1151 return v if v is None else (int(v) // scale)
608d11f5
PH
1152
1153
1154def parse_duration(s):
1155 if s is None:
1156 return None
1157
1158 m = re.match(
2db806b4 1159 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?$', s)
608d11f5
PH
1160 if not m:
1161 return None
1162 res = int(m.group('secs'))
1163 if m.group('mins'):
1164 res += int(m.group('mins')) * 60
1165 if m.group('hours'):
1166 res += int(m.group('hours')) * 60 * 60
1167 return res
91d7d0b3
JMF
1168
1169
1170def prepend_extension(filename, ext):
1171 name, real_ext = os.path.splitext(filename)
1172 return u'{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1173
1174
1175def check_executable(exe, args=[]):
1176 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1177 args can be a list of arguments for a short output (like -version) """
1178 try:
1179 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1180 except OSError:
1181 return False
1182 return exe
b7ab0590
PH
1183
1184
1185class PagedList(object):
1186 def __init__(self, pagefunc, pagesize):
1187 self._pagefunc = pagefunc
1188 self._pagesize = pagesize
1189
dd26ced1
PH
1190 def __len__(self):
1191 # This is only useful for tests
1192 return len(self.getslice())
1193
b7ab0590
PH
1194 def getslice(self, start=0, end=None):
1195 res = []
1196 for pagenum in itertools.count(start // self._pagesize):
1197 firstid = pagenum * self._pagesize
1198 nextfirstid = pagenum * self._pagesize + self._pagesize
1199 if start >= nextfirstid:
1200 continue
1201
1202 page_results = list(self._pagefunc(pagenum))
1203
1204 startv = (
1205 start % self._pagesize
1206 if firstid <= start < nextfirstid
1207 else 0)
1208
1209 endv = (
1210 ((end - 1) % self._pagesize) + 1
1211 if (end is not None and firstid <= end <= nextfirstid)
1212 else None)
1213
1214 if startv != 0 or endv is not None:
1215 page_results = page_results[startv:endv]
1216 res.extend(page_results)
1217
1218 # A little optimization - if current page is not "full", ie. does
1219 # not contain page_size videos then we can assume that this page
1220 # is the last one - there are no more ids on further pages -
1221 # i.e. no need to query again.
1222 if len(page_results) + startv < self._pagesize:
1223 break
1224
1225 # If we got the whole page, but the next page is not interesting,
1226 # break out early as well
1227 if end == nextfirstid:
1228 break
1229 return res
81c2f20b
PH
1230
1231
1232def uppercase_escape(s):
1233 return re.sub(
1234 r'\\U([0-9a-fA-F]{8})',
1235 lambda m: compat_chr(int(m.group(1), base=16)), s)
b53466e1
PH
1236
1237try:
1238 struct.pack(u'!I', 0)
1239except TypeError:
1240 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1241 def struct_pack(spec, *args):
1242 if isinstance(spec, compat_str):
1243 spec = spec.encode('ascii')
1244 return struct.pack(spec, *args)
1245
1246 def struct_unpack(spec, *args):
1247 if isinstance(spec, compat_str):
1248 spec = spec.encode('ascii')
1249 return struct.unpack(spec, *args)
1250else:
1251 struct_pack = struct.pack
1252 struct_unpack = struct.unpack
62e609ab
PH
1253
1254
1255def read_batch_urls(batch_fd):
1256 def fixup(url):
1257 if not isinstance(url, compat_str):
1258 url = url.decode('utf-8', 'replace')
1259 BOM_UTF8 = u'\xef\xbb\xbf'
1260 if url.startswith(BOM_UTF8):
1261 url = url[len(BOM_UTF8):]
1262 url = url.strip()
1263 if url.startswith(('#', ';', ']')):
1264 return False
1265 return url
1266
1267 with contextlib.closing(batch_fd) as fd:
1268 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1269
1270
1271def urlencode_postdata(*args, **kargs):
1272 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1273
1274
1275def parse_xml(s):
1276 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1277 def doctype(self, name, pubid, system):
1278 pass # Ignore doctypes
1279
1280 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1281 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1282 return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
e68301af
PH
1283
1284
1285if sys.version_info < (3, 0) and sys.platform == 'win32':
1286 def compat_getpass(prompt, *args, **kwargs):
1287 if isinstance(prompt, compat_str):
4e6f9aec 1288 prompt = prompt.encode(preferredencoding())
e68301af
PH
1289 return getpass.getpass(prompt, *args, **kwargs)
1290else:
1291 compat_getpass = getpass.getpass