]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[ntv] Move app guess outside formats loop
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
912b38b4 4import calendar
62e609ab 5import contextlib
e3946f98 6import ctypes
c496ca96
PH
7import datetime
8import email.utils
f45c185f 9import errno
e68301af 10import getpass
d77c3dfd 11import gzip
b7ab0590 12import itertools
03f9daab 13import io
f4bfd65f 14import json
d77c3dfd 15import locale
02dbf93f 16import math
d77c3dfd 17import os
4eb7f1d1 18import pipes
c496ca96 19import platform
d77c3dfd 20import re
13ebea79 21import ssl
c496ca96 22import socket
b53466e1 23import struct
1c088fa8 24import subprocess
d77c3dfd 25import sys
01951dda 26import traceback
bcf89ce6 27import xml.etree.ElementTree
d77c3dfd 28import zlib
d77c3dfd 29
01ba00ca 30try:
59ae15a5 31 import urllib.request as compat_urllib_request
01ba00ca 32except ImportError: # Python 2
59ae15a5 33 import urllib2 as compat_urllib_request
01ba00ca
PH
34
35try:
59ae15a5 36 import urllib.error as compat_urllib_error
01ba00ca 37except ImportError: # Python 2
59ae15a5 38 import urllib2 as compat_urllib_error
01ba00ca
PH
39
40try:
59ae15a5 41 import urllib.parse as compat_urllib_parse
01ba00ca 42except ImportError: # Python 2
59ae15a5 43 import urllib as compat_urllib_parse
01ba00ca 44
799c0763
PH
45try:
46 from urllib.parse import urlparse as compat_urllib_parse_urlparse
47except ImportError: # Python 2
48 from urlparse import urlparse as compat_urllib_parse_urlparse
49
6543f0dc
JMF
50try:
51 import urllib.parse as compat_urlparse
52except ImportError: # Python 2
53 import urlparse as compat_urlparse
54
01ba00ca 55try:
59ae15a5 56 import http.cookiejar as compat_cookiejar
01ba00ca 57except ImportError: # Python 2
59ae15a5 58 import cookielib as compat_cookiejar
01ba00ca 59
3e669f36 60try:
59ae15a5 61 import html.entities as compat_html_entities
9f37a959 62except ImportError: # Python 2
59ae15a5 63 import htmlentitydefs as compat_html_entities
3e669f36 64
a8156c1d 65try:
59ae15a5 66 import html.parser as compat_html_parser
9f37a959 67except ImportError: # Python 2
59ae15a5 68 import HTMLParser as compat_html_parser
a8156c1d 69
348d0a7a 70try:
59ae15a5 71 import http.client as compat_http_client
9f37a959 72except ImportError: # Python 2
59ae15a5 73 import httplib as compat_http_client
348d0a7a 74
2eabb802 75try:
0e283428 76 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
77except ImportError: # Python 2
78 from urllib2 import HTTPError as compat_HTTPError
79
e0df6211
PH
80try:
81 from urllib.request import urlretrieve as compat_urlretrieve
82except ImportError: # Python 2
83 from urllib import urlretrieve as compat_urlretrieve
84
85
5910e210
PH
86try:
87 from subprocess import DEVNULL
88 compat_subprocess_get_DEVNULL = lambda: DEVNULL
89except ImportError:
90 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
91
9f37a959 92try:
59ae15a5 93 from urllib.parse import parse_qs as compat_parse_qs
9f37a959 94except ImportError: # Python 2
59ae15a5
PH
95 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
96 # Python 2's version is apparently totally broken
97 def _unquote(string, encoding='utf-8', errors='replace'):
98 if string == '':
99 return string
100 res = string.split('%')
101 if len(res) == 1:
102 return string
103 if encoding is None:
104 encoding = 'utf-8'
105 if errors is None:
106 errors = 'replace'
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
108 pct_sequence = b''
109 string = res[0]
110 for item in res[1:]:
111 try:
112 if not item:
113 raise ValueError
114 pct_sequence += item[:2].decode('hex')
115 rest = item[2:]
116 if not rest:
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
120 continue
121 except ValueError:
122 rest = '%' + item
123 # Encountered non-percent-encoded characters. Flush the current
124 # pct_sequence.
125 string += pct_sequence.decode(encoding, errors) + rest
126 pct_sequence = b''
127 if pct_sequence:
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
130 return string
131
132 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
133 encoding='utf-8', errors='replace'):
134 qs, _coerce_result = qs, unicode
135 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
136 r = []
137 for name_value in pairs:
138 if not name_value and not strict_parsing:
139 continue
140 nv = name_value.split('=', 1)
141 if len(nv) != 2:
142 if strict_parsing:
143 raise ValueError("bad query field: %r" % (name_value,))
144 # Handle case of a control-name with no equal sign
145 if keep_blank_values:
146 nv.append('')
147 else:
148 continue
149 if len(nv[1]) or keep_blank_values:
150 name = nv[0].replace('+', ' ')
151 name = _unquote(name, encoding=encoding, errors=errors)
152 name = _coerce_result(name)
153 value = nv[1].replace('+', ' ')
154 value = _unquote(value, encoding=encoding, errors=errors)
155 value = _coerce_result(value)
156 r.append((name, value))
157 return r
158
159 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
160 encoding='utf-8', errors='replace'):
161 parsed_result = {}
162 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
163 encoding=encoding, errors=errors)
164 for name, value in pairs:
165 if name in parsed_result:
166 parsed_result[name].append(value)
167 else:
168 parsed_result[name] = [value]
169 return parsed_result
348d0a7a 170
3e669f36 171try:
59ae15a5 172 compat_str = unicode # Python 2
3e669f36 173except NameError:
59ae15a5 174 compat_str = str
3e669f36
PH
175
176try:
59ae15a5 177 compat_chr = unichr # Python 2
3e669f36 178except NameError:
59ae15a5 179 compat_chr = chr
3e669f36 180
f7300c5c
JMF
181try:
182 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
183except ImportError: # Python 2.6
184 from xml.parsers.expat import ExpatError as compat_xml_parse_error
185
b31756c1
FV
186def compat_ord(c):
187 if type(c) is int: return c
188 else: return ord(c)
189
468e2e92
FV
190# This is not clearly defined otherwise
191compiled_regex_type = type(re.compile(''))
192
3e669f36 193std_headers = {
ae8f7871 194 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
195 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
196 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
197 'Accept-Encoding': 'gzip, deflate',
198 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 199}
f427df17 200
d77c3dfd 201def preferredencoding():
59ae15a5 202 """Get preferred encoding.
d77c3dfd 203
59ae15a5
PH
204 Returns the best encoding scheme for the system, based on
205 locale.getpreferredencoding() and some further tweaks.
206 """
207 try:
208 pref = locale.getpreferredencoding()
209 u'TEST'.encode(pref)
210 except:
211 pref = 'UTF-8'
bae611f2 212
59ae15a5 213 return pref
d77c3dfd 214
8cd10ac4 215if sys.version_info < (3,0):
59ae15a5
PH
216 def compat_print(s):
217 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 218else:
59ae15a5
PH
219 def compat_print(s):
220 assert type(s) == type(u'')
221 print(s)
d77c3dfd 222
f4bfd65f
PH
223# In Python 2.x, json.dump expects a bytestream.
224# In Python 3.x, it writes to a character stream
225if sys.version_info < (3,0):
226 def write_json_file(obj, fn):
227 with open(fn, 'wb') as f:
228 json.dump(obj, f)
229else:
230 def write_json_file(obj, fn):
231 with open(fn, 'w', encoding='utf-8') as f:
232 json.dump(obj, f)
233
59ae56fa
PH
234if sys.version_info >= (2,7):
235 def find_xpath_attr(node, xpath, key, val):
236 """ Find the xpath xpath[@key=val] """
5de3ece2 237 assert re.match(r'^[a-zA-Z]+$', key)
af1588c0 238 assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
59ae56fa
PH
239 expr = xpath + u"[@%s='%s']" % (key, val)
240 return node.find(expr)
241else:
242 def find_xpath_attr(node, xpath, key, val):
243 for f in node.findall(xpath):
244 if f.attrib.get(key) == val:
245 return f
246 return None
247
d7e66d39
JMF
248# On python2.6 the xml.etree.ElementTree.Element methods don't support
249# the namespace parameter
250def xpath_with_ns(path, ns_map):
251 components = [c.split(':') for c in path.split('/')]
252 replaced = []
253 for c in components:
254 if len(c) == 1:
255 replaced.append(c[0])
256 else:
257 ns, tag = c
258 replaced.append('{%s}%s' % (ns_map[ns], tag))
259 return '/'.join(replaced)
260
d77c3dfd 261def htmlentity_transform(matchobj):
59ae15a5
PH
262 """Transforms an HTML entity to a character.
263
264 This function receives a match object and is intended to be used with
265 the re.sub() function.
266 """
267 entity = matchobj.group(1)
268
269 # Known non-numeric HTML entity
270 if entity in compat_html_entities.name2codepoint:
271 return compat_chr(compat_html_entities.name2codepoint[entity])
272
273 mobj = re.match(u'(?u)#(x?\\d+)', entity)
274 if mobj is not None:
275 numstr = mobj.group(1)
276 if numstr.startswith(u'x'):
277 base = 16
278 numstr = u'0%s' % numstr
279 else:
280 base = 10
281 return compat_chr(int(numstr, base))
282
283 # Unknown entity in name, return its literal representation
284 return (u'&%s;' % entity)
d77c3dfd 285
a8156c1d 286compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
287class BaseHTMLParser(compat_html_parser.HTMLParser):
288 def __init(self):
289 compat_html_parser.HTMLParser.__init__(self)
290 self.html = None
291
292 def loads(self, html):
293 self.html = html
294 self.feed(html)
295 self.close()
296
297class AttrParser(BaseHTMLParser):
43e8fafd
ND
298 """Modified HTMLParser that isolates a tag with the specified attribute"""
299 def __init__(self, attribute, value):
300 self.attribute = attribute
301 self.value = value
59ae15a5
PH
302 self.result = None
303 self.started = False
304 self.depth = {}
59ae15a5
PH
305 self.watch_startpos = False
306 self.error_count = 0
a921f407 307 BaseHTMLParser.__init__(self)
59ae15a5
PH
308
309 def error(self, message):
310 if self.error_count > 10 or self.started:
311 raise compat_html_parser.HTMLParseError(message, self.getpos())
312 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
313 self.error_count += 1
314 self.goahead(1)
315
59ae15a5
PH
316 def handle_starttag(self, tag, attrs):
317 attrs = dict(attrs)
318 if self.started:
319 self.find_startpos(None)
43e8fafd 320 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
321 self.result = [tag]
322 self.started = True
323 self.watch_startpos = True
324 if self.started:
325 if not tag in self.depth: self.depth[tag] = 0
326 self.depth[tag] += 1
327
328 def handle_endtag(self, tag):
329 if self.started:
330 if tag in self.depth: self.depth[tag] -= 1
331 if self.depth[self.result[0]] == 0:
332 self.started = False
333 self.result.append(self.getpos())
334
335 def find_startpos(self, x):
336 """Needed to put the start position of the result (self.result[1])
337 after the opening tag with the requested id"""
338 if self.watch_startpos:
339 self.watch_startpos = False
340 self.result.append(self.getpos())
341 handle_entityref = handle_charref = handle_data = handle_comment = \
342 handle_decl = handle_pi = unknown_decl = find_startpos
343
344 def get_result(self):
345 if self.result is None:
346 return None
347 if len(self.result) != 3:
348 return None
349 lines = self.html.split('\n')
350 lines = lines[self.result[1][0]-1:self.result[2][0]]
351 lines[0] = lines[0][self.result[1][1]:]
352 if len(lines) == 1:
353 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
354 lines[-1] = lines[-1][:self.result[2][1]]
355 return '\n'.join(lines).strip()
3b024e17
PH
356# Hack for https://github.com/rg3/youtube-dl/issues/662
357if sys.version_info < (2, 7, 3):
358 AttrParser.parse_endtag = (lambda self, i:
359 i + len("</scr'+'ipt>")
360 if self.rawdata[i:].startswith("</scr'+'ipt>")
361 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
362
363def get_element_by_id(id, html):
43e8fafd
ND
364 """Return the content of the tag with the specified ID in the passed HTML document"""
365 return get_element_by_attribute("id", id, html)
366
367def get_element_by_attribute(attribute, value, html):
368 """Return the content of the tag with the specified attribute in the passed HTML document"""
369 parser = AttrParser(attribute, value)
59ae15a5
PH
370 try:
371 parser.loads(html)
372 except compat_html_parser.HTMLParseError:
373 pass
374 return parser.get_result()
9e6dd238 375
a921f407
JMF
376class MetaParser(BaseHTMLParser):
377 """
378 Modified HTMLParser that isolates a meta tag with the specified name
379 attribute.
380 """
381 def __init__(self, name):
382 BaseHTMLParser.__init__(self)
383 self.name = name
384 self.content = None
385 self.result = None
386
387 def handle_starttag(self, tag, attrs):
388 if tag != 'meta':
389 return
390 attrs = dict(attrs)
391 if attrs.get('name') == self.name:
392 self.result = attrs.get('content')
393
394 def get_result(self):
395 return self.result
396
397def get_meta_content(name, html):
398 """
399 Return the content attribute from the meta tag with the given name attribute.
400 """
401 parser = MetaParser(name)
402 try:
403 parser.loads(html)
404 except compat_html_parser.HTMLParseError:
405 pass
406 return parser.get_result()
407
9e6dd238
FV
408
409def clean_html(html):
59ae15a5
PH
410 """Clean an HTML snippet into a readable string"""
411 # Newline vs <br />
412 html = html.replace('\n', ' ')
6b3aef80
FV
413 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
414 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
415 # Strip html tags
416 html = re.sub('<.*?>', '', html)
417 # Replace html entities
418 html = unescapeHTML(html)
7decf895 419 return html.strip()
9e6dd238
FV
420
421
d77c3dfd 422def sanitize_open(filename, open_mode):
59ae15a5
PH
423 """Try to open the given filename, and slightly tweak it if this fails.
424
425 Attempts to open the given filename. If this fails, it tries to change
426 the filename slightly, step by step, until it's either able to open it
427 or it fails and raises a final exception, like the standard open()
428 function.
429
430 It returns the tuple (stream, definitive_file_name).
431 """
432 try:
433 if filename == u'-':
434 if sys.platform == 'win32':
435 import msvcrt
436 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 437 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
438 stream = open(encodeFilename(filename), open_mode)
439 return (stream, filename)
440 except (IOError, OSError) as err:
f45c185f
PH
441 if err.errno in (errno.EACCES,):
442 raise
59ae15a5 443
f45c185f
PH
444 # In case of error, try to remove win32 forbidden chars
445 alt_filename = os.path.join(
446 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
447 for path_part in os.path.split(filename)
448 )
449 if alt_filename == filename:
450 raise
451 else:
452 # An exception here should be caught in the caller
453 stream = open(encodeFilename(filename), open_mode)
454 return (stream, alt_filename)
d77c3dfd
FV
455
456
457def timeconvert(timestr):
59ae15a5
PH
458 """Convert RFC 2822 defined time string into system timestamp"""
459 timestamp = None
460 timetuple = email.utils.parsedate_tz(timestr)
461 if timetuple is not None:
462 timestamp = email.utils.mktime_tz(timetuple)
463 return timestamp
1c469a94 464
796173d0 465def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
466 """Sanitizes a string so it could be used as part of a filename.
467 If restricted is set, use a stricter subset of allowed characters.
796173d0 468 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
469 """
470 def replace_insane(char):
471 if char == '?' or ord(char) < 32 or ord(char) == 127:
472 return ''
473 elif char == '"':
474 return '' if restricted else '\''
475 elif char == ':':
476 return '_-' if restricted else ' -'
477 elif char in '\\/|*<>':
478 return '_'
627dcfff 479 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
480 return '_'
481 if restricted and ord(char) > 127:
482 return '_'
483 return char
484
485 result = u''.join(map(replace_insane, s))
796173d0
PH
486 if not is_id:
487 while '__' in result:
488 result = result.replace('__', '_')
489 result = result.strip('_')
490 # Common case of "Foreign band name - English song title"
491 if restricted and result.startswith('-_'):
492 result = result[2:]
493 if not result:
494 result = '_'
59ae15a5 495 return result
d77c3dfd
FV
496
497def orderedSet(iterable):
59ae15a5
PH
498 """ Remove all duplicates from the input iterable """
499 res = []
500 for el in iterable:
501 if el not in res:
502 res.append(el)
503 return res
d77c3dfd 504
912b38b4 505
d77c3dfd 506def unescapeHTML(s):
912b38b4
PH
507 if s is None:
508 return None
509 assert type(s) == compat_str
d77c3dfd 510
912b38b4 511 result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
59ae15a5 512 return result
d77c3dfd 513
8bf48f23
PH
514
515def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
516 """
517 @param s The name of the file
518 """
d77c3dfd 519
8bf48f23 520 assert type(s) == compat_str
d77c3dfd 521
59ae15a5
PH
522 # Python 3 has a Unicode API
523 if sys.version_info >= (3, 0):
524 return s
0f00efed 525
59ae15a5
PH
526 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
527 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
528 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
529 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
530 if not for_subprocess:
531 return s
532 else:
533 # For subprocess calls, encode with locale encoding
534 # Refer to http://stackoverflow.com/a/9951851/35070
535 encoding = preferredencoding()
59ae15a5 536 else:
6df40dcb 537 encoding = sys.getfilesystemencoding()
8bf48f23
PH
538 if encoding is None:
539 encoding = 'utf-8'
540 return s.encode(encoding, 'ignore')
541
d77c3dfd 542
8271226a
PH
543def decodeOption(optval):
544 if optval is None:
545 return optval
546 if isinstance(optval, bytes):
547 optval = optval.decode(preferredencoding())
548
549 assert isinstance(optval, compat_str)
550 return optval
1c256f70 551
4539dd30
PH
552def formatSeconds(secs):
553 if secs > 3600:
554 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
555 elif secs > 60:
556 return '%d:%02d' % (secs // 60, secs % 60)
557 else:
558 return '%d' % secs
559
a0ddb8a2
PH
560
561def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
13ebea79
PH
562 if sys.version_info < (3, 2):
563 import httplib
564
565 class HTTPSConnectionV3(httplib.HTTPSConnection):
566 def __init__(self, *args, **kwargs):
567 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
568
569 def connect(self):
570 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 571 if getattr(self, '_tunnel_host', False):
13ebea79
PH
572 self.sock = sock
573 self._tunnel()
574 try:
575 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
de79c46c 576 except ssl.SSLError:
13ebea79
PH
577 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
578
579 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
580 def https_open(self, req):
581 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 582 return HTTPSHandlerV3(**kwargs)
ea6d901e 583 else:
13ebea79 584 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
ea6d901e 585 context.verify_mode = (ssl.CERT_NONE
dca08720 586 if opts_no_check_certificate
ea6d901e 587 else ssl.CERT_REQUIRED)
303b479e
PH
588 context.set_default_verify_paths()
589 try:
590 context.load_default_certs()
591 except AttributeError:
592 pass # Python < 3.4
a0ddb8a2 593 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 594
1c256f70
PH
595class ExtractorError(Exception):
596 """Error during info extraction."""
2eabb802 597 def __init__(self, msg, tb=None, expected=False, cause=None):
9a82b238
PH
598 """ tb, if given, is the original traceback (so that it can be printed out).
599 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
600 """
601
602 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
603 expected = True
604 if not expected:
298f833b 605 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 606 super(ExtractorError, self).__init__(msg)
d5979c5d 607
1c256f70 608 self.traceback = tb
8cc83b8d 609 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 610 self.cause = cause
1c256f70 611
01951dda
PH
612 def format_traceback(self):
613 if self.traceback is None:
614 return None
615 return u''.join(traceback.format_tb(self.traceback))
616
1c256f70 617
55b3e45b
JMF
618class RegexNotFoundError(ExtractorError):
619 """Error when a regex didn't match"""
620 pass
621
622
d77c3dfd 623class DownloadError(Exception):
59ae15a5 624 """Download Error exception.
d77c3dfd 625
59ae15a5
PH
626 This exception may be thrown by FileDownloader objects if they are not
627 configured to continue on errors. They will contain the appropriate
628 error message.
629 """
8cc83b8d
FV
630 def __init__(self, msg, exc_info=None):
631 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
632 super(DownloadError, self).__init__(msg)
633 self.exc_info = exc_info
d77c3dfd
FV
634
635
636class SameFileError(Exception):
59ae15a5 637 """Same File exception.
d77c3dfd 638
59ae15a5
PH
639 This exception will be thrown by FileDownloader objects if they detect
640 multiple files would have to be downloaded to the same file on disk.
641 """
642 pass
d77c3dfd
FV
643
644
645class PostProcessingError(Exception):
59ae15a5 646 """Post Processing exception.
d77c3dfd 647
59ae15a5
PH
648 This exception may be raised by PostProcessor's .run() method to
649 indicate an error in the postprocessing task.
650 """
7851b379
PH
651 def __init__(self, msg):
652 self.msg = msg
d77c3dfd
FV
653
654class MaxDownloadsReached(Exception):
59ae15a5
PH
655 """ --max-downloads limit has been reached. """
656 pass
d77c3dfd
FV
657
658
659class UnavailableVideoError(Exception):
59ae15a5 660 """Unavailable Format exception.
d77c3dfd 661
59ae15a5
PH
662 This exception will be thrown when a video is requested
663 in a format that is not available for that video.
664 """
665 pass
d77c3dfd
FV
666
667
668class ContentTooShortError(Exception):
59ae15a5 669 """Content Too Short exception.
d77c3dfd 670
59ae15a5
PH
671 This exception may be raised by FileDownloader objects when a file they
672 download is too small for what the server announced first, indicating
673 the connection was probably interrupted.
674 """
675 # Both in bytes
676 downloaded = None
677 expected = None
d77c3dfd 678
59ae15a5
PH
679 def __init__(self, downloaded, expected):
680 self.downloaded = downloaded
681 self.expected = expected
d77c3dfd 682
acebc9cd 683class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
684 """Handler for HTTP requests and responses.
685
686 This class, when installed with an OpenerDirector, automatically adds
687 the standard headers to every HTTP request and handles gzipped and
688 deflated responses from web servers. If compression is to be avoided in
689 a particular request, the original request in the program code only has
690 to include the HTTP header "Youtubedl-No-Compression", which will be
691 removed before making the real request.
692
693 Part of this code was copied from:
694
695 http://techknack.net/python-urllib2-handlers/
696
697 Andrew Rowls, the author of that code, agreed to release it to the
698 public domain.
699 """
700
701 @staticmethod
702 def deflate(data):
703 try:
704 return zlib.decompress(data, -zlib.MAX_WBITS)
705 except zlib.error:
706 return zlib.decompress(data)
707
708 @staticmethod
709 def addinfourl_wrapper(stream, headers, url, code):
710 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
711 return compat_urllib_request.addinfourl(stream, headers, url, code)
712 ret = compat_urllib_request.addinfourl(stream, headers, url)
713 ret.code = code
714 return ret
715
acebc9cd
PH
716 def http_request(self, req):
717 for h,v in std_headers.items():
59ae15a5
PH
718 if h in req.headers:
719 del req.headers[h]
335959e7 720 req.add_header(h, v)
59ae15a5
PH
721 if 'Youtubedl-no-compression' in req.headers:
722 if 'Accept-encoding' in req.headers:
723 del req.headers['Accept-encoding']
724 del req.headers['Youtubedl-no-compression']
3446dfb7 725 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
726 if 'User-agent' in req.headers:
727 del req.headers['User-agent']
728 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 729 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
730 return req
731
acebc9cd 732 def http_response(self, req, resp):
59ae15a5
PH
733 old_resp = resp
734 # gzip
735 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
736 content = resp.read()
737 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
738 try:
739 uncompressed = io.BytesIO(gz.read())
740 except IOError as original_ioerror:
741 # There may be junk add the end of the file
742 # See http://stackoverflow.com/q/4928560/35070 for details
743 for i in range(1, 1024):
744 try:
745 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
746 uncompressed = io.BytesIO(gz.read())
747 except IOError:
748 continue
749 break
750 else:
751 raise original_ioerror
752 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
753 resp.msg = old_resp.msg
754 # deflate
755 if resp.headers.get('Content-encoding', '') == 'deflate':
756 gz = io.BytesIO(self.deflate(resp.read()))
757 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
758 resp.msg = old_resp.msg
759 return resp
0f8d03f8 760
acebc9cd
PH
761 https_request = http_request
762 https_response = http_response
bf50b038 763
5de90176 764
912b38b4
PH
765def parse_iso8601(date_str):
766 """ Return a UNIX timestamp from the given date """
767
768 if date_str is None:
769 return None
770
771 m = re.search(
772 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
773 date_str)
774 if not m:
775 timezone = datetime.timedelta()
776 else:
777 date_str = date_str[:-len(m.group(0))]
778 if not m.group('sign'):
779 timezone = datetime.timedelta()
780 else:
781 sign = 1 if m.group('sign') == '+' else -1
782 timezone = datetime.timedelta(
783 hours=sign * int(m.group('hours')),
784 minutes=sign * int(m.group('minutes')))
785
786 dt = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') - timezone
787 return calendar.timegm(dt.timetuple())
788
789
bf50b038
JMF
790def unified_strdate(date_str):
791 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
792
793 if date_str is None:
794 return None
795
bf50b038
JMF
796 upload_date = None
797 #Replace commas
026fcc04 798 date_str = date_str.replace(',', ' ')
bf50b038 799 # %z (UTC offset) is only supported in python>=3.2
026fcc04 800 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
19e1d359
JMF
801 format_expressions = [
802 '%d %B %Y',
0f99566c 803 '%d %b %Y',
19e1d359
JMF
804 '%B %d %Y',
805 '%b %d %Y',
806 '%Y-%m-%d',
4cf96546 807 '%d.%m.%Y',
19e1d359
JMF
808 '%d/%m/%Y',
809 '%Y/%m/%d %H:%M:%S',
5d73273f 810 '%Y-%m-%d %H:%M:%S',
19e1d359 811 '%d.%m.%Y %H:%M',
b047de6f 812 '%d.%m.%Y %H.%M',
19e1d359 813 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
814 '%Y-%m-%dT%H:%M:%S.%fZ',
815 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 816 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 817 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 818 '%Y-%m-%dT%H:%M',
19e1d359 819 ]
bf50b038
JMF
820 for expression in format_expressions:
821 try:
822 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 823 except ValueError:
bf50b038 824 pass
42393ce2
PH
825 if upload_date is None:
826 timetuple = email.utils.parsedate_tz(date_str)
827 if timetuple:
828 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
829 return upload_date
830
cbdbb766 831def determine_ext(url, default_ext=u'unknown_video'):
73e79f2a
PH
832 guess = url.partition(u'?')[0].rpartition(u'.')[2]
833 if re.match(r'^[A-Za-z0-9]+$', guess):
834 return guess
835 else:
cbdbb766 836 return default_ext
73e79f2a 837
d4051a8e
JMF
838def subtitles_filename(filename, sub_lang, sub_format):
839 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
840
bd558525 841def date_from_str(date_str):
37254abc
JMF
842 """
843 Return a datetime object from a string in the format YYYYMMDD or
844 (now|today)[+-][0-9](day|week|month|year)(s)?"""
845 today = datetime.date.today()
846 if date_str == 'now'or date_str == 'today':
847 return today
848 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
849 if match is not None:
850 sign = match.group('sign')
851 time = int(match.group('time'))
852 if sign == '-':
853 time = -time
854 unit = match.group('unit')
855 #A bad aproximation?
856 if unit == 'month':
857 unit = 'day'
858 time *= 30
859 elif unit == 'year':
860 unit = 'day'
861 time *= 365
862 unit += 's'
863 delta = datetime.timedelta(**{unit: time})
864 return today + delta
bd558525
JMF
865 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
866
e63fc1be 867def hyphenate_date(date_str):
868 """
869 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
870 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
871 if match is not None:
872 return '-'.join(match.groups())
873 else:
874 return date_str
875
bd558525
JMF
876class DateRange(object):
877 """Represents a time interval between two dates"""
878 def __init__(self, start=None, end=None):
879 """start and end must be strings in the format accepted by date"""
880 if start is not None:
881 self.start = date_from_str(start)
882 else:
883 self.start = datetime.datetime.min.date()
884 if end is not None:
885 self.end = date_from_str(end)
886 else:
887 self.end = datetime.datetime.max.date()
37254abc 888 if self.start > self.end:
bd558525
JMF
889 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
890 @classmethod
891 def day(cls, day):
892 """Returns a range that only contains the given day"""
893 return cls(day,day)
894 def __contains__(self, date):
895 """Check if the date is in the range"""
37254abc
JMF
896 if not isinstance(date, datetime.date):
897 date = date_from_str(date)
898 return self.start <= date <= self.end
bd558525
JMF
899 def __str__(self):
900 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
901
902
903def platform_name():
904 """ Returns the platform name as a compat_str """
905 res = platform.platform()
906 if isinstance(res, bytes):
907 res = res.decode(preferredencoding())
908
909 assert isinstance(res, compat_str)
910 return res
c257baff
PH
911
912
7459e3a2
PH
913def write_string(s, out=None):
914 if out is None:
915 out = sys.stderr
8bf48f23 916 assert type(s) == compat_str
7459e3a2
PH
917
918 if ('b' in getattr(out, 'mode', '') or
919 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
920 s = s.encode(preferredencoding(), 'ignore')
8bf48f23
PH
921 try:
922 out.write(s)
923 except UnicodeEncodeError:
924 # In Windows shells, this can fail even when the codec is just charmap!?
925 # See https://wiki.python.org/moin/PrintFails#Issue
926 if sys.platform == 'win32' and hasattr(out, 'encoding'):
927 s = s.encode(out.encoding, 'ignore').decode(out.encoding)
928 out.write(s)
929 else:
930 raise
931
7459e3a2
PH
932 out.flush()
933
934
48ea9cea
PH
935def bytes_to_intlist(bs):
936 if not bs:
937 return []
938 if isinstance(bs[0], int): # Python 3
939 return list(bs)
940 else:
941 return [ord(c) for c in bs]
942
c257baff 943
cba892fa 944def intlist_to_bytes(xs):
945 if not xs:
946 return b''
947 if isinstance(chr(0), bytes): # Python 2
948 return ''.join([chr(x) for x in xs])
949 else:
950 return bytes(xs)
c38b1e77
PH
951
952
953def get_cachedir(params={}):
954 cache_root = os.environ.get('XDG_CACHE_HOME',
955 os.path.expanduser('~/.cache'))
956 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
c1c9a79c
PH
957
958
959# Cross-platform file locking
960if sys.platform == 'win32':
961 import ctypes.wintypes
962 import msvcrt
963
964 class OVERLAPPED(ctypes.Structure):
965 _fields_ = [
966 ('Internal', ctypes.wintypes.LPVOID),
967 ('InternalHigh', ctypes.wintypes.LPVOID),
968 ('Offset', ctypes.wintypes.DWORD),
969 ('OffsetHigh', ctypes.wintypes.DWORD),
970 ('hEvent', ctypes.wintypes.HANDLE),
971 ]
972
973 kernel32 = ctypes.windll.kernel32
974 LockFileEx = kernel32.LockFileEx
975 LockFileEx.argtypes = [
976 ctypes.wintypes.HANDLE, # hFile
977 ctypes.wintypes.DWORD, # dwFlags
978 ctypes.wintypes.DWORD, # dwReserved
979 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
980 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
981 ctypes.POINTER(OVERLAPPED) # Overlapped
982 ]
983 LockFileEx.restype = ctypes.wintypes.BOOL
984 UnlockFileEx = kernel32.UnlockFileEx
985 UnlockFileEx.argtypes = [
986 ctypes.wintypes.HANDLE, # hFile
987 ctypes.wintypes.DWORD, # dwReserved
988 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
989 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
990 ctypes.POINTER(OVERLAPPED) # Overlapped
991 ]
992 UnlockFileEx.restype = ctypes.wintypes.BOOL
993 whole_low = 0xffffffff
994 whole_high = 0x7fffffff
995
996 def _lock_file(f, exclusive):
997 overlapped = OVERLAPPED()
998 overlapped.Offset = 0
999 overlapped.OffsetHigh = 0
1000 overlapped.hEvent = 0
1001 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1002 handle = msvcrt.get_osfhandle(f.fileno())
1003 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1004 whole_low, whole_high, f._lock_file_overlapped_p):
1005 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1006
1007 def _unlock_file(f):
1008 assert f._lock_file_overlapped_p
1009 handle = msvcrt.get_osfhandle(f.fileno())
1010 if not UnlockFileEx(handle, 0,
1011 whole_low, whole_high, f._lock_file_overlapped_p):
1012 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1013
1014else:
1015 import fcntl
1016
1017 def _lock_file(f, exclusive):
1018 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1019
1020 def _unlock_file(f):
1021 fcntl.lockf(f, fcntl.LOCK_UN)
1022
1023
1024class locked_file(object):
1025 def __init__(self, filename, mode, encoding=None):
1026 assert mode in ['r', 'a', 'w']
1027 self.f = io.open(filename, mode, encoding=encoding)
1028 self.mode = mode
1029
1030 def __enter__(self):
1031 exclusive = self.mode != 'r'
1032 try:
1033 _lock_file(self.f, exclusive)
1034 except IOError:
1035 self.f.close()
1036 raise
1037 return self
1038
1039 def __exit__(self, etype, value, traceback):
1040 try:
1041 _unlock_file(self.f)
1042 finally:
1043 self.f.close()
1044
1045 def __iter__(self):
1046 return iter(self.f)
1047
1048 def write(self, *args):
1049 return self.f.write(*args)
1050
1051 def read(self, *args):
1052 return self.f.read(*args)
4eb7f1d1
JMF
1053
1054
1055def shell_quote(args):
a6a173c2
JMF
1056 quoted_args = []
1057 encoding = sys.getfilesystemencoding()
1058 if encoding is None:
1059 encoding = 'utf-8'
1060 for a in args:
1061 if isinstance(a, bytes):
1062 # We may get a filename encoded with 'encodeFilename'
1063 a = a.decode(encoding)
1064 quoted_args.append(pipes.quote(a))
1065 return u' '.join(quoted_args)
9d4660ca
PH
1066
1067
f4d96df0
PH
1068def takewhile_inclusive(pred, seq):
1069 """ Like itertools.takewhile, but include the latest evaluated element
1070 (the first element so that Not pred(e)) """
1071 for e in seq:
1072 yield e
1073 if not pred(e):
1074 return
1075
1076
9d4660ca
PH
1077def smuggle_url(url, data):
1078 """ Pass additional data in a URL for internal use. """
1079
1080 sdata = compat_urllib_parse.urlencode(
1081 {u'__youtubedl_smuggle': json.dumps(data)})
1082 return url + u'#' + sdata
1083
1084
79f82953 1085def unsmuggle_url(smug_url, default=None):
9d4660ca 1086 if not '#__youtubedl_smuggle' in smug_url:
79f82953 1087 return smug_url, default
9d4660ca
PH
1088 url, _, sdata = smug_url.rpartition(u'#')
1089 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1090 data = json.loads(jsond)
1091 return url, data
02dbf93f
PH
1092
1093
02dbf93f
PH
1094def format_bytes(bytes):
1095 if bytes is None:
1096 return u'N/A'
1097 if type(bytes) is str:
1098 bytes = float(bytes)
1099 if bytes == 0.0:
1100 exponent = 0
1101 else:
1102 exponent = int(math.log(bytes, 1024.0))
1103 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1104 converted = float(bytes) / float(1024 ** exponent)
1105 return u'%.2f%s' % (converted, suffix)
f53c966a 1106
1c088fa8 1107
f53c966a
JMF
1108def str_to_int(int_str):
1109 int_str = re.sub(r'[,\.]', u'', int_str)
1110 return int(int_str)
1c088fa8
PH
1111
1112
1113def get_term_width():
1114 columns = os.environ.get('COLUMNS', None)
1115 if columns:
1116 return int(columns)
1117
1118 try:
1119 sp = subprocess.Popen(
1120 ['stty', 'size'],
1121 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1122 out, err = sp.communicate()
1123 return int(out.split()[1])
1124 except:
1125 pass
1126 return None
caefb1de
PH
1127
1128
1129def month_by_name(name):
1130 """ Return the number of a month by (locale-independently) English name """
1131
1132 ENGLISH_NAMES = [
dadb8184 1133 u'January', u'February', u'March', u'April', u'May', u'June',
caefb1de
PH
1134 u'July', u'August', u'September', u'October', u'November', u'December']
1135 try:
1136 return ENGLISH_NAMES.index(name) + 1
1137 except ValueError:
1138 return None
18258362
JMF
1139
1140
5aafe895 1141def fix_xml_ampersands(xml_str):
18258362 1142 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1143 return re.sub(
1144 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1145 u'&amp;',
1146 xml_str)
e3946f98
PH
1147
1148
1149def setproctitle(title):
8bf48f23 1150 assert isinstance(title, compat_str)
e3946f98
PH
1151 try:
1152 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1153 except OSError:
1154 return
6eefe533
PH
1155 title_bytes = title.encode('utf-8')
1156 buf = ctypes.create_string_buffer(len(title_bytes))
1157 buf.value = title_bytes
e3946f98 1158 try:
6eefe533 1159 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1160 except AttributeError:
1161 return # Strange libc, just skip this
d7dda168
PH
1162
1163
1164def remove_start(s, start):
1165 if s.startswith(start):
1166 return s[len(start):]
1167 return s
29eb5174
PH
1168
1169
1170def url_basename(url):
9b8aaeed
JMF
1171 path = compat_urlparse.urlparse(url).path
1172 return path.strip(u'/').split(u'/')[-1]
aa94a6d3
PH
1173
1174
1175class HEADRequest(compat_urllib_request.Request):
1176 def get_method(self):
1177 return "HEAD"
7217e148
PH
1178
1179
dd27fd17
PH
1180def int_or_none(v, scale=1):
1181 return v if v is None else (int(v) // scale)
608d11f5
PH
1182
1183
1184def parse_duration(s):
1185 if s is None:
1186 return None
1187
1188 m = re.match(
ba40a746 1189 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
608d11f5
PH
1190 if not m:
1191 return None
1192 res = int(m.group('secs'))
1193 if m.group('mins'):
1194 res += int(m.group('mins')) * 60
1195 if m.group('hours'):
1196 res += int(m.group('hours')) * 60 * 60
1197 return res
91d7d0b3
JMF
1198
1199
1200def prepend_extension(filename, ext):
1201 name, real_ext = os.path.splitext(filename)
1202 return u'{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1203
1204
1205def check_executable(exe, args=[]):
1206 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1207 args can be a list of arguments for a short output (like -version) """
1208 try:
1209 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1210 except OSError:
1211 return False
1212 return exe
b7ab0590
PH
1213
1214
1215class PagedList(object):
1216 def __init__(self, pagefunc, pagesize):
1217 self._pagefunc = pagefunc
1218 self._pagesize = pagesize
1219
dd26ced1
PH
1220 def __len__(self):
1221 # This is only useful for tests
1222 return len(self.getslice())
1223
b7ab0590
PH
1224 def getslice(self, start=0, end=None):
1225 res = []
1226 for pagenum in itertools.count(start // self._pagesize):
1227 firstid = pagenum * self._pagesize
1228 nextfirstid = pagenum * self._pagesize + self._pagesize
1229 if start >= nextfirstid:
1230 continue
1231
1232 page_results = list(self._pagefunc(pagenum))
1233
1234 startv = (
1235 start % self._pagesize
1236 if firstid <= start < nextfirstid
1237 else 0)
1238
1239 endv = (
1240 ((end - 1) % self._pagesize) + 1
1241 if (end is not None and firstid <= end <= nextfirstid)
1242 else None)
1243
1244 if startv != 0 or endv is not None:
1245 page_results = page_results[startv:endv]
1246 res.extend(page_results)
1247
1248 # A little optimization - if current page is not "full", ie. does
1249 # not contain page_size videos then we can assume that this page
1250 # is the last one - there are no more ids on further pages -
1251 # i.e. no need to query again.
1252 if len(page_results) + startv < self._pagesize:
1253 break
1254
1255 # If we got the whole page, but the next page is not interesting,
1256 # break out early as well
1257 if end == nextfirstid:
1258 break
1259 return res
81c2f20b
PH
1260
1261
1262def uppercase_escape(s):
1263 return re.sub(
1264 r'\\U([0-9a-fA-F]{8})',
1265 lambda m: compat_chr(int(m.group(1), base=16)), s)
b53466e1
PH
1266
1267try:
1268 struct.pack(u'!I', 0)
1269except TypeError:
1270 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1271 def struct_pack(spec, *args):
1272 if isinstance(spec, compat_str):
1273 spec = spec.encode('ascii')
1274 return struct.pack(spec, *args)
1275
1276 def struct_unpack(spec, *args):
1277 if isinstance(spec, compat_str):
1278 spec = spec.encode('ascii')
1279 return struct.unpack(spec, *args)
1280else:
1281 struct_pack = struct.pack
1282 struct_unpack = struct.unpack
62e609ab
PH
1283
1284
1285def read_batch_urls(batch_fd):
1286 def fixup(url):
1287 if not isinstance(url, compat_str):
1288 url = url.decode('utf-8', 'replace')
1289 BOM_UTF8 = u'\xef\xbb\xbf'
1290 if url.startswith(BOM_UTF8):
1291 url = url[len(BOM_UTF8):]
1292 url = url.strip()
1293 if url.startswith(('#', ';', ']')):
1294 return False
1295 return url
1296
1297 with contextlib.closing(batch_fd) as fd:
1298 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1299
1300
1301def urlencode_postdata(*args, **kargs):
1302 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1303
1304
1305def parse_xml(s):
1306 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1307 def doctype(self, name, pubid, system):
1308 pass # Ignore doctypes
1309
1310 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1311 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1312 return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
e68301af
PH
1313
1314
1315if sys.version_info < (3, 0) and sys.platform == 'win32':
1316 def compat_getpass(prompt, *args, **kwargs):
1317 if isinstance(prompt, compat_str):
4e6f9aec 1318 prompt = prompt.encode(preferredencoding())
e68301af
PH
1319 return getpass.getpass(prompt, *args, **kwargs)
1320else:
1321 compat_getpass = getpass.getpass
a1a530b0
PH
1322
1323
1324US_RATINGS = {
1325 'G': 0,
1326 'PG': 10,
1327 'PG-13': 13,
1328 'R': 16,
1329 'NC': 18,
1330}
fac55558
PH
1331
1332
1333def strip_jsonp(code):
1334 return re.sub(r'(?s)^[a-zA-Z_]+\s*\(\s*(.*)\);\s*?\s*$', r'\1', code)