]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
release 2014.04.01.2
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
912b38b4 4import calendar
62e609ab 5import contextlib
e3946f98 6import ctypes
c496ca96
PH
7import datetime
8import email.utils
f45c185f 9import errno
e68301af 10import getpass
d77c3dfd 11import gzip
b7ab0590 12import itertools
03f9daab 13import io
f4bfd65f 14import json
d77c3dfd 15import locale
02dbf93f 16import math
d77c3dfd 17import os
4eb7f1d1 18import pipes
c496ca96 19import platform
d77c3dfd 20import re
13ebea79 21import ssl
c496ca96 22import socket
b53466e1 23import struct
1c088fa8 24import subprocess
d77c3dfd 25import sys
01951dda 26import traceback
bcf89ce6 27import xml.etree.ElementTree
d77c3dfd 28import zlib
d77c3dfd 29
01ba00ca 30try:
59ae15a5 31 import urllib.request as compat_urllib_request
01ba00ca 32except ImportError: # Python 2
59ae15a5 33 import urllib2 as compat_urllib_request
01ba00ca
PH
34
35try:
59ae15a5 36 import urllib.error as compat_urllib_error
01ba00ca 37except ImportError: # Python 2
59ae15a5 38 import urllib2 as compat_urllib_error
01ba00ca
PH
39
40try:
59ae15a5 41 import urllib.parse as compat_urllib_parse
01ba00ca 42except ImportError: # Python 2
59ae15a5 43 import urllib as compat_urllib_parse
01ba00ca 44
799c0763
PH
45try:
46 from urllib.parse import urlparse as compat_urllib_parse_urlparse
47except ImportError: # Python 2
48 from urlparse import urlparse as compat_urllib_parse_urlparse
49
6543f0dc
JMF
50try:
51 import urllib.parse as compat_urlparse
52except ImportError: # Python 2
53 import urlparse as compat_urlparse
54
01ba00ca 55try:
59ae15a5 56 import http.cookiejar as compat_cookiejar
01ba00ca 57except ImportError: # Python 2
59ae15a5 58 import cookielib as compat_cookiejar
01ba00ca 59
3e669f36 60try:
59ae15a5 61 import html.entities as compat_html_entities
9f37a959 62except ImportError: # Python 2
59ae15a5 63 import htmlentitydefs as compat_html_entities
3e669f36 64
a8156c1d 65try:
59ae15a5 66 import html.parser as compat_html_parser
9f37a959 67except ImportError: # Python 2
59ae15a5 68 import HTMLParser as compat_html_parser
a8156c1d 69
348d0a7a 70try:
59ae15a5 71 import http.client as compat_http_client
9f37a959 72except ImportError: # Python 2
59ae15a5 73 import httplib as compat_http_client
348d0a7a 74
2eabb802 75try:
0e283428 76 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
77except ImportError: # Python 2
78 from urllib2 import HTTPError as compat_HTTPError
79
e0df6211
PH
80try:
81 from urllib.request import urlretrieve as compat_urlretrieve
82except ImportError: # Python 2
83 from urllib import urlretrieve as compat_urlretrieve
84
85
5910e210
PH
86try:
87 from subprocess import DEVNULL
88 compat_subprocess_get_DEVNULL = lambda: DEVNULL
89except ImportError:
90 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
91
9f37a959 92try:
59ae15a5 93 from urllib.parse import parse_qs as compat_parse_qs
9f37a959 94except ImportError: # Python 2
59ae15a5
PH
95 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
96 # Python 2's version is apparently totally broken
97 def _unquote(string, encoding='utf-8', errors='replace'):
98 if string == '':
99 return string
100 res = string.split('%')
101 if len(res) == 1:
102 return string
103 if encoding is None:
104 encoding = 'utf-8'
105 if errors is None:
106 errors = 'replace'
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
108 pct_sequence = b''
109 string = res[0]
110 for item in res[1:]:
111 try:
112 if not item:
113 raise ValueError
114 pct_sequence += item[:2].decode('hex')
115 rest = item[2:]
116 if not rest:
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
120 continue
121 except ValueError:
122 rest = '%' + item
123 # Encountered non-percent-encoded characters. Flush the current
124 # pct_sequence.
125 string += pct_sequence.decode(encoding, errors) + rest
126 pct_sequence = b''
127 if pct_sequence:
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
130 return string
131
132 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
133 encoding='utf-8', errors='replace'):
134 qs, _coerce_result = qs, unicode
135 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
136 r = []
137 for name_value in pairs:
138 if not name_value and not strict_parsing:
139 continue
140 nv = name_value.split('=', 1)
141 if len(nv) != 2:
142 if strict_parsing:
143 raise ValueError("bad query field: %r" % (name_value,))
144 # Handle case of a control-name with no equal sign
145 if keep_blank_values:
146 nv.append('')
147 else:
148 continue
149 if len(nv[1]) or keep_blank_values:
150 name = nv[0].replace('+', ' ')
151 name = _unquote(name, encoding=encoding, errors=errors)
152 name = _coerce_result(name)
153 value = nv[1].replace('+', ' ')
154 value = _unquote(value, encoding=encoding, errors=errors)
155 value = _coerce_result(value)
156 r.append((name, value))
157 return r
158
159 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
160 encoding='utf-8', errors='replace'):
161 parsed_result = {}
162 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
163 encoding=encoding, errors=errors)
164 for name, value in pairs:
165 if name in parsed_result:
166 parsed_result[name].append(value)
167 else:
168 parsed_result[name] = [value]
169 return parsed_result
348d0a7a 170
3e669f36 171try:
59ae15a5 172 compat_str = unicode # Python 2
3e669f36 173except NameError:
59ae15a5 174 compat_str = str
3e669f36
PH
175
176try:
59ae15a5 177 compat_chr = unichr # Python 2
3e669f36 178except NameError:
59ae15a5 179 compat_chr = chr
3e669f36 180
f7300c5c
JMF
181try:
182 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
183except ImportError: # Python 2.6
184 from xml.parsers.expat import ExpatError as compat_xml_parse_error
185
b31756c1
FV
186def compat_ord(c):
187 if type(c) is int: return c
188 else: return ord(c)
189
468e2e92
FV
190# This is not clearly defined otherwise
191compiled_regex_type = type(re.compile(''))
192
3e669f36 193std_headers = {
ae8f7871 194 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
59ae15a5
PH
195 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
196 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
197 'Accept-Encoding': 'gzip, deflate',
198 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 199}
f427df17 200
d77c3dfd 201def preferredencoding():
59ae15a5 202 """Get preferred encoding.
d77c3dfd 203
59ae15a5
PH
204 Returns the best encoding scheme for the system, based on
205 locale.getpreferredencoding() and some further tweaks.
206 """
207 try:
208 pref = locale.getpreferredencoding()
209 u'TEST'.encode(pref)
210 except:
211 pref = 'UTF-8'
bae611f2 212
59ae15a5 213 return pref
d77c3dfd 214
8cd10ac4 215if sys.version_info < (3,0):
59ae15a5
PH
216 def compat_print(s):
217 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 218else:
59ae15a5
PH
219 def compat_print(s):
220 assert type(s) == type(u'')
221 print(s)
d77c3dfd 222
f4bfd65f
PH
223# In Python 2.x, json.dump expects a bytestream.
224# In Python 3.x, it writes to a character stream
225if sys.version_info < (3,0):
226 def write_json_file(obj, fn):
227 with open(fn, 'wb') as f:
228 json.dump(obj, f)
229else:
230 def write_json_file(obj, fn):
231 with open(fn, 'w', encoding='utf-8') as f:
232 json.dump(obj, f)
233
59ae56fa
PH
234if sys.version_info >= (2,7):
235 def find_xpath_attr(node, xpath, key, val):
236 """ Find the xpath xpath[@key=val] """
5de3ece2 237 assert re.match(r'^[a-zA-Z]+$', key)
af1588c0 238 assert re.match(r'^[a-zA-Z0-9@\s:._]*$', val)
59ae56fa
PH
239 expr = xpath + u"[@%s='%s']" % (key, val)
240 return node.find(expr)
241else:
242 def find_xpath_attr(node, xpath, key, val):
243 for f in node.findall(xpath):
244 if f.attrib.get(key) == val:
245 return f
246 return None
247
d7e66d39
JMF
248# On python2.6 the xml.etree.ElementTree.Element methods don't support
249# the namespace parameter
250def xpath_with_ns(path, ns_map):
251 components = [c.split(':') for c in path.split('/')]
252 replaced = []
253 for c in components:
254 if len(c) == 1:
255 replaced.append(c[0])
256 else:
257 ns, tag = c
258 replaced.append('{%s}%s' % (ns_map[ns], tag))
259 return '/'.join(replaced)
260
d77c3dfd 261def htmlentity_transform(matchobj):
59ae15a5
PH
262 """Transforms an HTML entity to a character.
263
264 This function receives a match object and is intended to be used with
265 the re.sub() function.
266 """
267 entity = matchobj.group(1)
268
269 # Known non-numeric HTML entity
270 if entity in compat_html_entities.name2codepoint:
271 return compat_chr(compat_html_entities.name2codepoint[entity])
272
273 mobj = re.match(u'(?u)#(x?\\d+)', entity)
274 if mobj is not None:
275 numstr = mobj.group(1)
276 if numstr.startswith(u'x'):
277 base = 16
278 numstr = u'0%s' % numstr
279 else:
280 base = 10
281 return compat_chr(int(numstr, base))
282
283 # Unknown entity in name, return its literal representation
284 return (u'&%s;' % entity)
d77c3dfd 285
a8156c1d 286compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
287class BaseHTMLParser(compat_html_parser.HTMLParser):
288 def __init(self):
289 compat_html_parser.HTMLParser.__init__(self)
290 self.html = None
291
292 def loads(self, html):
293 self.html = html
294 self.feed(html)
295 self.close()
296
297class AttrParser(BaseHTMLParser):
43e8fafd
ND
298 """Modified HTMLParser that isolates a tag with the specified attribute"""
299 def __init__(self, attribute, value):
300 self.attribute = attribute
301 self.value = value
59ae15a5
PH
302 self.result = None
303 self.started = False
304 self.depth = {}
59ae15a5
PH
305 self.watch_startpos = False
306 self.error_count = 0
a921f407 307 BaseHTMLParser.__init__(self)
59ae15a5
PH
308
309 def error(self, message):
310 if self.error_count > 10 or self.started:
311 raise compat_html_parser.HTMLParseError(message, self.getpos())
312 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
313 self.error_count += 1
314 self.goahead(1)
315
59ae15a5
PH
316 def handle_starttag(self, tag, attrs):
317 attrs = dict(attrs)
318 if self.started:
319 self.find_startpos(None)
43e8fafd 320 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
321 self.result = [tag]
322 self.started = True
323 self.watch_startpos = True
324 if self.started:
325 if not tag in self.depth: self.depth[tag] = 0
326 self.depth[tag] += 1
327
328 def handle_endtag(self, tag):
329 if self.started:
330 if tag in self.depth: self.depth[tag] -= 1
331 if self.depth[self.result[0]] == 0:
332 self.started = False
333 self.result.append(self.getpos())
334
335 def find_startpos(self, x):
336 """Needed to put the start position of the result (self.result[1])
337 after the opening tag with the requested id"""
338 if self.watch_startpos:
339 self.watch_startpos = False
340 self.result.append(self.getpos())
341 handle_entityref = handle_charref = handle_data = handle_comment = \
342 handle_decl = handle_pi = unknown_decl = find_startpos
343
344 def get_result(self):
345 if self.result is None:
346 return None
347 if len(self.result) != 3:
348 return None
349 lines = self.html.split('\n')
350 lines = lines[self.result[1][0]-1:self.result[2][0]]
351 lines[0] = lines[0][self.result[1][1]:]
352 if len(lines) == 1:
353 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
354 lines[-1] = lines[-1][:self.result[2][1]]
355 return '\n'.join(lines).strip()
3b024e17
PH
356# Hack for https://github.com/rg3/youtube-dl/issues/662
357if sys.version_info < (2, 7, 3):
358 AttrParser.parse_endtag = (lambda self, i:
359 i + len("</scr'+'ipt>")
360 if self.rawdata[i:].startswith("</scr'+'ipt>")
361 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
362
363def get_element_by_id(id, html):
43e8fafd
ND
364 """Return the content of the tag with the specified ID in the passed HTML document"""
365 return get_element_by_attribute("id", id, html)
366
367def get_element_by_attribute(attribute, value, html):
368 """Return the content of the tag with the specified attribute in the passed HTML document"""
369 parser = AttrParser(attribute, value)
59ae15a5
PH
370 try:
371 parser.loads(html)
372 except compat_html_parser.HTMLParseError:
373 pass
374 return parser.get_result()
9e6dd238 375
a921f407
JMF
376class MetaParser(BaseHTMLParser):
377 """
378 Modified HTMLParser that isolates a meta tag with the specified name
379 attribute.
380 """
381 def __init__(self, name):
382 BaseHTMLParser.__init__(self)
383 self.name = name
384 self.content = None
385 self.result = None
386
387 def handle_starttag(self, tag, attrs):
388 if tag != 'meta':
389 return
390 attrs = dict(attrs)
391 if attrs.get('name') == self.name:
392 self.result = attrs.get('content')
393
394 def get_result(self):
395 return self.result
396
397def get_meta_content(name, html):
398 """
399 Return the content attribute from the meta tag with the given name attribute.
400 """
401 parser = MetaParser(name)
402 try:
403 parser.loads(html)
404 except compat_html_parser.HTMLParseError:
405 pass
406 return parser.get_result()
407
9e6dd238
FV
408
409def clean_html(html):
59ae15a5
PH
410 """Clean an HTML snippet into a readable string"""
411 # Newline vs <br />
412 html = html.replace('\n', ' ')
6b3aef80
FV
413 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
414 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
415 # Strip html tags
416 html = re.sub('<.*?>', '', html)
417 # Replace html entities
418 html = unescapeHTML(html)
7decf895 419 return html.strip()
9e6dd238
FV
420
421
d77c3dfd 422def sanitize_open(filename, open_mode):
59ae15a5
PH
423 """Try to open the given filename, and slightly tweak it if this fails.
424
425 Attempts to open the given filename. If this fails, it tries to change
426 the filename slightly, step by step, until it's either able to open it
427 or it fails and raises a final exception, like the standard open()
428 function.
429
430 It returns the tuple (stream, definitive_file_name).
431 """
432 try:
433 if filename == u'-':
434 if sys.platform == 'win32':
435 import msvcrt
436 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 437 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
438 stream = open(encodeFilename(filename), open_mode)
439 return (stream, filename)
440 except (IOError, OSError) as err:
f45c185f
PH
441 if err.errno in (errno.EACCES,):
442 raise
59ae15a5 443
f45c185f
PH
444 # In case of error, try to remove win32 forbidden chars
445 alt_filename = os.path.join(
446 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
447 for path_part in os.path.split(filename)
448 )
449 if alt_filename == filename:
450 raise
451 else:
452 # An exception here should be caught in the caller
453 stream = open(encodeFilename(filename), open_mode)
454 return (stream, alt_filename)
d77c3dfd
FV
455
456
457def timeconvert(timestr):
59ae15a5
PH
458 """Convert RFC 2822 defined time string into system timestamp"""
459 timestamp = None
460 timetuple = email.utils.parsedate_tz(timestr)
461 if timetuple is not None:
462 timestamp = email.utils.mktime_tz(timetuple)
463 return timestamp
1c469a94 464
796173d0 465def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
466 """Sanitizes a string so it could be used as part of a filename.
467 If restricted is set, use a stricter subset of allowed characters.
796173d0 468 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
469 """
470 def replace_insane(char):
471 if char == '?' or ord(char) < 32 or ord(char) == 127:
472 return ''
473 elif char == '"':
474 return '' if restricted else '\''
475 elif char == ':':
476 return '_-' if restricted else ' -'
477 elif char in '\\/|*<>':
478 return '_'
627dcfff 479 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
480 return '_'
481 if restricted and ord(char) > 127:
482 return '_'
483 return char
484
485 result = u''.join(map(replace_insane, s))
796173d0
PH
486 if not is_id:
487 while '__' in result:
488 result = result.replace('__', '_')
489 result = result.strip('_')
490 # Common case of "Foreign band name - English song title"
491 if restricted and result.startswith('-_'):
492 result = result[2:]
493 if not result:
494 result = '_'
59ae15a5 495 return result
d77c3dfd
FV
496
497def orderedSet(iterable):
59ae15a5
PH
498 """ Remove all duplicates from the input iterable """
499 res = []
500 for el in iterable:
501 if el not in res:
502 res.append(el)
503 return res
d77c3dfd 504
912b38b4 505
d77c3dfd 506def unescapeHTML(s):
912b38b4
PH
507 if s is None:
508 return None
509 assert type(s) == compat_str
d77c3dfd 510
912b38b4 511 result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
59ae15a5 512 return result
d77c3dfd 513
8bf48f23
PH
514
515def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
516 """
517 @param s The name of the file
518 """
d77c3dfd 519
8bf48f23 520 assert type(s) == compat_str
d77c3dfd 521
59ae15a5
PH
522 # Python 3 has a Unicode API
523 if sys.version_info >= (3, 0):
524 return s
0f00efed 525
59ae15a5
PH
526 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
527 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
528 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
529 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
8bf48f23
PH
530 if not for_subprocess:
531 return s
532 else:
533 # For subprocess calls, encode with locale encoding
534 # Refer to http://stackoverflow.com/a/9951851/35070
535 encoding = preferredencoding()
59ae15a5 536 else:
6df40dcb 537 encoding = sys.getfilesystemencoding()
8bf48f23
PH
538 if encoding is None:
539 encoding = 'utf-8'
540 return s.encode(encoding, 'ignore')
541
8271226a
PH
542def decodeOption(optval):
543 if optval is None:
544 return optval
545 if isinstance(optval, bytes):
546 optval = optval.decode(preferredencoding())
547
548 assert isinstance(optval, compat_str)
549 return optval
1c256f70 550
4539dd30
PH
551def formatSeconds(secs):
552 if secs > 3600:
553 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
554 elif secs > 60:
555 return '%d:%02d' % (secs // 60, secs % 60)
556 else:
557 return '%d' % secs
558
a0ddb8a2
PH
559
560def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
13ebea79
PH
561 if sys.version_info < (3, 2):
562 import httplib
563
564 class HTTPSConnectionV3(httplib.HTTPSConnection):
565 def __init__(self, *args, **kwargs):
566 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
567
568 def connect(self):
569 sock = socket.create_connection((self.host, self.port), self.timeout)
ac79fa02 570 if getattr(self, '_tunnel_host', False):
13ebea79
PH
571 self.sock = sock
572 self._tunnel()
573 try:
574 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
de79c46c 575 except ssl.SSLError:
13ebea79
PH
576 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
577
578 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
579 def https_open(self, req):
580 return self.do_open(HTTPSConnectionV3, req)
a0ddb8a2 581 return HTTPSHandlerV3(**kwargs)
ea6d901e 582 else:
13ebea79 583 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
ea6d901e 584 context.verify_mode = (ssl.CERT_NONE
dca08720 585 if opts_no_check_certificate
ea6d901e 586 else ssl.CERT_REQUIRED)
303b479e
PH
587 context.set_default_verify_paths()
588 try:
589 context.load_default_certs()
590 except AttributeError:
591 pass # Python < 3.4
a0ddb8a2 592 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
ea6d901e 593
1c256f70
PH
594class ExtractorError(Exception):
595 """Error during info extraction."""
2eabb802 596 def __init__(self, msg, tb=None, expected=False, cause=None):
9a82b238
PH
597 """ tb, if given, is the original traceback (so that it can be printed out).
598 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
599 """
600
601 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
602 expected = True
603 if not expected:
298f833b 604 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 605 super(ExtractorError, self).__init__(msg)
d5979c5d 606
1c256f70 607 self.traceback = tb
8cc83b8d 608 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 609 self.cause = cause
1c256f70 610
01951dda
PH
611 def format_traceback(self):
612 if self.traceback is None:
613 return None
614 return u''.join(traceback.format_tb(self.traceback))
615
1c256f70 616
55b3e45b
JMF
617class RegexNotFoundError(ExtractorError):
618 """Error when a regex didn't match"""
619 pass
620
621
d77c3dfd 622class DownloadError(Exception):
59ae15a5 623 """Download Error exception.
d77c3dfd 624
59ae15a5
PH
625 This exception may be thrown by FileDownloader objects if they are not
626 configured to continue on errors. They will contain the appropriate
627 error message.
628 """
8cc83b8d
FV
629 def __init__(self, msg, exc_info=None):
630 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
631 super(DownloadError, self).__init__(msg)
632 self.exc_info = exc_info
d77c3dfd
FV
633
634
635class SameFileError(Exception):
59ae15a5 636 """Same File exception.
d77c3dfd 637
59ae15a5
PH
638 This exception will be thrown by FileDownloader objects if they detect
639 multiple files would have to be downloaded to the same file on disk.
640 """
641 pass
d77c3dfd
FV
642
643
644class PostProcessingError(Exception):
59ae15a5 645 """Post Processing exception.
d77c3dfd 646
59ae15a5
PH
647 This exception may be raised by PostProcessor's .run() method to
648 indicate an error in the postprocessing task.
649 """
7851b379
PH
650 def __init__(self, msg):
651 self.msg = msg
d77c3dfd
FV
652
653class MaxDownloadsReached(Exception):
59ae15a5
PH
654 """ --max-downloads limit has been reached. """
655 pass
d77c3dfd
FV
656
657
658class UnavailableVideoError(Exception):
59ae15a5 659 """Unavailable Format exception.
d77c3dfd 660
59ae15a5
PH
661 This exception will be thrown when a video is requested
662 in a format that is not available for that video.
663 """
664 pass
d77c3dfd
FV
665
666
667class ContentTooShortError(Exception):
59ae15a5 668 """Content Too Short exception.
d77c3dfd 669
59ae15a5
PH
670 This exception may be raised by FileDownloader objects when a file they
671 download is too small for what the server announced first, indicating
672 the connection was probably interrupted.
673 """
674 # Both in bytes
675 downloaded = None
676 expected = None
d77c3dfd 677
59ae15a5
PH
678 def __init__(self, downloaded, expected):
679 self.downloaded = downloaded
680 self.expected = expected
d77c3dfd 681
acebc9cd 682class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
683 """Handler for HTTP requests and responses.
684
685 This class, when installed with an OpenerDirector, automatically adds
686 the standard headers to every HTTP request and handles gzipped and
687 deflated responses from web servers. If compression is to be avoided in
688 a particular request, the original request in the program code only has
689 to include the HTTP header "Youtubedl-No-Compression", which will be
690 removed before making the real request.
691
692 Part of this code was copied from:
693
694 http://techknack.net/python-urllib2-handlers/
695
696 Andrew Rowls, the author of that code, agreed to release it to the
697 public domain.
698 """
699
700 @staticmethod
701 def deflate(data):
702 try:
703 return zlib.decompress(data, -zlib.MAX_WBITS)
704 except zlib.error:
705 return zlib.decompress(data)
706
707 @staticmethod
708 def addinfourl_wrapper(stream, headers, url, code):
709 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
710 return compat_urllib_request.addinfourl(stream, headers, url, code)
711 ret = compat_urllib_request.addinfourl(stream, headers, url)
712 ret.code = code
713 return ret
714
acebc9cd
PH
715 def http_request(self, req):
716 for h,v in std_headers.items():
59ae15a5
PH
717 if h in req.headers:
718 del req.headers[h]
335959e7 719 req.add_header(h, v)
59ae15a5
PH
720 if 'Youtubedl-no-compression' in req.headers:
721 if 'Accept-encoding' in req.headers:
722 del req.headers['Accept-encoding']
723 del req.headers['Youtubedl-no-compression']
3446dfb7 724 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
725 if 'User-agent' in req.headers:
726 del req.headers['User-agent']
727 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 728 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
729 return req
730
acebc9cd 731 def http_response(self, req, resp):
59ae15a5
PH
732 old_resp = resp
733 # gzip
734 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
735 content = resp.read()
736 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
737 try:
738 uncompressed = io.BytesIO(gz.read())
739 except IOError as original_ioerror:
740 # There may be junk add the end of the file
741 # See http://stackoverflow.com/q/4928560/35070 for details
742 for i in range(1, 1024):
743 try:
744 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
745 uncompressed = io.BytesIO(gz.read())
746 except IOError:
747 continue
748 break
749 else:
750 raise original_ioerror
751 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
752 resp.msg = old_resp.msg
753 # deflate
754 if resp.headers.get('Content-encoding', '') == 'deflate':
755 gz = io.BytesIO(self.deflate(resp.read()))
756 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
757 resp.msg = old_resp.msg
758 return resp
0f8d03f8 759
acebc9cd
PH
760 https_request = http_request
761 https_response = http_response
bf50b038 762
5de90176 763
912b38b4
PH
764def parse_iso8601(date_str):
765 """ Return a UNIX timestamp from the given date """
766
767 if date_str is None:
768 return None
769
770 m = re.search(
771 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
772 date_str)
773 if not m:
774 timezone = datetime.timedelta()
775 else:
776 date_str = date_str[:-len(m.group(0))]
777 if not m.group('sign'):
778 timezone = datetime.timedelta()
779 else:
780 sign = 1 if m.group('sign') == '+' else -1
781 timezone = datetime.timedelta(
782 hours=sign * int(m.group('hours')),
783 minutes=sign * int(m.group('minutes')))
784
785 dt = datetime.datetime.strptime(date_str, '%Y-%m-%dT%H:%M:%S') - timezone
786 return calendar.timegm(dt.timetuple())
787
788
bf50b038
JMF
789def unified_strdate(date_str):
790 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
791
792 if date_str is None:
793 return None
794
bf50b038
JMF
795 upload_date = None
796 #Replace commas
026fcc04 797 date_str = date_str.replace(',', ' ')
bf50b038 798 # %z (UTC offset) is only supported in python>=3.2
026fcc04 799 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
19e1d359
JMF
800 format_expressions = [
801 '%d %B %Y',
0f99566c 802 '%d %b %Y',
19e1d359
JMF
803 '%B %d %Y',
804 '%b %d %Y',
805 '%Y-%m-%d',
4cf96546 806 '%d.%m.%Y',
19e1d359
JMF
807 '%d/%m/%Y',
808 '%Y/%m/%d %H:%M:%S',
5d73273f 809 '%Y-%m-%d %H:%M:%S',
19e1d359 810 '%d.%m.%Y %H:%M',
b047de6f 811 '%d.%m.%Y %H.%M',
19e1d359 812 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
813 '%Y-%m-%dT%H:%M:%S.%fZ',
814 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 815 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 816 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 817 '%Y-%m-%dT%H:%M',
19e1d359 818 ]
bf50b038
JMF
819 for expression in format_expressions:
820 try:
821 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 822 except ValueError:
bf50b038 823 pass
42393ce2
PH
824 if upload_date is None:
825 timetuple = email.utils.parsedate_tz(date_str)
826 if timetuple:
827 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
828 return upload_date
829
cbdbb766 830def determine_ext(url, default_ext=u'unknown_video'):
73e79f2a
PH
831 guess = url.partition(u'?')[0].rpartition(u'.')[2]
832 if re.match(r'^[A-Za-z0-9]+$', guess):
833 return guess
834 else:
cbdbb766 835 return default_ext
73e79f2a 836
d4051a8e
JMF
837def subtitles_filename(filename, sub_lang, sub_format):
838 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
839
bd558525 840def date_from_str(date_str):
37254abc
JMF
841 """
842 Return a datetime object from a string in the format YYYYMMDD or
843 (now|today)[+-][0-9](day|week|month|year)(s)?"""
844 today = datetime.date.today()
845 if date_str == 'now'or date_str == 'today':
846 return today
847 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
848 if match is not None:
849 sign = match.group('sign')
850 time = int(match.group('time'))
851 if sign == '-':
852 time = -time
853 unit = match.group('unit')
854 #A bad aproximation?
855 if unit == 'month':
856 unit = 'day'
857 time *= 30
858 elif unit == 'year':
859 unit = 'day'
860 time *= 365
861 unit += 's'
862 delta = datetime.timedelta(**{unit: time})
863 return today + delta
bd558525
JMF
864 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
865
e63fc1be 866def hyphenate_date(date_str):
867 """
868 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
869 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
870 if match is not None:
871 return '-'.join(match.groups())
872 else:
873 return date_str
874
bd558525
JMF
875class DateRange(object):
876 """Represents a time interval between two dates"""
877 def __init__(self, start=None, end=None):
878 """start and end must be strings in the format accepted by date"""
879 if start is not None:
880 self.start = date_from_str(start)
881 else:
882 self.start = datetime.datetime.min.date()
883 if end is not None:
884 self.end = date_from_str(end)
885 else:
886 self.end = datetime.datetime.max.date()
37254abc 887 if self.start > self.end:
bd558525
JMF
888 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
889 @classmethod
890 def day(cls, day):
891 """Returns a range that only contains the given day"""
892 return cls(day,day)
893 def __contains__(self, date):
894 """Check if the date is in the range"""
37254abc
JMF
895 if not isinstance(date, datetime.date):
896 date = date_from_str(date)
897 return self.start <= date <= self.end
bd558525
JMF
898 def __str__(self):
899 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
900
901
902def platform_name():
903 """ Returns the platform name as a compat_str """
904 res = platform.platform()
905 if isinstance(res, bytes):
906 res = res.decode(preferredencoding())
907
908 assert isinstance(res, compat_str)
909 return res
c257baff
PH
910
911
7459e3a2
PH
912def write_string(s, out=None):
913 if out is None:
914 out = sys.stderr
8bf48f23 915 assert type(s) == compat_str
7459e3a2
PH
916
917 if ('b' in getattr(out, 'mode', '') or
918 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
919 s = s.encode(preferredencoding(), 'ignore')
8bf48f23
PH
920 try:
921 out.write(s)
922 except UnicodeEncodeError:
923 # In Windows shells, this can fail even when the codec is just charmap!?
924 # See https://wiki.python.org/moin/PrintFails#Issue
925 if sys.platform == 'win32' and hasattr(out, 'encoding'):
926 s = s.encode(out.encoding, 'ignore').decode(out.encoding)
927 out.write(s)
928 else:
929 raise
930
7459e3a2
PH
931 out.flush()
932
933
48ea9cea
PH
934def bytes_to_intlist(bs):
935 if not bs:
936 return []
937 if isinstance(bs[0], int): # Python 3
938 return list(bs)
939 else:
940 return [ord(c) for c in bs]
941
c257baff 942
cba892fa 943def intlist_to_bytes(xs):
944 if not xs:
945 return b''
946 if isinstance(chr(0), bytes): # Python 2
947 return ''.join([chr(x) for x in xs])
948 else:
949 return bytes(xs)
c38b1e77
PH
950
951
952def get_cachedir(params={}):
953 cache_root = os.environ.get('XDG_CACHE_HOME',
954 os.path.expanduser('~/.cache'))
955 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
c1c9a79c
PH
956
957
958# Cross-platform file locking
959if sys.platform == 'win32':
960 import ctypes.wintypes
961 import msvcrt
962
963 class OVERLAPPED(ctypes.Structure):
964 _fields_ = [
965 ('Internal', ctypes.wintypes.LPVOID),
966 ('InternalHigh', ctypes.wintypes.LPVOID),
967 ('Offset', ctypes.wintypes.DWORD),
968 ('OffsetHigh', ctypes.wintypes.DWORD),
969 ('hEvent', ctypes.wintypes.HANDLE),
970 ]
971
972 kernel32 = ctypes.windll.kernel32
973 LockFileEx = kernel32.LockFileEx
974 LockFileEx.argtypes = [
975 ctypes.wintypes.HANDLE, # hFile
976 ctypes.wintypes.DWORD, # dwFlags
977 ctypes.wintypes.DWORD, # dwReserved
978 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
979 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
980 ctypes.POINTER(OVERLAPPED) # Overlapped
981 ]
982 LockFileEx.restype = ctypes.wintypes.BOOL
983 UnlockFileEx = kernel32.UnlockFileEx
984 UnlockFileEx.argtypes = [
985 ctypes.wintypes.HANDLE, # hFile
986 ctypes.wintypes.DWORD, # dwReserved
987 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
988 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
989 ctypes.POINTER(OVERLAPPED) # Overlapped
990 ]
991 UnlockFileEx.restype = ctypes.wintypes.BOOL
992 whole_low = 0xffffffff
993 whole_high = 0x7fffffff
994
995 def _lock_file(f, exclusive):
996 overlapped = OVERLAPPED()
997 overlapped.Offset = 0
998 overlapped.OffsetHigh = 0
999 overlapped.hEvent = 0
1000 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1001 handle = msvcrt.get_osfhandle(f.fileno())
1002 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1003 whole_low, whole_high, f._lock_file_overlapped_p):
1004 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1005
1006 def _unlock_file(f):
1007 assert f._lock_file_overlapped_p
1008 handle = msvcrt.get_osfhandle(f.fileno())
1009 if not UnlockFileEx(handle, 0,
1010 whole_low, whole_high, f._lock_file_overlapped_p):
1011 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1012
1013else:
1014 import fcntl
1015
1016 def _lock_file(f, exclusive):
1017 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1018
1019 def _unlock_file(f):
1020 fcntl.lockf(f, fcntl.LOCK_UN)
1021
1022
1023class locked_file(object):
1024 def __init__(self, filename, mode, encoding=None):
1025 assert mode in ['r', 'a', 'w']
1026 self.f = io.open(filename, mode, encoding=encoding)
1027 self.mode = mode
1028
1029 def __enter__(self):
1030 exclusive = self.mode != 'r'
1031 try:
1032 _lock_file(self.f, exclusive)
1033 except IOError:
1034 self.f.close()
1035 raise
1036 return self
1037
1038 def __exit__(self, etype, value, traceback):
1039 try:
1040 _unlock_file(self.f)
1041 finally:
1042 self.f.close()
1043
1044 def __iter__(self):
1045 return iter(self.f)
1046
1047 def write(self, *args):
1048 return self.f.write(*args)
1049
1050 def read(self, *args):
1051 return self.f.read(*args)
4eb7f1d1
JMF
1052
1053
1054def shell_quote(args):
a6a173c2
JMF
1055 quoted_args = []
1056 encoding = sys.getfilesystemencoding()
1057 if encoding is None:
1058 encoding = 'utf-8'
1059 for a in args:
1060 if isinstance(a, bytes):
1061 # We may get a filename encoded with 'encodeFilename'
1062 a = a.decode(encoding)
1063 quoted_args.append(pipes.quote(a))
1064 return u' '.join(quoted_args)
9d4660ca
PH
1065
1066
f4d96df0
PH
1067def takewhile_inclusive(pred, seq):
1068 """ Like itertools.takewhile, but include the latest evaluated element
1069 (the first element so that Not pred(e)) """
1070 for e in seq:
1071 yield e
1072 if not pred(e):
1073 return
1074
1075
9d4660ca
PH
1076def smuggle_url(url, data):
1077 """ Pass additional data in a URL for internal use. """
1078
1079 sdata = compat_urllib_parse.urlencode(
1080 {u'__youtubedl_smuggle': json.dumps(data)})
1081 return url + u'#' + sdata
1082
1083
79f82953 1084def unsmuggle_url(smug_url, default=None):
9d4660ca 1085 if not '#__youtubedl_smuggle' in smug_url:
79f82953 1086 return smug_url, default
9d4660ca
PH
1087 url, _, sdata = smug_url.rpartition(u'#')
1088 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1089 data = json.loads(jsond)
1090 return url, data
02dbf93f
PH
1091
1092
02dbf93f
PH
1093def format_bytes(bytes):
1094 if bytes is None:
1095 return u'N/A'
1096 if type(bytes) is str:
1097 bytes = float(bytes)
1098 if bytes == 0.0:
1099 exponent = 0
1100 else:
1101 exponent = int(math.log(bytes, 1024.0))
1102 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1103 converted = float(bytes) / float(1024 ** exponent)
1104 return u'%.2f%s' % (converted, suffix)
f53c966a 1105
1c088fa8 1106
f53c966a
JMF
1107def str_to_int(int_str):
1108 int_str = re.sub(r'[,\.]', u'', int_str)
1109 return int(int_str)
1c088fa8
PH
1110
1111
1112def get_term_width():
1113 columns = os.environ.get('COLUMNS', None)
1114 if columns:
1115 return int(columns)
1116
1117 try:
1118 sp = subprocess.Popen(
1119 ['stty', 'size'],
1120 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1121 out, err = sp.communicate()
1122 return int(out.split()[1])
1123 except:
1124 pass
1125 return None
caefb1de
PH
1126
1127
1128def month_by_name(name):
1129 """ Return the number of a month by (locale-independently) English name """
1130
1131 ENGLISH_NAMES = [
dadb8184 1132 u'January', u'February', u'March', u'April', u'May', u'June',
caefb1de
PH
1133 u'July', u'August', u'September', u'October', u'November', u'December']
1134 try:
1135 return ENGLISH_NAMES.index(name) + 1
1136 except ValueError:
1137 return None
18258362
JMF
1138
1139
5aafe895 1140def fix_xml_ampersands(xml_str):
18258362 1141 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1142 return re.sub(
1143 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1144 u'&amp;',
1145 xml_str)
e3946f98
PH
1146
1147
1148def setproctitle(title):
8bf48f23 1149 assert isinstance(title, compat_str)
e3946f98
PH
1150 try:
1151 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1152 except OSError:
1153 return
6eefe533
PH
1154 title_bytes = title.encode('utf-8')
1155 buf = ctypes.create_string_buffer(len(title_bytes))
1156 buf.value = title_bytes
e3946f98 1157 try:
6eefe533 1158 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1159 except AttributeError:
1160 return # Strange libc, just skip this
d7dda168
PH
1161
1162
1163def remove_start(s, start):
1164 if s.startswith(start):
1165 return s[len(start):]
1166 return s
29eb5174
PH
1167
1168
1169def url_basename(url):
9b8aaeed
JMF
1170 path = compat_urlparse.urlparse(url).path
1171 return path.strip(u'/').split(u'/')[-1]
aa94a6d3
PH
1172
1173
1174class HEADRequest(compat_urllib_request.Request):
1175 def get_method(self):
1176 return "HEAD"
7217e148
PH
1177
1178
dd27fd17
PH
1179def int_or_none(v, scale=1):
1180 return v if v is None else (int(v) // scale)
608d11f5
PH
1181
1182
43f775e4
PH
1183def float_or_none(v, scale=1):
1184 return v if v is None else (float(v) / scale)
1185
1186
608d11f5
PH
1187def parse_duration(s):
1188 if s is None:
1189 return None
1190
1191 m = re.match(
ba40a746 1192 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?$', s)
608d11f5
PH
1193 if not m:
1194 return None
1195 res = int(m.group('secs'))
1196 if m.group('mins'):
1197 res += int(m.group('mins')) * 60
1198 if m.group('hours'):
1199 res += int(m.group('hours')) * 60 * 60
1200 return res
91d7d0b3
JMF
1201
1202
1203def prepend_extension(filename, ext):
1204 name, real_ext = os.path.splitext(filename)
1205 return u'{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1206
1207
1208def check_executable(exe, args=[]):
1209 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1210 args can be a list of arguments for a short output (like -version) """
1211 try:
1212 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1213 except OSError:
1214 return False
1215 return exe
b7ab0590
PH
1216
1217
1218class PagedList(object):
1219 def __init__(self, pagefunc, pagesize):
1220 self._pagefunc = pagefunc
1221 self._pagesize = pagesize
1222
dd26ced1
PH
1223 def __len__(self):
1224 # This is only useful for tests
1225 return len(self.getslice())
1226
b7ab0590
PH
1227 def getslice(self, start=0, end=None):
1228 res = []
1229 for pagenum in itertools.count(start // self._pagesize):
1230 firstid = pagenum * self._pagesize
1231 nextfirstid = pagenum * self._pagesize + self._pagesize
1232 if start >= nextfirstid:
1233 continue
1234
1235 page_results = list(self._pagefunc(pagenum))
1236
1237 startv = (
1238 start % self._pagesize
1239 if firstid <= start < nextfirstid
1240 else 0)
1241
1242 endv = (
1243 ((end - 1) % self._pagesize) + 1
1244 if (end is not None and firstid <= end <= nextfirstid)
1245 else None)
1246
1247 if startv != 0 or endv is not None:
1248 page_results = page_results[startv:endv]
1249 res.extend(page_results)
1250
1251 # A little optimization - if current page is not "full", ie. does
1252 # not contain page_size videos then we can assume that this page
1253 # is the last one - there are no more ids on further pages -
1254 # i.e. no need to query again.
1255 if len(page_results) + startv < self._pagesize:
1256 break
1257
1258 # If we got the whole page, but the next page is not interesting,
1259 # break out early as well
1260 if end == nextfirstid:
1261 break
1262 return res
81c2f20b
PH
1263
1264
1265def uppercase_escape(s):
1266 return re.sub(
1267 r'\\U([0-9a-fA-F]{8})',
1268 lambda m: compat_chr(int(m.group(1), base=16)), s)
b53466e1
PH
1269
1270try:
1271 struct.pack(u'!I', 0)
1272except TypeError:
1273 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1274 def struct_pack(spec, *args):
1275 if isinstance(spec, compat_str):
1276 spec = spec.encode('ascii')
1277 return struct.pack(spec, *args)
1278
1279 def struct_unpack(spec, *args):
1280 if isinstance(spec, compat_str):
1281 spec = spec.encode('ascii')
1282 return struct.unpack(spec, *args)
1283else:
1284 struct_pack = struct.pack
1285 struct_unpack = struct.unpack
62e609ab
PH
1286
1287
1288def read_batch_urls(batch_fd):
1289 def fixup(url):
1290 if not isinstance(url, compat_str):
1291 url = url.decode('utf-8', 'replace')
1292 BOM_UTF8 = u'\xef\xbb\xbf'
1293 if url.startswith(BOM_UTF8):
1294 url = url[len(BOM_UTF8):]
1295 url = url.strip()
1296 if url.startswith(('#', ';', ']')):
1297 return False
1298 return url
1299
1300 with contextlib.closing(batch_fd) as fd:
1301 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1302
1303
1304def urlencode_postdata(*args, **kargs):
1305 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1306
1307
1308def parse_xml(s):
1309 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1310 def doctype(self, name, pubid, system):
1311 pass # Ignore doctypes
1312
1313 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1314 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1315 return xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
e68301af
PH
1316
1317
1318if sys.version_info < (3, 0) and sys.platform == 'win32':
1319 def compat_getpass(prompt, *args, **kwargs):
1320 if isinstance(prompt, compat_str):
4e6f9aec 1321 prompt = prompt.encode(preferredencoding())
e68301af
PH
1322 return getpass.getpass(prompt, *args, **kwargs)
1323else:
1324 compat_getpass = getpass.getpass
a1a530b0
PH
1325
1326
1327US_RATINGS = {
1328 'G': 0,
1329 'PG': 10,
1330 'PG-13': 13,
1331 'R': 16,
1332 'NC': 18,
1333}
fac55558
PH
1334
1335
1336def strip_jsonp(code):
1337 return re.sub(r'(?s)^[a-zA-Z_]+\s*\(\s*(.*)\);\s*?\s*$', r'\1', code)