]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
[utils] Default SSL to TLS. (Fixes #3727)
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import calendar
5 import codecs
6 import contextlib
7 import ctypes
8 import datetime
9 import email.utils
10 import errno
11 import getpass
12 import gzip
13 import itertools
14 import io
15 import json
16 import locale
17 import math
18 import os
19 import pipes
20 import platform
21 import re
22 import ssl
23 import socket
24 import struct
25 import subprocess
26 import sys
27 import tempfile
28 import traceback
29 import xml.etree.ElementTree
30 import zlib
31
32 try:
33 import urllib.request as compat_urllib_request
34 except ImportError: # Python 2
35 import urllib2 as compat_urllib_request
36
37 try:
38 import urllib.error as compat_urllib_error
39 except ImportError: # Python 2
40 import urllib2 as compat_urllib_error
41
42 try:
43 import urllib.parse as compat_urllib_parse
44 except ImportError: # Python 2
45 import urllib as compat_urllib_parse
46
47 try:
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49 except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
51
52 try:
53 import urllib.parse as compat_urlparse
54 except ImportError: # Python 2
55 import urlparse as compat_urlparse
56
57 try:
58 import http.cookiejar as compat_cookiejar
59 except ImportError: # Python 2
60 import cookielib as compat_cookiejar
61
62 try:
63 import html.entities as compat_html_entities
64 except ImportError: # Python 2
65 import htmlentitydefs as compat_html_entities
66
67 try:
68 import html.parser as compat_html_parser
69 except ImportError: # Python 2
70 import HTMLParser as compat_html_parser
71
72 try:
73 import http.client as compat_http_client
74 except ImportError: # Python 2
75 import httplib as compat_http_client
76
77 try:
78 from urllib.error import HTTPError as compat_HTTPError
79 except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
81
82 try:
83 from urllib.request import urlretrieve as compat_urlretrieve
84 except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
86
87
88 try:
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
91 except ImportError:
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
93
94 try:
95 from urllib.parse import unquote as compat_urllib_parse_unquote
96 except ImportError:
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
98 if string == '':
99 return string
100 res = string.split('%')
101 if len(res) == 1:
102 return string
103 if encoding is None:
104 encoding = 'utf-8'
105 if errors is None:
106 errors = 'replace'
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
108 pct_sequence = b''
109 string = res[0]
110 for item in res[1:]:
111 try:
112 if not item:
113 raise ValueError
114 pct_sequence += item[:2].decode('hex')
115 rest = item[2:]
116 if not rest:
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
120 continue
121 except ValueError:
122 rest = '%' + item
123 # Encountered non-percent-encoded characters. Flush the current
124 # pct_sequence.
125 string += pct_sequence.decode(encoding, errors) + rest
126 pct_sequence = b''
127 if pct_sequence:
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
130 return string
131
132
133 try:
134 from urllib.parse import parse_qs as compat_parse_qs
135 except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
138
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
143 r = []
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
146 continue
147 nv = name_value.split('=', 1)
148 if len(nv) != 2:
149 if strict_parsing:
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
153 nv.append('')
154 else:
155 continue
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
164 value = _coerce_result(value)
165 r.append((name, value))
166 return r
167
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
170 parsed_result = {}
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
176 else:
177 parsed_result[name] = [value]
178 return parsed_result
179
180 try:
181 compat_str = unicode # Python 2
182 except NameError:
183 compat_str = str
184
185 try:
186 compat_chr = unichr # Python 2
187 except NameError:
188 compat_chr = chr
189
190 try:
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192 except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
194
195 try:
196 from shlex import quote as shlex_quote
197 except ImportError: # Python < 3.3
198 def shlex_quote(s):
199 return "'" + s.replace("'", "'\"'\"'") + "'"
200
201
202 def compat_ord(c):
203 if type(c) is int: return c
204 else: return ord(c)
205
206 # This is not clearly defined otherwise
207 compiled_regex_type = type(re.compile(''))
208
209 std_headers = {
210 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
211 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
212 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
213 'Accept-Encoding': 'gzip, deflate',
214 'Accept-Language': 'en-us,en;q=0.5',
215 }
216
217 def preferredencoding():
218 """Get preferred encoding.
219
220 Returns the best encoding scheme for the system, based on
221 locale.getpreferredencoding() and some further tweaks.
222 """
223 try:
224 pref = locale.getpreferredencoding()
225 u'TEST'.encode(pref)
226 except:
227 pref = 'UTF-8'
228
229 return pref
230
231 if sys.version_info < (3,0):
232 def compat_print(s):
233 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
234 else:
235 def compat_print(s):
236 assert type(s) == type(u'')
237 print(s)
238
239
240 def write_json_file(obj, fn):
241 """ Encode obj as JSON and write it to fn, atomically """
242
243 args = {
244 'suffix': '.tmp',
245 'prefix': os.path.basename(fn) + '.',
246 'dir': os.path.dirname(fn),
247 'delete': False,
248 }
249
250 # In Python 2.x, json.dump expects a bytestream.
251 # In Python 3.x, it writes to a character stream
252 if sys.version_info < (3, 0):
253 args['mode'] = 'wb'
254 else:
255 args.update({
256 'mode': 'w',
257 'encoding': 'utf-8',
258 })
259
260 tf = tempfile.NamedTemporaryFile(**args)
261
262 try:
263 with tf:
264 json.dump(obj, tf)
265 os.rename(tf.name, fn)
266 except:
267 try:
268 os.remove(tf.name)
269 except OSError:
270 pass
271 raise
272
273
274 if sys.version_info >= (2, 7):
275 def find_xpath_attr(node, xpath, key, val):
276 """ Find the xpath xpath[@key=val] """
277 assert re.match(r'^[a-zA-Z-]+$', key)
278 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
279 expr = xpath + u"[@%s='%s']" % (key, val)
280 return node.find(expr)
281 else:
282 def find_xpath_attr(node, xpath, key, val):
283 for f in node.findall(xpath):
284 if f.attrib.get(key) == val:
285 return f
286 return None
287
288 # On python2.6 the xml.etree.ElementTree.Element methods don't support
289 # the namespace parameter
290 def xpath_with_ns(path, ns_map):
291 components = [c.split(':') for c in path.split('/')]
292 replaced = []
293 for c in components:
294 if len(c) == 1:
295 replaced.append(c[0])
296 else:
297 ns, tag = c
298 replaced.append('{%s}%s' % (ns_map[ns], tag))
299 return '/'.join(replaced)
300
301
302 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
303 class BaseHTMLParser(compat_html_parser.HTMLParser):
304 def __init(self):
305 compat_html_parser.HTMLParser.__init__(self)
306 self.html = None
307
308 def loads(self, html):
309 self.html = html
310 self.feed(html)
311 self.close()
312
313 class AttrParser(BaseHTMLParser):
314 """Modified HTMLParser that isolates a tag with the specified attribute"""
315 def __init__(self, attribute, value):
316 self.attribute = attribute
317 self.value = value
318 self.result = None
319 self.started = False
320 self.depth = {}
321 self.watch_startpos = False
322 self.error_count = 0
323 BaseHTMLParser.__init__(self)
324
325 def error(self, message):
326 if self.error_count > 10 or self.started:
327 raise compat_html_parser.HTMLParseError(message, self.getpos())
328 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
329 self.error_count += 1
330 self.goahead(1)
331
332 def handle_starttag(self, tag, attrs):
333 attrs = dict(attrs)
334 if self.started:
335 self.find_startpos(None)
336 if self.attribute in attrs and attrs[self.attribute] == self.value:
337 self.result = [tag]
338 self.started = True
339 self.watch_startpos = True
340 if self.started:
341 if not tag in self.depth: self.depth[tag] = 0
342 self.depth[tag] += 1
343
344 def handle_endtag(self, tag):
345 if self.started:
346 if tag in self.depth: self.depth[tag] -= 1
347 if self.depth[self.result[0]] == 0:
348 self.started = False
349 self.result.append(self.getpos())
350
351 def find_startpos(self, x):
352 """Needed to put the start position of the result (self.result[1])
353 after the opening tag with the requested id"""
354 if self.watch_startpos:
355 self.watch_startpos = False
356 self.result.append(self.getpos())
357 handle_entityref = handle_charref = handle_data = handle_comment = \
358 handle_decl = handle_pi = unknown_decl = find_startpos
359
360 def get_result(self):
361 if self.result is None:
362 return None
363 if len(self.result) != 3:
364 return None
365 lines = self.html.split('\n')
366 lines = lines[self.result[1][0]-1:self.result[2][0]]
367 lines[0] = lines[0][self.result[1][1]:]
368 if len(lines) == 1:
369 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
370 lines[-1] = lines[-1][:self.result[2][1]]
371 return '\n'.join(lines).strip()
372 # Hack for https://github.com/rg3/youtube-dl/issues/662
373 if sys.version_info < (2, 7, 3):
374 AttrParser.parse_endtag = (lambda self, i:
375 i + len("</scr'+'ipt>")
376 if self.rawdata[i:].startswith("</scr'+'ipt>")
377 else compat_html_parser.HTMLParser.parse_endtag(self, i))
378
379 def get_element_by_id(id, html):
380 """Return the content of the tag with the specified ID in the passed HTML document"""
381 return get_element_by_attribute("id", id, html)
382
383 def get_element_by_attribute(attribute, value, html):
384 """Return the content of the tag with the specified attribute in the passed HTML document"""
385 parser = AttrParser(attribute, value)
386 try:
387 parser.loads(html)
388 except compat_html_parser.HTMLParseError:
389 pass
390 return parser.get_result()
391
392 class MetaParser(BaseHTMLParser):
393 """
394 Modified HTMLParser that isolates a meta tag with the specified name
395 attribute.
396 """
397 def __init__(self, name):
398 BaseHTMLParser.__init__(self)
399 self.name = name
400 self.content = None
401 self.result = None
402
403 def handle_starttag(self, tag, attrs):
404 if tag != 'meta':
405 return
406 attrs = dict(attrs)
407 if attrs.get('name') == self.name:
408 self.result = attrs.get('content')
409
410 def get_result(self):
411 return self.result
412
413 def get_meta_content(name, html):
414 """
415 Return the content attribute from the meta tag with the given name attribute.
416 """
417 parser = MetaParser(name)
418 try:
419 parser.loads(html)
420 except compat_html_parser.HTMLParseError:
421 pass
422 return parser.get_result()
423
424
425 def clean_html(html):
426 """Clean an HTML snippet into a readable string"""
427 # Newline vs <br />
428 html = html.replace('\n', ' ')
429 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
430 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
431 # Strip html tags
432 html = re.sub('<.*?>', '', html)
433 # Replace html entities
434 html = unescapeHTML(html)
435 return html.strip()
436
437
438 def sanitize_open(filename, open_mode):
439 """Try to open the given filename, and slightly tweak it if this fails.
440
441 Attempts to open the given filename. If this fails, it tries to change
442 the filename slightly, step by step, until it's either able to open it
443 or it fails and raises a final exception, like the standard open()
444 function.
445
446 It returns the tuple (stream, definitive_file_name).
447 """
448 try:
449 if filename == u'-':
450 if sys.platform == 'win32':
451 import msvcrt
452 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
453 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
454 stream = open(encodeFilename(filename), open_mode)
455 return (stream, filename)
456 except (IOError, OSError) as err:
457 if err.errno in (errno.EACCES,):
458 raise
459
460 # In case of error, try to remove win32 forbidden chars
461 alt_filename = os.path.join(
462 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
463 for path_part in os.path.split(filename)
464 )
465 if alt_filename == filename:
466 raise
467 else:
468 # An exception here should be caught in the caller
469 stream = open(encodeFilename(filename), open_mode)
470 return (stream, alt_filename)
471
472
473 def timeconvert(timestr):
474 """Convert RFC 2822 defined time string into system timestamp"""
475 timestamp = None
476 timetuple = email.utils.parsedate_tz(timestr)
477 if timetuple is not None:
478 timestamp = email.utils.mktime_tz(timetuple)
479 return timestamp
480
481 def sanitize_filename(s, restricted=False, is_id=False):
482 """Sanitizes a string so it could be used as part of a filename.
483 If restricted is set, use a stricter subset of allowed characters.
484 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
485 """
486 def replace_insane(char):
487 if char == '?' or ord(char) < 32 or ord(char) == 127:
488 return ''
489 elif char == '"':
490 return '' if restricted else '\''
491 elif char == ':':
492 return '_-' if restricted else ' -'
493 elif char in '\\/|*<>':
494 return '_'
495 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
496 return '_'
497 if restricted and ord(char) > 127:
498 return '_'
499 return char
500
501 result = u''.join(map(replace_insane, s))
502 if not is_id:
503 while '__' in result:
504 result = result.replace('__', '_')
505 result = result.strip('_')
506 # Common case of "Foreign band name - English song title"
507 if restricted and result.startswith('-_'):
508 result = result[2:]
509 if not result:
510 result = '_'
511 return result
512
513 def orderedSet(iterable):
514 """ Remove all duplicates from the input iterable """
515 res = []
516 for el in iterable:
517 if el not in res:
518 res.append(el)
519 return res
520
521
522 def _htmlentity_transform(entity):
523 """Transforms an HTML entity to a character."""
524 # Known non-numeric HTML entity
525 if entity in compat_html_entities.name2codepoint:
526 return compat_chr(compat_html_entities.name2codepoint[entity])
527
528 mobj = re.match(r'#(x?[0-9]+)', entity)
529 if mobj is not None:
530 numstr = mobj.group(1)
531 if numstr.startswith(u'x'):
532 base = 16
533 numstr = u'0%s' % numstr
534 else:
535 base = 10
536 return compat_chr(int(numstr, base))
537
538 # Unknown entity in name, return its literal representation
539 return (u'&%s;' % entity)
540
541
542 def unescapeHTML(s):
543 if s is None:
544 return None
545 assert type(s) == compat_str
546
547 return re.sub(
548 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
549
550
551 def encodeFilename(s, for_subprocess=False):
552 """
553 @param s The name of the file
554 """
555
556 assert type(s) == compat_str
557
558 # Python 3 has a Unicode API
559 if sys.version_info >= (3, 0):
560 return s
561
562 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
563 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
564 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
565 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
566 if not for_subprocess:
567 return s
568 else:
569 # For subprocess calls, encode with locale encoding
570 # Refer to http://stackoverflow.com/a/9951851/35070
571 encoding = preferredencoding()
572 else:
573 encoding = sys.getfilesystemencoding()
574 if encoding is None:
575 encoding = 'utf-8'
576 return s.encode(encoding, 'ignore')
577
578
579 def encodeArgument(s):
580 if not isinstance(s, compat_str):
581 # Legacy code that uses byte strings
582 # Uncomment the following line after fixing all post processors
583 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
584 s = s.decode('ascii')
585 return encodeFilename(s, True)
586
587
588 def decodeOption(optval):
589 if optval is None:
590 return optval
591 if isinstance(optval, bytes):
592 optval = optval.decode(preferredencoding())
593
594 assert isinstance(optval, compat_str)
595 return optval
596
597 def formatSeconds(secs):
598 if secs > 3600:
599 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
600 elif secs > 60:
601 return '%d:%02d' % (secs // 60, secs % 60)
602 else:
603 return '%d' % secs
604
605
606 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
607 if sys.version_info < (3, 2):
608 import httplib
609
610 class HTTPSConnectionV3(httplib.HTTPSConnection):
611 def __init__(self, *args, **kwargs):
612 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
613
614 def connect(self):
615 sock = socket.create_connection((self.host, self.port), self.timeout)
616 if getattr(self, '_tunnel_host', False):
617 self.sock = sock
618 self._tunnel()
619 try:
620 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
621 except ssl.SSLError:
622 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
623
624 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
625 def https_open(self, req):
626 return self.do_open(HTTPSConnectionV3, req)
627 return HTTPSHandlerV3(**kwargs)
628 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
629 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
630 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
631 if opts_no_check_certificate:
632 context.verify_mode = ssl.CERT_NONE
633 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
634 else: # Python < 3.4
635 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
636 context.verify_mode = (ssl.CERT_NONE
637 if opts_no_check_certificate
638 else ssl.CERT_REQUIRED)
639 context.set_default_verify_paths()
640 try:
641 context.load_default_certs()
642 except AttributeError:
643 pass # Python < 3.4
644 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
645
646 class ExtractorError(Exception):
647 """Error during info extraction."""
648 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
649 """ tb, if given, is the original traceback (so that it can be printed out).
650 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
651 """
652
653 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
654 expected = True
655 if video_id is not None:
656 msg = video_id + ': ' + msg
657 if not expected:
658 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
659 super(ExtractorError, self).__init__(msg)
660
661 self.traceback = tb
662 self.exc_info = sys.exc_info() # preserve original exception
663 self.cause = cause
664 self.video_id = video_id
665
666 def format_traceback(self):
667 if self.traceback is None:
668 return None
669 return u''.join(traceback.format_tb(self.traceback))
670
671
672 class RegexNotFoundError(ExtractorError):
673 """Error when a regex didn't match"""
674 pass
675
676
677 class DownloadError(Exception):
678 """Download Error exception.
679
680 This exception may be thrown by FileDownloader objects if they are not
681 configured to continue on errors. They will contain the appropriate
682 error message.
683 """
684 def __init__(self, msg, exc_info=None):
685 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
686 super(DownloadError, self).__init__(msg)
687 self.exc_info = exc_info
688
689
690 class SameFileError(Exception):
691 """Same File exception.
692
693 This exception will be thrown by FileDownloader objects if they detect
694 multiple files would have to be downloaded to the same file on disk.
695 """
696 pass
697
698
699 class PostProcessingError(Exception):
700 """Post Processing exception.
701
702 This exception may be raised by PostProcessor's .run() method to
703 indicate an error in the postprocessing task.
704 """
705 def __init__(self, msg):
706 self.msg = msg
707
708 class MaxDownloadsReached(Exception):
709 """ --max-downloads limit has been reached. """
710 pass
711
712
713 class UnavailableVideoError(Exception):
714 """Unavailable Format exception.
715
716 This exception will be thrown when a video is requested
717 in a format that is not available for that video.
718 """
719 pass
720
721
722 class ContentTooShortError(Exception):
723 """Content Too Short exception.
724
725 This exception may be raised by FileDownloader objects when a file they
726 download is too small for what the server announced first, indicating
727 the connection was probably interrupted.
728 """
729 # Both in bytes
730 downloaded = None
731 expected = None
732
733 def __init__(self, downloaded, expected):
734 self.downloaded = downloaded
735 self.expected = expected
736
737 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
738 """Handler for HTTP requests and responses.
739
740 This class, when installed with an OpenerDirector, automatically adds
741 the standard headers to every HTTP request and handles gzipped and
742 deflated responses from web servers. If compression is to be avoided in
743 a particular request, the original request in the program code only has
744 to include the HTTP header "Youtubedl-No-Compression", which will be
745 removed before making the real request.
746
747 Part of this code was copied from:
748
749 http://techknack.net/python-urllib2-handlers/
750
751 Andrew Rowls, the author of that code, agreed to release it to the
752 public domain.
753 """
754
755 @staticmethod
756 def deflate(data):
757 try:
758 return zlib.decompress(data, -zlib.MAX_WBITS)
759 except zlib.error:
760 return zlib.decompress(data)
761
762 @staticmethod
763 def addinfourl_wrapper(stream, headers, url, code):
764 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
765 return compat_urllib_request.addinfourl(stream, headers, url, code)
766 ret = compat_urllib_request.addinfourl(stream, headers, url)
767 ret.code = code
768 return ret
769
770 def http_request(self, req):
771 for h, v in std_headers.items():
772 if h not in req.headers:
773 req.add_header(h, v)
774 if 'Youtubedl-no-compression' in req.headers:
775 if 'Accept-encoding' in req.headers:
776 del req.headers['Accept-encoding']
777 del req.headers['Youtubedl-no-compression']
778 if 'Youtubedl-user-agent' in req.headers:
779 if 'User-agent' in req.headers:
780 del req.headers['User-agent']
781 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
782 del req.headers['Youtubedl-user-agent']
783 return req
784
785 def http_response(self, req, resp):
786 old_resp = resp
787 # gzip
788 if resp.headers.get('Content-encoding', '') == 'gzip':
789 content = resp.read()
790 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
791 try:
792 uncompressed = io.BytesIO(gz.read())
793 except IOError as original_ioerror:
794 # There may be junk add the end of the file
795 # See http://stackoverflow.com/q/4928560/35070 for details
796 for i in range(1, 1024):
797 try:
798 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
799 uncompressed = io.BytesIO(gz.read())
800 except IOError:
801 continue
802 break
803 else:
804 raise original_ioerror
805 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
806 resp.msg = old_resp.msg
807 # deflate
808 if resp.headers.get('Content-encoding', '') == 'deflate':
809 gz = io.BytesIO(self.deflate(resp.read()))
810 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
811 resp.msg = old_resp.msg
812 return resp
813
814 https_request = http_request
815 https_response = http_response
816
817
818 def parse_iso8601(date_str, delimiter='T'):
819 """ Return a UNIX timestamp from the given date """
820
821 if date_str is None:
822 return None
823
824 m = re.search(
825 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
826 date_str)
827 if not m:
828 timezone = datetime.timedelta()
829 else:
830 date_str = date_str[:-len(m.group(0))]
831 if not m.group('sign'):
832 timezone = datetime.timedelta()
833 else:
834 sign = 1 if m.group('sign') == '+' else -1
835 timezone = datetime.timedelta(
836 hours=sign * int(m.group('hours')),
837 minutes=sign * int(m.group('minutes')))
838 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
839 dt = datetime.datetime.strptime(date_str, date_format) - timezone
840 return calendar.timegm(dt.timetuple())
841
842
843 def unified_strdate(date_str):
844 """Return a string with the date in the format YYYYMMDD"""
845
846 if date_str is None:
847 return None
848
849 upload_date = None
850 #Replace commas
851 date_str = date_str.replace(',', ' ')
852 # %z (UTC offset) is only supported in python>=3.2
853 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
854 format_expressions = [
855 '%d %B %Y',
856 '%d %b %Y',
857 '%B %d %Y',
858 '%b %d %Y',
859 '%b %dst %Y %I:%M%p',
860 '%b %dnd %Y %I:%M%p',
861 '%b %dth %Y %I:%M%p',
862 '%Y-%m-%d',
863 '%Y/%m/%d',
864 '%d.%m.%Y',
865 '%d/%m/%Y',
866 '%d/%m/%y',
867 '%Y/%m/%d %H:%M:%S',
868 '%Y-%m-%d %H:%M:%S',
869 '%d.%m.%Y %H:%M',
870 '%d.%m.%Y %H.%M',
871 '%Y-%m-%dT%H:%M:%SZ',
872 '%Y-%m-%dT%H:%M:%S.%fZ',
873 '%Y-%m-%dT%H:%M:%S.%f0Z',
874 '%Y-%m-%dT%H:%M:%S',
875 '%Y-%m-%dT%H:%M:%S.%f',
876 '%Y-%m-%dT%H:%M',
877 ]
878 for expression in format_expressions:
879 try:
880 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
881 except ValueError:
882 pass
883 if upload_date is None:
884 timetuple = email.utils.parsedate_tz(date_str)
885 if timetuple:
886 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
887 return upload_date
888
889 def determine_ext(url, default_ext=u'unknown_video'):
890 if url is None:
891 return default_ext
892 guess = url.partition(u'?')[0].rpartition(u'.')[2]
893 if re.match(r'^[A-Za-z0-9]+$', guess):
894 return guess
895 else:
896 return default_ext
897
898 def subtitles_filename(filename, sub_lang, sub_format):
899 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
900
901 def date_from_str(date_str):
902 """
903 Return a datetime object from a string in the format YYYYMMDD or
904 (now|today)[+-][0-9](day|week|month|year)(s)?"""
905 today = datetime.date.today()
906 if date_str == 'now'or date_str == 'today':
907 return today
908 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
909 if match is not None:
910 sign = match.group('sign')
911 time = int(match.group('time'))
912 if sign == '-':
913 time = -time
914 unit = match.group('unit')
915 #A bad aproximation?
916 if unit == 'month':
917 unit = 'day'
918 time *= 30
919 elif unit == 'year':
920 unit = 'day'
921 time *= 365
922 unit += 's'
923 delta = datetime.timedelta(**{unit: time})
924 return today + delta
925 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
926
927 def hyphenate_date(date_str):
928 """
929 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
930 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
931 if match is not None:
932 return '-'.join(match.groups())
933 else:
934 return date_str
935
936 class DateRange(object):
937 """Represents a time interval between two dates"""
938 def __init__(self, start=None, end=None):
939 """start and end must be strings in the format accepted by date"""
940 if start is not None:
941 self.start = date_from_str(start)
942 else:
943 self.start = datetime.datetime.min.date()
944 if end is not None:
945 self.end = date_from_str(end)
946 else:
947 self.end = datetime.datetime.max.date()
948 if self.start > self.end:
949 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
950 @classmethod
951 def day(cls, day):
952 """Returns a range that only contains the given day"""
953 return cls(day,day)
954 def __contains__(self, date):
955 """Check if the date is in the range"""
956 if not isinstance(date, datetime.date):
957 date = date_from_str(date)
958 return self.start <= date <= self.end
959 def __str__(self):
960 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
961
962
963 def platform_name():
964 """ Returns the platform name as a compat_str """
965 res = platform.platform()
966 if isinstance(res, bytes):
967 res = res.decode(preferredencoding())
968
969 assert isinstance(res, compat_str)
970 return res
971
972
973 def _windows_write_string(s, out):
974 """ Returns True if the string was written using special methods,
975 False if it has yet to be written out."""
976 # Adapted from http://stackoverflow.com/a/3259271/35070
977
978 import ctypes
979 import ctypes.wintypes
980
981 WIN_OUTPUT_IDS = {
982 1: -11,
983 2: -12,
984 }
985
986 try:
987 fileno = out.fileno()
988 except AttributeError:
989 # If the output stream doesn't have a fileno, it's virtual
990 return False
991 if fileno not in WIN_OUTPUT_IDS:
992 return False
993
994 GetStdHandle = ctypes.WINFUNCTYPE(
995 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
996 ("GetStdHandle", ctypes.windll.kernel32))
997 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
998
999 WriteConsoleW = ctypes.WINFUNCTYPE(
1000 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1001 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1002 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1003 written = ctypes.wintypes.DWORD(0)
1004
1005 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1006 FILE_TYPE_CHAR = 0x0002
1007 FILE_TYPE_REMOTE = 0x8000
1008 GetConsoleMode = ctypes.WINFUNCTYPE(
1009 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1010 ctypes.POINTER(ctypes.wintypes.DWORD))(
1011 ("GetConsoleMode", ctypes.windll.kernel32))
1012 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1013
1014 def not_a_console(handle):
1015 if handle == INVALID_HANDLE_VALUE or handle is None:
1016 return True
1017 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1018 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1019
1020 if not_a_console(h):
1021 return False
1022
1023 def next_nonbmp_pos(s):
1024 try:
1025 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1026 except StopIteration:
1027 return len(s)
1028
1029 while s:
1030 count = min(next_nonbmp_pos(s), 1024)
1031
1032 ret = WriteConsoleW(
1033 h, s, count if count else 2, ctypes.byref(written), None)
1034 if ret == 0:
1035 raise OSError('Failed to write string')
1036 if not count: # We just wrote a non-BMP character
1037 assert written.value == 2
1038 s = s[1:]
1039 else:
1040 assert written.value > 0
1041 s = s[written.value:]
1042 return True
1043
1044
1045 def write_string(s, out=None, encoding=None):
1046 if out is None:
1047 out = sys.stderr
1048 assert type(s) == compat_str
1049
1050 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1051 if _windows_write_string(s, out):
1052 return
1053
1054 if ('b' in getattr(out, 'mode', '') or
1055 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1056 byt = s.encode(encoding or preferredencoding(), 'ignore')
1057 out.write(byt)
1058 elif hasattr(out, 'buffer'):
1059 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1060 byt = s.encode(enc, 'ignore')
1061 out.buffer.write(byt)
1062 else:
1063 out.write(s)
1064 out.flush()
1065
1066
1067 def bytes_to_intlist(bs):
1068 if not bs:
1069 return []
1070 if isinstance(bs[0], int): # Python 3
1071 return list(bs)
1072 else:
1073 return [ord(c) for c in bs]
1074
1075
1076 def intlist_to_bytes(xs):
1077 if not xs:
1078 return b''
1079 if isinstance(chr(0), bytes): # Python 2
1080 return ''.join([chr(x) for x in xs])
1081 else:
1082 return bytes(xs)
1083
1084
1085 # Cross-platform file locking
1086 if sys.platform == 'win32':
1087 import ctypes.wintypes
1088 import msvcrt
1089
1090 class OVERLAPPED(ctypes.Structure):
1091 _fields_ = [
1092 ('Internal', ctypes.wintypes.LPVOID),
1093 ('InternalHigh', ctypes.wintypes.LPVOID),
1094 ('Offset', ctypes.wintypes.DWORD),
1095 ('OffsetHigh', ctypes.wintypes.DWORD),
1096 ('hEvent', ctypes.wintypes.HANDLE),
1097 ]
1098
1099 kernel32 = ctypes.windll.kernel32
1100 LockFileEx = kernel32.LockFileEx
1101 LockFileEx.argtypes = [
1102 ctypes.wintypes.HANDLE, # hFile
1103 ctypes.wintypes.DWORD, # dwFlags
1104 ctypes.wintypes.DWORD, # dwReserved
1105 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1106 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1107 ctypes.POINTER(OVERLAPPED) # Overlapped
1108 ]
1109 LockFileEx.restype = ctypes.wintypes.BOOL
1110 UnlockFileEx = kernel32.UnlockFileEx
1111 UnlockFileEx.argtypes = [
1112 ctypes.wintypes.HANDLE, # hFile
1113 ctypes.wintypes.DWORD, # dwReserved
1114 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1115 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1116 ctypes.POINTER(OVERLAPPED) # Overlapped
1117 ]
1118 UnlockFileEx.restype = ctypes.wintypes.BOOL
1119 whole_low = 0xffffffff
1120 whole_high = 0x7fffffff
1121
1122 def _lock_file(f, exclusive):
1123 overlapped = OVERLAPPED()
1124 overlapped.Offset = 0
1125 overlapped.OffsetHigh = 0
1126 overlapped.hEvent = 0
1127 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1128 handle = msvcrt.get_osfhandle(f.fileno())
1129 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1130 whole_low, whole_high, f._lock_file_overlapped_p):
1131 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1132
1133 def _unlock_file(f):
1134 assert f._lock_file_overlapped_p
1135 handle = msvcrt.get_osfhandle(f.fileno())
1136 if not UnlockFileEx(handle, 0,
1137 whole_low, whole_high, f._lock_file_overlapped_p):
1138 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1139
1140 else:
1141 import fcntl
1142
1143 def _lock_file(f, exclusive):
1144 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1145
1146 def _unlock_file(f):
1147 fcntl.flock(f, fcntl.LOCK_UN)
1148
1149
1150 class locked_file(object):
1151 def __init__(self, filename, mode, encoding=None):
1152 assert mode in ['r', 'a', 'w']
1153 self.f = io.open(filename, mode, encoding=encoding)
1154 self.mode = mode
1155
1156 def __enter__(self):
1157 exclusive = self.mode != 'r'
1158 try:
1159 _lock_file(self.f, exclusive)
1160 except IOError:
1161 self.f.close()
1162 raise
1163 return self
1164
1165 def __exit__(self, etype, value, traceback):
1166 try:
1167 _unlock_file(self.f)
1168 finally:
1169 self.f.close()
1170
1171 def __iter__(self):
1172 return iter(self.f)
1173
1174 def write(self, *args):
1175 return self.f.write(*args)
1176
1177 def read(self, *args):
1178 return self.f.read(*args)
1179
1180
1181 def shell_quote(args):
1182 quoted_args = []
1183 encoding = sys.getfilesystemencoding()
1184 if encoding is None:
1185 encoding = 'utf-8'
1186 for a in args:
1187 if isinstance(a, bytes):
1188 # We may get a filename encoded with 'encodeFilename'
1189 a = a.decode(encoding)
1190 quoted_args.append(pipes.quote(a))
1191 return u' '.join(quoted_args)
1192
1193
1194 def takewhile_inclusive(pred, seq):
1195 """ Like itertools.takewhile, but include the latest evaluated element
1196 (the first element so that Not pred(e)) """
1197 for e in seq:
1198 yield e
1199 if not pred(e):
1200 return
1201
1202
1203 def smuggle_url(url, data):
1204 """ Pass additional data in a URL for internal use. """
1205
1206 sdata = compat_urllib_parse.urlencode(
1207 {u'__youtubedl_smuggle': json.dumps(data)})
1208 return url + u'#' + sdata
1209
1210
1211 def unsmuggle_url(smug_url, default=None):
1212 if not '#__youtubedl_smuggle' in smug_url:
1213 return smug_url, default
1214 url, _, sdata = smug_url.rpartition(u'#')
1215 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1216 data = json.loads(jsond)
1217 return url, data
1218
1219
1220 def format_bytes(bytes):
1221 if bytes is None:
1222 return u'N/A'
1223 if type(bytes) is str:
1224 bytes = float(bytes)
1225 if bytes == 0.0:
1226 exponent = 0
1227 else:
1228 exponent = int(math.log(bytes, 1024.0))
1229 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1230 converted = float(bytes) / float(1024 ** exponent)
1231 return u'%.2f%s' % (converted, suffix)
1232
1233
1234 def get_term_width():
1235 columns = os.environ.get('COLUMNS', None)
1236 if columns:
1237 return int(columns)
1238
1239 try:
1240 sp = subprocess.Popen(
1241 ['stty', 'size'],
1242 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1243 out, err = sp.communicate()
1244 return int(out.split()[1])
1245 except:
1246 pass
1247 return None
1248
1249
1250 def month_by_name(name):
1251 """ Return the number of a month by (locale-independently) English name """
1252
1253 ENGLISH_NAMES = [
1254 u'January', u'February', u'March', u'April', u'May', u'June',
1255 u'July', u'August', u'September', u'October', u'November', u'December']
1256 try:
1257 return ENGLISH_NAMES.index(name) + 1
1258 except ValueError:
1259 return None
1260
1261
1262 def fix_xml_ampersands(xml_str):
1263 """Replace all the '&' by '&amp;' in XML"""
1264 return re.sub(
1265 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1266 u'&amp;',
1267 xml_str)
1268
1269
1270 def setproctitle(title):
1271 assert isinstance(title, compat_str)
1272 try:
1273 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1274 except OSError:
1275 return
1276 title_bytes = title.encode('utf-8')
1277 buf = ctypes.create_string_buffer(len(title_bytes))
1278 buf.value = title_bytes
1279 try:
1280 libc.prctl(15, buf, 0, 0, 0)
1281 except AttributeError:
1282 return # Strange libc, just skip this
1283
1284
1285 def remove_start(s, start):
1286 if s.startswith(start):
1287 return s[len(start):]
1288 return s
1289
1290
1291 def remove_end(s, end):
1292 if s.endswith(end):
1293 return s[:-len(end)]
1294 return s
1295
1296
1297 def url_basename(url):
1298 path = compat_urlparse.urlparse(url).path
1299 return path.strip(u'/').split(u'/')[-1]
1300
1301
1302 class HEADRequest(compat_urllib_request.Request):
1303 def get_method(self):
1304 return "HEAD"
1305
1306
1307 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1308 if get_attr:
1309 if v is not None:
1310 v = getattr(v, get_attr, None)
1311 if v == '':
1312 v = None
1313 return default if v is None else (int(v) * invscale // scale)
1314
1315
1316 def str_or_none(v, default=None):
1317 return default if v is None else compat_str(v)
1318
1319
1320 def str_to_int(int_str):
1321 """ A more relaxed version of int_or_none """
1322 if int_str is None:
1323 return None
1324 int_str = re.sub(r'[,\.\+]', u'', int_str)
1325 return int(int_str)
1326
1327
1328 def float_or_none(v, scale=1, invscale=1, default=None):
1329 return default if v is None else (float(v) * invscale / scale)
1330
1331
1332 def parse_duration(s):
1333 if s is None:
1334 return None
1335
1336 s = s.strip()
1337
1338 m = re.match(
1339 r'(?i)(?:(?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?(?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$', s)
1340 if not m:
1341 return None
1342 res = int(m.group('secs'))
1343 if m.group('mins'):
1344 res += int(m.group('mins')) * 60
1345 if m.group('hours'):
1346 res += int(m.group('hours')) * 60 * 60
1347 if m.group('ms'):
1348 res += float(m.group('ms'))
1349 return res
1350
1351
1352 def prepend_extension(filename, ext):
1353 name, real_ext = os.path.splitext(filename)
1354 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1355
1356
1357 def check_executable(exe, args=[]):
1358 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1359 args can be a list of arguments for a short output (like -version) """
1360 try:
1361 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1362 except OSError:
1363 return False
1364 return exe
1365
1366
1367 class PagedList(object):
1368 def __init__(self, pagefunc, pagesize):
1369 self._pagefunc = pagefunc
1370 self._pagesize = pagesize
1371
1372 def __len__(self):
1373 # This is only useful for tests
1374 return len(self.getslice())
1375
1376 def getslice(self, start=0, end=None):
1377 res = []
1378 for pagenum in itertools.count(start // self._pagesize):
1379 firstid = pagenum * self._pagesize
1380 nextfirstid = pagenum * self._pagesize + self._pagesize
1381 if start >= nextfirstid:
1382 continue
1383
1384 page_results = list(self._pagefunc(pagenum))
1385
1386 startv = (
1387 start % self._pagesize
1388 if firstid <= start < nextfirstid
1389 else 0)
1390
1391 endv = (
1392 ((end - 1) % self._pagesize) + 1
1393 if (end is not None and firstid <= end <= nextfirstid)
1394 else None)
1395
1396 if startv != 0 or endv is not None:
1397 page_results = page_results[startv:endv]
1398 res.extend(page_results)
1399
1400 # A little optimization - if current page is not "full", ie. does
1401 # not contain page_size videos then we can assume that this page
1402 # is the last one - there are no more ids on further pages -
1403 # i.e. no need to query again.
1404 if len(page_results) + startv < self._pagesize:
1405 break
1406
1407 # If we got the whole page, but the next page is not interesting,
1408 # break out early as well
1409 if end == nextfirstid:
1410 break
1411 return res
1412
1413
1414 def uppercase_escape(s):
1415 unicode_escape = codecs.getdecoder('unicode_escape')
1416 return re.sub(
1417 r'\\U[0-9a-fA-F]{8}',
1418 lambda m: unicode_escape(m.group(0))[0],
1419 s)
1420
1421 try:
1422 struct.pack(u'!I', 0)
1423 except TypeError:
1424 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1425 def struct_pack(spec, *args):
1426 if isinstance(spec, compat_str):
1427 spec = spec.encode('ascii')
1428 return struct.pack(spec, *args)
1429
1430 def struct_unpack(spec, *args):
1431 if isinstance(spec, compat_str):
1432 spec = spec.encode('ascii')
1433 return struct.unpack(spec, *args)
1434 else:
1435 struct_pack = struct.pack
1436 struct_unpack = struct.unpack
1437
1438
1439 def read_batch_urls(batch_fd):
1440 def fixup(url):
1441 if not isinstance(url, compat_str):
1442 url = url.decode('utf-8', 'replace')
1443 BOM_UTF8 = u'\xef\xbb\xbf'
1444 if url.startswith(BOM_UTF8):
1445 url = url[len(BOM_UTF8):]
1446 url = url.strip()
1447 if url.startswith(('#', ';', ']')):
1448 return False
1449 return url
1450
1451 with contextlib.closing(batch_fd) as fd:
1452 return [url for url in map(fixup, fd) if url]
1453
1454
1455 def urlencode_postdata(*args, **kargs):
1456 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1457
1458
1459 try:
1460 etree_iter = xml.etree.ElementTree.Element.iter
1461 except AttributeError: # Python <=2.6
1462 etree_iter = lambda n: n.findall('.//*')
1463
1464
1465 def parse_xml(s):
1466 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1467 def doctype(self, name, pubid, system):
1468 pass # Ignore doctypes
1469
1470 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1471 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1472 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1473 # Fix up XML parser in Python 2.x
1474 if sys.version_info < (3, 0):
1475 for n in etree_iter(tree):
1476 if n.text is not None:
1477 if not isinstance(n.text, compat_str):
1478 n.text = n.text.decode('utf-8')
1479 return tree
1480
1481
1482 if sys.version_info < (3, 0) and sys.platform == 'win32':
1483 def compat_getpass(prompt, *args, **kwargs):
1484 if isinstance(prompt, compat_str):
1485 prompt = prompt.encode(preferredencoding())
1486 return getpass.getpass(prompt, *args, **kwargs)
1487 else:
1488 compat_getpass = getpass.getpass
1489
1490
1491 US_RATINGS = {
1492 'G': 0,
1493 'PG': 10,
1494 'PG-13': 13,
1495 'R': 16,
1496 'NC': 18,
1497 }
1498
1499
1500 def strip_jsonp(code):
1501 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1502
1503
1504 def js_to_json(code):
1505 def fix_kv(m):
1506 key = m.group(2)
1507 if key.startswith("'"):
1508 assert key.endswith("'")
1509 assert '"' not in key
1510 key = '"%s"' % key[1:-1]
1511 elif not key.startswith('"'):
1512 key = '"%s"' % key
1513
1514 value = m.group(4)
1515 if value.startswith("'"):
1516 assert value.endswith("'")
1517 assert '"' not in value
1518 value = '"%s"' % value[1:-1]
1519
1520 return m.group(1) + key + m.group(3) + value
1521
1522 res = re.sub(r'''(?x)
1523 ([{,]\s*)
1524 ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1525 (:\s*)
1526 ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1527 ''', fix_kv, code)
1528 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1529 return res
1530
1531
1532 def qualities(quality_ids):
1533 """ Get a numeric quality value out of a list of possible values """
1534 def q(qid):
1535 try:
1536 return quality_ids.index(qid)
1537 except ValueError:
1538 return -1
1539 return q
1540
1541
1542 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1543
1544 try:
1545 subprocess_check_output = subprocess.check_output
1546 except AttributeError:
1547 def subprocess_check_output(*args, **kwargs):
1548 assert 'input' not in kwargs
1549 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1550 output, _ = p.communicate()
1551 ret = p.poll()
1552 if ret:
1553 raise subprocess.CalledProcessError(ret, p.args, output=output)
1554 return output