]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
[generic] Fix rss under Python 2.x and move test to extractor
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 import calendar
5 import codecs
6 import contextlib
7 import ctypes
8 import datetime
9 import email.utils
10 import errno
11 import getpass
12 import gzip
13 import itertools
14 import io
15 import json
16 import locale
17 import math
18 import os
19 import pipes
20 import platform
21 import re
22 import ssl
23 import socket
24 import struct
25 import subprocess
26 import sys
27 import tempfile
28 import traceback
29 import xml.etree.ElementTree
30 import zlib
31
32 try:
33 import urllib.request as compat_urllib_request
34 except ImportError: # Python 2
35 import urllib2 as compat_urllib_request
36
37 try:
38 import urllib.error as compat_urllib_error
39 except ImportError: # Python 2
40 import urllib2 as compat_urllib_error
41
42 try:
43 import urllib.parse as compat_urllib_parse
44 except ImportError: # Python 2
45 import urllib as compat_urllib_parse
46
47 try:
48 from urllib.parse import urlparse as compat_urllib_parse_urlparse
49 except ImportError: # Python 2
50 from urlparse import urlparse as compat_urllib_parse_urlparse
51
52 try:
53 import urllib.parse as compat_urlparse
54 except ImportError: # Python 2
55 import urlparse as compat_urlparse
56
57 try:
58 import http.cookiejar as compat_cookiejar
59 except ImportError: # Python 2
60 import cookielib as compat_cookiejar
61
62 try:
63 import html.entities as compat_html_entities
64 except ImportError: # Python 2
65 import htmlentitydefs as compat_html_entities
66
67 try:
68 import html.parser as compat_html_parser
69 except ImportError: # Python 2
70 import HTMLParser as compat_html_parser
71
72 try:
73 import http.client as compat_http_client
74 except ImportError: # Python 2
75 import httplib as compat_http_client
76
77 try:
78 from urllib.error import HTTPError as compat_HTTPError
79 except ImportError: # Python 2
80 from urllib2 import HTTPError as compat_HTTPError
81
82 try:
83 from urllib.request import urlretrieve as compat_urlretrieve
84 except ImportError: # Python 2
85 from urllib import urlretrieve as compat_urlretrieve
86
87
88 try:
89 from subprocess import DEVNULL
90 compat_subprocess_get_DEVNULL = lambda: DEVNULL
91 except ImportError:
92 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
93
94 try:
95 from urllib.parse import unquote as compat_urllib_parse_unquote
96 except ImportError:
97 def compat_urllib_parse_unquote(string, encoding='utf-8', errors='replace'):
98 if string == '':
99 return string
100 res = string.split('%')
101 if len(res) == 1:
102 return string
103 if encoding is None:
104 encoding = 'utf-8'
105 if errors is None:
106 errors = 'replace'
107 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
108 pct_sequence = b''
109 string = res[0]
110 for item in res[1:]:
111 try:
112 if not item:
113 raise ValueError
114 pct_sequence += item[:2].decode('hex')
115 rest = item[2:]
116 if not rest:
117 # This segment was just a single percent-encoded character.
118 # May be part of a sequence of code units, so delay decoding.
119 # (Stored in pct_sequence).
120 continue
121 except ValueError:
122 rest = '%' + item
123 # Encountered non-percent-encoded characters. Flush the current
124 # pct_sequence.
125 string += pct_sequence.decode(encoding, errors) + rest
126 pct_sequence = b''
127 if pct_sequence:
128 # Flush the final pct_sequence
129 string += pct_sequence.decode(encoding, errors)
130 return string
131
132
133 try:
134 from urllib.parse import parse_qs as compat_parse_qs
135 except ImportError: # Python 2
136 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
137 # Python 2's version is apparently totally broken
138
139 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
140 encoding='utf-8', errors='replace'):
141 qs, _coerce_result = qs, unicode
142 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
143 r = []
144 for name_value in pairs:
145 if not name_value and not strict_parsing:
146 continue
147 nv = name_value.split('=', 1)
148 if len(nv) != 2:
149 if strict_parsing:
150 raise ValueError("bad query field: %r" % (name_value,))
151 # Handle case of a control-name with no equal sign
152 if keep_blank_values:
153 nv.append('')
154 else:
155 continue
156 if len(nv[1]) or keep_blank_values:
157 name = nv[0].replace('+', ' ')
158 name = compat_urllib_parse_unquote(
159 name, encoding=encoding, errors=errors)
160 name = _coerce_result(name)
161 value = nv[1].replace('+', ' ')
162 value = compat_urllib_parse_unquote(
163 value, encoding=encoding, errors=errors)
164 value = _coerce_result(value)
165 r.append((name, value))
166 return r
167
168 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
169 encoding='utf-8', errors='replace'):
170 parsed_result = {}
171 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
172 encoding=encoding, errors=errors)
173 for name, value in pairs:
174 if name in parsed_result:
175 parsed_result[name].append(value)
176 else:
177 parsed_result[name] = [value]
178 return parsed_result
179
180 try:
181 compat_str = unicode # Python 2
182 except NameError:
183 compat_str = str
184
185 try:
186 compat_chr = unichr # Python 2
187 except NameError:
188 compat_chr = chr
189
190 try:
191 from xml.etree.ElementTree import ParseError as compat_xml_parse_error
192 except ImportError: # Python 2.6
193 from xml.parsers.expat import ExpatError as compat_xml_parse_error
194
195 try:
196 from shlex import quote as shlex_quote
197 except ImportError: # Python < 3.3
198 def shlex_quote(s):
199 return "'" + s.replace("'", "'\"'\"'") + "'"
200
201
202 def compat_ord(c):
203 if type(c) is int: return c
204 else: return ord(c)
205
206 # This is not clearly defined otherwise
207 compiled_regex_type = type(re.compile(''))
208
209 std_headers = {
210 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
211 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
212 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
213 'Accept-Encoding': 'gzip, deflate',
214 'Accept-Language': 'en-us,en;q=0.5',
215 }
216
217 def preferredencoding():
218 """Get preferred encoding.
219
220 Returns the best encoding scheme for the system, based on
221 locale.getpreferredencoding() and some further tweaks.
222 """
223 try:
224 pref = locale.getpreferredencoding()
225 u'TEST'.encode(pref)
226 except:
227 pref = 'UTF-8'
228
229 return pref
230
231 if sys.version_info < (3,0):
232 def compat_print(s):
233 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
234 else:
235 def compat_print(s):
236 assert type(s) == type(u'')
237 print(s)
238
239
240 def write_json_file(obj, fn):
241 """ Encode obj as JSON and write it to fn, atomically """
242
243 args = {
244 'suffix': '.tmp',
245 'prefix': os.path.basename(fn) + '.',
246 'dir': os.path.dirname(fn),
247 'delete': False,
248 }
249
250 # In Python 2.x, json.dump expects a bytestream.
251 # In Python 3.x, it writes to a character stream
252 if sys.version_info < (3, 0):
253 args['mode'] = 'wb'
254 else:
255 args.update({
256 'mode': 'w',
257 'encoding': 'utf-8',
258 })
259
260 tf = tempfile.NamedTemporaryFile(**args)
261
262 try:
263 with tf:
264 json.dump(obj, tf)
265 os.rename(tf.name, fn)
266 except:
267 try:
268 os.remove(tf.name)
269 except OSError:
270 pass
271 raise
272
273
274 if sys.version_info >= (2, 7):
275 def find_xpath_attr(node, xpath, key, val):
276 """ Find the xpath xpath[@key=val] """
277 assert re.match(r'^[a-zA-Z-]+$', key)
278 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
279 expr = xpath + u"[@%s='%s']" % (key, val)
280 return node.find(expr)
281 else:
282 def find_xpath_attr(node, xpath, key, val):
283 for f in node.findall(xpath):
284 if f.attrib.get(key) == val:
285 return f
286 return None
287
288 # On python2.6 the xml.etree.ElementTree.Element methods don't support
289 # the namespace parameter
290 def xpath_with_ns(path, ns_map):
291 components = [c.split(':') for c in path.split('/')]
292 replaced = []
293 for c in components:
294 if len(c) == 1:
295 replaced.append(c[0])
296 else:
297 ns, tag = c
298 replaced.append('{%s}%s' % (ns_map[ns], tag))
299 return '/'.join(replaced)
300
301 def htmlentity_transform(matchobj):
302 """Transforms an HTML entity to a character.
303
304 This function receives a match object and is intended to be used with
305 the re.sub() function.
306 """
307 entity = matchobj.group(1)
308
309 # Known non-numeric HTML entity
310 if entity in compat_html_entities.name2codepoint:
311 return compat_chr(compat_html_entities.name2codepoint[entity])
312
313 mobj = re.match(u'(?u)#(x?\\d+)', entity)
314 if mobj is not None:
315 numstr = mobj.group(1)
316 if numstr.startswith(u'x'):
317 base = 16
318 numstr = u'0%s' % numstr
319 else:
320 base = 10
321 return compat_chr(int(numstr, base))
322
323 # Unknown entity in name, return its literal representation
324 return (u'&%s;' % entity)
325
326 compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
327 class BaseHTMLParser(compat_html_parser.HTMLParser):
328 def __init(self):
329 compat_html_parser.HTMLParser.__init__(self)
330 self.html = None
331
332 def loads(self, html):
333 self.html = html
334 self.feed(html)
335 self.close()
336
337 class AttrParser(BaseHTMLParser):
338 """Modified HTMLParser that isolates a tag with the specified attribute"""
339 def __init__(self, attribute, value):
340 self.attribute = attribute
341 self.value = value
342 self.result = None
343 self.started = False
344 self.depth = {}
345 self.watch_startpos = False
346 self.error_count = 0
347 BaseHTMLParser.__init__(self)
348
349 def error(self, message):
350 if self.error_count > 10 or self.started:
351 raise compat_html_parser.HTMLParseError(message, self.getpos())
352 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
353 self.error_count += 1
354 self.goahead(1)
355
356 def handle_starttag(self, tag, attrs):
357 attrs = dict(attrs)
358 if self.started:
359 self.find_startpos(None)
360 if self.attribute in attrs and attrs[self.attribute] == self.value:
361 self.result = [tag]
362 self.started = True
363 self.watch_startpos = True
364 if self.started:
365 if not tag in self.depth: self.depth[tag] = 0
366 self.depth[tag] += 1
367
368 def handle_endtag(self, tag):
369 if self.started:
370 if tag in self.depth: self.depth[tag] -= 1
371 if self.depth[self.result[0]] == 0:
372 self.started = False
373 self.result.append(self.getpos())
374
375 def find_startpos(self, x):
376 """Needed to put the start position of the result (self.result[1])
377 after the opening tag with the requested id"""
378 if self.watch_startpos:
379 self.watch_startpos = False
380 self.result.append(self.getpos())
381 handle_entityref = handle_charref = handle_data = handle_comment = \
382 handle_decl = handle_pi = unknown_decl = find_startpos
383
384 def get_result(self):
385 if self.result is None:
386 return None
387 if len(self.result) != 3:
388 return None
389 lines = self.html.split('\n')
390 lines = lines[self.result[1][0]-1:self.result[2][0]]
391 lines[0] = lines[0][self.result[1][1]:]
392 if len(lines) == 1:
393 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
394 lines[-1] = lines[-1][:self.result[2][1]]
395 return '\n'.join(lines).strip()
396 # Hack for https://github.com/rg3/youtube-dl/issues/662
397 if sys.version_info < (2, 7, 3):
398 AttrParser.parse_endtag = (lambda self, i:
399 i + len("</scr'+'ipt>")
400 if self.rawdata[i:].startswith("</scr'+'ipt>")
401 else compat_html_parser.HTMLParser.parse_endtag(self, i))
402
403 def get_element_by_id(id, html):
404 """Return the content of the tag with the specified ID in the passed HTML document"""
405 return get_element_by_attribute("id", id, html)
406
407 def get_element_by_attribute(attribute, value, html):
408 """Return the content of the tag with the specified attribute in the passed HTML document"""
409 parser = AttrParser(attribute, value)
410 try:
411 parser.loads(html)
412 except compat_html_parser.HTMLParseError:
413 pass
414 return parser.get_result()
415
416 class MetaParser(BaseHTMLParser):
417 """
418 Modified HTMLParser that isolates a meta tag with the specified name
419 attribute.
420 """
421 def __init__(self, name):
422 BaseHTMLParser.__init__(self)
423 self.name = name
424 self.content = None
425 self.result = None
426
427 def handle_starttag(self, tag, attrs):
428 if tag != 'meta':
429 return
430 attrs = dict(attrs)
431 if attrs.get('name') == self.name:
432 self.result = attrs.get('content')
433
434 def get_result(self):
435 return self.result
436
437 def get_meta_content(name, html):
438 """
439 Return the content attribute from the meta tag with the given name attribute.
440 """
441 parser = MetaParser(name)
442 try:
443 parser.loads(html)
444 except compat_html_parser.HTMLParseError:
445 pass
446 return parser.get_result()
447
448
449 def clean_html(html):
450 """Clean an HTML snippet into a readable string"""
451 # Newline vs <br />
452 html = html.replace('\n', ' ')
453 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
454 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
455 # Strip html tags
456 html = re.sub('<.*?>', '', html)
457 # Replace html entities
458 html = unescapeHTML(html)
459 return html.strip()
460
461
462 def sanitize_open(filename, open_mode):
463 """Try to open the given filename, and slightly tweak it if this fails.
464
465 Attempts to open the given filename. If this fails, it tries to change
466 the filename slightly, step by step, until it's either able to open it
467 or it fails and raises a final exception, like the standard open()
468 function.
469
470 It returns the tuple (stream, definitive_file_name).
471 """
472 try:
473 if filename == u'-':
474 if sys.platform == 'win32':
475 import msvcrt
476 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
477 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
478 stream = open(encodeFilename(filename), open_mode)
479 return (stream, filename)
480 except (IOError, OSError) as err:
481 if err.errno in (errno.EACCES,):
482 raise
483
484 # In case of error, try to remove win32 forbidden chars
485 alt_filename = os.path.join(
486 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
487 for path_part in os.path.split(filename)
488 )
489 if alt_filename == filename:
490 raise
491 else:
492 # An exception here should be caught in the caller
493 stream = open(encodeFilename(filename), open_mode)
494 return (stream, alt_filename)
495
496
497 def timeconvert(timestr):
498 """Convert RFC 2822 defined time string into system timestamp"""
499 timestamp = None
500 timetuple = email.utils.parsedate_tz(timestr)
501 if timetuple is not None:
502 timestamp = email.utils.mktime_tz(timetuple)
503 return timestamp
504
505 def sanitize_filename(s, restricted=False, is_id=False):
506 """Sanitizes a string so it could be used as part of a filename.
507 If restricted is set, use a stricter subset of allowed characters.
508 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
509 """
510 def replace_insane(char):
511 if char == '?' or ord(char) < 32 or ord(char) == 127:
512 return ''
513 elif char == '"':
514 return '' if restricted else '\''
515 elif char == ':':
516 return '_-' if restricted else ' -'
517 elif char in '\\/|*<>':
518 return '_'
519 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
520 return '_'
521 if restricted and ord(char) > 127:
522 return '_'
523 return char
524
525 result = u''.join(map(replace_insane, s))
526 if not is_id:
527 while '__' in result:
528 result = result.replace('__', '_')
529 result = result.strip('_')
530 # Common case of "Foreign band name - English song title"
531 if restricted and result.startswith('-_'):
532 result = result[2:]
533 if not result:
534 result = '_'
535 return result
536
537 def orderedSet(iterable):
538 """ Remove all duplicates from the input iterable """
539 res = []
540 for el in iterable:
541 if el not in res:
542 res.append(el)
543 return res
544
545
546 def unescapeHTML(s):
547 if s is None:
548 return None
549 assert type(s) == compat_str
550
551 result = re.sub(r'(?u)&(.+?);', htmlentity_transform, s)
552 return result
553
554
555 def encodeFilename(s, for_subprocess=False):
556 """
557 @param s The name of the file
558 """
559
560 assert type(s) == compat_str
561
562 # Python 3 has a Unicode API
563 if sys.version_info >= (3, 0):
564 return s
565
566 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
567 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
568 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
569 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
570 if not for_subprocess:
571 return s
572 else:
573 # For subprocess calls, encode with locale encoding
574 # Refer to http://stackoverflow.com/a/9951851/35070
575 encoding = preferredencoding()
576 else:
577 encoding = sys.getfilesystemencoding()
578 if encoding is None:
579 encoding = 'utf-8'
580 return s.encode(encoding, 'ignore')
581
582
583 def encodeArgument(s):
584 if not isinstance(s, compat_str):
585 # Legacy code that uses byte strings
586 # Uncomment the following line after fixing all post processors
587 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
588 s = s.decode('ascii')
589 return encodeFilename(s, True)
590
591
592 def decodeOption(optval):
593 if optval is None:
594 return optval
595 if isinstance(optval, bytes):
596 optval = optval.decode(preferredencoding())
597
598 assert isinstance(optval, compat_str)
599 return optval
600
601 def formatSeconds(secs):
602 if secs > 3600:
603 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
604 elif secs > 60:
605 return '%d:%02d' % (secs // 60, secs % 60)
606 else:
607 return '%d' % secs
608
609
610 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
611 if sys.version_info < (3, 2):
612 import httplib
613
614 class HTTPSConnectionV3(httplib.HTTPSConnection):
615 def __init__(self, *args, **kwargs):
616 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
617
618 def connect(self):
619 sock = socket.create_connection((self.host, self.port), self.timeout)
620 if getattr(self, '_tunnel_host', False):
621 self.sock = sock
622 self._tunnel()
623 try:
624 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3)
625 except ssl.SSLError:
626 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
627
628 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
629 def https_open(self, req):
630 return self.do_open(HTTPSConnectionV3, req)
631 return HTTPSHandlerV3(**kwargs)
632 else:
633 context = ssl.SSLContext(ssl.PROTOCOL_SSLv3)
634 context.verify_mode = (ssl.CERT_NONE
635 if opts_no_check_certificate
636 else ssl.CERT_REQUIRED)
637 context.set_default_verify_paths()
638 try:
639 context.load_default_certs()
640 except AttributeError:
641 pass # Python < 3.4
642 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
643
644 class ExtractorError(Exception):
645 """Error during info extraction."""
646 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
647 """ tb, if given, is the original traceback (so that it can be printed out).
648 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
649 """
650
651 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
652 expected = True
653 if video_id is not None:
654 msg = video_id + ': ' + msg
655 if not expected:
656 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
657 super(ExtractorError, self).__init__(msg)
658
659 self.traceback = tb
660 self.exc_info = sys.exc_info() # preserve original exception
661 self.cause = cause
662 self.video_id = video_id
663
664 def format_traceback(self):
665 if self.traceback is None:
666 return None
667 return u''.join(traceback.format_tb(self.traceback))
668
669
670 class RegexNotFoundError(ExtractorError):
671 """Error when a regex didn't match"""
672 pass
673
674
675 class DownloadError(Exception):
676 """Download Error exception.
677
678 This exception may be thrown by FileDownloader objects if they are not
679 configured to continue on errors. They will contain the appropriate
680 error message.
681 """
682 def __init__(self, msg, exc_info=None):
683 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
684 super(DownloadError, self).__init__(msg)
685 self.exc_info = exc_info
686
687
688 class SameFileError(Exception):
689 """Same File exception.
690
691 This exception will be thrown by FileDownloader objects if they detect
692 multiple files would have to be downloaded to the same file on disk.
693 """
694 pass
695
696
697 class PostProcessingError(Exception):
698 """Post Processing exception.
699
700 This exception may be raised by PostProcessor's .run() method to
701 indicate an error in the postprocessing task.
702 """
703 def __init__(self, msg):
704 self.msg = msg
705
706 class MaxDownloadsReached(Exception):
707 """ --max-downloads limit has been reached. """
708 pass
709
710
711 class UnavailableVideoError(Exception):
712 """Unavailable Format exception.
713
714 This exception will be thrown when a video is requested
715 in a format that is not available for that video.
716 """
717 pass
718
719
720 class ContentTooShortError(Exception):
721 """Content Too Short exception.
722
723 This exception may be raised by FileDownloader objects when a file they
724 download is too small for what the server announced first, indicating
725 the connection was probably interrupted.
726 """
727 # Both in bytes
728 downloaded = None
729 expected = None
730
731 def __init__(self, downloaded, expected):
732 self.downloaded = downloaded
733 self.expected = expected
734
735 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
736 """Handler for HTTP requests and responses.
737
738 This class, when installed with an OpenerDirector, automatically adds
739 the standard headers to every HTTP request and handles gzipped and
740 deflated responses from web servers. If compression is to be avoided in
741 a particular request, the original request in the program code only has
742 to include the HTTP header "Youtubedl-No-Compression", which will be
743 removed before making the real request.
744
745 Part of this code was copied from:
746
747 http://techknack.net/python-urllib2-handlers/
748
749 Andrew Rowls, the author of that code, agreed to release it to the
750 public domain.
751 """
752
753 @staticmethod
754 def deflate(data):
755 try:
756 return zlib.decompress(data, -zlib.MAX_WBITS)
757 except zlib.error:
758 return zlib.decompress(data)
759
760 @staticmethod
761 def addinfourl_wrapper(stream, headers, url, code):
762 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
763 return compat_urllib_request.addinfourl(stream, headers, url, code)
764 ret = compat_urllib_request.addinfourl(stream, headers, url)
765 ret.code = code
766 return ret
767
768 def http_request(self, req):
769 for h,v in std_headers.items():
770 if h in req.headers:
771 del req.headers[h]
772 req.add_header(h, v)
773 if 'Youtubedl-no-compression' in req.headers:
774 if 'Accept-encoding' in req.headers:
775 del req.headers['Accept-encoding']
776 del req.headers['Youtubedl-no-compression']
777 if 'Youtubedl-user-agent' in req.headers:
778 if 'User-agent' in req.headers:
779 del req.headers['User-agent']
780 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
781 del req.headers['Youtubedl-user-agent']
782 return req
783
784 def http_response(self, req, resp):
785 old_resp = resp
786 # gzip
787 if resp.headers.get('Content-encoding', '') == 'gzip':
788 content = resp.read()
789 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
790 try:
791 uncompressed = io.BytesIO(gz.read())
792 except IOError as original_ioerror:
793 # There may be junk add the end of the file
794 # See http://stackoverflow.com/q/4928560/35070 for details
795 for i in range(1, 1024):
796 try:
797 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
798 uncompressed = io.BytesIO(gz.read())
799 except IOError:
800 continue
801 break
802 else:
803 raise original_ioerror
804 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
805 resp.msg = old_resp.msg
806 # deflate
807 if resp.headers.get('Content-encoding', '') == 'deflate':
808 gz = io.BytesIO(self.deflate(resp.read()))
809 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
810 resp.msg = old_resp.msg
811 return resp
812
813 https_request = http_request
814 https_response = http_response
815
816
817 def parse_iso8601(date_str, delimiter='T'):
818 """ Return a UNIX timestamp from the given date """
819
820 if date_str is None:
821 return None
822
823 m = re.search(
824 r'Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$',
825 date_str)
826 if not m:
827 timezone = datetime.timedelta()
828 else:
829 date_str = date_str[:-len(m.group(0))]
830 if not m.group('sign'):
831 timezone = datetime.timedelta()
832 else:
833 sign = 1 if m.group('sign') == '+' else -1
834 timezone = datetime.timedelta(
835 hours=sign * int(m.group('hours')),
836 minutes=sign * int(m.group('minutes')))
837 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
838 dt = datetime.datetime.strptime(date_str, date_format) - timezone
839 return calendar.timegm(dt.timetuple())
840
841
842 def unified_strdate(date_str):
843 """Return a string with the date in the format YYYYMMDD"""
844
845 if date_str is None:
846 return None
847
848 upload_date = None
849 #Replace commas
850 date_str = date_str.replace(',', ' ')
851 # %z (UTC offset) is only supported in python>=3.2
852 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
853 format_expressions = [
854 '%d %B %Y',
855 '%d %b %Y',
856 '%B %d %Y',
857 '%b %d %Y',
858 '%b %dst %Y %I:%M%p',
859 '%b %dnd %Y %I:%M%p',
860 '%b %dth %Y %I:%M%p',
861 '%Y-%m-%d',
862 '%Y/%m/%d',
863 '%d.%m.%Y',
864 '%d/%m/%Y',
865 '%d/%m/%y',
866 '%Y/%m/%d %H:%M:%S',
867 '%Y-%m-%d %H:%M:%S',
868 '%d.%m.%Y %H:%M',
869 '%d.%m.%Y %H.%M',
870 '%Y-%m-%dT%H:%M:%SZ',
871 '%Y-%m-%dT%H:%M:%S.%fZ',
872 '%Y-%m-%dT%H:%M:%S.%f0Z',
873 '%Y-%m-%dT%H:%M:%S',
874 '%Y-%m-%dT%H:%M:%S.%f',
875 '%Y-%m-%dT%H:%M',
876 ]
877 for expression in format_expressions:
878 try:
879 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
880 except ValueError:
881 pass
882 if upload_date is None:
883 timetuple = email.utils.parsedate_tz(date_str)
884 if timetuple:
885 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
886 return upload_date
887
888 def determine_ext(url, default_ext=u'unknown_video'):
889 if url is None:
890 return default_ext
891 guess = url.partition(u'?')[0].rpartition(u'.')[2]
892 if re.match(r'^[A-Za-z0-9]+$', guess):
893 return guess
894 else:
895 return default_ext
896
897 def subtitles_filename(filename, sub_lang, sub_format):
898 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
899
900 def date_from_str(date_str):
901 """
902 Return a datetime object from a string in the format YYYYMMDD or
903 (now|today)[+-][0-9](day|week|month|year)(s)?"""
904 today = datetime.date.today()
905 if date_str == 'now'or date_str == 'today':
906 return today
907 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
908 if match is not None:
909 sign = match.group('sign')
910 time = int(match.group('time'))
911 if sign == '-':
912 time = -time
913 unit = match.group('unit')
914 #A bad aproximation?
915 if unit == 'month':
916 unit = 'day'
917 time *= 30
918 elif unit == 'year':
919 unit = 'day'
920 time *= 365
921 unit += 's'
922 delta = datetime.timedelta(**{unit: time})
923 return today + delta
924 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
925
926 def hyphenate_date(date_str):
927 """
928 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
929 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
930 if match is not None:
931 return '-'.join(match.groups())
932 else:
933 return date_str
934
935 class DateRange(object):
936 """Represents a time interval between two dates"""
937 def __init__(self, start=None, end=None):
938 """start and end must be strings in the format accepted by date"""
939 if start is not None:
940 self.start = date_from_str(start)
941 else:
942 self.start = datetime.datetime.min.date()
943 if end is not None:
944 self.end = date_from_str(end)
945 else:
946 self.end = datetime.datetime.max.date()
947 if self.start > self.end:
948 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
949 @classmethod
950 def day(cls, day):
951 """Returns a range that only contains the given day"""
952 return cls(day,day)
953 def __contains__(self, date):
954 """Check if the date is in the range"""
955 if not isinstance(date, datetime.date):
956 date = date_from_str(date)
957 return self.start <= date <= self.end
958 def __str__(self):
959 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
960
961
962 def platform_name():
963 """ Returns the platform name as a compat_str """
964 res = platform.platform()
965 if isinstance(res, bytes):
966 res = res.decode(preferredencoding())
967
968 assert isinstance(res, compat_str)
969 return res
970
971
972 def _windows_write_string(s, out):
973 """ Returns True if the string was written using special methods,
974 False if it has yet to be written out."""
975 # Adapted from http://stackoverflow.com/a/3259271/35070
976
977 import ctypes
978 import ctypes.wintypes
979
980 WIN_OUTPUT_IDS = {
981 1: -11,
982 2: -12,
983 }
984
985 try:
986 fileno = out.fileno()
987 except AttributeError:
988 # If the output stream doesn't have a fileno, it's virtual
989 return False
990 if fileno not in WIN_OUTPUT_IDS:
991 return False
992
993 GetStdHandle = ctypes.WINFUNCTYPE(
994 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
995 ("GetStdHandle", ctypes.windll.kernel32))
996 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
997
998 WriteConsoleW = ctypes.WINFUNCTYPE(
999 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1000 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1001 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
1002 written = ctypes.wintypes.DWORD(0)
1003
1004 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
1005 FILE_TYPE_CHAR = 0x0002
1006 FILE_TYPE_REMOTE = 0x8000
1007 GetConsoleMode = ctypes.WINFUNCTYPE(
1008 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1009 ctypes.POINTER(ctypes.wintypes.DWORD))(
1010 ("GetConsoleMode", ctypes.windll.kernel32))
1011 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1012
1013 def not_a_console(handle):
1014 if handle == INVALID_HANDLE_VALUE or handle is None:
1015 return True
1016 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
1017 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1018
1019 if not_a_console(h):
1020 return False
1021
1022 def next_nonbmp_pos(s):
1023 try:
1024 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1025 except StopIteration:
1026 return len(s)
1027
1028 while s:
1029 count = min(next_nonbmp_pos(s), 1024)
1030
1031 ret = WriteConsoleW(
1032 h, s, count if count else 2, ctypes.byref(written), None)
1033 if ret == 0:
1034 raise OSError('Failed to write string')
1035 if not count: # We just wrote a non-BMP character
1036 assert written.value == 2
1037 s = s[1:]
1038 else:
1039 assert written.value > 0
1040 s = s[written.value:]
1041 return True
1042
1043
1044 def write_string(s, out=None, encoding=None):
1045 if out is None:
1046 out = sys.stderr
1047 assert type(s) == compat_str
1048
1049 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1050 if _windows_write_string(s, out):
1051 return
1052
1053 if ('b' in getattr(out, 'mode', '') or
1054 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1055 byt = s.encode(encoding or preferredencoding(), 'ignore')
1056 out.write(byt)
1057 elif hasattr(out, 'buffer'):
1058 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1059 byt = s.encode(enc, 'ignore')
1060 out.buffer.write(byt)
1061 else:
1062 out.write(s)
1063 out.flush()
1064
1065
1066 def bytes_to_intlist(bs):
1067 if not bs:
1068 return []
1069 if isinstance(bs[0], int): # Python 3
1070 return list(bs)
1071 else:
1072 return [ord(c) for c in bs]
1073
1074
1075 def intlist_to_bytes(xs):
1076 if not xs:
1077 return b''
1078 if isinstance(chr(0), bytes): # Python 2
1079 return ''.join([chr(x) for x in xs])
1080 else:
1081 return bytes(xs)
1082
1083
1084 def get_cachedir(params={}):
1085 cache_root = os.environ.get('XDG_CACHE_HOME',
1086 os.path.expanduser('~/.cache'))
1087 return params.get('cachedir', os.path.join(cache_root, 'youtube-dl'))
1088
1089
1090 # Cross-platform file locking
1091 if sys.platform == 'win32':
1092 import ctypes.wintypes
1093 import msvcrt
1094
1095 class OVERLAPPED(ctypes.Structure):
1096 _fields_ = [
1097 ('Internal', ctypes.wintypes.LPVOID),
1098 ('InternalHigh', ctypes.wintypes.LPVOID),
1099 ('Offset', ctypes.wintypes.DWORD),
1100 ('OffsetHigh', ctypes.wintypes.DWORD),
1101 ('hEvent', ctypes.wintypes.HANDLE),
1102 ]
1103
1104 kernel32 = ctypes.windll.kernel32
1105 LockFileEx = kernel32.LockFileEx
1106 LockFileEx.argtypes = [
1107 ctypes.wintypes.HANDLE, # hFile
1108 ctypes.wintypes.DWORD, # dwFlags
1109 ctypes.wintypes.DWORD, # dwReserved
1110 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1111 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1112 ctypes.POINTER(OVERLAPPED) # Overlapped
1113 ]
1114 LockFileEx.restype = ctypes.wintypes.BOOL
1115 UnlockFileEx = kernel32.UnlockFileEx
1116 UnlockFileEx.argtypes = [
1117 ctypes.wintypes.HANDLE, # hFile
1118 ctypes.wintypes.DWORD, # dwReserved
1119 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1120 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1121 ctypes.POINTER(OVERLAPPED) # Overlapped
1122 ]
1123 UnlockFileEx.restype = ctypes.wintypes.BOOL
1124 whole_low = 0xffffffff
1125 whole_high = 0x7fffffff
1126
1127 def _lock_file(f, exclusive):
1128 overlapped = OVERLAPPED()
1129 overlapped.Offset = 0
1130 overlapped.OffsetHigh = 0
1131 overlapped.hEvent = 0
1132 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1133 handle = msvcrt.get_osfhandle(f.fileno())
1134 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1135 whole_low, whole_high, f._lock_file_overlapped_p):
1136 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1137
1138 def _unlock_file(f):
1139 assert f._lock_file_overlapped_p
1140 handle = msvcrt.get_osfhandle(f.fileno())
1141 if not UnlockFileEx(handle, 0,
1142 whole_low, whole_high, f._lock_file_overlapped_p):
1143 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1144
1145 else:
1146 import fcntl
1147
1148 def _lock_file(f, exclusive):
1149 fcntl.lockf(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1150
1151 def _unlock_file(f):
1152 fcntl.lockf(f, fcntl.LOCK_UN)
1153
1154
1155 class locked_file(object):
1156 def __init__(self, filename, mode, encoding=None):
1157 assert mode in ['r', 'a', 'w']
1158 self.f = io.open(filename, mode, encoding=encoding)
1159 self.mode = mode
1160
1161 def __enter__(self):
1162 exclusive = self.mode != 'r'
1163 try:
1164 _lock_file(self.f, exclusive)
1165 except IOError:
1166 self.f.close()
1167 raise
1168 return self
1169
1170 def __exit__(self, etype, value, traceback):
1171 try:
1172 _unlock_file(self.f)
1173 finally:
1174 self.f.close()
1175
1176 def __iter__(self):
1177 return iter(self.f)
1178
1179 def write(self, *args):
1180 return self.f.write(*args)
1181
1182 def read(self, *args):
1183 return self.f.read(*args)
1184
1185
1186 def shell_quote(args):
1187 quoted_args = []
1188 encoding = sys.getfilesystemencoding()
1189 if encoding is None:
1190 encoding = 'utf-8'
1191 for a in args:
1192 if isinstance(a, bytes):
1193 # We may get a filename encoded with 'encodeFilename'
1194 a = a.decode(encoding)
1195 quoted_args.append(pipes.quote(a))
1196 return u' '.join(quoted_args)
1197
1198
1199 def takewhile_inclusive(pred, seq):
1200 """ Like itertools.takewhile, but include the latest evaluated element
1201 (the first element so that Not pred(e)) """
1202 for e in seq:
1203 yield e
1204 if not pred(e):
1205 return
1206
1207
1208 def smuggle_url(url, data):
1209 """ Pass additional data in a URL for internal use. """
1210
1211 sdata = compat_urllib_parse.urlencode(
1212 {u'__youtubedl_smuggle': json.dumps(data)})
1213 return url + u'#' + sdata
1214
1215
1216 def unsmuggle_url(smug_url, default=None):
1217 if not '#__youtubedl_smuggle' in smug_url:
1218 return smug_url, default
1219 url, _, sdata = smug_url.rpartition(u'#')
1220 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
1221 data = json.loads(jsond)
1222 return url, data
1223
1224
1225 def format_bytes(bytes):
1226 if bytes is None:
1227 return u'N/A'
1228 if type(bytes) is str:
1229 bytes = float(bytes)
1230 if bytes == 0.0:
1231 exponent = 0
1232 else:
1233 exponent = int(math.log(bytes, 1024.0))
1234 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1235 converted = float(bytes) / float(1024 ** exponent)
1236 return u'%.2f%s' % (converted, suffix)
1237
1238
1239 def get_term_width():
1240 columns = os.environ.get('COLUMNS', None)
1241 if columns:
1242 return int(columns)
1243
1244 try:
1245 sp = subprocess.Popen(
1246 ['stty', 'size'],
1247 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1248 out, err = sp.communicate()
1249 return int(out.split()[1])
1250 except:
1251 pass
1252 return None
1253
1254
1255 def month_by_name(name):
1256 """ Return the number of a month by (locale-independently) English name """
1257
1258 ENGLISH_NAMES = [
1259 u'January', u'February', u'March', u'April', u'May', u'June',
1260 u'July', u'August', u'September', u'October', u'November', u'December']
1261 try:
1262 return ENGLISH_NAMES.index(name) + 1
1263 except ValueError:
1264 return None
1265
1266
1267 def fix_xml_ampersands(xml_str):
1268 """Replace all the '&' by '&amp;' in XML"""
1269 return re.sub(
1270 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1271 u'&amp;',
1272 xml_str)
1273
1274
1275 def setproctitle(title):
1276 assert isinstance(title, compat_str)
1277 try:
1278 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1279 except OSError:
1280 return
1281 title_bytes = title.encode('utf-8')
1282 buf = ctypes.create_string_buffer(len(title_bytes))
1283 buf.value = title_bytes
1284 try:
1285 libc.prctl(15, buf, 0, 0, 0)
1286 except AttributeError:
1287 return # Strange libc, just skip this
1288
1289
1290 def remove_start(s, start):
1291 if s.startswith(start):
1292 return s[len(start):]
1293 return s
1294
1295
1296 def remove_end(s, end):
1297 if s.endswith(end):
1298 return s[:-len(end)]
1299 return s
1300
1301
1302 def url_basename(url):
1303 path = compat_urlparse.urlparse(url).path
1304 return path.strip(u'/').split(u'/')[-1]
1305
1306
1307 class HEADRequest(compat_urllib_request.Request):
1308 def get_method(self):
1309 return "HEAD"
1310
1311
1312 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1313 if get_attr:
1314 if v is not None:
1315 v = getattr(v, get_attr, None)
1316 if v == '':
1317 v = None
1318 return default if v is None else (int(v) * invscale // scale)
1319
1320
1321 def str_or_none(v, default=None):
1322 return default if v is None else compat_str(v)
1323
1324
1325 def str_to_int(int_str):
1326 if int_str is None:
1327 return None
1328 int_str = re.sub(r'[,\.]', u'', int_str)
1329 return int(int_str)
1330
1331
1332 def float_or_none(v, scale=1, invscale=1, default=None):
1333 return default if v is None else (float(v) * invscale / scale)
1334
1335
1336 def parse_duration(s):
1337 if s is None:
1338 return None
1339
1340 m = re.match(
1341 r'(?:(?:(?P<hours>[0-9]+)[:h])?(?P<mins>[0-9]+)[:m])?(?P<secs>[0-9]+)s?(?::[0-9]+)?(?P<ms>\.[0-9]+)?$', s)
1342 if not m:
1343 return None
1344 res = int(m.group('secs'))
1345 if m.group('mins'):
1346 res += int(m.group('mins')) * 60
1347 if m.group('hours'):
1348 res += int(m.group('hours')) * 60 * 60
1349 if m.group('ms'):
1350 res += float(m.group('ms'))
1351 return res
1352
1353
1354 def prepend_extension(filename, ext):
1355 name, real_ext = os.path.splitext(filename)
1356 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1357
1358
1359 def check_executable(exe, args=[]):
1360 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1361 args can be a list of arguments for a short output (like -version) """
1362 try:
1363 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1364 except OSError:
1365 return False
1366 return exe
1367
1368
1369 class PagedList(object):
1370 def __init__(self, pagefunc, pagesize):
1371 self._pagefunc = pagefunc
1372 self._pagesize = pagesize
1373
1374 def __len__(self):
1375 # This is only useful for tests
1376 return len(self.getslice())
1377
1378 def getslice(self, start=0, end=None):
1379 res = []
1380 for pagenum in itertools.count(start // self._pagesize):
1381 firstid = pagenum * self._pagesize
1382 nextfirstid = pagenum * self._pagesize + self._pagesize
1383 if start >= nextfirstid:
1384 continue
1385
1386 page_results = list(self._pagefunc(pagenum))
1387
1388 startv = (
1389 start % self._pagesize
1390 if firstid <= start < nextfirstid
1391 else 0)
1392
1393 endv = (
1394 ((end - 1) % self._pagesize) + 1
1395 if (end is not None and firstid <= end <= nextfirstid)
1396 else None)
1397
1398 if startv != 0 or endv is not None:
1399 page_results = page_results[startv:endv]
1400 res.extend(page_results)
1401
1402 # A little optimization - if current page is not "full", ie. does
1403 # not contain page_size videos then we can assume that this page
1404 # is the last one - there are no more ids on further pages -
1405 # i.e. no need to query again.
1406 if len(page_results) + startv < self._pagesize:
1407 break
1408
1409 # If we got the whole page, but the next page is not interesting,
1410 # break out early as well
1411 if end == nextfirstid:
1412 break
1413 return res
1414
1415
1416 def uppercase_escape(s):
1417 unicode_escape = codecs.getdecoder('unicode_escape')
1418 return re.sub(
1419 r'\\U[0-9a-fA-F]{8}',
1420 lambda m: unicode_escape(m.group(0))[0],
1421 s)
1422
1423 try:
1424 struct.pack(u'!I', 0)
1425 except TypeError:
1426 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1427 def struct_pack(spec, *args):
1428 if isinstance(spec, compat_str):
1429 spec = spec.encode('ascii')
1430 return struct.pack(spec, *args)
1431
1432 def struct_unpack(spec, *args):
1433 if isinstance(spec, compat_str):
1434 spec = spec.encode('ascii')
1435 return struct.unpack(spec, *args)
1436 else:
1437 struct_pack = struct.pack
1438 struct_unpack = struct.unpack
1439
1440
1441 def read_batch_urls(batch_fd):
1442 def fixup(url):
1443 if not isinstance(url, compat_str):
1444 url = url.decode('utf-8', 'replace')
1445 BOM_UTF8 = u'\xef\xbb\xbf'
1446 if url.startswith(BOM_UTF8):
1447 url = url[len(BOM_UTF8):]
1448 url = url.strip()
1449 if url.startswith(('#', ';', ']')):
1450 return False
1451 return url
1452
1453 with contextlib.closing(batch_fd) as fd:
1454 return [url for url in map(fixup, fd) if url]
1455
1456
1457 def urlencode_postdata(*args, **kargs):
1458 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1459
1460
1461 try:
1462 etree_iter = xml.etree.ElementTree.Element.iter
1463 except AttributeError: # Python <=2.6
1464 etree_iter = lambda n: n.findall('.//*')
1465
1466
1467 def parse_xml(s):
1468 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1469 def doctype(self, name, pubid, system):
1470 pass # Ignore doctypes
1471
1472 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1473 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1474 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1475 # Fix up XML parser in Python 2.x
1476 if sys.version_info < (3, 0):
1477 for n in etree_iter(tree):
1478 if n.text is not None:
1479 if not isinstance(n.text, compat_str):
1480 n.text = n.text.decode('utf-8')
1481 return tree
1482
1483
1484 if sys.version_info < (3, 0) and sys.platform == 'win32':
1485 def compat_getpass(prompt, *args, **kwargs):
1486 if isinstance(prompt, compat_str):
1487 prompt = prompt.encode(preferredencoding())
1488 return getpass.getpass(prompt, *args, **kwargs)
1489 else:
1490 compat_getpass = getpass.getpass
1491
1492
1493 US_RATINGS = {
1494 'G': 0,
1495 'PG': 10,
1496 'PG-13': 13,
1497 'R': 16,
1498 'NC': 18,
1499 }
1500
1501
1502 def strip_jsonp(code):
1503 return re.sub(r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?\s*$', r'\1', code)
1504
1505
1506 def js_to_json(code):
1507 def fix_kv(m):
1508 key = m.group(2)
1509 if key.startswith("'"):
1510 assert key.endswith("'")
1511 assert '"' not in key
1512 key = '"%s"' % key[1:-1]
1513 elif not key.startswith('"'):
1514 key = '"%s"' % key
1515
1516 value = m.group(4)
1517 if value.startswith("'"):
1518 assert value.endswith("'")
1519 assert '"' not in value
1520 value = '"%s"' % value[1:-1]
1521
1522 return m.group(1) + key + m.group(3) + value
1523
1524 res = re.sub(r'''(?x)
1525 ([{,]\s*)
1526 ("[^"]*"|\'[^\']*\'|[a-z0-9A-Z]+)
1527 (:\s*)
1528 ([0-9.]+|true|false|"[^"]*"|\'[^\']*\'|\[|\{)
1529 ''', fix_kv, code)
1530 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1531 return res
1532
1533
1534 def qualities(quality_ids):
1535 """ Get a numeric quality value out of a list of possible values """
1536 def q(qid):
1537 try:
1538 return quality_ids.index(qid)
1539 except ValueError:
1540 return -1
1541 return q
1542
1543
1544 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1545
1546 try:
1547 subprocess_check_output = subprocess.check_output
1548 except AttributeError:
1549 def subprocess_check_output(*args, **kwargs):
1550 assert 'input' not in kwargs
1551 p = subprocess.Popen(*args, stdout=subprocess.PIPE, **kwargs)
1552 output, _ = p.communicate()
1553 ret = p.poll()
1554 if ret:
1555 raise subprocess.CalledProcessError(ret, p.args, output=output)
1556 return output