]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
Merge remote-tracking branch 'lenaten/8tracks'
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import calendar
7 import codecs
8 import contextlib
9 import ctypes
10 import datetime
11 import email.utils
12 import errno
13 import gzip
14 import itertools
15 import io
16 import json
17 import locale
18 import math
19 import os
20 import pipes
21 import platform
22 import re
23 import ssl
24 import socket
25 import struct
26 import subprocess
27 import sys
28 import tempfile
29 import traceback
30 import xml.etree.ElementTree
31 import zlib
32
33 from .compat import (
34 compat_chr,
35 compat_getenv,
36 compat_html_entities,
37 compat_parse_qs,
38 compat_str,
39 compat_urllib_error,
40 compat_urllib_parse,
41 compat_urllib_parse_urlparse,
42 compat_urllib_request,
43 compat_urlparse,
44 shlex_quote,
45 )
46
47
48 # This is not clearly defined otherwise
49 compiled_regex_type = type(re.compile(''))
50
51 std_headers = {
52 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
53 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
54 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
55 'Accept-Encoding': 'gzip, deflate',
56 'Accept-Language': 'en-us,en;q=0.5',
57 }
58
59
60 def preferredencoding():
61 """Get preferred encoding.
62
63 Returns the best encoding scheme for the system, based on
64 locale.getpreferredencoding() and some further tweaks.
65 """
66 try:
67 pref = locale.getpreferredencoding()
68 'TEST'.encode(pref)
69 except:
70 pref = 'UTF-8'
71
72 return pref
73
74
75 def write_json_file(obj, fn):
76 """ Encode obj as JSON and write it to fn, atomically if possible """
77
78 fn = encodeFilename(fn)
79 if sys.version_info < (3, 0) and sys.platform != 'win32':
80 encoding = get_filesystem_encoding()
81 # os.path.basename returns a bytes object, but NamedTemporaryFile
82 # will fail if the filename contains non ascii characters unless we
83 # use a unicode object
84 path_basename = lambda f: os.path.basename(fn).decode(encoding)
85 # the same for os.path.dirname
86 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
87 else:
88 path_basename = os.path.basename
89 path_dirname = os.path.dirname
90
91 args = {
92 'suffix': '.tmp',
93 'prefix': path_basename(fn) + '.',
94 'dir': path_dirname(fn),
95 'delete': False,
96 }
97
98 # In Python 2.x, json.dump expects a bytestream.
99 # In Python 3.x, it writes to a character stream
100 if sys.version_info < (3, 0):
101 args['mode'] = 'wb'
102 else:
103 args.update({
104 'mode': 'w',
105 'encoding': 'utf-8',
106 })
107
108 tf = tempfile.NamedTemporaryFile(**args)
109
110 try:
111 with tf:
112 json.dump(obj, tf)
113 if sys.platform == 'win32':
114 # Need to remove existing file on Windows, else os.rename raises
115 # WindowsError or FileExistsError.
116 try:
117 os.unlink(fn)
118 except OSError:
119 pass
120 os.rename(tf.name, fn)
121 except:
122 try:
123 os.remove(tf.name)
124 except OSError:
125 pass
126 raise
127
128
129 if sys.version_info >= (2, 7):
130 def find_xpath_attr(node, xpath, key, val):
131 """ Find the xpath xpath[@key=val] """
132 assert re.match(r'^[a-zA-Z-]+$', key)
133 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
134 expr = xpath + "[@%s='%s']" % (key, val)
135 return node.find(expr)
136 else:
137 def find_xpath_attr(node, xpath, key, val):
138 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
139 # .//node does not match if a node is a direct child of . !
140 if isinstance(xpath, unicode):
141 xpath = xpath.encode('ascii')
142
143 for f in node.findall(xpath):
144 if f.attrib.get(key) == val:
145 return f
146 return None
147
148 # On python2.6 the xml.etree.ElementTree.Element methods don't support
149 # the namespace parameter
150
151
152 def xpath_with_ns(path, ns_map):
153 components = [c.split(':') for c in path.split('/')]
154 replaced = []
155 for c in components:
156 if len(c) == 1:
157 replaced.append(c[0])
158 else:
159 ns, tag = c
160 replaced.append('{%s}%s' % (ns_map[ns], tag))
161 return '/'.join(replaced)
162
163
164 def xpath_text(node, xpath, name=None, fatal=False):
165 if sys.version_info < (2, 7): # Crazy 2.6
166 xpath = xpath.encode('ascii')
167
168 n = node.find(xpath)
169 if n is None or n.text is None:
170 if fatal:
171 name = xpath if name is None else name
172 raise ExtractorError('Could not find XML element %s' % name)
173 else:
174 return None
175 return n.text
176
177
178 def get_element_by_id(id, html):
179 """Return the content of the tag with the specified ID in the passed HTML document"""
180 return get_element_by_attribute("id", id, html)
181
182
183 def get_element_by_attribute(attribute, value, html):
184 """Return the content of the tag with the specified attribute in the passed HTML document"""
185
186 m = re.search(r'''(?xs)
187 <([a-zA-Z0-9:._-]+)
188 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
189 \s+%s=['"]?%s['"]?
190 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
191 \s*>
192 (?P<content>.*?)
193 </\1>
194 ''' % (re.escape(attribute), re.escape(value)), html)
195
196 if not m:
197 return None
198 res = m.group('content')
199
200 if res.startswith('"') or res.startswith("'"):
201 res = res[1:-1]
202
203 return unescapeHTML(res)
204
205
206 def clean_html(html):
207 """Clean an HTML snippet into a readable string"""
208
209 if html is None: # Convenience for sanitizing descriptions etc.
210 return html
211
212 # Newline vs <br />
213 html = html.replace('\n', ' ')
214 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
215 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
216 # Strip html tags
217 html = re.sub('<.*?>', '', html)
218 # Replace html entities
219 html = unescapeHTML(html)
220 return html.strip()
221
222
223 def sanitize_open(filename, open_mode):
224 """Try to open the given filename, and slightly tweak it if this fails.
225
226 Attempts to open the given filename. If this fails, it tries to change
227 the filename slightly, step by step, until it's either able to open it
228 or it fails and raises a final exception, like the standard open()
229 function.
230
231 It returns the tuple (stream, definitive_file_name).
232 """
233 try:
234 if filename == '-':
235 if sys.platform == 'win32':
236 import msvcrt
237 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
238 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
239 stream = open(encodeFilename(filename), open_mode)
240 return (stream, filename)
241 except (IOError, OSError) as err:
242 if err.errno in (errno.EACCES,):
243 raise
244
245 # In case of error, try to remove win32 forbidden chars
246 alt_filename = os.path.join(
247 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
248 for path_part in os.path.split(filename)
249 )
250 if alt_filename == filename:
251 raise
252 else:
253 # An exception here should be caught in the caller
254 stream = open(encodeFilename(filename), open_mode)
255 return (stream, alt_filename)
256
257
258 def timeconvert(timestr):
259 """Convert RFC 2822 defined time string into system timestamp"""
260 timestamp = None
261 timetuple = email.utils.parsedate_tz(timestr)
262 if timetuple is not None:
263 timestamp = email.utils.mktime_tz(timetuple)
264 return timestamp
265
266
267 def sanitize_filename(s, restricted=False, is_id=False):
268 """Sanitizes a string so it could be used as part of a filename.
269 If restricted is set, use a stricter subset of allowed characters.
270 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
271 """
272 def replace_insane(char):
273 if char == '?' or ord(char) < 32 or ord(char) == 127:
274 return ''
275 elif char == '"':
276 return '' if restricted else '\''
277 elif char == ':':
278 return '_-' if restricted else ' -'
279 elif char in '\\/|*<>':
280 return '_'
281 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
282 return '_'
283 if restricted and ord(char) > 127:
284 return '_'
285 return char
286
287 result = ''.join(map(replace_insane, s))
288 if not is_id:
289 while '__' in result:
290 result = result.replace('__', '_')
291 result = result.strip('_')
292 # Common case of "Foreign band name - English song title"
293 if restricted and result.startswith('-_'):
294 result = result[2:]
295 if not result:
296 result = '_'
297 return result
298
299
300 def orderedSet(iterable):
301 """ Remove all duplicates from the input iterable """
302 res = []
303 for el in iterable:
304 if el not in res:
305 res.append(el)
306 return res
307
308
309 def _htmlentity_transform(entity):
310 """Transforms an HTML entity to a character."""
311 # Known non-numeric HTML entity
312 if entity in compat_html_entities.name2codepoint:
313 return compat_chr(compat_html_entities.name2codepoint[entity])
314
315 mobj = re.match(r'#(x?[0-9]+)', entity)
316 if mobj is not None:
317 numstr = mobj.group(1)
318 if numstr.startswith('x'):
319 base = 16
320 numstr = '0%s' % numstr
321 else:
322 base = 10
323 return compat_chr(int(numstr, base))
324
325 # Unknown entity in name, return its literal representation
326 return ('&%s;' % entity)
327
328
329 def unescapeHTML(s):
330 if s is None:
331 return None
332 assert type(s) == compat_str
333
334 return re.sub(
335 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
336
337
338 def encodeFilename(s, for_subprocess=False):
339 """
340 @param s The name of the file
341 """
342
343 assert type(s) == compat_str
344
345 # Python 3 has a Unicode API
346 if sys.version_info >= (3, 0):
347 return s
348
349 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
350 # Pass '' directly to use Unicode APIs on Windows 2000 and up
351 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
352 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
353 if not for_subprocess:
354 return s
355 else:
356 # For subprocess calls, encode with locale encoding
357 # Refer to http://stackoverflow.com/a/9951851/35070
358 encoding = preferredencoding()
359 else:
360 encoding = sys.getfilesystemencoding()
361 if encoding is None:
362 encoding = 'utf-8'
363 return s.encode(encoding, 'ignore')
364
365
366 def encodeArgument(s):
367 if not isinstance(s, compat_str):
368 # Legacy code that uses byte strings
369 # Uncomment the following line after fixing all post processors
370 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
371 s = s.decode('ascii')
372 return encodeFilename(s, True)
373
374
375 def decodeOption(optval):
376 if optval is None:
377 return optval
378 if isinstance(optval, bytes):
379 optval = optval.decode(preferredencoding())
380
381 assert isinstance(optval, compat_str)
382 return optval
383
384
385 def formatSeconds(secs):
386 if secs > 3600:
387 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
388 elif secs > 60:
389 return '%d:%02d' % (secs // 60, secs % 60)
390 else:
391 return '%d' % secs
392
393
394 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
395 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
396 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
397 if opts_no_check_certificate:
398 context.verify_mode = ssl.CERT_NONE
399 try:
400 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
401 except TypeError:
402 # Python 2.7.8
403 # (create_default_context present but HTTPSHandler has no context=)
404 pass
405
406 if sys.version_info < (3, 2):
407 import httplib
408
409 class HTTPSConnectionV3(httplib.HTTPSConnection):
410 def __init__(self, *args, **kwargs):
411 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
412
413 def connect(self):
414 sock = socket.create_connection((self.host, self.port), self.timeout)
415 if getattr(self, '_tunnel_host', False):
416 self.sock = sock
417 self._tunnel()
418 try:
419 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
420 except ssl.SSLError:
421 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
422
423 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
424 def https_open(self, req):
425 return self.do_open(HTTPSConnectionV3, req)
426 return HTTPSHandlerV3(**kwargs)
427 else: # Python < 3.4
428 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
429 context.verify_mode = (ssl.CERT_NONE
430 if opts_no_check_certificate
431 else ssl.CERT_REQUIRED)
432 context.set_default_verify_paths()
433 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
434
435
436 class ExtractorError(Exception):
437 """Error during info extraction."""
438
439 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
440 """ tb, if given, is the original traceback (so that it can be printed out).
441 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
442 """
443
444 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
445 expected = True
446 if video_id is not None:
447 msg = video_id + ': ' + msg
448 if cause:
449 msg += ' (caused by %r)' % cause
450 if not expected:
451 if ytdl_is_updateable():
452 update_cmd = 'type youtube-dl -U to update'
453 else:
454 update_cmd = 'see https://yt-dl.org/update on how to update'
455 msg += '; please report this issue on https://yt-dl.org/bug .'
456 msg += ' Make sure you are using the latest version; %s.' % update_cmd
457 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
458 super(ExtractorError, self).__init__(msg)
459
460 self.traceback = tb
461 self.exc_info = sys.exc_info() # preserve original exception
462 self.cause = cause
463 self.video_id = video_id
464
465 def format_traceback(self):
466 if self.traceback is None:
467 return None
468 return ''.join(traceback.format_tb(self.traceback))
469
470
471 class UnsupportedError(ExtractorError):
472 def __init__(self, url):
473 super(UnsupportedError, self).__init__(
474 'Unsupported URL: %s' % url, expected=True)
475 self.url = url
476
477
478 class RegexNotFoundError(ExtractorError):
479 """Error when a regex didn't match"""
480 pass
481
482
483 class DownloadError(Exception):
484 """Download Error exception.
485
486 This exception may be thrown by FileDownloader objects if they are not
487 configured to continue on errors. They will contain the appropriate
488 error message.
489 """
490
491 def __init__(self, msg, exc_info=None):
492 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
493 super(DownloadError, self).__init__(msg)
494 self.exc_info = exc_info
495
496
497 class SameFileError(Exception):
498 """Same File exception.
499
500 This exception will be thrown by FileDownloader objects if they detect
501 multiple files would have to be downloaded to the same file on disk.
502 """
503 pass
504
505
506 class PostProcessingError(Exception):
507 """Post Processing exception.
508
509 This exception may be raised by PostProcessor's .run() method to
510 indicate an error in the postprocessing task.
511 """
512
513 def __init__(self, msg):
514 self.msg = msg
515
516
517 class MaxDownloadsReached(Exception):
518 """ --max-downloads limit has been reached. """
519 pass
520
521
522 class UnavailableVideoError(Exception):
523 """Unavailable Format exception.
524
525 This exception will be thrown when a video is requested
526 in a format that is not available for that video.
527 """
528 pass
529
530
531 class ContentTooShortError(Exception):
532 """Content Too Short exception.
533
534 This exception may be raised by FileDownloader objects when a file they
535 download is too small for what the server announced first, indicating
536 the connection was probably interrupted.
537 """
538 # Both in bytes
539 downloaded = None
540 expected = None
541
542 def __init__(self, downloaded, expected):
543 self.downloaded = downloaded
544 self.expected = expected
545
546
547 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
548 """Handler for HTTP requests and responses.
549
550 This class, when installed with an OpenerDirector, automatically adds
551 the standard headers to every HTTP request and handles gzipped and
552 deflated responses from web servers. If compression is to be avoided in
553 a particular request, the original request in the program code only has
554 to include the HTTP header "Youtubedl-No-Compression", which will be
555 removed before making the real request.
556
557 Part of this code was copied from:
558
559 http://techknack.net/python-urllib2-handlers/
560
561 Andrew Rowls, the author of that code, agreed to release it to the
562 public domain.
563 """
564
565 @staticmethod
566 def deflate(data):
567 try:
568 return zlib.decompress(data, -zlib.MAX_WBITS)
569 except zlib.error:
570 return zlib.decompress(data)
571
572 @staticmethod
573 def addinfourl_wrapper(stream, headers, url, code):
574 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
575 return compat_urllib_request.addinfourl(stream, headers, url, code)
576 ret = compat_urllib_request.addinfourl(stream, headers, url)
577 ret.code = code
578 return ret
579
580 def http_request(self, req):
581 for h, v in std_headers.items():
582 if h not in req.headers:
583 req.add_header(h, v)
584 if 'Youtubedl-no-compression' in req.headers:
585 if 'Accept-encoding' in req.headers:
586 del req.headers['Accept-encoding']
587 del req.headers['Youtubedl-no-compression']
588 if 'Youtubedl-user-agent' in req.headers:
589 if 'User-agent' in req.headers:
590 del req.headers['User-agent']
591 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
592 del req.headers['Youtubedl-user-agent']
593
594 if sys.version_info < (2, 7) and '#' in req.get_full_url():
595 # Python 2.6 is brain-dead when it comes to fragments
596 req._Request__original = req._Request__original.partition('#')[0]
597 req._Request__r_type = req._Request__r_type.partition('#')[0]
598
599 return req
600
601 def http_response(self, req, resp):
602 old_resp = resp
603 # gzip
604 if resp.headers.get('Content-encoding', '') == 'gzip':
605 content = resp.read()
606 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
607 try:
608 uncompressed = io.BytesIO(gz.read())
609 except IOError as original_ioerror:
610 # There may be junk add the end of the file
611 # See http://stackoverflow.com/q/4928560/35070 for details
612 for i in range(1, 1024):
613 try:
614 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
615 uncompressed = io.BytesIO(gz.read())
616 except IOError:
617 continue
618 break
619 else:
620 raise original_ioerror
621 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
622 resp.msg = old_resp.msg
623 # deflate
624 if resp.headers.get('Content-encoding', '') == 'deflate':
625 gz = io.BytesIO(self.deflate(resp.read()))
626 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
627 resp.msg = old_resp.msg
628 return resp
629
630 https_request = http_request
631 https_response = http_response
632
633
634 def parse_iso8601(date_str, delimiter='T'):
635 """ Return a UNIX timestamp from the given date """
636
637 if date_str is None:
638 return None
639
640 m = re.search(
641 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
642 date_str)
643 if not m:
644 timezone = datetime.timedelta()
645 else:
646 date_str = date_str[:-len(m.group(0))]
647 if not m.group('sign'):
648 timezone = datetime.timedelta()
649 else:
650 sign = 1 if m.group('sign') == '+' else -1
651 timezone = datetime.timedelta(
652 hours=sign * int(m.group('hours')),
653 minutes=sign * int(m.group('minutes')))
654 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
655 dt = datetime.datetime.strptime(date_str, date_format) - timezone
656 return calendar.timegm(dt.timetuple())
657
658
659 def unified_strdate(date_str, day_first=True):
660 """Return a string with the date in the format YYYYMMDD"""
661
662 if date_str is None:
663 return None
664 upload_date = None
665 # Replace commas
666 date_str = date_str.replace(',', ' ')
667 # %z (UTC offset) is only supported in python>=3.2
668 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
669 # Remove AM/PM + timezone
670 date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
671
672 format_expressions = [
673 '%d %B %Y',
674 '%d %b %Y',
675 '%B %d %Y',
676 '%b %d %Y',
677 '%b %dst %Y %I:%M%p',
678 '%b %dnd %Y %I:%M%p',
679 '%b %dth %Y %I:%M%p',
680 '%Y-%m-%d',
681 '%Y/%m/%d',
682 '%d.%m.%Y',
683 '%d/%m/%Y',
684 '%d/%m/%y',
685 '%Y/%m/%d %H:%M:%S',
686 '%Y-%m-%d %H:%M:%S',
687 '%Y-%m-%d %H:%M:%S.%f',
688 '%d.%m.%Y %H:%M',
689 '%d.%m.%Y %H.%M',
690 '%Y-%m-%dT%H:%M:%SZ',
691 '%Y-%m-%dT%H:%M:%S.%fZ',
692 '%Y-%m-%dT%H:%M:%S.%f0Z',
693 '%Y-%m-%dT%H:%M:%S',
694 '%Y-%m-%dT%H:%M:%S.%f',
695 '%Y-%m-%dT%H:%M',
696 ]
697 if day_first:
698 format_expressions.extend([
699 '%d/%m/%Y %H:%M:%S',
700 ])
701 else:
702 format_expressions.extend([
703 '%m/%d/%Y %H:%M:%S',
704 ])
705 for expression in format_expressions:
706 try:
707 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
708 except ValueError:
709 pass
710 if upload_date is None:
711 timetuple = email.utils.parsedate_tz(date_str)
712 if timetuple:
713 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
714 return upload_date
715
716
717 def determine_ext(url, default_ext='unknown_video'):
718 if url is None:
719 return default_ext
720 guess = url.partition('?')[0].rpartition('.')[2]
721 if re.match(r'^[A-Za-z0-9]+$', guess):
722 return guess
723 else:
724 return default_ext
725
726
727 def subtitles_filename(filename, sub_lang, sub_format):
728 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
729
730
731 def date_from_str(date_str):
732 """
733 Return a datetime object from a string in the format YYYYMMDD or
734 (now|today)[+-][0-9](day|week|month|year)(s)?"""
735 today = datetime.date.today()
736 if date_str in ('now', 'today'):
737 return today
738 if date_str == 'yesterday':
739 return today - datetime.timedelta(days=1)
740 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
741 if match is not None:
742 sign = match.group('sign')
743 time = int(match.group('time'))
744 if sign == '-':
745 time = -time
746 unit = match.group('unit')
747 # A bad aproximation?
748 if unit == 'month':
749 unit = 'day'
750 time *= 30
751 elif unit == 'year':
752 unit = 'day'
753 time *= 365
754 unit += 's'
755 delta = datetime.timedelta(**{unit: time})
756 return today + delta
757 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
758
759
760 def hyphenate_date(date_str):
761 """
762 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
763 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
764 if match is not None:
765 return '-'.join(match.groups())
766 else:
767 return date_str
768
769
770 class DateRange(object):
771 """Represents a time interval between two dates"""
772
773 def __init__(self, start=None, end=None):
774 """start and end must be strings in the format accepted by date"""
775 if start is not None:
776 self.start = date_from_str(start)
777 else:
778 self.start = datetime.datetime.min.date()
779 if end is not None:
780 self.end = date_from_str(end)
781 else:
782 self.end = datetime.datetime.max.date()
783 if self.start > self.end:
784 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
785
786 @classmethod
787 def day(cls, day):
788 """Returns a range that only contains the given day"""
789 return cls(day, day)
790
791 def __contains__(self, date):
792 """Check if the date is in the range"""
793 if not isinstance(date, datetime.date):
794 date = date_from_str(date)
795 return self.start <= date <= self.end
796
797 def __str__(self):
798 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
799
800
801 def platform_name():
802 """ Returns the platform name as a compat_str """
803 res = platform.platform()
804 if isinstance(res, bytes):
805 res = res.decode(preferredencoding())
806
807 assert isinstance(res, compat_str)
808 return res
809
810
811 def _windows_write_string(s, out):
812 """ Returns True if the string was written using special methods,
813 False if it has yet to be written out."""
814 # Adapted from http://stackoverflow.com/a/3259271/35070
815
816 import ctypes
817 import ctypes.wintypes
818
819 WIN_OUTPUT_IDS = {
820 1: -11,
821 2: -12,
822 }
823
824 try:
825 fileno = out.fileno()
826 except AttributeError:
827 # If the output stream doesn't have a fileno, it's virtual
828 return False
829 if fileno not in WIN_OUTPUT_IDS:
830 return False
831
832 GetStdHandle = ctypes.WINFUNCTYPE(
833 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
834 (b"GetStdHandle", ctypes.windll.kernel32))
835 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
836
837 WriteConsoleW = ctypes.WINFUNCTYPE(
838 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
839 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
840 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
841 written = ctypes.wintypes.DWORD(0)
842
843 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
844 FILE_TYPE_CHAR = 0x0002
845 FILE_TYPE_REMOTE = 0x8000
846 GetConsoleMode = ctypes.WINFUNCTYPE(
847 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
848 ctypes.POINTER(ctypes.wintypes.DWORD))(
849 (b"GetConsoleMode", ctypes.windll.kernel32))
850 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
851
852 def not_a_console(handle):
853 if handle == INVALID_HANDLE_VALUE or handle is None:
854 return True
855 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
856 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
857
858 if not_a_console(h):
859 return False
860
861 def next_nonbmp_pos(s):
862 try:
863 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
864 except StopIteration:
865 return len(s)
866
867 while s:
868 count = min(next_nonbmp_pos(s), 1024)
869
870 ret = WriteConsoleW(
871 h, s, count if count else 2, ctypes.byref(written), None)
872 if ret == 0:
873 raise OSError('Failed to write string')
874 if not count: # We just wrote a non-BMP character
875 assert written.value == 2
876 s = s[1:]
877 else:
878 assert written.value > 0
879 s = s[written.value:]
880 return True
881
882
883 def write_string(s, out=None, encoding=None):
884 if out is None:
885 out = sys.stderr
886 assert type(s) == compat_str
887
888 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
889 if _windows_write_string(s, out):
890 return
891
892 if ('b' in getattr(out, 'mode', '') or
893 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
894 byt = s.encode(encoding or preferredencoding(), 'ignore')
895 out.write(byt)
896 elif hasattr(out, 'buffer'):
897 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
898 byt = s.encode(enc, 'ignore')
899 out.buffer.write(byt)
900 else:
901 out.write(s)
902 out.flush()
903
904
905 def bytes_to_intlist(bs):
906 if not bs:
907 return []
908 if isinstance(bs[0], int): # Python 3
909 return list(bs)
910 else:
911 return [ord(c) for c in bs]
912
913
914 def intlist_to_bytes(xs):
915 if not xs:
916 return b''
917 return struct_pack('%dB' % len(xs), *xs)
918
919
920 # Cross-platform file locking
921 if sys.platform == 'win32':
922 import ctypes.wintypes
923 import msvcrt
924
925 class OVERLAPPED(ctypes.Structure):
926 _fields_ = [
927 ('Internal', ctypes.wintypes.LPVOID),
928 ('InternalHigh', ctypes.wintypes.LPVOID),
929 ('Offset', ctypes.wintypes.DWORD),
930 ('OffsetHigh', ctypes.wintypes.DWORD),
931 ('hEvent', ctypes.wintypes.HANDLE),
932 ]
933
934 kernel32 = ctypes.windll.kernel32
935 LockFileEx = kernel32.LockFileEx
936 LockFileEx.argtypes = [
937 ctypes.wintypes.HANDLE, # hFile
938 ctypes.wintypes.DWORD, # dwFlags
939 ctypes.wintypes.DWORD, # dwReserved
940 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
941 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
942 ctypes.POINTER(OVERLAPPED) # Overlapped
943 ]
944 LockFileEx.restype = ctypes.wintypes.BOOL
945 UnlockFileEx = kernel32.UnlockFileEx
946 UnlockFileEx.argtypes = [
947 ctypes.wintypes.HANDLE, # hFile
948 ctypes.wintypes.DWORD, # dwReserved
949 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
950 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
951 ctypes.POINTER(OVERLAPPED) # Overlapped
952 ]
953 UnlockFileEx.restype = ctypes.wintypes.BOOL
954 whole_low = 0xffffffff
955 whole_high = 0x7fffffff
956
957 def _lock_file(f, exclusive):
958 overlapped = OVERLAPPED()
959 overlapped.Offset = 0
960 overlapped.OffsetHigh = 0
961 overlapped.hEvent = 0
962 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
963 handle = msvcrt.get_osfhandle(f.fileno())
964 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
965 whole_low, whole_high, f._lock_file_overlapped_p):
966 raise OSError('Locking file failed: %r' % ctypes.FormatError())
967
968 def _unlock_file(f):
969 assert f._lock_file_overlapped_p
970 handle = msvcrt.get_osfhandle(f.fileno())
971 if not UnlockFileEx(handle, 0,
972 whole_low, whole_high, f._lock_file_overlapped_p):
973 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
974
975 else:
976 import fcntl
977
978 def _lock_file(f, exclusive):
979 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
980
981 def _unlock_file(f):
982 fcntl.flock(f, fcntl.LOCK_UN)
983
984
985 class locked_file(object):
986 def __init__(self, filename, mode, encoding=None):
987 assert mode in ['r', 'a', 'w']
988 self.f = io.open(filename, mode, encoding=encoding)
989 self.mode = mode
990
991 def __enter__(self):
992 exclusive = self.mode != 'r'
993 try:
994 _lock_file(self.f, exclusive)
995 except IOError:
996 self.f.close()
997 raise
998 return self
999
1000 def __exit__(self, etype, value, traceback):
1001 try:
1002 _unlock_file(self.f)
1003 finally:
1004 self.f.close()
1005
1006 def __iter__(self):
1007 return iter(self.f)
1008
1009 def write(self, *args):
1010 return self.f.write(*args)
1011
1012 def read(self, *args):
1013 return self.f.read(*args)
1014
1015
1016 def get_filesystem_encoding():
1017 encoding = sys.getfilesystemencoding()
1018 return encoding if encoding is not None else 'utf-8'
1019
1020
1021 def shell_quote(args):
1022 quoted_args = []
1023 encoding = get_filesystem_encoding()
1024 for a in args:
1025 if isinstance(a, bytes):
1026 # We may get a filename encoded with 'encodeFilename'
1027 a = a.decode(encoding)
1028 quoted_args.append(pipes.quote(a))
1029 return ' '.join(quoted_args)
1030
1031
1032 def takewhile_inclusive(pred, seq):
1033 """ Like itertools.takewhile, but include the latest evaluated element
1034 (the first element so that Not pred(e)) """
1035 for e in seq:
1036 yield e
1037 if not pred(e):
1038 return
1039
1040
1041 def smuggle_url(url, data):
1042 """ Pass additional data in a URL for internal use. """
1043
1044 sdata = compat_urllib_parse.urlencode(
1045 {'__youtubedl_smuggle': json.dumps(data)})
1046 return url + '#' + sdata
1047
1048
1049 def unsmuggle_url(smug_url, default=None):
1050 if '#__youtubedl_smuggle' not in smug_url:
1051 return smug_url, default
1052 url, _, sdata = smug_url.rpartition('#')
1053 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1054 data = json.loads(jsond)
1055 return url, data
1056
1057
1058 def format_bytes(bytes):
1059 if bytes is None:
1060 return 'N/A'
1061 if type(bytes) is str:
1062 bytes = float(bytes)
1063 if bytes == 0.0:
1064 exponent = 0
1065 else:
1066 exponent = int(math.log(bytes, 1024.0))
1067 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1068 converted = float(bytes) / float(1024 ** exponent)
1069 return '%.2f%s' % (converted, suffix)
1070
1071
1072 def parse_filesize(s):
1073 if s is None:
1074 return None
1075
1076 # The lower-case forms are of course incorrect and inofficial,
1077 # but we support those too
1078 _UNIT_TABLE = {
1079 'B': 1,
1080 'b': 1,
1081 'KiB': 1024,
1082 'KB': 1000,
1083 'kB': 1024,
1084 'Kb': 1000,
1085 'MiB': 1024 ** 2,
1086 'MB': 1000 ** 2,
1087 'mB': 1024 ** 2,
1088 'Mb': 1000 ** 2,
1089 'GiB': 1024 ** 3,
1090 'GB': 1000 ** 3,
1091 'gB': 1024 ** 3,
1092 'Gb': 1000 ** 3,
1093 'TiB': 1024 ** 4,
1094 'TB': 1000 ** 4,
1095 'tB': 1024 ** 4,
1096 'Tb': 1000 ** 4,
1097 'PiB': 1024 ** 5,
1098 'PB': 1000 ** 5,
1099 'pB': 1024 ** 5,
1100 'Pb': 1000 ** 5,
1101 'EiB': 1024 ** 6,
1102 'EB': 1000 ** 6,
1103 'eB': 1024 ** 6,
1104 'Eb': 1000 ** 6,
1105 'ZiB': 1024 ** 7,
1106 'ZB': 1000 ** 7,
1107 'zB': 1024 ** 7,
1108 'Zb': 1000 ** 7,
1109 'YiB': 1024 ** 8,
1110 'YB': 1000 ** 8,
1111 'yB': 1024 ** 8,
1112 'Yb': 1000 ** 8,
1113 }
1114
1115 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1116 m = re.match(
1117 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1118 if not m:
1119 return None
1120
1121 num_str = m.group('num').replace(',', '.')
1122 mult = _UNIT_TABLE[m.group('unit')]
1123 return int(float(num_str) * mult)
1124
1125
1126 def get_term_width():
1127 columns = compat_getenv('COLUMNS', None)
1128 if columns:
1129 return int(columns)
1130
1131 try:
1132 sp = subprocess.Popen(
1133 ['stty', 'size'],
1134 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1135 out, err = sp.communicate()
1136 return int(out.split()[1])
1137 except:
1138 pass
1139 return None
1140
1141
1142 def month_by_name(name):
1143 """ Return the number of a month by (locale-independently) English name """
1144
1145 ENGLISH_NAMES = [
1146 'January', 'February', 'March', 'April', 'May', 'June',
1147 'July', 'August', 'September', 'October', 'November', 'December']
1148 try:
1149 return ENGLISH_NAMES.index(name) + 1
1150 except ValueError:
1151 return None
1152
1153
1154 def fix_xml_ampersands(xml_str):
1155 """Replace all the '&' by '&amp;' in XML"""
1156 return re.sub(
1157 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1158 '&amp;',
1159 xml_str)
1160
1161
1162 def setproctitle(title):
1163 assert isinstance(title, compat_str)
1164 try:
1165 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1166 except OSError:
1167 return
1168 title_bytes = title.encode('utf-8')
1169 buf = ctypes.create_string_buffer(len(title_bytes))
1170 buf.value = title_bytes
1171 try:
1172 libc.prctl(15, buf, 0, 0, 0)
1173 except AttributeError:
1174 return # Strange libc, just skip this
1175
1176
1177 def remove_start(s, start):
1178 if s.startswith(start):
1179 return s[len(start):]
1180 return s
1181
1182
1183 def remove_end(s, end):
1184 if s.endswith(end):
1185 return s[:-len(end)]
1186 return s
1187
1188
1189 def url_basename(url):
1190 path = compat_urlparse.urlparse(url).path
1191 return path.strip('/').split('/')[-1]
1192
1193
1194 class HEADRequest(compat_urllib_request.Request):
1195 def get_method(self):
1196 return "HEAD"
1197
1198
1199 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1200 if get_attr:
1201 if v is not None:
1202 v = getattr(v, get_attr, None)
1203 if v == '':
1204 v = None
1205 return default if v is None else (int(v) * invscale // scale)
1206
1207
1208 def str_or_none(v, default=None):
1209 return default if v is None else compat_str(v)
1210
1211
1212 def str_to_int(int_str):
1213 """ A more relaxed version of int_or_none """
1214 if int_str is None:
1215 return None
1216 int_str = re.sub(r'[,\.\+]', '', int_str)
1217 return int(int_str)
1218
1219
1220 def float_or_none(v, scale=1, invscale=1, default=None):
1221 return default if v is None else (float(v) * invscale / scale)
1222
1223
1224 def parse_duration(s):
1225 if s is None:
1226 return None
1227
1228 s = s.strip()
1229
1230 m = re.match(
1231 r'''(?ix)T?
1232 (?:
1233 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1234 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1235
1236 (?:
1237 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1238 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1239 )?
1240 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1241 )$''', s)
1242 if not m:
1243 return None
1244 res = 0
1245 if m.group('only_mins'):
1246 return float_or_none(m.group('only_mins'), invscale=60)
1247 if m.group('only_hours'):
1248 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1249 if m.group('secs'):
1250 res += int(m.group('secs'))
1251 if m.group('mins'):
1252 res += int(m.group('mins')) * 60
1253 if m.group('hours'):
1254 res += int(m.group('hours')) * 60 * 60
1255 if m.group('ms'):
1256 res += float(m.group('ms'))
1257 return res
1258
1259
1260 def prepend_extension(filename, ext):
1261 name, real_ext = os.path.splitext(filename)
1262 return '{0}.{1}{2}'.format(name, ext, real_ext)
1263
1264
1265 def check_executable(exe, args=[]):
1266 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1267 args can be a list of arguments for a short output (like -version) """
1268 try:
1269 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1270 except OSError:
1271 return False
1272 return exe
1273
1274
1275 def get_exe_version(exe, args=['--version'],
1276 version_re=None, unrecognized='present'):
1277 """ Returns the version of the specified executable,
1278 or False if the executable is not present """
1279 try:
1280 out, _ = subprocess.Popen(
1281 [exe] + args,
1282 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1283 except OSError:
1284 return False
1285 if isinstance(out, bytes): # Python 2.x
1286 out = out.decode('ascii', 'ignore')
1287 return detect_exe_version(out, version_re, unrecognized)
1288
1289
1290 def detect_exe_version(output, version_re=None, unrecognized='present'):
1291 assert isinstance(output, compat_str)
1292 if version_re is None:
1293 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1294 m = re.search(version_re, output)
1295 if m:
1296 return m.group(1)
1297 else:
1298 return unrecognized
1299
1300
1301 class PagedList(object):
1302 def __len__(self):
1303 # This is only useful for tests
1304 return len(self.getslice())
1305
1306
1307 class OnDemandPagedList(PagedList):
1308 def __init__(self, pagefunc, pagesize):
1309 self._pagefunc = pagefunc
1310 self._pagesize = pagesize
1311
1312 def getslice(self, start=0, end=None):
1313 res = []
1314 for pagenum in itertools.count(start // self._pagesize):
1315 firstid = pagenum * self._pagesize
1316 nextfirstid = pagenum * self._pagesize + self._pagesize
1317 if start >= nextfirstid:
1318 continue
1319
1320 page_results = list(self._pagefunc(pagenum))
1321
1322 startv = (
1323 start % self._pagesize
1324 if firstid <= start < nextfirstid
1325 else 0)
1326
1327 endv = (
1328 ((end - 1) % self._pagesize) + 1
1329 if (end is not None and firstid <= end <= nextfirstid)
1330 else None)
1331
1332 if startv != 0 or endv is not None:
1333 page_results = page_results[startv:endv]
1334 res.extend(page_results)
1335
1336 # A little optimization - if current page is not "full", ie. does
1337 # not contain page_size videos then we can assume that this page
1338 # is the last one - there are no more ids on further pages -
1339 # i.e. no need to query again.
1340 if len(page_results) + startv < self._pagesize:
1341 break
1342
1343 # If we got the whole page, but the next page is not interesting,
1344 # break out early as well
1345 if end == nextfirstid:
1346 break
1347 return res
1348
1349
1350 class InAdvancePagedList(PagedList):
1351 def __init__(self, pagefunc, pagecount, pagesize):
1352 self._pagefunc = pagefunc
1353 self._pagecount = pagecount
1354 self._pagesize = pagesize
1355
1356 def getslice(self, start=0, end=None):
1357 res = []
1358 start_page = start // self._pagesize
1359 end_page = (
1360 self._pagecount if end is None else (end // self._pagesize + 1))
1361 skip_elems = start - start_page * self._pagesize
1362 only_more = None if end is None else end - start
1363 for pagenum in range(start_page, end_page):
1364 page = list(self._pagefunc(pagenum))
1365 if skip_elems:
1366 page = page[skip_elems:]
1367 skip_elems = None
1368 if only_more is not None:
1369 if len(page) < only_more:
1370 only_more -= len(page)
1371 else:
1372 page = page[:only_more]
1373 res.extend(page)
1374 break
1375 res.extend(page)
1376 return res
1377
1378
1379 def uppercase_escape(s):
1380 unicode_escape = codecs.getdecoder('unicode_escape')
1381 return re.sub(
1382 r'\\U[0-9a-fA-F]{8}',
1383 lambda m: unicode_escape(m.group(0))[0],
1384 s)
1385
1386
1387 def escape_rfc3986(s):
1388 """Escape non-ASCII characters as suggested by RFC 3986"""
1389 if sys.version_info < (3, 0) and isinstance(s, unicode):
1390 s = s.encode('utf-8')
1391 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1392
1393
1394 def escape_url(url):
1395 """Escape URL as suggested by RFC 3986"""
1396 url_parsed = compat_urllib_parse_urlparse(url)
1397 return url_parsed._replace(
1398 path=escape_rfc3986(url_parsed.path),
1399 params=escape_rfc3986(url_parsed.params),
1400 query=escape_rfc3986(url_parsed.query),
1401 fragment=escape_rfc3986(url_parsed.fragment)
1402 ).geturl()
1403
1404 try:
1405 struct.pack('!I', 0)
1406 except TypeError:
1407 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1408 def struct_pack(spec, *args):
1409 if isinstance(spec, compat_str):
1410 spec = spec.encode('ascii')
1411 return struct.pack(spec, *args)
1412
1413 def struct_unpack(spec, *args):
1414 if isinstance(spec, compat_str):
1415 spec = spec.encode('ascii')
1416 return struct.unpack(spec, *args)
1417 else:
1418 struct_pack = struct.pack
1419 struct_unpack = struct.unpack
1420
1421
1422 def read_batch_urls(batch_fd):
1423 def fixup(url):
1424 if not isinstance(url, compat_str):
1425 url = url.decode('utf-8', 'replace')
1426 BOM_UTF8 = '\xef\xbb\xbf'
1427 if url.startswith(BOM_UTF8):
1428 url = url[len(BOM_UTF8):]
1429 url = url.strip()
1430 if url.startswith(('#', ';', ']')):
1431 return False
1432 return url
1433
1434 with contextlib.closing(batch_fd) as fd:
1435 return [url for url in map(fixup, fd) if url]
1436
1437
1438 def urlencode_postdata(*args, **kargs):
1439 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1440
1441
1442 try:
1443 etree_iter = xml.etree.ElementTree.Element.iter
1444 except AttributeError: # Python <=2.6
1445 etree_iter = lambda n: n.findall('.//*')
1446
1447
1448 def parse_xml(s):
1449 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1450 def doctype(self, name, pubid, system):
1451 pass # Ignore doctypes
1452
1453 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1454 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1455 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1456 # Fix up XML parser in Python 2.x
1457 if sys.version_info < (3, 0):
1458 for n in etree_iter(tree):
1459 if n.text is not None:
1460 if not isinstance(n.text, compat_str):
1461 n.text = n.text.decode('utf-8')
1462 return tree
1463
1464
1465 US_RATINGS = {
1466 'G': 0,
1467 'PG': 10,
1468 'PG-13': 13,
1469 'R': 16,
1470 'NC': 18,
1471 }
1472
1473
1474 def parse_age_limit(s):
1475 if s is None:
1476 return None
1477 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1478 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1479
1480
1481 def strip_jsonp(code):
1482 return re.sub(
1483 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1484
1485
1486 def js_to_json(code):
1487 def fix_kv(m):
1488 v = m.group(0)
1489 if v in ('true', 'false', 'null'):
1490 return v
1491 if v.startswith('"'):
1492 return v
1493 if v.startswith("'"):
1494 v = v[1:-1]
1495 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1496 '\\\\': '\\\\',
1497 "\\'": "'",
1498 '"': '\\"',
1499 }[m.group(0)], v)
1500 return '"%s"' % v
1501
1502 res = re.sub(r'''(?x)
1503 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1504 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1505 [a-zA-Z_][a-zA-Z_0-9]*
1506 ''', fix_kv, code)
1507 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1508 return res
1509
1510
1511 def qualities(quality_ids):
1512 """ Get a numeric quality value out of a list of possible values """
1513 def q(qid):
1514 try:
1515 return quality_ids.index(qid)
1516 except ValueError:
1517 return -1
1518 return q
1519
1520
1521 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1522
1523
1524 def limit_length(s, length):
1525 """ Add ellipses to overly long strings """
1526 if s is None:
1527 return None
1528 ELLIPSES = '...'
1529 if len(s) > length:
1530 return s[:length - len(ELLIPSES)] + ELLIPSES
1531 return s
1532
1533
1534 def version_tuple(v):
1535 return tuple(int(e) for e in re.split(r'[-.]', v))
1536
1537
1538 def is_outdated_version(version, limit, assume_new=True):
1539 if not version:
1540 return not assume_new
1541 try:
1542 return version_tuple(version) < version_tuple(limit)
1543 except ValueError:
1544 return not assume_new
1545
1546
1547 def ytdl_is_updateable():
1548 """ Returns if youtube-dl can be updated with -U """
1549 from zipimport import zipimporter
1550
1551 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1552
1553
1554 def args_to_str(args):
1555 # Get a short string representation for a subprocess command
1556 return ' '.join(shlex_quote(a) for a in args)
1557
1558
1559 def urlhandle_detect_ext(url_handle):
1560 try:
1561 url_handle.headers
1562 getheader = lambda h: url_handle.headers[h]
1563 except AttributeError: # Python < 3
1564 getheader = url_handle.info().getheader
1565
1566 return getheader('Content-Type').split("/")[1]
1567
1568
1569 def age_restricted(content_limit, age_limit):
1570 """ Returns True iff the content should be blocked """
1571
1572 if age_limit is None: # No limit set
1573 return False
1574 if content_limit is None:
1575 return False # Content available for everyone
1576 return age_limit < content_limit