]> jfr.im git - yt-dlp.git/blame_incremental - youtube_dl/utils.py
README: Recommend using flake8 instead of pyflake and pep8 separately
[yt-dlp.git] / youtube_dl / utils.py
... / ...
CommitLineData
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
4from __future__ import unicode_literals
5
6import calendar
7import codecs
8import contextlib
9import ctypes
10import datetime
11import email.utils
12import errno
13import functools
14import gzip
15import itertools
16import io
17import json
18import locale
19import math
20import os
21import pipes
22import platform
23import re
24import ssl
25import socket
26import struct
27import subprocess
28import sys
29import tempfile
30import traceback
31import xml.etree.ElementTree
32import zlib
33
34from .compat import (
35 compat_chr,
36 compat_getenv,
37 compat_html_entities,
38 compat_http_client,
39 compat_parse_qs,
40 compat_socket_create_connection,
41 compat_str,
42 compat_urllib_error,
43 compat_urllib_parse,
44 compat_urllib_parse_urlparse,
45 compat_urllib_request,
46 compat_urlparse,
47 shlex_quote,
48)
49
50
51# This is not clearly defined otherwise
52compiled_regex_type = type(re.compile(''))
53
54std_headers = {
55 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
56 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
57 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
58 'Accept-Encoding': 'gzip, deflate',
59 'Accept-Language': 'en-us,en;q=0.5',
60}
61
62
63def preferredencoding():
64 """Get preferred encoding.
65
66 Returns the best encoding scheme for the system, based on
67 locale.getpreferredencoding() and some further tweaks.
68 """
69 try:
70 pref = locale.getpreferredencoding()
71 'TEST'.encode(pref)
72 except:
73 pref = 'UTF-8'
74
75 return pref
76
77
78def write_json_file(obj, fn):
79 """ Encode obj as JSON and write it to fn, atomically if possible """
80
81 fn = encodeFilename(fn)
82 if sys.version_info < (3, 0) and sys.platform != 'win32':
83 encoding = get_filesystem_encoding()
84 # os.path.basename returns a bytes object, but NamedTemporaryFile
85 # will fail if the filename contains non ascii characters unless we
86 # use a unicode object
87 path_basename = lambda f: os.path.basename(fn).decode(encoding)
88 # the same for os.path.dirname
89 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
90 else:
91 path_basename = os.path.basename
92 path_dirname = os.path.dirname
93
94 args = {
95 'suffix': '.tmp',
96 'prefix': path_basename(fn) + '.',
97 'dir': path_dirname(fn),
98 'delete': False,
99 }
100
101 # In Python 2.x, json.dump expects a bytestream.
102 # In Python 3.x, it writes to a character stream
103 if sys.version_info < (3, 0):
104 args['mode'] = 'wb'
105 else:
106 args.update({
107 'mode': 'w',
108 'encoding': 'utf-8',
109 })
110
111 tf = tempfile.NamedTemporaryFile(**args)
112
113 try:
114 with tf:
115 json.dump(obj, tf)
116 if sys.platform == 'win32':
117 # Need to remove existing file on Windows, else os.rename raises
118 # WindowsError or FileExistsError.
119 try:
120 os.unlink(fn)
121 except OSError:
122 pass
123 os.rename(tf.name, fn)
124 except:
125 try:
126 os.remove(tf.name)
127 except OSError:
128 pass
129 raise
130
131
132if sys.version_info >= (2, 7):
133 def find_xpath_attr(node, xpath, key, val):
134 """ Find the xpath xpath[@key=val] """
135 assert re.match(r'^[a-zA-Z-]+$', key)
136 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
137 expr = xpath + "[@%s='%s']" % (key, val)
138 return node.find(expr)
139else:
140 def find_xpath_attr(node, xpath, key, val):
141 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
142 # .//node does not match if a node is a direct child of . !
143 if isinstance(xpath, unicode):
144 xpath = xpath.encode('ascii')
145
146 for f in node.findall(xpath):
147 if f.attrib.get(key) == val:
148 return f
149 return None
150
151# On python2.6 the xml.etree.ElementTree.Element methods don't support
152# the namespace parameter
153
154
155def xpath_with_ns(path, ns_map):
156 components = [c.split(':') for c in path.split('/')]
157 replaced = []
158 for c in components:
159 if len(c) == 1:
160 replaced.append(c[0])
161 else:
162 ns, tag = c
163 replaced.append('{%s}%s' % (ns_map[ns], tag))
164 return '/'.join(replaced)
165
166
167def xpath_text(node, xpath, name=None, fatal=False):
168 if sys.version_info < (2, 7): # Crazy 2.6
169 xpath = xpath.encode('ascii')
170
171 n = node.find(xpath)
172 if n is None or n.text is None:
173 if fatal:
174 name = xpath if name is None else name
175 raise ExtractorError('Could not find XML element %s' % name)
176 else:
177 return None
178 return n.text
179
180
181def get_element_by_id(id, html):
182 """Return the content of the tag with the specified ID in the passed HTML document"""
183 return get_element_by_attribute("id", id, html)
184
185
186def get_element_by_attribute(attribute, value, html):
187 """Return the content of the tag with the specified attribute in the passed HTML document"""
188
189 m = re.search(r'''(?xs)
190 <([a-zA-Z0-9:._-]+)
191 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
192 \s+%s=['"]?%s['"]?
193 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
194 \s*>
195 (?P<content>.*?)
196 </\1>
197 ''' % (re.escape(attribute), re.escape(value)), html)
198
199 if not m:
200 return None
201 res = m.group('content')
202
203 if res.startswith('"') or res.startswith("'"):
204 res = res[1:-1]
205
206 return unescapeHTML(res)
207
208
209def clean_html(html):
210 """Clean an HTML snippet into a readable string"""
211
212 if html is None: # Convenience for sanitizing descriptions etc.
213 return html
214
215 # Newline vs <br />
216 html = html.replace('\n', ' ')
217 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
218 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
219 # Strip html tags
220 html = re.sub('<.*?>', '', html)
221 # Replace html entities
222 html = unescapeHTML(html)
223 return html.strip()
224
225
226def sanitize_open(filename, open_mode):
227 """Try to open the given filename, and slightly tweak it if this fails.
228
229 Attempts to open the given filename. If this fails, it tries to change
230 the filename slightly, step by step, until it's either able to open it
231 or it fails and raises a final exception, like the standard open()
232 function.
233
234 It returns the tuple (stream, definitive_file_name).
235 """
236 try:
237 if filename == '-':
238 if sys.platform == 'win32':
239 import msvcrt
240 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
241 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
242 stream = open(encodeFilename(filename), open_mode)
243 return (stream, filename)
244 except (IOError, OSError) as err:
245 if err.errno in (errno.EACCES,):
246 raise
247
248 # In case of error, try to remove win32 forbidden chars
249 alt_filename = os.path.join(
250 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
251 for path_part in os.path.split(filename)
252 )
253 if alt_filename == filename:
254 raise
255 else:
256 # An exception here should be caught in the caller
257 stream = open(encodeFilename(filename), open_mode)
258 return (stream, alt_filename)
259
260
261def timeconvert(timestr):
262 """Convert RFC 2822 defined time string into system timestamp"""
263 timestamp = None
264 timetuple = email.utils.parsedate_tz(timestr)
265 if timetuple is not None:
266 timestamp = email.utils.mktime_tz(timetuple)
267 return timestamp
268
269
270def sanitize_filename(s, restricted=False, is_id=False):
271 """Sanitizes a string so it could be used as part of a filename.
272 If restricted is set, use a stricter subset of allowed characters.
273 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
274 """
275 def replace_insane(char):
276 if char == '?' or ord(char) < 32 or ord(char) == 127:
277 return ''
278 elif char == '"':
279 return '' if restricted else '\''
280 elif char == ':':
281 return '_-' if restricted else ' -'
282 elif char in '\\/|*<>':
283 return '_'
284 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
285 return '_'
286 if restricted and ord(char) > 127:
287 return '_'
288 return char
289
290 # Handle timestamps
291 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
292 result = ''.join(map(replace_insane, s))
293 if not is_id:
294 while '__' in result:
295 result = result.replace('__', '_')
296 result = result.strip('_')
297 # Common case of "Foreign band name - English song title"
298 if restricted and result.startswith('-_'):
299 result = result[2:]
300 if not result:
301 result = '_'
302 return result
303
304
305def orderedSet(iterable):
306 """ Remove all duplicates from the input iterable """
307 res = []
308 for el in iterable:
309 if el not in res:
310 res.append(el)
311 return res
312
313
314def _htmlentity_transform(entity):
315 """Transforms an HTML entity to a character."""
316 # Known non-numeric HTML entity
317 if entity in compat_html_entities.name2codepoint:
318 return compat_chr(compat_html_entities.name2codepoint[entity])
319
320 mobj = re.match(r'#(x?[0-9]+)', entity)
321 if mobj is not None:
322 numstr = mobj.group(1)
323 if numstr.startswith('x'):
324 base = 16
325 numstr = '0%s' % numstr
326 else:
327 base = 10
328 return compat_chr(int(numstr, base))
329
330 # Unknown entity in name, return its literal representation
331 return ('&%s;' % entity)
332
333
334def unescapeHTML(s):
335 if s is None:
336 return None
337 assert type(s) == compat_str
338
339 return re.sub(
340 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
341
342
343def encodeFilename(s, for_subprocess=False):
344 """
345 @param s The name of the file
346 """
347
348 assert type(s) == compat_str
349
350 # Python 3 has a Unicode API
351 if sys.version_info >= (3, 0):
352 return s
353
354 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
355 # Pass '' directly to use Unicode APIs on Windows 2000 and up
356 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
357 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
358 if not for_subprocess:
359 return s
360 else:
361 # For subprocess calls, encode with locale encoding
362 # Refer to http://stackoverflow.com/a/9951851/35070
363 encoding = preferredencoding()
364 else:
365 encoding = sys.getfilesystemencoding()
366 if encoding is None:
367 encoding = 'utf-8'
368 return s.encode(encoding, 'ignore')
369
370
371def encodeArgument(s):
372 if not isinstance(s, compat_str):
373 # Legacy code that uses byte strings
374 # Uncomment the following line after fixing all post processors
375 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
376 s = s.decode('ascii')
377 return encodeFilename(s, True)
378
379
380def decodeOption(optval):
381 if optval is None:
382 return optval
383 if isinstance(optval, bytes):
384 optval = optval.decode(preferredencoding())
385
386 assert isinstance(optval, compat_str)
387 return optval
388
389
390def formatSeconds(secs):
391 if secs > 3600:
392 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
393 elif secs > 60:
394 return '%d:%02d' % (secs // 60, secs % 60)
395 else:
396 return '%d' % secs
397
398
399def make_HTTPS_handler(params, **kwargs):
400 opts_no_check_certificate = params.get('nocheckcertificate', False)
401 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
402 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
403 if opts_no_check_certificate:
404 context.check_hostname = False
405 context.verify_mode = ssl.CERT_NONE
406 try:
407 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
408 except TypeError:
409 # Python 2.7.8
410 # (create_default_context present but HTTPSHandler has no context=)
411 pass
412
413 if sys.version_info < (3, 2):
414 return YoutubeDLHTTPSHandler(params, **kwargs)
415 else: # Python < 3.4
416 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
417 context.verify_mode = (ssl.CERT_NONE
418 if opts_no_check_certificate
419 else ssl.CERT_REQUIRED)
420 context.set_default_verify_paths()
421 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
422
423
424class ExtractorError(Exception):
425 """Error during info extraction."""
426
427 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
428 """ tb, if given, is the original traceback (so that it can be printed out).
429 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
430 """
431
432 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
433 expected = True
434 if video_id is not None:
435 msg = video_id + ': ' + msg
436 if cause:
437 msg += ' (caused by %r)' % cause
438 if not expected:
439 if ytdl_is_updateable():
440 update_cmd = 'type youtube-dl -U to update'
441 else:
442 update_cmd = 'see https://yt-dl.org/update on how to update'
443 msg += '; please report this issue on https://yt-dl.org/bug .'
444 msg += ' Make sure you are using the latest version; %s.' % update_cmd
445 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
446 super(ExtractorError, self).__init__(msg)
447
448 self.traceback = tb
449 self.exc_info = sys.exc_info() # preserve original exception
450 self.cause = cause
451 self.video_id = video_id
452
453 def format_traceback(self):
454 if self.traceback is None:
455 return None
456 return ''.join(traceback.format_tb(self.traceback))
457
458
459class UnsupportedError(ExtractorError):
460 def __init__(self, url):
461 super(UnsupportedError, self).__init__(
462 'Unsupported URL: %s' % url, expected=True)
463 self.url = url
464
465
466class RegexNotFoundError(ExtractorError):
467 """Error when a regex didn't match"""
468 pass
469
470
471class DownloadError(Exception):
472 """Download Error exception.
473
474 This exception may be thrown by FileDownloader objects if they are not
475 configured to continue on errors. They will contain the appropriate
476 error message.
477 """
478
479 def __init__(self, msg, exc_info=None):
480 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
481 super(DownloadError, self).__init__(msg)
482 self.exc_info = exc_info
483
484
485class SameFileError(Exception):
486 """Same File exception.
487
488 This exception will be thrown by FileDownloader objects if they detect
489 multiple files would have to be downloaded to the same file on disk.
490 """
491 pass
492
493
494class PostProcessingError(Exception):
495 """Post Processing exception.
496
497 This exception may be raised by PostProcessor's .run() method to
498 indicate an error in the postprocessing task.
499 """
500
501 def __init__(self, msg):
502 self.msg = msg
503
504
505class MaxDownloadsReached(Exception):
506 """ --max-downloads limit has been reached. """
507 pass
508
509
510class UnavailableVideoError(Exception):
511 """Unavailable Format exception.
512
513 This exception will be thrown when a video is requested
514 in a format that is not available for that video.
515 """
516 pass
517
518
519class ContentTooShortError(Exception):
520 """Content Too Short exception.
521
522 This exception may be raised by FileDownloader objects when a file they
523 download is too small for what the server announced first, indicating
524 the connection was probably interrupted.
525 """
526 # Both in bytes
527 downloaded = None
528 expected = None
529
530 def __init__(self, downloaded, expected):
531 self.downloaded = downloaded
532 self.expected = expected
533
534
535def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
536 hc = http_class(*args, **kwargs)
537 source_address = ydl_handler._params.get('source_address')
538 if source_address is not None:
539 sa = (source_address, 0)
540 if hasattr(hc, 'source_address'): # Python 2.7+
541 hc.source_address = sa
542 else: # Python 2.6
543 def _hc_connect(self, *args, **kwargs):
544 sock = compat_socket_create_connection(
545 (self.host, self.port), self.timeout, sa)
546 if is_https:
547 self.sock = ssl.wrap_socket(
548 sock, self.key_file, self.cert_file,
549 ssl_version=ssl.PROTOCOL_TLSv1)
550 else:
551 self.sock = sock
552 hc.connect = functools.partial(_hc_connect, hc)
553
554 return hc
555
556
557class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
558 """Handler for HTTP requests and responses.
559
560 This class, when installed with an OpenerDirector, automatically adds
561 the standard headers to every HTTP request and handles gzipped and
562 deflated responses from web servers. If compression is to be avoided in
563 a particular request, the original request in the program code only has
564 to include the HTTP header "Youtubedl-No-Compression", which will be
565 removed before making the real request.
566
567 Part of this code was copied from:
568
569 http://techknack.net/python-urllib2-handlers/
570
571 Andrew Rowls, the author of that code, agreed to release it to the
572 public domain.
573 """
574
575 def __init__(self, params, *args, **kwargs):
576 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
577 self._params = params
578
579 def http_open(self, req):
580 return self.do_open(functools.partial(
581 _create_http_connection, self, compat_http_client.HTTPConnection, False),
582 req)
583
584 @staticmethod
585 def deflate(data):
586 try:
587 return zlib.decompress(data, -zlib.MAX_WBITS)
588 except zlib.error:
589 return zlib.decompress(data)
590
591 @staticmethod
592 def addinfourl_wrapper(stream, headers, url, code):
593 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
594 return compat_urllib_request.addinfourl(stream, headers, url, code)
595 ret = compat_urllib_request.addinfourl(stream, headers, url)
596 ret.code = code
597 return ret
598
599 def http_request(self, req):
600 for h, v in std_headers.items():
601 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
602 # The dict keys are capitalized because of this bug by urllib
603 if h.capitalize() not in req.headers:
604 req.add_header(h, v)
605 if 'Youtubedl-no-compression' in req.headers:
606 if 'Accept-encoding' in req.headers:
607 del req.headers['Accept-encoding']
608 del req.headers['Youtubedl-no-compression']
609 if 'Youtubedl-user-agent' in req.headers:
610 if 'User-agent' in req.headers:
611 del req.headers['User-agent']
612 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
613 del req.headers['Youtubedl-user-agent']
614
615 if sys.version_info < (2, 7) and '#' in req.get_full_url():
616 # Python 2.6 is brain-dead when it comes to fragments
617 req._Request__original = req._Request__original.partition('#')[0]
618 req._Request__r_type = req._Request__r_type.partition('#')[0]
619
620 return req
621
622 def http_response(self, req, resp):
623 old_resp = resp
624 # gzip
625 if resp.headers.get('Content-encoding', '') == 'gzip':
626 content = resp.read()
627 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
628 try:
629 uncompressed = io.BytesIO(gz.read())
630 except IOError as original_ioerror:
631 # There may be junk add the end of the file
632 # See http://stackoverflow.com/q/4928560/35070 for details
633 for i in range(1, 1024):
634 try:
635 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
636 uncompressed = io.BytesIO(gz.read())
637 except IOError:
638 continue
639 break
640 else:
641 raise original_ioerror
642 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
643 resp.msg = old_resp.msg
644 # deflate
645 if resp.headers.get('Content-encoding', '') == 'deflate':
646 gz = io.BytesIO(self.deflate(resp.read()))
647 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
648 resp.msg = old_resp.msg
649 return resp
650
651 https_request = http_request
652 https_response = http_response
653
654
655class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
656 def __init__(self, params, https_conn_class=None, *args, **kwargs):
657 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
658 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
659 self._params = params
660
661 def https_open(self, req):
662 return self.do_open(functools.partial(
663 _create_http_connection, self, self._https_conn_class, True),
664 req)
665
666
667def parse_iso8601(date_str, delimiter='T'):
668 """ Return a UNIX timestamp from the given date """
669
670 if date_str is None:
671 return None
672
673 m = re.search(
674 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
675 date_str)
676 if not m:
677 timezone = datetime.timedelta()
678 else:
679 date_str = date_str[:-len(m.group(0))]
680 if not m.group('sign'):
681 timezone = datetime.timedelta()
682 else:
683 sign = 1 if m.group('sign') == '+' else -1
684 timezone = datetime.timedelta(
685 hours=sign * int(m.group('hours')),
686 minutes=sign * int(m.group('minutes')))
687 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
688 dt = datetime.datetime.strptime(date_str, date_format) - timezone
689 return calendar.timegm(dt.timetuple())
690
691
692def unified_strdate(date_str, day_first=True):
693 """Return a string with the date in the format YYYYMMDD"""
694
695 if date_str is None:
696 return None
697 upload_date = None
698 # Replace commas
699 date_str = date_str.replace(',', ' ')
700 # %z (UTC offset) is only supported in python>=3.2
701 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
702 # Remove AM/PM + timezone
703 date_str = re.sub(r'(?i)\s*(?:AM|PM)\s+[A-Z]+', '', date_str)
704
705 format_expressions = [
706 '%d %B %Y',
707 '%d %b %Y',
708 '%B %d %Y',
709 '%b %d %Y',
710 '%b %dst %Y %I:%M%p',
711 '%b %dnd %Y %I:%M%p',
712 '%b %dth %Y %I:%M%p',
713 '%Y %m %d',
714 '%Y-%m-%d',
715 '%Y/%m/%d',
716 '%Y/%m/%d %H:%M:%S',
717 '%Y-%m-%d %H:%M:%S',
718 '%Y-%m-%d %H:%M:%S.%f',
719 '%d.%m.%Y %H:%M',
720 '%d.%m.%Y %H.%M',
721 '%Y-%m-%dT%H:%M:%SZ',
722 '%Y-%m-%dT%H:%M:%S.%fZ',
723 '%Y-%m-%dT%H:%M:%S.%f0Z',
724 '%Y-%m-%dT%H:%M:%S',
725 '%Y-%m-%dT%H:%M:%S.%f',
726 '%Y-%m-%dT%H:%M',
727 ]
728 if day_first:
729 format_expressions.extend([
730 '%d.%m.%Y',
731 '%d/%m/%Y',
732 '%d/%m/%y',
733 '%d/%m/%Y %H:%M:%S',
734 ])
735 else:
736 format_expressions.extend([
737 '%m.%d.%Y',
738 '%m/%d/%Y',
739 '%m/%d/%y',
740 '%m/%d/%Y %H:%M:%S',
741 ])
742 for expression in format_expressions:
743 try:
744 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
745 except ValueError:
746 pass
747 if upload_date is None:
748 timetuple = email.utils.parsedate_tz(date_str)
749 if timetuple:
750 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
751 return upload_date
752
753
754def determine_ext(url, default_ext='unknown_video'):
755 if url is None:
756 return default_ext
757 guess = url.partition('?')[0].rpartition('.')[2]
758 if re.match(r'^[A-Za-z0-9]+$', guess):
759 return guess
760 else:
761 return default_ext
762
763
764def subtitles_filename(filename, sub_lang, sub_format):
765 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
766
767
768def date_from_str(date_str):
769 """
770 Return a datetime object from a string in the format YYYYMMDD or
771 (now|today)[+-][0-9](day|week|month|year)(s)?"""
772 today = datetime.date.today()
773 if date_str in ('now', 'today'):
774 return today
775 if date_str == 'yesterday':
776 return today - datetime.timedelta(days=1)
777 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
778 if match is not None:
779 sign = match.group('sign')
780 time = int(match.group('time'))
781 if sign == '-':
782 time = -time
783 unit = match.group('unit')
784 # A bad aproximation?
785 if unit == 'month':
786 unit = 'day'
787 time *= 30
788 elif unit == 'year':
789 unit = 'day'
790 time *= 365
791 unit += 's'
792 delta = datetime.timedelta(**{unit: time})
793 return today + delta
794 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
795
796
797def hyphenate_date(date_str):
798 """
799 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
800 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
801 if match is not None:
802 return '-'.join(match.groups())
803 else:
804 return date_str
805
806
807class DateRange(object):
808 """Represents a time interval between two dates"""
809
810 def __init__(self, start=None, end=None):
811 """start and end must be strings in the format accepted by date"""
812 if start is not None:
813 self.start = date_from_str(start)
814 else:
815 self.start = datetime.datetime.min.date()
816 if end is not None:
817 self.end = date_from_str(end)
818 else:
819 self.end = datetime.datetime.max.date()
820 if self.start > self.end:
821 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
822
823 @classmethod
824 def day(cls, day):
825 """Returns a range that only contains the given day"""
826 return cls(day, day)
827
828 def __contains__(self, date):
829 """Check if the date is in the range"""
830 if not isinstance(date, datetime.date):
831 date = date_from_str(date)
832 return self.start <= date <= self.end
833
834 def __str__(self):
835 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
836
837
838def platform_name():
839 """ Returns the platform name as a compat_str """
840 res = platform.platform()
841 if isinstance(res, bytes):
842 res = res.decode(preferredencoding())
843
844 assert isinstance(res, compat_str)
845 return res
846
847
848def _windows_write_string(s, out):
849 """ Returns True if the string was written using special methods,
850 False if it has yet to be written out."""
851 # Adapted from http://stackoverflow.com/a/3259271/35070
852
853 import ctypes
854 import ctypes.wintypes
855
856 WIN_OUTPUT_IDS = {
857 1: -11,
858 2: -12,
859 }
860
861 try:
862 fileno = out.fileno()
863 except AttributeError:
864 # If the output stream doesn't have a fileno, it's virtual
865 return False
866 except io.UnsupportedOperation:
867 # Some strange Windows pseudo files?
868 return False
869 if fileno not in WIN_OUTPUT_IDS:
870 return False
871
872 GetStdHandle = ctypes.WINFUNCTYPE(
873 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
874 (b"GetStdHandle", ctypes.windll.kernel32))
875 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
876
877 WriteConsoleW = ctypes.WINFUNCTYPE(
878 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
879 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
880 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
881 written = ctypes.wintypes.DWORD(0)
882
883 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
884 FILE_TYPE_CHAR = 0x0002
885 FILE_TYPE_REMOTE = 0x8000
886 GetConsoleMode = ctypes.WINFUNCTYPE(
887 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
888 ctypes.POINTER(ctypes.wintypes.DWORD))(
889 (b"GetConsoleMode", ctypes.windll.kernel32))
890 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
891
892 def not_a_console(handle):
893 if handle == INVALID_HANDLE_VALUE or handle is None:
894 return True
895 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
896 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
897
898 if not_a_console(h):
899 return False
900
901 def next_nonbmp_pos(s):
902 try:
903 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
904 except StopIteration:
905 return len(s)
906
907 while s:
908 count = min(next_nonbmp_pos(s), 1024)
909
910 ret = WriteConsoleW(
911 h, s, count if count else 2, ctypes.byref(written), None)
912 if ret == 0:
913 raise OSError('Failed to write string')
914 if not count: # We just wrote a non-BMP character
915 assert written.value == 2
916 s = s[1:]
917 else:
918 assert written.value > 0
919 s = s[written.value:]
920 return True
921
922
923def write_string(s, out=None, encoding=None):
924 if out is None:
925 out = sys.stderr
926 assert type(s) == compat_str
927
928 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
929 if _windows_write_string(s, out):
930 return
931
932 if ('b' in getattr(out, 'mode', '') or
933 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
934 byt = s.encode(encoding or preferredencoding(), 'ignore')
935 out.write(byt)
936 elif hasattr(out, 'buffer'):
937 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
938 byt = s.encode(enc, 'ignore')
939 out.buffer.write(byt)
940 else:
941 out.write(s)
942 out.flush()
943
944
945def bytes_to_intlist(bs):
946 if not bs:
947 return []
948 if isinstance(bs[0], int): # Python 3
949 return list(bs)
950 else:
951 return [ord(c) for c in bs]
952
953
954def intlist_to_bytes(xs):
955 if not xs:
956 return b''
957 return struct_pack('%dB' % len(xs), *xs)
958
959
960# Cross-platform file locking
961if sys.platform == 'win32':
962 import ctypes.wintypes
963 import msvcrt
964
965 class OVERLAPPED(ctypes.Structure):
966 _fields_ = [
967 ('Internal', ctypes.wintypes.LPVOID),
968 ('InternalHigh', ctypes.wintypes.LPVOID),
969 ('Offset', ctypes.wintypes.DWORD),
970 ('OffsetHigh', ctypes.wintypes.DWORD),
971 ('hEvent', ctypes.wintypes.HANDLE),
972 ]
973
974 kernel32 = ctypes.windll.kernel32
975 LockFileEx = kernel32.LockFileEx
976 LockFileEx.argtypes = [
977 ctypes.wintypes.HANDLE, # hFile
978 ctypes.wintypes.DWORD, # dwFlags
979 ctypes.wintypes.DWORD, # dwReserved
980 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
981 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
982 ctypes.POINTER(OVERLAPPED) # Overlapped
983 ]
984 LockFileEx.restype = ctypes.wintypes.BOOL
985 UnlockFileEx = kernel32.UnlockFileEx
986 UnlockFileEx.argtypes = [
987 ctypes.wintypes.HANDLE, # hFile
988 ctypes.wintypes.DWORD, # dwReserved
989 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
990 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
991 ctypes.POINTER(OVERLAPPED) # Overlapped
992 ]
993 UnlockFileEx.restype = ctypes.wintypes.BOOL
994 whole_low = 0xffffffff
995 whole_high = 0x7fffffff
996
997 def _lock_file(f, exclusive):
998 overlapped = OVERLAPPED()
999 overlapped.Offset = 0
1000 overlapped.OffsetHigh = 0
1001 overlapped.hEvent = 0
1002 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1003 handle = msvcrt.get_osfhandle(f.fileno())
1004 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1005 whole_low, whole_high, f._lock_file_overlapped_p):
1006 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1007
1008 def _unlock_file(f):
1009 assert f._lock_file_overlapped_p
1010 handle = msvcrt.get_osfhandle(f.fileno())
1011 if not UnlockFileEx(handle, 0,
1012 whole_low, whole_high, f._lock_file_overlapped_p):
1013 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1014
1015else:
1016 import fcntl
1017
1018 def _lock_file(f, exclusive):
1019 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1020
1021 def _unlock_file(f):
1022 fcntl.flock(f, fcntl.LOCK_UN)
1023
1024
1025class locked_file(object):
1026 def __init__(self, filename, mode, encoding=None):
1027 assert mode in ['r', 'a', 'w']
1028 self.f = io.open(filename, mode, encoding=encoding)
1029 self.mode = mode
1030
1031 def __enter__(self):
1032 exclusive = self.mode != 'r'
1033 try:
1034 _lock_file(self.f, exclusive)
1035 except IOError:
1036 self.f.close()
1037 raise
1038 return self
1039
1040 def __exit__(self, etype, value, traceback):
1041 try:
1042 _unlock_file(self.f)
1043 finally:
1044 self.f.close()
1045
1046 def __iter__(self):
1047 return iter(self.f)
1048
1049 def write(self, *args):
1050 return self.f.write(*args)
1051
1052 def read(self, *args):
1053 return self.f.read(*args)
1054
1055
1056def get_filesystem_encoding():
1057 encoding = sys.getfilesystemencoding()
1058 return encoding if encoding is not None else 'utf-8'
1059
1060
1061def shell_quote(args):
1062 quoted_args = []
1063 encoding = get_filesystem_encoding()
1064 for a in args:
1065 if isinstance(a, bytes):
1066 # We may get a filename encoded with 'encodeFilename'
1067 a = a.decode(encoding)
1068 quoted_args.append(pipes.quote(a))
1069 return ' '.join(quoted_args)
1070
1071
1072def takewhile_inclusive(pred, seq):
1073 """ Like itertools.takewhile, but include the latest evaluated element
1074 (the first element so that Not pred(e)) """
1075 for e in seq:
1076 yield e
1077 if not pred(e):
1078 return
1079
1080
1081def smuggle_url(url, data):
1082 """ Pass additional data in a URL for internal use. """
1083
1084 sdata = compat_urllib_parse.urlencode(
1085 {'__youtubedl_smuggle': json.dumps(data)})
1086 return url + '#' + sdata
1087
1088
1089def unsmuggle_url(smug_url, default=None):
1090 if '#__youtubedl_smuggle' not in smug_url:
1091 return smug_url, default
1092 url, _, sdata = smug_url.rpartition('#')
1093 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1094 data = json.loads(jsond)
1095 return url, data
1096
1097
1098def format_bytes(bytes):
1099 if bytes is None:
1100 return 'N/A'
1101 if type(bytes) is str:
1102 bytes = float(bytes)
1103 if bytes == 0.0:
1104 exponent = 0
1105 else:
1106 exponent = int(math.log(bytes, 1024.0))
1107 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1108 converted = float(bytes) / float(1024 ** exponent)
1109 return '%.2f%s' % (converted, suffix)
1110
1111
1112def parse_filesize(s):
1113 if s is None:
1114 return None
1115
1116 # The lower-case forms are of course incorrect and inofficial,
1117 # but we support those too
1118 _UNIT_TABLE = {
1119 'B': 1,
1120 'b': 1,
1121 'KiB': 1024,
1122 'KB': 1000,
1123 'kB': 1024,
1124 'Kb': 1000,
1125 'MiB': 1024 ** 2,
1126 'MB': 1000 ** 2,
1127 'mB': 1024 ** 2,
1128 'Mb': 1000 ** 2,
1129 'GiB': 1024 ** 3,
1130 'GB': 1000 ** 3,
1131 'gB': 1024 ** 3,
1132 'Gb': 1000 ** 3,
1133 'TiB': 1024 ** 4,
1134 'TB': 1000 ** 4,
1135 'tB': 1024 ** 4,
1136 'Tb': 1000 ** 4,
1137 'PiB': 1024 ** 5,
1138 'PB': 1000 ** 5,
1139 'pB': 1024 ** 5,
1140 'Pb': 1000 ** 5,
1141 'EiB': 1024 ** 6,
1142 'EB': 1000 ** 6,
1143 'eB': 1024 ** 6,
1144 'Eb': 1000 ** 6,
1145 'ZiB': 1024 ** 7,
1146 'ZB': 1000 ** 7,
1147 'zB': 1024 ** 7,
1148 'Zb': 1000 ** 7,
1149 'YiB': 1024 ** 8,
1150 'YB': 1000 ** 8,
1151 'yB': 1024 ** 8,
1152 'Yb': 1000 ** 8,
1153 }
1154
1155 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1156 m = re.match(
1157 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1158 if not m:
1159 return None
1160
1161 num_str = m.group('num').replace(',', '.')
1162 mult = _UNIT_TABLE[m.group('unit')]
1163 return int(float(num_str) * mult)
1164
1165
1166def get_term_width():
1167 columns = compat_getenv('COLUMNS', None)
1168 if columns:
1169 return int(columns)
1170
1171 try:
1172 sp = subprocess.Popen(
1173 ['stty', 'size'],
1174 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1175 out, err = sp.communicate()
1176 return int(out.split()[1])
1177 except:
1178 pass
1179 return None
1180
1181
1182def month_by_name(name):
1183 """ Return the number of a month by (locale-independently) English name """
1184
1185 ENGLISH_NAMES = [
1186 'January', 'February', 'March', 'April', 'May', 'June',
1187 'July', 'August', 'September', 'October', 'November', 'December']
1188 try:
1189 return ENGLISH_NAMES.index(name) + 1
1190 except ValueError:
1191 return None
1192
1193
1194def fix_xml_ampersands(xml_str):
1195 """Replace all the '&' by '&amp;' in XML"""
1196 return re.sub(
1197 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1198 '&amp;',
1199 xml_str)
1200
1201
1202def setproctitle(title):
1203 assert isinstance(title, compat_str)
1204 try:
1205 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1206 except OSError:
1207 return
1208 title_bytes = title.encode('utf-8')
1209 buf = ctypes.create_string_buffer(len(title_bytes))
1210 buf.value = title_bytes
1211 try:
1212 libc.prctl(15, buf, 0, 0, 0)
1213 except AttributeError:
1214 return # Strange libc, just skip this
1215
1216
1217def remove_start(s, start):
1218 if s.startswith(start):
1219 return s[len(start):]
1220 return s
1221
1222
1223def remove_end(s, end):
1224 if s.endswith(end):
1225 return s[:-len(end)]
1226 return s
1227
1228
1229def url_basename(url):
1230 path = compat_urlparse.urlparse(url).path
1231 return path.strip('/').split('/')[-1]
1232
1233
1234class HEADRequest(compat_urllib_request.Request):
1235 def get_method(self):
1236 return "HEAD"
1237
1238
1239def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1240 if get_attr:
1241 if v is not None:
1242 v = getattr(v, get_attr, None)
1243 if v == '':
1244 v = None
1245 return default if v is None else (int(v) * invscale // scale)
1246
1247
1248def str_or_none(v, default=None):
1249 return default if v is None else compat_str(v)
1250
1251
1252def str_to_int(int_str):
1253 """ A more relaxed version of int_or_none """
1254 if int_str is None:
1255 return None
1256 int_str = re.sub(r'[,\.\+]', '', int_str)
1257 return int(int_str)
1258
1259
1260def float_or_none(v, scale=1, invscale=1, default=None):
1261 return default if v is None else (float(v) * invscale / scale)
1262
1263
1264def parse_duration(s):
1265 if not isinstance(s, basestring if sys.version_info < (3, 0) else compat_str):
1266 return None
1267
1268 s = s.strip()
1269
1270 m = re.match(
1271 r'''(?ix)(?:P?T)?
1272 (?:
1273 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1274 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1275
1276 (?:
1277 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1278 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1279 )?
1280 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1281 )$''', s)
1282 if not m:
1283 return None
1284 res = 0
1285 if m.group('only_mins'):
1286 return float_or_none(m.group('only_mins'), invscale=60)
1287 if m.group('only_hours'):
1288 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1289 if m.group('secs'):
1290 res += int(m.group('secs'))
1291 if m.group('mins'):
1292 res += int(m.group('mins')) * 60
1293 if m.group('hours'):
1294 res += int(m.group('hours')) * 60 * 60
1295 if m.group('ms'):
1296 res += float(m.group('ms'))
1297 return res
1298
1299
1300def prepend_extension(filename, ext):
1301 name, real_ext = os.path.splitext(filename)
1302 return '{0}.{1}{2}'.format(name, ext, real_ext)
1303
1304
1305def check_executable(exe, args=[]):
1306 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1307 args can be a list of arguments for a short output (like -version) """
1308 try:
1309 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1310 except OSError:
1311 return False
1312 return exe
1313
1314
1315def get_exe_version(exe, args=['--version'],
1316 version_re=None, unrecognized='present'):
1317 """ Returns the version of the specified executable,
1318 or False if the executable is not present """
1319 try:
1320 out, _ = subprocess.Popen(
1321 [exe] + args,
1322 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1323 except OSError:
1324 return False
1325 if isinstance(out, bytes): # Python 2.x
1326 out = out.decode('ascii', 'ignore')
1327 return detect_exe_version(out, version_re, unrecognized)
1328
1329
1330def detect_exe_version(output, version_re=None, unrecognized='present'):
1331 assert isinstance(output, compat_str)
1332 if version_re is None:
1333 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1334 m = re.search(version_re, output)
1335 if m:
1336 return m.group(1)
1337 else:
1338 return unrecognized
1339
1340
1341class PagedList(object):
1342 def __len__(self):
1343 # This is only useful for tests
1344 return len(self.getslice())
1345
1346
1347class OnDemandPagedList(PagedList):
1348 def __init__(self, pagefunc, pagesize):
1349 self._pagefunc = pagefunc
1350 self._pagesize = pagesize
1351
1352 def getslice(self, start=0, end=None):
1353 res = []
1354 for pagenum in itertools.count(start // self._pagesize):
1355 firstid = pagenum * self._pagesize
1356 nextfirstid = pagenum * self._pagesize + self._pagesize
1357 if start >= nextfirstid:
1358 continue
1359
1360 page_results = list(self._pagefunc(pagenum))
1361
1362 startv = (
1363 start % self._pagesize
1364 if firstid <= start < nextfirstid
1365 else 0)
1366
1367 endv = (
1368 ((end - 1) % self._pagesize) + 1
1369 if (end is not None and firstid <= end <= nextfirstid)
1370 else None)
1371
1372 if startv != 0 or endv is not None:
1373 page_results = page_results[startv:endv]
1374 res.extend(page_results)
1375
1376 # A little optimization - if current page is not "full", ie. does
1377 # not contain page_size videos then we can assume that this page
1378 # is the last one - there are no more ids on further pages -
1379 # i.e. no need to query again.
1380 if len(page_results) + startv < self._pagesize:
1381 break
1382
1383 # If we got the whole page, but the next page is not interesting,
1384 # break out early as well
1385 if end == nextfirstid:
1386 break
1387 return res
1388
1389
1390class InAdvancePagedList(PagedList):
1391 def __init__(self, pagefunc, pagecount, pagesize):
1392 self._pagefunc = pagefunc
1393 self._pagecount = pagecount
1394 self._pagesize = pagesize
1395
1396 def getslice(self, start=0, end=None):
1397 res = []
1398 start_page = start // self._pagesize
1399 end_page = (
1400 self._pagecount if end is None else (end // self._pagesize + 1))
1401 skip_elems = start - start_page * self._pagesize
1402 only_more = None if end is None else end - start
1403 for pagenum in range(start_page, end_page):
1404 page = list(self._pagefunc(pagenum))
1405 if skip_elems:
1406 page = page[skip_elems:]
1407 skip_elems = None
1408 if only_more is not None:
1409 if len(page) < only_more:
1410 only_more -= len(page)
1411 else:
1412 page = page[:only_more]
1413 res.extend(page)
1414 break
1415 res.extend(page)
1416 return res
1417
1418
1419def uppercase_escape(s):
1420 unicode_escape = codecs.getdecoder('unicode_escape')
1421 return re.sub(
1422 r'\\U[0-9a-fA-F]{8}',
1423 lambda m: unicode_escape(m.group(0))[0],
1424 s)
1425
1426
1427def escape_rfc3986(s):
1428 """Escape non-ASCII characters as suggested by RFC 3986"""
1429 if sys.version_info < (3, 0) and isinstance(s, unicode):
1430 s = s.encode('utf-8')
1431 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1432
1433
1434def escape_url(url):
1435 """Escape URL as suggested by RFC 3986"""
1436 url_parsed = compat_urllib_parse_urlparse(url)
1437 return url_parsed._replace(
1438 path=escape_rfc3986(url_parsed.path),
1439 params=escape_rfc3986(url_parsed.params),
1440 query=escape_rfc3986(url_parsed.query),
1441 fragment=escape_rfc3986(url_parsed.fragment)
1442 ).geturl()
1443
1444try:
1445 struct.pack('!I', 0)
1446except TypeError:
1447 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1448 def struct_pack(spec, *args):
1449 if isinstance(spec, compat_str):
1450 spec = spec.encode('ascii')
1451 return struct.pack(spec, *args)
1452
1453 def struct_unpack(spec, *args):
1454 if isinstance(spec, compat_str):
1455 spec = spec.encode('ascii')
1456 return struct.unpack(spec, *args)
1457else:
1458 struct_pack = struct.pack
1459 struct_unpack = struct.unpack
1460
1461
1462def read_batch_urls(batch_fd):
1463 def fixup(url):
1464 if not isinstance(url, compat_str):
1465 url = url.decode('utf-8', 'replace')
1466 BOM_UTF8 = '\xef\xbb\xbf'
1467 if url.startswith(BOM_UTF8):
1468 url = url[len(BOM_UTF8):]
1469 url = url.strip()
1470 if url.startswith(('#', ';', ']')):
1471 return False
1472 return url
1473
1474 with contextlib.closing(batch_fd) as fd:
1475 return [url for url in map(fixup, fd) if url]
1476
1477
1478def urlencode_postdata(*args, **kargs):
1479 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1480
1481
1482try:
1483 etree_iter = xml.etree.ElementTree.Element.iter
1484except AttributeError: # Python <=2.6
1485 etree_iter = lambda n: n.findall('.//*')
1486
1487
1488def parse_xml(s):
1489 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1490 def doctype(self, name, pubid, system):
1491 pass # Ignore doctypes
1492
1493 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1494 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1495 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1496 # Fix up XML parser in Python 2.x
1497 if sys.version_info < (3, 0):
1498 for n in etree_iter(tree):
1499 if n.text is not None:
1500 if not isinstance(n.text, compat_str):
1501 n.text = n.text.decode('utf-8')
1502 return tree
1503
1504
1505US_RATINGS = {
1506 'G': 0,
1507 'PG': 10,
1508 'PG-13': 13,
1509 'R': 16,
1510 'NC': 18,
1511}
1512
1513
1514def parse_age_limit(s):
1515 if s is None:
1516 return None
1517 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1518 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1519
1520
1521def strip_jsonp(code):
1522 return re.sub(
1523 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1524
1525
1526def js_to_json(code):
1527 def fix_kv(m):
1528 v = m.group(0)
1529 if v in ('true', 'false', 'null'):
1530 return v
1531 if v.startswith('"'):
1532 return v
1533 if v.startswith("'"):
1534 v = v[1:-1]
1535 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1536 '\\\\': '\\\\',
1537 "\\'": "'",
1538 '"': '\\"',
1539 }[m.group(0)], v)
1540 return '"%s"' % v
1541
1542 res = re.sub(r'''(?x)
1543 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1544 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1545 [a-zA-Z_][a-zA-Z_0-9]*
1546 ''', fix_kv, code)
1547 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1548 return res
1549
1550
1551def qualities(quality_ids):
1552 """ Get a numeric quality value out of a list of possible values """
1553 def q(qid):
1554 try:
1555 return quality_ids.index(qid)
1556 except ValueError:
1557 return -1
1558 return q
1559
1560
1561DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1562
1563
1564def limit_length(s, length):
1565 """ Add ellipses to overly long strings """
1566 if s is None:
1567 return None
1568 ELLIPSES = '...'
1569 if len(s) > length:
1570 return s[:length - len(ELLIPSES)] + ELLIPSES
1571 return s
1572
1573
1574def version_tuple(v):
1575 return tuple(int(e) for e in re.split(r'[-.]', v))
1576
1577
1578def is_outdated_version(version, limit, assume_new=True):
1579 if not version:
1580 return not assume_new
1581 try:
1582 return version_tuple(version) < version_tuple(limit)
1583 except ValueError:
1584 return not assume_new
1585
1586
1587def ytdl_is_updateable():
1588 """ Returns if youtube-dl can be updated with -U """
1589 from zipimport import zipimporter
1590
1591 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1592
1593
1594def args_to_str(args):
1595 # Get a short string representation for a subprocess command
1596 return ' '.join(shlex_quote(a) for a in args)
1597
1598
1599def urlhandle_detect_ext(url_handle):
1600 try:
1601 url_handle.headers
1602 getheader = lambda h: url_handle.headers[h]
1603 except AttributeError: # Python < 3
1604 getheader = url_handle.info().getheader
1605
1606 cd = getheader('Content-Disposition')
1607 if cd:
1608 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1609 if m:
1610 e = determine_ext(m.group('filename'), default_ext=None)
1611 if e:
1612 return e
1613
1614 return getheader('Content-Type').split("/")[1]
1615
1616
1617def age_restricted(content_limit, age_limit):
1618 """ Returns True iff the content should be blocked """
1619
1620 if age_limit is None: # No limit set
1621 return False
1622 if content_limit is None:
1623 return False # Content available for everyone
1624 return age_limit < content_limit
1625
1626
1627def is_html(first_bytes):
1628 """ Detect whether a file contains HTML by examining its first bytes. """
1629
1630 BOMS = [
1631 (b'\xef\xbb\xbf', 'utf-8'),
1632 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1633 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1634 (b'\xff\xfe', 'utf-16-le'),
1635 (b'\xfe\xff', 'utf-16-be'),
1636 ]
1637 for bom, enc in BOMS:
1638 if first_bytes.startswith(bom):
1639 s = first_bytes[len(bom):].decode(enc, 'replace')
1640 break
1641 else:
1642 s = first_bytes.decode('utf-8', 'replace')
1643
1644 return re.match(r'^\s*<', s)