]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
[YoutubeDL] Add generic video filtering (Fixes #4916)
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import calendar
7 import codecs
8 import contextlib
9 import ctypes
10 import datetime
11 import email.utils
12 import errno
13 import functools
14 import gzip
15 import itertools
16 import io
17 import json
18 import locale
19 import math
20 import operator
21 import os
22 import pipes
23 import platform
24 import re
25 import ssl
26 import socket
27 import struct
28 import subprocess
29 import sys
30 import tempfile
31 import traceback
32 import xml.etree.ElementTree
33 import zlib
34
35 from .compat import (
36 compat_basestring,
37 compat_chr,
38 compat_getenv,
39 compat_html_entities,
40 compat_http_client,
41 compat_parse_qs,
42 compat_socket_create_connection,
43 compat_str,
44 compat_urllib_error,
45 compat_urllib_parse,
46 compat_urllib_parse_urlparse,
47 compat_urllib_request,
48 compat_urlparse,
49 shlex_quote,
50 )
51
52
53 # This is not clearly defined otherwise
54 compiled_regex_type = type(re.compile(''))
55
56 std_headers = {
57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
62 }
63
64
65 def preferredencoding():
66 """Get preferred encoding.
67
68 Returns the best encoding scheme for the system, based on
69 locale.getpreferredencoding() and some further tweaks.
70 """
71 try:
72 pref = locale.getpreferredencoding()
73 'TEST'.encode(pref)
74 except:
75 pref = 'UTF-8'
76
77 return pref
78
79
80 def write_json_file(obj, fn):
81 """ Encode obj as JSON and write it to fn, atomically if possible """
82
83 fn = encodeFilename(fn)
84 if sys.version_info < (3, 0) and sys.platform != 'win32':
85 encoding = get_filesystem_encoding()
86 # os.path.basename returns a bytes object, but NamedTemporaryFile
87 # will fail if the filename contains non ascii characters unless we
88 # use a unicode object
89 path_basename = lambda f: os.path.basename(fn).decode(encoding)
90 # the same for os.path.dirname
91 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
92 else:
93 path_basename = os.path.basename
94 path_dirname = os.path.dirname
95
96 args = {
97 'suffix': '.tmp',
98 'prefix': path_basename(fn) + '.',
99 'dir': path_dirname(fn),
100 'delete': False,
101 }
102
103 # In Python 2.x, json.dump expects a bytestream.
104 # In Python 3.x, it writes to a character stream
105 if sys.version_info < (3, 0):
106 args['mode'] = 'wb'
107 else:
108 args.update({
109 'mode': 'w',
110 'encoding': 'utf-8',
111 })
112
113 tf = tempfile.NamedTemporaryFile(**args)
114
115 try:
116 with tf:
117 json.dump(obj, tf)
118 if sys.platform == 'win32':
119 # Need to remove existing file on Windows, else os.rename raises
120 # WindowsError or FileExistsError.
121 try:
122 os.unlink(fn)
123 except OSError:
124 pass
125 os.rename(tf.name, fn)
126 except:
127 try:
128 os.remove(tf.name)
129 except OSError:
130 pass
131 raise
132
133
134 if sys.version_info >= (2, 7):
135 def find_xpath_attr(node, xpath, key, val):
136 """ Find the xpath xpath[@key=val] """
137 assert re.match(r'^[a-zA-Z-]+$', key)
138 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
139 expr = xpath + "[@%s='%s']" % (key, val)
140 return node.find(expr)
141 else:
142 def find_xpath_attr(node, xpath, key, val):
143 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
144 # .//node does not match if a node is a direct child of . !
145 if isinstance(xpath, compat_str):
146 xpath = xpath.encode('ascii')
147
148 for f in node.findall(xpath):
149 if f.attrib.get(key) == val:
150 return f
151 return None
152
153 # On python2.6 the xml.etree.ElementTree.Element methods don't support
154 # the namespace parameter
155
156
157 def xpath_with_ns(path, ns_map):
158 components = [c.split(':') for c in path.split('/')]
159 replaced = []
160 for c in components:
161 if len(c) == 1:
162 replaced.append(c[0])
163 else:
164 ns, tag = c
165 replaced.append('{%s}%s' % (ns_map[ns], tag))
166 return '/'.join(replaced)
167
168
169 def xpath_text(node, xpath, name=None, fatal=False):
170 if sys.version_info < (2, 7): # Crazy 2.6
171 xpath = xpath.encode('ascii')
172
173 n = node.find(xpath)
174 if n is None or n.text is None:
175 if fatal:
176 name = xpath if name is None else name
177 raise ExtractorError('Could not find XML element %s' % name)
178 else:
179 return None
180 return n.text
181
182
183 def get_element_by_id(id, html):
184 """Return the content of the tag with the specified ID in the passed HTML document"""
185 return get_element_by_attribute("id", id, html)
186
187
188 def get_element_by_attribute(attribute, value, html):
189 """Return the content of the tag with the specified attribute in the passed HTML document"""
190
191 m = re.search(r'''(?xs)
192 <([a-zA-Z0-9:._-]+)
193 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
194 \s+%s=['"]?%s['"]?
195 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
196 \s*>
197 (?P<content>.*?)
198 </\1>
199 ''' % (re.escape(attribute), re.escape(value)), html)
200
201 if not m:
202 return None
203 res = m.group('content')
204
205 if res.startswith('"') or res.startswith("'"):
206 res = res[1:-1]
207
208 return unescapeHTML(res)
209
210
211 def clean_html(html):
212 """Clean an HTML snippet into a readable string"""
213
214 if html is None: # Convenience for sanitizing descriptions etc.
215 return html
216
217 # Newline vs <br />
218 html = html.replace('\n', ' ')
219 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
220 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
221 # Strip html tags
222 html = re.sub('<.*?>', '', html)
223 # Replace html entities
224 html = unescapeHTML(html)
225 return html.strip()
226
227
228 def sanitize_open(filename, open_mode):
229 """Try to open the given filename, and slightly tweak it if this fails.
230
231 Attempts to open the given filename. If this fails, it tries to change
232 the filename slightly, step by step, until it's either able to open it
233 or it fails and raises a final exception, like the standard open()
234 function.
235
236 It returns the tuple (stream, definitive_file_name).
237 """
238 try:
239 if filename == '-':
240 if sys.platform == 'win32':
241 import msvcrt
242 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
243 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
244 stream = open(encodeFilename(filename), open_mode)
245 return (stream, filename)
246 except (IOError, OSError) as err:
247 if err.errno in (errno.EACCES,):
248 raise
249
250 # In case of error, try to remove win32 forbidden chars
251 alt_filename = os.path.join(
252 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
253 for path_part in os.path.split(filename)
254 )
255 if alt_filename == filename:
256 raise
257 else:
258 # An exception here should be caught in the caller
259 stream = open(encodeFilename(filename), open_mode)
260 return (stream, alt_filename)
261
262
263 def timeconvert(timestr):
264 """Convert RFC 2822 defined time string into system timestamp"""
265 timestamp = None
266 timetuple = email.utils.parsedate_tz(timestr)
267 if timetuple is not None:
268 timestamp = email.utils.mktime_tz(timetuple)
269 return timestamp
270
271
272 def sanitize_filename(s, restricted=False, is_id=False):
273 """Sanitizes a string so it could be used as part of a filename.
274 If restricted is set, use a stricter subset of allowed characters.
275 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
276 """
277 def replace_insane(char):
278 if char == '?' or ord(char) < 32 or ord(char) == 127:
279 return ''
280 elif char == '"':
281 return '' if restricted else '\''
282 elif char == ':':
283 return '_-' if restricted else ' -'
284 elif char in '\\/|*<>':
285 return '_'
286 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
287 return '_'
288 if restricted and ord(char) > 127:
289 return '_'
290 return char
291
292 # Handle timestamps
293 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
294 result = ''.join(map(replace_insane, s))
295 if not is_id:
296 while '__' in result:
297 result = result.replace('__', '_')
298 result = result.strip('_')
299 # Common case of "Foreign band name - English song title"
300 if restricted and result.startswith('-_'):
301 result = result[2:]
302 if not result:
303 result = '_'
304 return result
305
306
307 def orderedSet(iterable):
308 """ Remove all duplicates from the input iterable """
309 res = []
310 for el in iterable:
311 if el not in res:
312 res.append(el)
313 return res
314
315
316 def _htmlentity_transform(entity):
317 """Transforms an HTML entity to a character."""
318 # Known non-numeric HTML entity
319 if entity in compat_html_entities.name2codepoint:
320 return compat_chr(compat_html_entities.name2codepoint[entity])
321
322 mobj = re.match(r'#(x?[0-9]+)', entity)
323 if mobj is not None:
324 numstr = mobj.group(1)
325 if numstr.startswith('x'):
326 base = 16
327 numstr = '0%s' % numstr
328 else:
329 base = 10
330 return compat_chr(int(numstr, base))
331
332 # Unknown entity in name, return its literal representation
333 return ('&%s;' % entity)
334
335
336 def unescapeHTML(s):
337 if s is None:
338 return None
339 assert type(s) == compat_str
340
341 return re.sub(
342 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
343
344
345 def encodeFilename(s, for_subprocess=False):
346 """
347 @param s The name of the file
348 """
349
350 assert type(s) == compat_str
351
352 # Python 3 has a Unicode API
353 if sys.version_info >= (3, 0):
354 return s
355
356 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
357 # Pass '' directly to use Unicode APIs on Windows 2000 and up
358 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
359 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
360 if not for_subprocess:
361 return s
362 else:
363 # For subprocess calls, encode with locale encoding
364 # Refer to http://stackoverflow.com/a/9951851/35070
365 encoding = preferredencoding()
366 else:
367 encoding = sys.getfilesystemencoding()
368 if encoding is None:
369 encoding = 'utf-8'
370 return s.encode(encoding, 'ignore')
371
372
373 def encodeArgument(s):
374 if not isinstance(s, compat_str):
375 # Legacy code that uses byte strings
376 # Uncomment the following line after fixing all post processors
377 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
378 s = s.decode('ascii')
379 return encodeFilename(s, True)
380
381
382 def decodeOption(optval):
383 if optval is None:
384 return optval
385 if isinstance(optval, bytes):
386 optval = optval.decode(preferredencoding())
387
388 assert isinstance(optval, compat_str)
389 return optval
390
391
392 def formatSeconds(secs):
393 if secs > 3600:
394 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
395 elif secs > 60:
396 return '%d:%02d' % (secs // 60, secs % 60)
397 else:
398 return '%d' % secs
399
400
401 def make_HTTPS_handler(params, **kwargs):
402 opts_no_check_certificate = params.get('nocheckcertificate', False)
403 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
404 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
405 if opts_no_check_certificate:
406 context.check_hostname = False
407 context.verify_mode = ssl.CERT_NONE
408 try:
409 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
410 except TypeError:
411 # Python 2.7.8
412 # (create_default_context present but HTTPSHandler has no context=)
413 pass
414
415 if sys.version_info < (3, 2):
416 return YoutubeDLHTTPSHandler(params, **kwargs)
417 else: # Python < 3.4
418 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
419 context.verify_mode = (ssl.CERT_NONE
420 if opts_no_check_certificate
421 else ssl.CERT_REQUIRED)
422 context.set_default_verify_paths()
423 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
424
425
426 class ExtractorError(Exception):
427 """Error during info extraction."""
428
429 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
430 """ tb, if given, is the original traceback (so that it can be printed out).
431 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
432 """
433
434 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
435 expected = True
436 if video_id is not None:
437 msg = video_id + ': ' + msg
438 if cause:
439 msg += ' (caused by %r)' % cause
440 if not expected:
441 if ytdl_is_updateable():
442 update_cmd = 'type youtube-dl -U to update'
443 else:
444 update_cmd = 'see https://yt-dl.org/update on how to update'
445 msg += '; please report this issue on https://yt-dl.org/bug .'
446 msg += ' Make sure you are using the latest version; %s.' % update_cmd
447 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
448 super(ExtractorError, self).__init__(msg)
449
450 self.traceback = tb
451 self.exc_info = sys.exc_info() # preserve original exception
452 self.cause = cause
453 self.video_id = video_id
454
455 def format_traceback(self):
456 if self.traceback is None:
457 return None
458 return ''.join(traceback.format_tb(self.traceback))
459
460
461 class UnsupportedError(ExtractorError):
462 def __init__(self, url):
463 super(UnsupportedError, self).__init__(
464 'Unsupported URL: %s' % url, expected=True)
465 self.url = url
466
467
468 class RegexNotFoundError(ExtractorError):
469 """Error when a regex didn't match"""
470 pass
471
472
473 class DownloadError(Exception):
474 """Download Error exception.
475
476 This exception may be thrown by FileDownloader objects if they are not
477 configured to continue on errors. They will contain the appropriate
478 error message.
479 """
480
481 def __init__(self, msg, exc_info=None):
482 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
483 super(DownloadError, self).__init__(msg)
484 self.exc_info = exc_info
485
486
487 class SameFileError(Exception):
488 """Same File exception.
489
490 This exception will be thrown by FileDownloader objects if they detect
491 multiple files would have to be downloaded to the same file on disk.
492 """
493 pass
494
495
496 class PostProcessingError(Exception):
497 """Post Processing exception.
498
499 This exception may be raised by PostProcessor's .run() method to
500 indicate an error in the postprocessing task.
501 """
502
503 def __init__(self, msg):
504 self.msg = msg
505
506
507 class MaxDownloadsReached(Exception):
508 """ --max-downloads limit has been reached. """
509 pass
510
511
512 class UnavailableVideoError(Exception):
513 """Unavailable Format exception.
514
515 This exception will be thrown when a video is requested
516 in a format that is not available for that video.
517 """
518 pass
519
520
521 class ContentTooShortError(Exception):
522 """Content Too Short exception.
523
524 This exception may be raised by FileDownloader objects when a file they
525 download is too small for what the server announced first, indicating
526 the connection was probably interrupted.
527 """
528 # Both in bytes
529 downloaded = None
530 expected = None
531
532 def __init__(self, downloaded, expected):
533 self.downloaded = downloaded
534 self.expected = expected
535
536
537 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
538 hc = http_class(*args, **kwargs)
539 source_address = ydl_handler._params.get('source_address')
540 if source_address is not None:
541 sa = (source_address, 0)
542 if hasattr(hc, 'source_address'): # Python 2.7+
543 hc.source_address = sa
544 else: # Python 2.6
545 def _hc_connect(self, *args, **kwargs):
546 sock = compat_socket_create_connection(
547 (self.host, self.port), self.timeout, sa)
548 if is_https:
549 self.sock = ssl.wrap_socket(
550 sock, self.key_file, self.cert_file,
551 ssl_version=ssl.PROTOCOL_TLSv1)
552 else:
553 self.sock = sock
554 hc.connect = functools.partial(_hc_connect, hc)
555
556 return hc
557
558
559 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
560 """Handler for HTTP requests and responses.
561
562 This class, when installed with an OpenerDirector, automatically adds
563 the standard headers to every HTTP request and handles gzipped and
564 deflated responses from web servers. If compression is to be avoided in
565 a particular request, the original request in the program code only has
566 to include the HTTP header "Youtubedl-No-Compression", which will be
567 removed before making the real request.
568
569 Part of this code was copied from:
570
571 http://techknack.net/python-urllib2-handlers/
572
573 Andrew Rowls, the author of that code, agreed to release it to the
574 public domain.
575 """
576
577 def __init__(self, params, *args, **kwargs):
578 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
579 self._params = params
580
581 def http_open(self, req):
582 return self.do_open(functools.partial(
583 _create_http_connection, self, compat_http_client.HTTPConnection, False),
584 req)
585
586 @staticmethod
587 def deflate(data):
588 try:
589 return zlib.decompress(data, -zlib.MAX_WBITS)
590 except zlib.error:
591 return zlib.decompress(data)
592
593 @staticmethod
594 def addinfourl_wrapper(stream, headers, url, code):
595 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
596 return compat_urllib_request.addinfourl(stream, headers, url, code)
597 ret = compat_urllib_request.addinfourl(stream, headers, url)
598 ret.code = code
599 return ret
600
601 def http_request(self, req):
602 for h, v in std_headers.items():
603 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
604 # The dict keys are capitalized because of this bug by urllib
605 if h.capitalize() not in req.headers:
606 req.add_header(h, v)
607 if 'Youtubedl-no-compression' in req.headers:
608 if 'Accept-encoding' in req.headers:
609 del req.headers['Accept-encoding']
610 del req.headers['Youtubedl-no-compression']
611
612 if sys.version_info < (2, 7) and '#' in req.get_full_url():
613 # Python 2.6 is brain-dead when it comes to fragments
614 req._Request__original = req._Request__original.partition('#')[0]
615 req._Request__r_type = req._Request__r_type.partition('#')[0]
616
617 return req
618
619 def http_response(self, req, resp):
620 old_resp = resp
621 # gzip
622 if resp.headers.get('Content-encoding', '') == 'gzip':
623 content = resp.read()
624 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
625 try:
626 uncompressed = io.BytesIO(gz.read())
627 except IOError as original_ioerror:
628 # There may be junk add the end of the file
629 # See http://stackoverflow.com/q/4928560/35070 for details
630 for i in range(1, 1024):
631 try:
632 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
633 uncompressed = io.BytesIO(gz.read())
634 except IOError:
635 continue
636 break
637 else:
638 raise original_ioerror
639 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
640 resp.msg = old_resp.msg
641 # deflate
642 if resp.headers.get('Content-encoding', '') == 'deflate':
643 gz = io.BytesIO(self.deflate(resp.read()))
644 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
645 resp.msg = old_resp.msg
646 return resp
647
648 https_request = http_request
649 https_response = http_response
650
651
652 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
653 def __init__(self, params, https_conn_class=None, *args, **kwargs):
654 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
655 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
656 self._params = params
657
658 def https_open(self, req):
659 kwargs = {}
660 if hasattr(self, '_context'): # python > 2.6
661 kwargs['context'] = self._context
662 if hasattr(self, '_check_hostname'): # python 3.x
663 kwargs['check_hostname'] = self._check_hostname
664 return self.do_open(functools.partial(
665 _create_http_connection, self, self._https_conn_class, True),
666 req, **kwargs)
667
668
669 def parse_iso8601(date_str, delimiter='T'):
670 """ Return a UNIX timestamp from the given date """
671
672 if date_str is None:
673 return None
674
675 m = re.search(
676 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
677 date_str)
678 if not m:
679 timezone = datetime.timedelta()
680 else:
681 date_str = date_str[:-len(m.group(0))]
682 if not m.group('sign'):
683 timezone = datetime.timedelta()
684 else:
685 sign = 1 if m.group('sign') == '+' else -1
686 timezone = datetime.timedelta(
687 hours=sign * int(m.group('hours')),
688 minutes=sign * int(m.group('minutes')))
689 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
690 dt = datetime.datetime.strptime(date_str, date_format) - timezone
691 return calendar.timegm(dt.timetuple())
692
693
694 def unified_strdate(date_str, day_first=True):
695 """Return a string with the date in the format YYYYMMDD"""
696
697 if date_str is None:
698 return None
699 upload_date = None
700 # Replace commas
701 date_str = date_str.replace(',', ' ')
702 # %z (UTC offset) is only supported in python>=3.2
703 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
704 # Remove AM/PM + timezone
705 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
706
707 format_expressions = [
708 '%d %B %Y',
709 '%d %b %Y',
710 '%B %d %Y',
711 '%b %d %Y',
712 '%b %dst %Y %I:%M%p',
713 '%b %dnd %Y %I:%M%p',
714 '%b %dth %Y %I:%M%p',
715 '%Y %m %d',
716 '%Y-%m-%d',
717 '%Y/%m/%d',
718 '%Y/%m/%d %H:%M:%S',
719 '%Y-%m-%d %H:%M:%S',
720 '%Y-%m-%d %H:%M:%S.%f',
721 '%d.%m.%Y %H:%M',
722 '%d.%m.%Y %H.%M',
723 '%Y-%m-%dT%H:%M:%SZ',
724 '%Y-%m-%dT%H:%M:%S.%fZ',
725 '%Y-%m-%dT%H:%M:%S.%f0Z',
726 '%Y-%m-%dT%H:%M:%S',
727 '%Y-%m-%dT%H:%M:%S.%f',
728 '%Y-%m-%dT%H:%M',
729 ]
730 if day_first:
731 format_expressions.extend([
732 '%d.%m.%Y',
733 '%d/%m/%Y',
734 '%d/%m/%y',
735 '%d/%m/%Y %H:%M:%S',
736 ])
737 else:
738 format_expressions.extend([
739 '%m.%d.%Y',
740 '%m/%d/%Y',
741 '%m/%d/%y',
742 '%m/%d/%Y %H:%M:%S',
743 ])
744 for expression in format_expressions:
745 try:
746 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
747 except ValueError:
748 pass
749 if upload_date is None:
750 timetuple = email.utils.parsedate_tz(date_str)
751 if timetuple:
752 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
753 return upload_date
754
755
756 def determine_ext(url, default_ext='unknown_video'):
757 if url is None:
758 return default_ext
759 guess = url.partition('?')[0].rpartition('.')[2]
760 if re.match(r'^[A-Za-z0-9]+$', guess):
761 return guess
762 else:
763 return default_ext
764
765
766 def subtitles_filename(filename, sub_lang, sub_format):
767 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
768
769
770 def date_from_str(date_str):
771 """
772 Return a datetime object from a string in the format YYYYMMDD or
773 (now|today)[+-][0-9](day|week|month|year)(s)?"""
774 today = datetime.date.today()
775 if date_str in ('now', 'today'):
776 return today
777 if date_str == 'yesterday':
778 return today - datetime.timedelta(days=1)
779 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
780 if match is not None:
781 sign = match.group('sign')
782 time = int(match.group('time'))
783 if sign == '-':
784 time = -time
785 unit = match.group('unit')
786 # A bad aproximation?
787 if unit == 'month':
788 unit = 'day'
789 time *= 30
790 elif unit == 'year':
791 unit = 'day'
792 time *= 365
793 unit += 's'
794 delta = datetime.timedelta(**{unit: time})
795 return today + delta
796 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
797
798
799 def hyphenate_date(date_str):
800 """
801 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
802 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
803 if match is not None:
804 return '-'.join(match.groups())
805 else:
806 return date_str
807
808
809 class DateRange(object):
810 """Represents a time interval between two dates"""
811
812 def __init__(self, start=None, end=None):
813 """start and end must be strings in the format accepted by date"""
814 if start is not None:
815 self.start = date_from_str(start)
816 else:
817 self.start = datetime.datetime.min.date()
818 if end is not None:
819 self.end = date_from_str(end)
820 else:
821 self.end = datetime.datetime.max.date()
822 if self.start > self.end:
823 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
824
825 @classmethod
826 def day(cls, day):
827 """Returns a range that only contains the given day"""
828 return cls(day, day)
829
830 def __contains__(self, date):
831 """Check if the date is in the range"""
832 if not isinstance(date, datetime.date):
833 date = date_from_str(date)
834 return self.start <= date <= self.end
835
836 def __str__(self):
837 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
838
839
840 def platform_name():
841 """ Returns the platform name as a compat_str """
842 res = platform.platform()
843 if isinstance(res, bytes):
844 res = res.decode(preferredencoding())
845
846 assert isinstance(res, compat_str)
847 return res
848
849
850 def _windows_write_string(s, out):
851 """ Returns True if the string was written using special methods,
852 False if it has yet to be written out."""
853 # Adapted from http://stackoverflow.com/a/3259271/35070
854
855 import ctypes
856 import ctypes.wintypes
857
858 WIN_OUTPUT_IDS = {
859 1: -11,
860 2: -12,
861 }
862
863 try:
864 fileno = out.fileno()
865 except AttributeError:
866 # If the output stream doesn't have a fileno, it's virtual
867 return False
868 except io.UnsupportedOperation:
869 # Some strange Windows pseudo files?
870 return False
871 if fileno not in WIN_OUTPUT_IDS:
872 return False
873
874 GetStdHandle = ctypes.WINFUNCTYPE(
875 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
876 (b"GetStdHandle", ctypes.windll.kernel32))
877 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
878
879 WriteConsoleW = ctypes.WINFUNCTYPE(
880 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
881 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
882 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
883 written = ctypes.wintypes.DWORD(0)
884
885 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
886 FILE_TYPE_CHAR = 0x0002
887 FILE_TYPE_REMOTE = 0x8000
888 GetConsoleMode = ctypes.WINFUNCTYPE(
889 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
890 ctypes.POINTER(ctypes.wintypes.DWORD))(
891 (b"GetConsoleMode", ctypes.windll.kernel32))
892 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
893
894 def not_a_console(handle):
895 if handle == INVALID_HANDLE_VALUE or handle is None:
896 return True
897 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
898 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
899
900 if not_a_console(h):
901 return False
902
903 def next_nonbmp_pos(s):
904 try:
905 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
906 except StopIteration:
907 return len(s)
908
909 while s:
910 count = min(next_nonbmp_pos(s), 1024)
911
912 ret = WriteConsoleW(
913 h, s, count if count else 2, ctypes.byref(written), None)
914 if ret == 0:
915 raise OSError('Failed to write string')
916 if not count: # We just wrote a non-BMP character
917 assert written.value == 2
918 s = s[1:]
919 else:
920 assert written.value > 0
921 s = s[written.value:]
922 return True
923
924
925 def write_string(s, out=None, encoding=None):
926 if out is None:
927 out = sys.stderr
928 assert type(s) == compat_str
929
930 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
931 if _windows_write_string(s, out):
932 return
933
934 if ('b' in getattr(out, 'mode', '') or
935 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
936 byt = s.encode(encoding or preferredencoding(), 'ignore')
937 out.write(byt)
938 elif hasattr(out, 'buffer'):
939 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
940 byt = s.encode(enc, 'ignore')
941 out.buffer.write(byt)
942 else:
943 out.write(s)
944 out.flush()
945
946
947 def bytes_to_intlist(bs):
948 if not bs:
949 return []
950 if isinstance(bs[0], int): # Python 3
951 return list(bs)
952 else:
953 return [ord(c) for c in bs]
954
955
956 def intlist_to_bytes(xs):
957 if not xs:
958 return b''
959 return struct_pack('%dB' % len(xs), *xs)
960
961
962 # Cross-platform file locking
963 if sys.platform == 'win32':
964 import ctypes.wintypes
965 import msvcrt
966
967 class OVERLAPPED(ctypes.Structure):
968 _fields_ = [
969 ('Internal', ctypes.wintypes.LPVOID),
970 ('InternalHigh', ctypes.wintypes.LPVOID),
971 ('Offset', ctypes.wintypes.DWORD),
972 ('OffsetHigh', ctypes.wintypes.DWORD),
973 ('hEvent', ctypes.wintypes.HANDLE),
974 ]
975
976 kernel32 = ctypes.windll.kernel32
977 LockFileEx = kernel32.LockFileEx
978 LockFileEx.argtypes = [
979 ctypes.wintypes.HANDLE, # hFile
980 ctypes.wintypes.DWORD, # dwFlags
981 ctypes.wintypes.DWORD, # dwReserved
982 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
983 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
984 ctypes.POINTER(OVERLAPPED) # Overlapped
985 ]
986 LockFileEx.restype = ctypes.wintypes.BOOL
987 UnlockFileEx = kernel32.UnlockFileEx
988 UnlockFileEx.argtypes = [
989 ctypes.wintypes.HANDLE, # hFile
990 ctypes.wintypes.DWORD, # dwReserved
991 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
992 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
993 ctypes.POINTER(OVERLAPPED) # Overlapped
994 ]
995 UnlockFileEx.restype = ctypes.wintypes.BOOL
996 whole_low = 0xffffffff
997 whole_high = 0x7fffffff
998
999 def _lock_file(f, exclusive):
1000 overlapped = OVERLAPPED()
1001 overlapped.Offset = 0
1002 overlapped.OffsetHigh = 0
1003 overlapped.hEvent = 0
1004 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1005 handle = msvcrt.get_osfhandle(f.fileno())
1006 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1007 whole_low, whole_high, f._lock_file_overlapped_p):
1008 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1009
1010 def _unlock_file(f):
1011 assert f._lock_file_overlapped_p
1012 handle = msvcrt.get_osfhandle(f.fileno())
1013 if not UnlockFileEx(handle, 0,
1014 whole_low, whole_high, f._lock_file_overlapped_p):
1015 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1016
1017 else:
1018 import fcntl
1019
1020 def _lock_file(f, exclusive):
1021 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1022
1023 def _unlock_file(f):
1024 fcntl.flock(f, fcntl.LOCK_UN)
1025
1026
1027 class locked_file(object):
1028 def __init__(self, filename, mode, encoding=None):
1029 assert mode in ['r', 'a', 'w']
1030 self.f = io.open(filename, mode, encoding=encoding)
1031 self.mode = mode
1032
1033 def __enter__(self):
1034 exclusive = self.mode != 'r'
1035 try:
1036 _lock_file(self.f, exclusive)
1037 except IOError:
1038 self.f.close()
1039 raise
1040 return self
1041
1042 def __exit__(self, etype, value, traceback):
1043 try:
1044 _unlock_file(self.f)
1045 finally:
1046 self.f.close()
1047
1048 def __iter__(self):
1049 return iter(self.f)
1050
1051 def write(self, *args):
1052 return self.f.write(*args)
1053
1054 def read(self, *args):
1055 return self.f.read(*args)
1056
1057
1058 def get_filesystem_encoding():
1059 encoding = sys.getfilesystemencoding()
1060 return encoding if encoding is not None else 'utf-8'
1061
1062
1063 def shell_quote(args):
1064 quoted_args = []
1065 encoding = get_filesystem_encoding()
1066 for a in args:
1067 if isinstance(a, bytes):
1068 # We may get a filename encoded with 'encodeFilename'
1069 a = a.decode(encoding)
1070 quoted_args.append(pipes.quote(a))
1071 return ' '.join(quoted_args)
1072
1073
1074 def takewhile_inclusive(pred, seq):
1075 """ Like itertools.takewhile, but include the latest evaluated element
1076 (the first element so that Not pred(e)) """
1077 for e in seq:
1078 yield e
1079 if not pred(e):
1080 return
1081
1082
1083 def smuggle_url(url, data):
1084 """ Pass additional data in a URL for internal use. """
1085
1086 sdata = compat_urllib_parse.urlencode(
1087 {'__youtubedl_smuggle': json.dumps(data)})
1088 return url + '#' + sdata
1089
1090
1091 def unsmuggle_url(smug_url, default=None):
1092 if '#__youtubedl_smuggle' not in smug_url:
1093 return smug_url, default
1094 url, _, sdata = smug_url.rpartition('#')
1095 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1096 data = json.loads(jsond)
1097 return url, data
1098
1099
1100 def format_bytes(bytes):
1101 if bytes is None:
1102 return 'N/A'
1103 if type(bytes) is str:
1104 bytes = float(bytes)
1105 if bytes == 0.0:
1106 exponent = 0
1107 else:
1108 exponent = int(math.log(bytes, 1024.0))
1109 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1110 converted = float(bytes) / float(1024 ** exponent)
1111 return '%.2f%s' % (converted, suffix)
1112
1113
1114 def parse_filesize(s):
1115 if s is None:
1116 return None
1117
1118 # The lower-case forms are of course incorrect and inofficial,
1119 # but we support those too
1120 _UNIT_TABLE = {
1121 'B': 1,
1122 'b': 1,
1123 'KiB': 1024,
1124 'KB': 1000,
1125 'kB': 1024,
1126 'Kb': 1000,
1127 'MiB': 1024 ** 2,
1128 'MB': 1000 ** 2,
1129 'mB': 1024 ** 2,
1130 'Mb': 1000 ** 2,
1131 'GiB': 1024 ** 3,
1132 'GB': 1000 ** 3,
1133 'gB': 1024 ** 3,
1134 'Gb': 1000 ** 3,
1135 'TiB': 1024 ** 4,
1136 'TB': 1000 ** 4,
1137 'tB': 1024 ** 4,
1138 'Tb': 1000 ** 4,
1139 'PiB': 1024 ** 5,
1140 'PB': 1000 ** 5,
1141 'pB': 1024 ** 5,
1142 'Pb': 1000 ** 5,
1143 'EiB': 1024 ** 6,
1144 'EB': 1000 ** 6,
1145 'eB': 1024 ** 6,
1146 'Eb': 1000 ** 6,
1147 'ZiB': 1024 ** 7,
1148 'ZB': 1000 ** 7,
1149 'zB': 1024 ** 7,
1150 'Zb': 1000 ** 7,
1151 'YiB': 1024 ** 8,
1152 'YB': 1000 ** 8,
1153 'yB': 1024 ** 8,
1154 'Yb': 1000 ** 8,
1155 }
1156
1157 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1158 m = re.match(
1159 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1160 if not m:
1161 return None
1162
1163 num_str = m.group('num').replace(',', '.')
1164 mult = _UNIT_TABLE[m.group('unit')]
1165 return int(float(num_str) * mult)
1166
1167
1168 def get_term_width():
1169 columns = compat_getenv('COLUMNS', None)
1170 if columns:
1171 return int(columns)
1172
1173 try:
1174 sp = subprocess.Popen(
1175 ['stty', 'size'],
1176 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1177 out, err = sp.communicate()
1178 return int(out.split()[1])
1179 except:
1180 pass
1181 return None
1182
1183
1184 def month_by_name(name):
1185 """ Return the number of a month by (locale-independently) English name """
1186
1187 ENGLISH_NAMES = [
1188 'January', 'February', 'March', 'April', 'May', 'June',
1189 'July', 'August', 'September', 'October', 'November', 'December']
1190 try:
1191 return ENGLISH_NAMES.index(name) + 1
1192 except ValueError:
1193 return None
1194
1195
1196 def fix_xml_ampersands(xml_str):
1197 """Replace all the '&' by '&amp;' in XML"""
1198 return re.sub(
1199 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1200 '&amp;',
1201 xml_str)
1202
1203
1204 def setproctitle(title):
1205 assert isinstance(title, compat_str)
1206 try:
1207 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1208 except OSError:
1209 return
1210 title_bytes = title.encode('utf-8')
1211 buf = ctypes.create_string_buffer(len(title_bytes))
1212 buf.value = title_bytes
1213 try:
1214 libc.prctl(15, buf, 0, 0, 0)
1215 except AttributeError:
1216 return # Strange libc, just skip this
1217
1218
1219 def remove_start(s, start):
1220 if s.startswith(start):
1221 return s[len(start):]
1222 return s
1223
1224
1225 def remove_end(s, end):
1226 if s.endswith(end):
1227 return s[:-len(end)]
1228 return s
1229
1230
1231 def url_basename(url):
1232 path = compat_urlparse.urlparse(url).path
1233 return path.strip('/').split('/')[-1]
1234
1235
1236 class HEADRequest(compat_urllib_request.Request):
1237 def get_method(self):
1238 return "HEAD"
1239
1240
1241 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1242 if get_attr:
1243 if v is not None:
1244 v = getattr(v, get_attr, None)
1245 if v == '':
1246 v = None
1247 return default if v is None else (int(v) * invscale // scale)
1248
1249
1250 def str_or_none(v, default=None):
1251 return default if v is None else compat_str(v)
1252
1253
1254 def str_to_int(int_str):
1255 """ A more relaxed version of int_or_none """
1256 if int_str is None:
1257 return None
1258 int_str = re.sub(r'[,\.\+]', '', int_str)
1259 return int(int_str)
1260
1261
1262 def float_or_none(v, scale=1, invscale=1, default=None):
1263 return default if v is None else (float(v) * invscale / scale)
1264
1265
1266 def parse_duration(s):
1267 if not isinstance(s, compat_basestring):
1268 return None
1269
1270 s = s.strip()
1271
1272 m = re.match(
1273 r'''(?ix)(?:P?T)?
1274 (?:
1275 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1276 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1277
1278 (?:
1279 (?:
1280 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1281 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1282 )?
1283 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1284 )?
1285 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1286 )$''', s)
1287 if not m:
1288 return None
1289 res = 0
1290 if m.group('only_mins'):
1291 return float_or_none(m.group('only_mins'), invscale=60)
1292 if m.group('only_hours'):
1293 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1294 if m.group('secs'):
1295 res += int(m.group('secs'))
1296 if m.group('mins'):
1297 res += int(m.group('mins')) * 60
1298 if m.group('hours'):
1299 res += int(m.group('hours')) * 60 * 60
1300 if m.group('days'):
1301 res += int(m.group('days')) * 24 * 60 * 60
1302 if m.group('ms'):
1303 res += float(m.group('ms'))
1304 return res
1305
1306
1307 def prepend_extension(filename, ext):
1308 name, real_ext = os.path.splitext(filename)
1309 return '{0}.{1}{2}'.format(name, ext, real_ext)
1310
1311
1312 def check_executable(exe, args=[]):
1313 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1314 args can be a list of arguments for a short output (like -version) """
1315 try:
1316 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1317 except OSError:
1318 return False
1319 return exe
1320
1321
1322 def get_exe_version(exe, args=['--version'],
1323 version_re=None, unrecognized='present'):
1324 """ Returns the version of the specified executable,
1325 or False if the executable is not present """
1326 try:
1327 out, _ = subprocess.Popen(
1328 [exe] + args,
1329 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1330 except OSError:
1331 return False
1332 if isinstance(out, bytes): # Python 2.x
1333 out = out.decode('ascii', 'ignore')
1334 return detect_exe_version(out, version_re, unrecognized)
1335
1336
1337 def detect_exe_version(output, version_re=None, unrecognized='present'):
1338 assert isinstance(output, compat_str)
1339 if version_re is None:
1340 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1341 m = re.search(version_re, output)
1342 if m:
1343 return m.group(1)
1344 else:
1345 return unrecognized
1346
1347
1348 class PagedList(object):
1349 def __len__(self):
1350 # This is only useful for tests
1351 return len(self.getslice())
1352
1353
1354 class OnDemandPagedList(PagedList):
1355 def __init__(self, pagefunc, pagesize):
1356 self._pagefunc = pagefunc
1357 self._pagesize = pagesize
1358
1359 def getslice(self, start=0, end=None):
1360 res = []
1361 for pagenum in itertools.count(start // self._pagesize):
1362 firstid = pagenum * self._pagesize
1363 nextfirstid = pagenum * self._pagesize + self._pagesize
1364 if start >= nextfirstid:
1365 continue
1366
1367 page_results = list(self._pagefunc(pagenum))
1368
1369 startv = (
1370 start % self._pagesize
1371 if firstid <= start < nextfirstid
1372 else 0)
1373
1374 endv = (
1375 ((end - 1) % self._pagesize) + 1
1376 if (end is not None and firstid <= end <= nextfirstid)
1377 else None)
1378
1379 if startv != 0 or endv is not None:
1380 page_results = page_results[startv:endv]
1381 res.extend(page_results)
1382
1383 # A little optimization - if current page is not "full", ie. does
1384 # not contain page_size videos then we can assume that this page
1385 # is the last one - there are no more ids on further pages -
1386 # i.e. no need to query again.
1387 if len(page_results) + startv < self._pagesize:
1388 break
1389
1390 # If we got the whole page, but the next page is not interesting,
1391 # break out early as well
1392 if end == nextfirstid:
1393 break
1394 return res
1395
1396
1397 class InAdvancePagedList(PagedList):
1398 def __init__(self, pagefunc, pagecount, pagesize):
1399 self._pagefunc = pagefunc
1400 self._pagecount = pagecount
1401 self._pagesize = pagesize
1402
1403 def getslice(self, start=0, end=None):
1404 res = []
1405 start_page = start // self._pagesize
1406 end_page = (
1407 self._pagecount if end is None else (end // self._pagesize + 1))
1408 skip_elems = start - start_page * self._pagesize
1409 only_more = None if end is None else end - start
1410 for pagenum in range(start_page, end_page):
1411 page = list(self._pagefunc(pagenum))
1412 if skip_elems:
1413 page = page[skip_elems:]
1414 skip_elems = None
1415 if only_more is not None:
1416 if len(page) < only_more:
1417 only_more -= len(page)
1418 else:
1419 page = page[:only_more]
1420 res.extend(page)
1421 break
1422 res.extend(page)
1423 return res
1424
1425
1426 def uppercase_escape(s):
1427 unicode_escape = codecs.getdecoder('unicode_escape')
1428 return re.sub(
1429 r'\\U[0-9a-fA-F]{8}',
1430 lambda m: unicode_escape(m.group(0))[0],
1431 s)
1432
1433
1434 def escape_rfc3986(s):
1435 """Escape non-ASCII characters as suggested by RFC 3986"""
1436 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1437 s = s.encode('utf-8')
1438 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1439
1440
1441 def escape_url(url):
1442 """Escape URL as suggested by RFC 3986"""
1443 url_parsed = compat_urllib_parse_urlparse(url)
1444 return url_parsed._replace(
1445 path=escape_rfc3986(url_parsed.path),
1446 params=escape_rfc3986(url_parsed.params),
1447 query=escape_rfc3986(url_parsed.query),
1448 fragment=escape_rfc3986(url_parsed.fragment)
1449 ).geturl()
1450
1451 try:
1452 struct.pack('!I', 0)
1453 except TypeError:
1454 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1455 def struct_pack(spec, *args):
1456 if isinstance(spec, compat_str):
1457 spec = spec.encode('ascii')
1458 return struct.pack(spec, *args)
1459
1460 def struct_unpack(spec, *args):
1461 if isinstance(spec, compat_str):
1462 spec = spec.encode('ascii')
1463 return struct.unpack(spec, *args)
1464 else:
1465 struct_pack = struct.pack
1466 struct_unpack = struct.unpack
1467
1468
1469 def read_batch_urls(batch_fd):
1470 def fixup(url):
1471 if not isinstance(url, compat_str):
1472 url = url.decode('utf-8', 'replace')
1473 BOM_UTF8 = '\xef\xbb\xbf'
1474 if url.startswith(BOM_UTF8):
1475 url = url[len(BOM_UTF8):]
1476 url = url.strip()
1477 if url.startswith(('#', ';', ']')):
1478 return False
1479 return url
1480
1481 with contextlib.closing(batch_fd) as fd:
1482 return [url for url in map(fixup, fd) if url]
1483
1484
1485 def urlencode_postdata(*args, **kargs):
1486 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1487
1488
1489 try:
1490 etree_iter = xml.etree.ElementTree.Element.iter
1491 except AttributeError: # Python <=2.6
1492 etree_iter = lambda n: n.findall('.//*')
1493
1494
1495 def parse_xml(s):
1496 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1497 def doctype(self, name, pubid, system):
1498 pass # Ignore doctypes
1499
1500 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1501 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1502 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1503 # Fix up XML parser in Python 2.x
1504 if sys.version_info < (3, 0):
1505 for n in etree_iter(tree):
1506 if n.text is not None:
1507 if not isinstance(n.text, compat_str):
1508 n.text = n.text.decode('utf-8')
1509 return tree
1510
1511
1512 US_RATINGS = {
1513 'G': 0,
1514 'PG': 10,
1515 'PG-13': 13,
1516 'R': 16,
1517 'NC': 18,
1518 }
1519
1520
1521 def parse_age_limit(s):
1522 if s is None:
1523 return None
1524 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1525 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1526
1527
1528 def strip_jsonp(code):
1529 return re.sub(
1530 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1531
1532
1533 def js_to_json(code):
1534 def fix_kv(m):
1535 v = m.group(0)
1536 if v in ('true', 'false', 'null'):
1537 return v
1538 if v.startswith('"'):
1539 return v
1540 if v.startswith("'"):
1541 v = v[1:-1]
1542 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1543 '\\\\': '\\\\',
1544 "\\'": "'",
1545 '"': '\\"',
1546 }[m.group(0)], v)
1547 return '"%s"' % v
1548
1549 res = re.sub(r'''(?x)
1550 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1551 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1552 [a-zA-Z_][.a-zA-Z_0-9]*
1553 ''', fix_kv, code)
1554 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1555 return res
1556
1557
1558 def qualities(quality_ids):
1559 """ Get a numeric quality value out of a list of possible values """
1560 def q(qid):
1561 try:
1562 return quality_ids.index(qid)
1563 except ValueError:
1564 return -1
1565 return q
1566
1567
1568 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1569
1570
1571 def limit_length(s, length):
1572 """ Add ellipses to overly long strings """
1573 if s is None:
1574 return None
1575 ELLIPSES = '...'
1576 if len(s) > length:
1577 return s[:length - len(ELLIPSES)] + ELLIPSES
1578 return s
1579
1580
1581 def version_tuple(v):
1582 return tuple(int(e) for e in re.split(r'[-.]', v))
1583
1584
1585 def is_outdated_version(version, limit, assume_new=True):
1586 if not version:
1587 return not assume_new
1588 try:
1589 return version_tuple(version) < version_tuple(limit)
1590 except ValueError:
1591 return not assume_new
1592
1593
1594 def ytdl_is_updateable():
1595 """ Returns if youtube-dl can be updated with -U """
1596 from zipimport import zipimporter
1597
1598 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1599
1600
1601 def args_to_str(args):
1602 # Get a short string representation for a subprocess command
1603 return ' '.join(shlex_quote(a) for a in args)
1604
1605
1606 def urlhandle_detect_ext(url_handle):
1607 try:
1608 url_handle.headers
1609 getheader = lambda h: url_handle.headers[h]
1610 except AttributeError: # Python < 3
1611 getheader = url_handle.info().getheader
1612
1613 cd = getheader('Content-Disposition')
1614 if cd:
1615 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1616 if m:
1617 e = determine_ext(m.group('filename'), default_ext=None)
1618 if e:
1619 return e
1620
1621 return getheader('Content-Type').split("/")[1]
1622
1623
1624 def age_restricted(content_limit, age_limit):
1625 """ Returns True iff the content should be blocked """
1626
1627 if age_limit is None: # No limit set
1628 return False
1629 if content_limit is None:
1630 return False # Content available for everyone
1631 return age_limit < content_limit
1632
1633
1634 def is_html(first_bytes):
1635 """ Detect whether a file contains HTML by examining its first bytes. """
1636
1637 BOMS = [
1638 (b'\xef\xbb\xbf', 'utf-8'),
1639 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1640 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1641 (b'\xff\xfe', 'utf-16-le'),
1642 (b'\xfe\xff', 'utf-16-be'),
1643 ]
1644 for bom, enc in BOMS:
1645 if first_bytes.startswith(bom):
1646 s = first_bytes[len(bom):].decode(enc, 'replace')
1647 break
1648 else:
1649 s = first_bytes.decode('utf-8', 'replace')
1650
1651 return re.match(r'^\s*<', s)
1652
1653
1654 def determine_protocol(info_dict):
1655 protocol = info_dict.get('protocol')
1656 if protocol is not None:
1657 return protocol
1658
1659 url = info_dict['url']
1660 if url.startswith('rtmp'):
1661 return 'rtmp'
1662 elif url.startswith('mms'):
1663 return 'mms'
1664 elif url.startswith('rtsp'):
1665 return 'rtsp'
1666
1667 ext = determine_ext(url)
1668 if ext == 'm3u8':
1669 return 'm3u8'
1670 elif ext == 'f4m':
1671 return 'f4m'
1672
1673 return compat_urllib_parse_urlparse(url).scheme
1674
1675
1676 def render_table(header_row, data):
1677 """ Render a list of rows, each as a list of values """
1678 table = [header_row] + data
1679 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1680 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1681 return '\n'.join(format_str % tuple(row) for row in table)
1682
1683
1684 def _match_one(filter_part, dct):
1685 COMPARISON_OPERATORS = {
1686 '<': operator.lt,
1687 '<=': operator.le,
1688 '>': operator.gt,
1689 '>=': operator.ge,
1690 '=': operator.eq,
1691 '!=': operator.ne,
1692 }
1693 operator_rex = re.compile(r'''(?x)\s*
1694 (?P<key>[a-z_]+)
1695 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1696 (?:
1697 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1698 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1699 )
1700 \s*$
1701 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1702 m = operator_rex.search(filter_part)
1703 if m:
1704 op = COMPARISON_OPERATORS[m.group('op')]
1705 if m.group('strval') is not None:
1706 if m.group('op') not in ('=', '!='):
1707 raise ValueError(
1708 'Operator %s does not support string values!' % m.group('op'))
1709 comparison_value = m.group('strval')
1710 else:
1711 try:
1712 comparison_value = int(m.group('intval'))
1713 except ValueError:
1714 comparison_value = parse_filesize(m.group('intval'))
1715 if comparison_value is None:
1716 comparison_value = parse_filesize(m.group('intval') + 'B')
1717 if comparison_value is None:
1718 raise ValueError(
1719 'Invalid integer value %r in filter part %r' % (
1720 m.group('intval'), filter_part))
1721 actual_value = dct.get(m.group('key'))
1722 if actual_value is None:
1723 return m.group('none_inclusive')
1724 return op(actual_value, comparison_value)
1725
1726 UNARY_OPERATORS = {
1727 '': lambda v: v is not None,
1728 '!': lambda v: v is None,
1729 }
1730 operator_rex = re.compile(r'''(?x)\s*
1731 (?P<op>%s)\s*(?P<key>[a-z_]+)
1732 \s*$
1733 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1734 m = operator_rex.search(filter_part)
1735 if m:
1736 op = UNARY_OPERATORS[m.group('op')]
1737 actual_value = dct.get(m.group('key'))
1738 return op(actual_value)
1739
1740 raise ValueError('Invalid filter part %r' % filter_part)
1741
1742
1743 def match_str(filter_str, dct):
1744 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1745
1746 return all(
1747 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1748
1749
1750 def match_filter_func(filter_str):
1751 def _match_func(info_dict):
1752 if match_str(filter_str, info_dict):
1753 return None
1754 else:
1755 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1756 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1757 return _match_func