]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
[utils] Keep dot and dotdot unmodified (Closes #5171)
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import calendar
7 import codecs
8 import contextlib
9 import ctypes
10 import datetime
11 import email.utils
12 import errno
13 import functools
14 import gzip
15 import itertools
16 import io
17 import json
18 import locale
19 import math
20 import operator
21 import os
22 import pipes
23 import platform
24 import re
25 import ssl
26 import socket
27 import struct
28 import subprocess
29 import sys
30 import tempfile
31 import traceback
32 import xml.etree.ElementTree
33 import zlib
34
35 from .compat import (
36 compat_basestring,
37 compat_chr,
38 compat_html_entities,
39 compat_http_client,
40 compat_parse_qs,
41 compat_socket_create_connection,
42 compat_str,
43 compat_urllib_error,
44 compat_urllib_parse,
45 compat_urllib_parse_urlparse,
46 compat_urllib_request,
47 compat_urlparse,
48 shlex_quote,
49 )
50
51
52 # This is not clearly defined otherwise
53 compiled_regex_type = type(re.compile(''))
54
55 std_headers = {
56 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
57 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
58 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59 'Accept-Encoding': 'gzip, deflate',
60 'Accept-Language': 'en-us,en;q=0.5',
61 }
62
63
64 ENGLISH_MONTH_NAMES = [
65 'January', 'February', 'March', 'April', 'May', 'June',
66 'July', 'August', 'September', 'October', 'November', 'December']
67
68
69 def preferredencoding():
70 """Get preferred encoding.
71
72 Returns the best encoding scheme for the system, based on
73 locale.getpreferredencoding() and some further tweaks.
74 """
75 try:
76 pref = locale.getpreferredencoding()
77 'TEST'.encode(pref)
78 except:
79 pref = 'UTF-8'
80
81 return pref
82
83
84 def write_json_file(obj, fn):
85 """ Encode obj as JSON and write it to fn, atomically if possible """
86
87 fn = encodeFilename(fn)
88 if sys.version_info < (3, 0) and sys.platform != 'win32':
89 encoding = get_filesystem_encoding()
90 # os.path.basename returns a bytes object, but NamedTemporaryFile
91 # will fail if the filename contains non ascii characters unless we
92 # use a unicode object
93 path_basename = lambda f: os.path.basename(fn).decode(encoding)
94 # the same for os.path.dirname
95 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
96 else:
97 path_basename = os.path.basename
98 path_dirname = os.path.dirname
99
100 args = {
101 'suffix': '.tmp',
102 'prefix': path_basename(fn) + '.',
103 'dir': path_dirname(fn),
104 'delete': False,
105 }
106
107 # In Python 2.x, json.dump expects a bytestream.
108 # In Python 3.x, it writes to a character stream
109 if sys.version_info < (3, 0):
110 args['mode'] = 'wb'
111 else:
112 args.update({
113 'mode': 'w',
114 'encoding': 'utf-8',
115 })
116
117 tf = tempfile.NamedTemporaryFile(**args)
118
119 try:
120 with tf:
121 json.dump(obj, tf)
122 if sys.platform == 'win32':
123 # Need to remove existing file on Windows, else os.rename raises
124 # WindowsError or FileExistsError.
125 try:
126 os.unlink(fn)
127 except OSError:
128 pass
129 os.rename(tf.name, fn)
130 except:
131 try:
132 os.remove(tf.name)
133 except OSError:
134 pass
135 raise
136
137
138 if sys.version_info >= (2, 7):
139 def find_xpath_attr(node, xpath, key, val):
140 """ Find the xpath xpath[@key=val] """
141 assert re.match(r'^[a-zA-Z-]+$', key)
142 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
143 expr = xpath + "[@%s='%s']" % (key, val)
144 return node.find(expr)
145 else:
146 def find_xpath_attr(node, xpath, key, val):
147 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
148 # .//node does not match if a node is a direct child of . !
149 if isinstance(xpath, compat_str):
150 xpath = xpath.encode('ascii')
151
152 for f in node.findall(xpath):
153 if f.attrib.get(key) == val:
154 return f
155 return None
156
157 # On python2.6 the xml.etree.ElementTree.Element methods don't support
158 # the namespace parameter
159
160
161 def xpath_with_ns(path, ns_map):
162 components = [c.split(':') for c in path.split('/')]
163 replaced = []
164 for c in components:
165 if len(c) == 1:
166 replaced.append(c[0])
167 else:
168 ns, tag = c
169 replaced.append('{%s}%s' % (ns_map[ns], tag))
170 return '/'.join(replaced)
171
172
173 def xpath_text(node, xpath, name=None, fatal=False):
174 if sys.version_info < (2, 7): # Crazy 2.6
175 xpath = xpath.encode('ascii')
176
177 n = node.find(xpath)
178 if n is None or n.text is None:
179 if fatal:
180 name = xpath if name is None else name
181 raise ExtractorError('Could not find XML element %s' % name)
182 else:
183 return None
184 return n.text
185
186
187 def get_element_by_id(id, html):
188 """Return the content of the tag with the specified ID in the passed HTML document"""
189 return get_element_by_attribute("id", id, html)
190
191
192 def get_element_by_attribute(attribute, value, html):
193 """Return the content of the tag with the specified attribute in the passed HTML document"""
194
195 m = re.search(r'''(?xs)
196 <([a-zA-Z0-9:._-]+)
197 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
198 \s+%s=['"]?%s['"]?
199 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
200 \s*>
201 (?P<content>.*?)
202 </\1>
203 ''' % (re.escape(attribute), re.escape(value)), html)
204
205 if not m:
206 return None
207 res = m.group('content')
208
209 if res.startswith('"') or res.startswith("'"):
210 res = res[1:-1]
211
212 return unescapeHTML(res)
213
214
215 def clean_html(html):
216 """Clean an HTML snippet into a readable string"""
217
218 if html is None: # Convenience for sanitizing descriptions etc.
219 return html
220
221 # Newline vs <br />
222 html = html.replace('\n', ' ')
223 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
224 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
225 # Strip html tags
226 html = re.sub('<.*?>', '', html)
227 # Replace html entities
228 html = unescapeHTML(html)
229 return html.strip()
230
231
232 def sanitize_open(filename, open_mode):
233 """Try to open the given filename, and slightly tweak it if this fails.
234
235 Attempts to open the given filename. If this fails, it tries to change
236 the filename slightly, step by step, until it's either able to open it
237 or it fails and raises a final exception, like the standard open()
238 function.
239
240 It returns the tuple (stream, definitive_file_name).
241 """
242 try:
243 if filename == '-':
244 if sys.platform == 'win32':
245 import msvcrt
246 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
247 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
248 stream = open(encodeFilename(filename), open_mode)
249 return (stream, filename)
250 except (IOError, OSError) as err:
251 if err.errno in (errno.EACCES,):
252 raise
253
254 # In case of error, try to remove win32 forbidden chars
255 alt_filename = sanitize_path(filename)
256 if alt_filename == filename:
257 raise
258 else:
259 # An exception here should be caught in the caller
260 stream = open(encodeFilename(alt_filename), open_mode)
261 return (stream, alt_filename)
262
263
264 def timeconvert(timestr):
265 """Convert RFC 2822 defined time string into system timestamp"""
266 timestamp = None
267 timetuple = email.utils.parsedate_tz(timestr)
268 if timetuple is not None:
269 timestamp = email.utils.mktime_tz(timetuple)
270 return timestamp
271
272
273 def sanitize_filename(s, restricted=False, is_id=False):
274 """Sanitizes a string so it could be used as part of a filename.
275 If restricted is set, use a stricter subset of allowed characters.
276 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
277 """
278 def replace_insane(char):
279 if char == '?' or ord(char) < 32 or ord(char) == 127:
280 return ''
281 elif char == '"':
282 return '' if restricted else '\''
283 elif char == ':':
284 return '_-' if restricted else ' -'
285 elif char in '\\/|*<>':
286 return '_'
287 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
288 return '_'
289 if restricted and ord(char) > 127:
290 return '_'
291 return char
292
293 # Handle timestamps
294 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
295 result = ''.join(map(replace_insane, s))
296 if not is_id:
297 while '__' in result:
298 result = result.replace('__', '_')
299 result = result.strip('_')
300 # Common case of "Foreign band name - English song title"
301 if restricted and result.startswith('-_'):
302 result = result[2:]
303 if result.startswith('-'):
304 result = '_' + result[len('-'):]
305 result = result.lstrip('.')
306 if not result:
307 result = '_'
308 return result
309
310
311 def sanitize_path(s):
312 """Sanitizes and normalizes path on Windows"""
313 if sys.platform != 'win32':
314 return s
315 drive, _ = os.path.splitdrive(s)
316 unc, _ = os.path.splitunc(s)
317 unc_or_drive = unc or drive
318 norm_path = os.path.normpath(remove_start(s, unc_or_drive)).split(os.path.sep)
319 if unc_or_drive:
320 norm_path.pop(0)
321 sanitized_path = [
322 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
323 for path_part in norm_path]
324 if unc_or_drive:
325 sanitized_path.insert(0, unc_or_drive + os.path.sep)
326 return os.path.join(*sanitized_path)
327
328
329 def orderedSet(iterable):
330 """ Remove all duplicates from the input iterable """
331 res = []
332 for el in iterable:
333 if el not in res:
334 res.append(el)
335 return res
336
337
338 def _htmlentity_transform(entity):
339 """Transforms an HTML entity to a character."""
340 # Known non-numeric HTML entity
341 if entity in compat_html_entities.name2codepoint:
342 return compat_chr(compat_html_entities.name2codepoint[entity])
343
344 mobj = re.match(r'#(x?[0-9]+)', entity)
345 if mobj is not None:
346 numstr = mobj.group(1)
347 if numstr.startswith('x'):
348 base = 16
349 numstr = '0%s' % numstr
350 else:
351 base = 10
352 return compat_chr(int(numstr, base))
353
354 # Unknown entity in name, return its literal representation
355 return ('&%s;' % entity)
356
357
358 def unescapeHTML(s):
359 if s is None:
360 return None
361 assert type(s) == compat_str
362
363 return re.sub(
364 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
365
366
367 def encodeFilename(s, for_subprocess=False):
368 """
369 @param s The name of the file
370 """
371
372 assert type(s) == compat_str
373
374 # Python 3 has a Unicode API
375 if sys.version_info >= (3, 0):
376 return s
377
378 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
379 # Pass '' directly to use Unicode APIs on Windows 2000 and up
380 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
381 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
382 if not for_subprocess:
383 return s
384 else:
385 # For subprocess calls, encode with locale encoding
386 # Refer to http://stackoverflow.com/a/9951851/35070
387 encoding = preferredencoding()
388 else:
389 encoding = sys.getfilesystemencoding()
390 if encoding is None:
391 encoding = 'utf-8'
392 return s.encode(encoding, 'ignore')
393
394
395 def encodeArgument(s):
396 if not isinstance(s, compat_str):
397 # Legacy code that uses byte strings
398 # Uncomment the following line after fixing all post processors
399 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
400 s = s.decode('ascii')
401 return encodeFilename(s, True)
402
403
404 def decodeOption(optval):
405 if optval is None:
406 return optval
407 if isinstance(optval, bytes):
408 optval = optval.decode(preferredencoding())
409
410 assert isinstance(optval, compat_str)
411 return optval
412
413
414 def formatSeconds(secs):
415 if secs > 3600:
416 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
417 elif secs > 60:
418 return '%d:%02d' % (secs // 60, secs % 60)
419 else:
420 return '%d' % secs
421
422
423 def make_HTTPS_handler(params, **kwargs):
424 opts_no_check_certificate = params.get('nocheckcertificate', False)
425 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
426 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
427 if opts_no_check_certificate:
428 context.check_hostname = False
429 context.verify_mode = ssl.CERT_NONE
430 try:
431 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
432 except TypeError:
433 # Python 2.7.8
434 # (create_default_context present but HTTPSHandler has no context=)
435 pass
436
437 if sys.version_info < (3, 2):
438 return YoutubeDLHTTPSHandler(params, **kwargs)
439 else: # Python < 3.4
440 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
441 context.verify_mode = (ssl.CERT_NONE
442 if opts_no_check_certificate
443 else ssl.CERT_REQUIRED)
444 context.set_default_verify_paths()
445 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
446
447
448 class ExtractorError(Exception):
449 """Error during info extraction."""
450
451 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
452 """ tb, if given, is the original traceback (so that it can be printed out).
453 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
454 """
455
456 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
457 expected = True
458 if video_id is not None:
459 msg = video_id + ': ' + msg
460 if cause:
461 msg += ' (caused by %r)' % cause
462 if not expected:
463 if ytdl_is_updateable():
464 update_cmd = 'type youtube-dl -U to update'
465 else:
466 update_cmd = 'see https://yt-dl.org/update on how to update'
467 msg += '; please report this issue on https://yt-dl.org/bug .'
468 msg += ' Make sure you are using the latest version; %s.' % update_cmd
469 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
470 super(ExtractorError, self).__init__(msg)
471
472 self.traceback = tb
473 self.exc_info = sys.exc_info() # preserve original exception
474 self.cause = cause
475 self.video_id = video_id
476
477 def format_traceback(self):
478 if self.traceback is None:
479 return None
480 return ''.join(traceback.format_tb(self.traceback))
481
482
483 class UnsupportedError(ExtractorError):
484 def __init__(self, url):
485 super(UnsupportedError, self).__init__(
486 'Unsupported URL: %s' % url, expected=True)
487 self.url = url
488
489
490 class RegexNotFoundError(ExtractorError):
491 """Error when a regex didn't match"""
492 pass
493
494
495 class DownloadError(Exception):
496 """Download Error exception.
497
498 This exception may be thrown by FileDownloader objects if they are not
499 configured to continue on errors. They will contain the appropriate
500 error message.
501 """
502
503 def __init__(self, msg, exc_info=None):
504 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
505 super(DownloadError, self).__init__(msg)
506 self.exc_info = exc_info
507
508
509 class SameFileError(Exception):
510 """Same File exception.
511
512 This exception will be thrown by FileDownloader objects if they detect
513 multiple files would have to be downloaded to the same file on disk.
514 """
515 pass
516
517
518 class PostProcessingError(Exception):
519 """Post Processing exception.
520
521 This exception may be raised by PostProcessor's .run() method to
522 indicate an error in the postprocessing task.
523 """
524
525 def __init__(self, msg):
526 self.msg = msg
527
528
529 class MaxDownloadsReached(Exception):
530 """ --max-downloads limit has been reached. """
531 pass
532
533
534 class UnavailableVideoError(Exception):
535 """Unavailable Format exception.
536
537 This exception will be thrown when a video is requested
538 in a format that is not available for that video.
539 """
540 pass
541
542
543 class ContentTooShortError(Exception):
544 """Content Too Short exception.
545
546 This exception may be raised by FileDownloader objects when a file they
547 download is too small for what the server announced first, indicating
548 the connection was probably interrupted.
549 """
550 # Both in bytes
551 downloaded = None
552 expected = None
553
554 def __init__(self, downloaded, expected):
555 self.downloaded = downloaded
556 self.expected = expected
557
558
559 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
560 hc = http_class(*args, **kwargs)
561 source_address = ydl_handler._params.get('source_address')
562 if source_address is not None:
563 sa = (source_address, 0)
564 if hasattr(hc, 'source_address'): # Python 2.7+
565 hc.source_address = sa
566 else: # Python 2.6
567 def _hc_connect(self, *args, **kwargs):
568 sock = compat_socket_create_connection(
569 (self.host, self.port), self.timeout, sa)
570 if is_https:
571 self.sock = ssl.wrap_socket(
572 sock, self.key_file, self.cert_file,
573 ssl_version=ssl.PROTOCOL_TLSv1)
574 else:
575 self.sock = sock
576 hc.connect = functools.partial(_hc_connect, hc)
577
578 return hc
579
580
581 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
582 """Handler for HTTP requests and responses.
583
584 This class, when installed with an OpenerDirector, automatically adds
585 the standard headers to every HTTP request and handles gzipped and
586 deflated responses from web servers. If compression is to be avoided in
587 a particular request, the original request in the program code only has
588 to include the HTTP header "Youtubedl-No-Compression", which will be
589 removed before making the real request.
590
591 Part of this code was copied from:
592
593 http://techknack.net/python-urllib2-handlers/
594
595 Andrew Rowls, the author of that code, agreed to release it to the
596 public domain.
597 """
598
599 def __init__(self, params, *args, **kwargs):
600 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
601 self._params = params
602
603 def http_open(self, req):
604 return self.do_open(functools.partial(
605 _create_http_connection, self, compat_http_client.HTTPConnection, False),
606 req)
607
608 @staticmethod
609 def deflate(data):
610 try:
611 return zlib.decompress(data, -zlib.MAX_WBITS)
612 except zlib.error:
613 return zlib.decompress(data)
614
615 @staticmethod
616 def addinfourl_wrapper(stream, headers, url, code):
617 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
618 return compat_urllib_request.addinfourl(stream, headers, url, code)
619 ret = compat_urllib_request.addinfourl(stream, headers, url)
620 ret.code = code
621 return ret
622
623 def http_request(self, req):
624 for h, v in std_headers.items():
625 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
626 # The dict keys are capitalized because of this bug by urllib
627 if h.capitalize() not in req.headers:
628 req.add_header(h, v)
629 if 'Youtubedl-no-compression' in req.headers:
630 if 'Accept-encoding' in req.headers:
631 del req.headers['Accept-encoding']
632 del req.headers['Youtubedl-no-compression']
633
634 if sys.version_info < (2, 7) and '#' in req.get_full_url():
635 # Python 2.6 is brain-dead when it comes to fragments
636 req._Request__original = req._Request__original.partition('#')[0]
637 req._Request__r_type = req._Request__r_type.partition('#')[0]
638
639 return req
640
641 def http_response(self, req, resp):
642 old_resp = resp
643 # gzip
644 if resp.headers.get('Content-encoding', '') == 'gzip':
645 content = resp.read()
646 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
647 try:
648 uncompressed = io.BytesIO(gz.read())
649 except IOError as original_ioerror:
650 # There may be junk add the end of the file
651 # See http://stackoverflow.com/q/4928560/35070 for details
652 for i in range(1, 1024):
653 try:
654 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
655 uncompressed = io.BytesIO(gz.read())
656 except IOError:
657 continue
658 break
659 else:
660 raise original_ioerror
661 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
662 resp.msg = old_resp.msg
663 # deflate
664 if resp.headers.get('Content-encoding', '') == 'deflate':
665 gz = io.BytesIO(self.deflate(resp.read()))
666 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
667 resp.msg = old_resp.msg
668 return resp
669
670 https_request = http_request
671 https_response = http_response
672
673
674 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
675 def __init__(self, params, https_conn_class=None, *args, **kwargs):
676 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
677 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
678 self._params = params
679
680 def https_open(self, req):
681 kwargs = {}
682 if hasattr(self, '_context'): # python > 2.6
683 kwargs['context'] = self._context
684 if hasattr(self, '_check_hostname'): # python 3.x
685 kwargs['check_hostname'] = self._check_hostname
686 return self.do_open(functools.partial(
687 _create_http_connection, self, self._https_conn_class, True),
688 req, **kwargs)
689
690
691 def parse_iso8601(date_str, delimiter='T', timezone=None):
692 """ Return a UNIX timestamp from the given date """
693
694 if date_str is None:
695 return None
696
697 if timezone is None:
698 m = re.search(
699 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
700 date_str)
701 if not m:
702 timezone = datetime.timedelta()
703 else:
704 date_str = date_str[:-len(m.group(0))]
705 if not m.group('sign'):
706 timezone = datetime.timedelta()
707 else:
708 sign = 1 if m.group('sign') == '+' else -1
709 timezone = datetime.timedelta(
710 hours=sign * int(m.group('hours')),
711 minutes=sign * int(m.group('minutes')))
712 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
713 dt = datetime.datetime.strptime(date_str, date_format) - timezone
714 return calendar.timegm(dt.timetuple())
715
716
717 def unified_strdate(date_str, day_first=True):
718 """Return a string with the date in the format YYYYMMDD"""
719
720 if date_str is None:
721 return None
722 upload_date = None
723 # Replace commas
724 date_str = date_str.replace(',', ' ')
725 # %z (UTC offset) is only supported in python>=3.2
726 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
727 # Remove AM/PM + timezone
728 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
729
730 format_expressions = [
731 '%d %B %Y',
732 '%d %b %Y',
733 '%B %d %Y',
734 '%b %d %Y',
735 '%b %dst %Y %I:%M%p',
736 '%b %dnd %Y %I:%M%p',
737 '%b %dth %Y %I:%M%p',
738 '%Y %m %d',
739 '%Y-%m-%d',
740 '%Y/%m/%d',
741 '%Y/%m/%d %H:%M:%S',
742 '%Y-%m-%d %H:%M:%S',
743 '%Y-%m-%d %H:%M:%S.%f',
744 '%d.%m.%Y %H:%M',
745 '%d.%m.%Y %H.%M',
746 '%Y-%m-%dT%H:%M:%SZ',
747 '%Y-%m-%dT%H:%M:%S.%fZ',
748 '%Y-%m-%dT%H:%M:%S.%f0Z',
749 '%Y-%m-%dT%H:%M:%S',
750 '%Y-%m-%dT%H:%M:%S.%f',
751 '%Y-%m-%dT%H:%M',
752 ]
753 if day_first:
754 format_expressions.extend([
755 '%d.%m.%Y',
756 '%d/%m/%Y',
757 '%d/%m/%y',
758 '%d/%m/%Y %H:%M:%S',
759 ])
760 else:
761 format_expressions.extend([
762 '%m.%d.%Y',
763 '%m/%d/%Y',
764 '%m/%d/%y',
765 '%m/%d/%Y %H:%M:%S',
766 ])
767 for expression in format_expressions:
768 try:
769 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
770 except ValueError:
771 pass
772 if upload_date is None:
773 timetuple = email.utils.parsedate_tz(date_str)
774 if timetuple:
775 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
776 return upload_date
777
778
779 def determine_ext(url, default_ext='unknown_video'):
780 if url is None:
781 return default_ext
782 guess = url.partition('?')[0].rpartition('.')[2]
783 if re.match(r'^[A-Za-z0-9]+$', guess):
784 return guess
785 else:
786 return default_ext
787
788
789 def subtitles_filename(filename, sub_lang, sub_format):
790 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
791
792
793 def date_from_str(date_str):
794 """
795 Return a datetime object from a string in the format YYYYMMDD or
796 (now|today)[+-][0-9](day|week|month|year)(s)?"""
797 today = datetime.date.today()
798 if date_str in ('now', 'today'):
799 return today
800 if date_str == 'yesterday':
801 return today - datetime.timedelta(days=1)
802 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
803 if match is not None:
804 sign = match.group('sign')
805 time = int(match.group('time'))
806 if sign == '-':
807 time = -time
808 unit = match.group('unit')
809 # A bad aproximation?
810 if unit == 'month':
811 unit = 'day'
812 time *= 30
813 elif unit == 'year':
814 unit = 'day'
815 time *= 365
816 unit += 's'
817 delta = datetime.timedelta(**{unit: time})
818 return today + delta
819 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
820
821
822 def hyphenate_date(date_str):
823 """
824 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
825 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
826 if match is not None:
827 return '-'.join(match.groups())
828 else:
829 return date_str
830
831
832 class DateRange(object):
833 """Represents a time interval between two dates"""
834
835 def __init__(self, start=None, end=None):
836 """start and end must be strings in the format accepted by date"""
837 if start is not None:
838 self.start = date_from_str(start)
839 else:
840 self.start = datetime.datetime.min.date()
841 if end is not None:
842 self.end = date_from_str(end)
843 else:
844 self.end = datetime.datetime.max.date()
845 if self.start > self.end:
846 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
847
848 @classmethod
849 def day(cls, day):
850 """Returns a range that only contains the given day"""
851 return cls(day, day)
852
853 def __contains__(self, date):
854 """Check if the date is in the range"""
855 if not isinstance(date, datetime.date):
856 date = date_from_str(date)
857 return self.start <= date <= self.end
858
859 def __str__(self):
860 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
861
862
863 def platform_name():
864 """ Returns the platform name as a compat_str """
865 res = platform.platform()
866 if isinstance(res, bytes):
867 res = res.decode(preferredencoding())
868
869 assert isinstance(res, compat_str)
870 return res
871
872
873 def _windows_write_string(s, out):
874 """ Returns True if the string was written using special methods,
875 False if it has yet to be written out."""
876 # Adapted from http://stackoverflow.com/a/3259271/35070
877
878 import ctypes
879 import ctypes.wintypes
880
881 WIN_OUTPUT_IDS = {
882 1: -11,
883 2: -12,
884 }
885
886 try:
887 fileno = out.fileno()
888 except AttributeError:
889 # If the output stream doesn't have a fileno, it's virtual
890 return False
891 except io.UnsupportedOperation:
892 # Some strange Windows pseudo files?
893 return False
894 if fileno not in WIN_OUTPUT_IDS:
895 return False
896
897 GetStdHandle = ctypes.WINFUNCTYPE(
898 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
899 (b"GetStdHandle", ctypes.windll.kernel32))
900 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
901
902 WriteConsoleW = ctypes.WINFUNCTYPE(
903 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
904 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
905 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
906 written = ctypes.wintypes.DWORD(0)
907
908 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
909 FILE_TYPE_CHAR = 0x0002
910 FILE_TYPE_REMOTE = 0x8000
911 GetConsoleMode = ctypes.WINFUNCTYPE(
912 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
913 ctypes.POINTER(ctypes.wintypes.DWORD))(
914 (b"GetConsoleMode", ctypes.windll.kernel32))
915 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
916
917 def not_a_console(handle):
918 if handle == INVALID_HANDLE_VALUE or handle is None:
919 return True
920 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
921 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
922
923 if not_a_console(h):
924 return False
925
926 def next_nonbmp_pos(s):
927 try:
928 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
929 except StopIteration:
930 return len(s)
931
932 while s:
933 count = min(next_nonbmp_pos(s), 1024)
934
935 ret = WriteConsoleW(
936 h, s, count if count else 2, ctypes.byref(written), None)
937 if ret == 0:
938 raise OSError('Failed to write string')
939 if not count: # We just wrote a non-BMP character
940 assert written.value == 2
941 s = s[1:]
942 else:
943 assert written.value > 0
944 s = s[written.value:]
945 return True
946
947
948 def write_string(s, out=None, encoding=None):
949 if out is None:
950 out = sys.stderr
951 assert type(s) == compat_str
952
953 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
954 if _windows_write_string(s, out):
955 return
956
957 if ('b' in getattr(out, 'mode', '') or
958 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
959 byt = s.encode(encoding or preferredencoding(), 'ignore')
960 out.write(byt)
961 elif hasattr(out, 'buffer'):
962 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
963 byt = s.encode(enc, 'ignore')
964 out.buffer.write(byt)
965 else:
966 out.write(s)
967 out.flush()
968
969
970 def bytes_to_intlist(bs):
971 if not bs:
972 return []
973 if isinstance(bs[0], int): # Python 3
974 return list(bs)
975 else:
976 return [ord(c) for c in bs]
977
978
979 def intlist_to_bytes(xs):
980 if not xs:
981 return b''
982 return struct_pack('%dB' % len(xs), *xs)
983
984
985 # Cross-platform file locking
986 if sys.platform == 'win32':
987 import ctypes.wintypes
988 import msvcrt
989
990 class OVERLAPPED(ctypes.Structure):
991 _fields_ = [
992 ('Internal', ctypes.wintypes.LPVOID),
993 ('InternalHigh', ctypes.wintypes.LPVOID),
994 ('Offset', ctypes.wintypes.DWORD),
995 ('OffsetHigh', ctypes.wintypes.DWORD),
996 ('hEvent', ctypes.wintypes.HANDLE),
997 ]
998
999 kernel32 = ctypes.windll.kernel32
1000 LockFileEx = kernel32.LockFileEx
1001 LockFileEx.argtypes = [
1002 ctypes.wintypes.HANDLE, # hFile
1003 ctypes.wintypes.DWORD, # dwFlags
1004 ctypes.wintypes.DWORD, # dwReserved
1005 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1006 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1007 ctypes.POINTER(OVERLAPPED) # Overlapped
1008 ]
1009 LockFileEx.restype = ctypes.wintypes.BOOL
1010 UnlockFileEx = kernel32.UnlockFileEx
1011 UnlockFileEx.argtypes = [
1012 ctypes.wintypes.HANDLE, # hFile
1013 ctypes.wintypes.DWORD, # dwReserved
1014 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1015 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1016 ctypes.POINTER(OVERLAPPED) # Overlapped
1017 ]
1018 UnlockFileEx.restype = ctypes.wintypes.BOOL
1019 whole_low = 0xffffffff
1020 whole_high = 0x7fffffff
1021
1022 def _lock_file(f, exclusive):
1023 overlapped = OVERLAPPED()
1024 overlapped.Offset = 0
1025 overlapped.OffsetHigh = 0
1026 overlapped.hEvent = 0
1027 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1028 handle = msvcrt.get_osfhandle(f.fileno())
1029 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1030 whole_low, whole_high, f._lock_file_overlapped_p):
1031 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1032
1033 def _unlock_file(f):
1034 assert f._lock_file_overlapped_p
1035 handle = msvcrt.get_osfhandle(f.fileno())
1036 if not UnlockFileEx(handle, 0,
1037 whole_low, whole_high, f._lock_file_overlapped_p):
1038 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1039
1040 else:
1041 import fcntl
1042
1043 def _lock_file(f, exclusive):
1044 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1045
1046 def _unlock_file(f):
1047 fcntl.flock(f, fcntl.LOCK_UN)
1048
1049
1050 class locked_file(object):
1051 def __init__(self, filename, mode, encoding=None):
1052 assert mode in ['r', 'a', 'w']
1053 self.f = io.open(filename, mode, encoding=encoding)
1054 self.mode = mode
1055
1056 def __enter__(self):
1057 exclusive = self.mode != 'r'
1058 try:
1059 _lock_file(self.f, exclusive)
1060 except IOError:
1061 self.f.close()
1062 raise
1063 return self
1064
1065 def __exit__(self, etype, value, traceback):
1066 try:
1067 _unlock_file(self.f)
1068 finally:
1069 self.f.close()
1070
1071 def __iter__(self):
1072 return iter(self.f)
1073
1074 def write(self, *args):
1075 return self.f.write(*args)
1076
1077 def read(self, *args):
1078 return self.f.read(*args)
1079
1080
1081 def get_filesystem_encoding():
1082 encoding = sys.getfilesystemencoding()
1083 return encoding if encoding is not None else 'utf-8'
1084
1085
1086 def shell_quote(args):
1087 quoted_args = []
1088 encoding = get_filesystem_encoding()
1089 for a in args:
1090 if isinstance(a, bytes):
1091 # We may get a filename encoded with 'encodeFilename'
1092 a = a.decode(encoding)
1093 quoted_args.append(pipes.quote(a))
1094 return ' '.join(quoted_args)
1095
1096
1097 def takewhile_inclusive(pred, seq):
1098 """ Like itertools.takewhile, but include the latest evaluated element
1099 (the first element so that Not pred(e)) """
1100 for e in seq:
1101 yield e
1102 if not pred(e):
1103 return
1104
1105
1106 def smuggle_url(url, data):
1107 """ Pass additional data in a URL for internal use. """
1108
1109 sdata = compat_urllib_parse.urlencode(
1110 {'__youtubedl_smuggle': json.dumps(data)})
1111 return url + '#' + sdata
1112
1113
1114 def unsmuggle_url(smug_url, default=None):
1115 if '#__youtubedl_smuggle' not in smug_url:
1116 return smug_url, default
1117 url, _, sdata = smug_url.rpartition('#')
1118 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1119 data = json.loads(jsond)
1120 return url, data
1121
1122
1123 def format_bytes(bytes):
1124 if bytes is None:
1125 return 'N/A'
1126 if type(bytes) is str:
1127 bytes = float(bytes)
1128 if bytes == 0.0:
1129 exponent = 0
1130 else:
1131 exponent = int(math.log(bytes, 1024.0))
1132 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1133 converted = float(bytes) / float(1024 ** exponent)
1134 return '%.2f%s' % (converted, suffix)
1135
1136
1137 def parse_filesize(s):
1138 if s is None:
1139 return None
1140
1141 # The lower-case forms are of course incorrect and inofficial,
1142 # but we support those too
1143 _UNIT_TABLE = {
1144 'B': 1,
1145 'b': 1,
1146 'KiB': 1024,
1147 'KB': 1000,
1148 'kB': 1024,
1149 'Kb': 1000,
1150 'MiB': 1024 ** 2,
1151 'MB': 1000 ** 2,
1152 'mB': 1024 ** 2,
1153 'Mb': 1000 ** 2,
1154 'GiB': 1024 ** 3,
1155 'GB': 1000 ** 3,
1156 'gB': 1024 ** 3,
1157 'Gb': 1000 ** 3,
1158 'TiB': 1024 ** 4,
1159 'TB': 1000 ** 4,
1160 'tB': 1024 ** 4,
1161 'Tb': 1000 ** 4,
1162 'PiB': 1024 ** 5,
1163 'PB': 1000 ** 5,
1164 'pB': 1024 ** 5,
1165 'Pb': 1000 ** 5,
1166 'EiB': 1024 ** 6,
1167 'EB': 1000 ** 6,
1168 'eB': 1024 ** 6,
1169 'Eb': 1000 ** 6,
1170 'ZiB': 1024 ** 7,
1171 'ZB': 1000 ** 7,
1172 'zB': 1024 ** 7,
1173 'Zb': 1000 ** 7,
1174 'YiB': 1024 ** 8,
1175 'YB': 1000 ** 8,
1176 'yB': 1024 ** 8,
1177 'Yb': 1000 ** 8,
1178 }
1179
1180 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1181 m = re.match(
1182 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1183 if not m:
1184 return None
1185
1186 num_str = m.group('num').replace(',', '.')
1187 mult = _UNIT_TABLE[m.group('unit')]
1188 return int(float(num_str) * mult)
1189
1190
1191 def month_by_name(name):
1192 """ Return the number of a month by (locale-independently) English name """
1193
1194 try:
1195 return ENGLISH_MONTH_NAMES.index(name) + 1
1196 except ValueError:
1197 return None
1198
1199
1200 def month_by_abbreviation(abbrev):
1201 """ Return the number of a month by (locale-independently) English
1202 abbreviations """
1203
1204 try:
1205 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1206 except ValueError:
1207 return None
1208
1209
1210 def fix_xml_ampersands(xml_str):
1211 """Replace all the '&' by '&amp;' in XML"""
1212 return re.sub(
1213 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1214 '&amp;',
1215 xml_str)
1216
1217
1218 def setproctitle(title):
1219 assert isinstance(title, compat_str)
1220 try:
1221 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1222 except OSError:
1223 return
1224 title_bytes = title.encode('utf-8')
1225 buf = ctypes.create_string_buffer(len(title_bytes))
1226 buf.value = title_bytes
1227 try:
1228 libc.prctl(15, buf, 0, 0, 0)
1229 except AttributeError:
1230 return # Strange libc, just skip this
1231
1232
1233 def remove_start(s, start):
1234 if s.startswith(start):
1235 return s[len(start):]
1236 return s
1237
1238
1239 def remove_end(s, end):
1240 if s.endswith(end):
1241 return s[:-len(end)]
1242 return s
1243
1244
1245 def url_basename(url):
1246 path = compat_urlparse.urlparse(url).path
1247 return path.strip('/').split('/')[-1]
1248
1249
1250 class HEADRequest(compat_urllib_request.Request):
1251 def get_method(self):
1252 return "HEAD"
1253
1254
1255 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1256 if get_attr:
1257 if v is not None:
1258 v = getattr(v, get_attr, None)
1259 if v == '':
1260 v = None
1261 return default if v is None else (int(v) * invscale // scale)
1262
1263
1264 def str_or_none(v, default=None):
1265 return default if v is None else compat_str(v)
1266
1267
1268 def str_to_int(int_str):
1269 """ A more relaxed version of int_or_none """
1270 if int_str is None:
1271 return None
1272 int_str = re.sub(r'[,\.\+]', '', int_str)
1273 return int(int_str)
1274
1275
1276 def float_or_none(v, scale=1, invscale=1, default=None):
1277 return default if v is None else (float(v) * invscale / scale)
1278
1279
1280 def parse_duration(s):
1281 if not isinstance(s, compat_basestring):
1282 return None
1283
1284 s = s.strip()
1285
1286 m = re.match(
1287 r'''(?ix)(?:P?T)?
1288 (?:
1289 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1290 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1291
1292 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
1293 (?:
1294 (?:
1295 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1296 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1297 )?
1298 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1299 )?
1300 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1301 )$''', s)
1302 if not m:
1303 return None
1304 res = 0
1305 if m.group('only_mins'):
1306 return float_or_none(m.group('only_mins'), invscale=60)
1307 if m.group('only_hours'):
1308 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1309 if m.group('secs'):
1310 res += int(m.group('secs'))
1311 if m.group('mins_reversed'):
1312 res += int(m.group('mins_reversed')) * 60
1313 if m.group('mins'):
1314 res += int(m.group('mins')) * 60
1315 if m.group('hours'):
1316 res += int(m.group('hours')) * 60 * 60
1317 if m.group('hours_reversed'):
1318 res += int(m.group('hours_reversed')) * 60 * 60
1319 if m.group('days'):
1320 res += int(m.group('days')) * 24 * 60 * 60
1321 if m.group('ms'):
1322 res += float(m.group('ms'))
1323 return res
1324
1325
1326 def prepend_extension(filename, ext):
1327 name, real_ext = os.path.splitext(filename)
1328 return '{0}.{1}{2}'.format(name, ext, real_ext)
1329
1330
1331 def check_executable(exe, args=[]):
1332 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1333 args can be a list of arguments for a short output (like -version) """
1334 try:
1335 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1336 except OSError:
1337 return False
1338 return exe
1339
1340
1341 def get_exe_version(exe, args=['--version'],
1342 version_re=None, unrecognized='present'):
1343 """ Returns the version of the specified executable,
1344 or False if the executable is not present """
1345 try:
1346 out, _ = subprocess.Popen(
1347 [exe] + args,
1348 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1349 except OSError:
1350 return False
1351 if isinstance(out, bytes): # Python 2.x
1352 out = out.decode('ascii', 'ignore')
1353 return detect_exe_version(out, version_re, unrecognized)
1354
1355
1356 def detect_exe_version(output, version_re=None, unrecognized='present'):
1357 assert isinstance(output, compat_str)
1358 if version_re is None:
1359 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1360 m = re.search(version_re, output)
1361 if m:
1362 return m.group(1)
1363 else:
1364 return unrecognized
1365
1366
1367 class PagedList(object):
1368 def __len__(self):
1369 # This is only useful for tests
1370 return len(self.getslice())
1371
1372
1373 class OnDemandPagedList(PagedList):
1374 def __init__(self, pagefunc, pagesize):
1375 self._pagefunc = pagefunc
1376 self._pagesize = pagesize
1377
1378 def getslice(self, start=0, end=None):
1379 res = []
1380 for pagenum in itertools.count(start // self._pagesize):
1381 firstid = pagenum * self._pagesize
1382 nextfirstid = pagenum * self._pagesize + self._pagesize
1383 if start >= nextfirstid:
1384 continue
1385
1386 page_results = list(self._pagefunc(pagenum))
1387
1388 startv = (
1389 start % self._pagesize
1390 if firstid <= start < nextfirstid
1391 else 0)
1392
1393 endv = (
1394 ((end - 1) % self._pagesize) + 1
1395 if (end is not None and firstid <= end <= nextfirstid)
1396 else None)
1397
1398 if startv != 0 or endv is not None:
1399 page_results = page_results[startv:endv]
1400 res.extend(page_results)
1401
1402 # A little optimization - if current page is not "full", ie. does
1403 # not contain page_size videos then we can assume that this page
1404 # is the last one - there are no more ids on further pages -
1405 # i.e. no need to query again.
1406 if len(page_results) + startv < self._pagesize:
1407 break
1408
1409 # If we got the whole page, but the next page is not interesting,
1410 # break out early as well
1411 if end == nextfirstid:
1412 break
1413 return res
1414
1415
1416 class InAdvancePagedList(PagedList):
1417 def __init__(self, pagefunc, pagecount, pagesize):
1418 self._pagefunc = pagefunc
1419 self._pagecount = pagecount
1420 self._pagesize = pagesize
1421
1422 def getslice(self, start=0, end=None):
1423 res = []
1424 start_page = start // self._pagesize
1425 end_page = (
1426 self._pagecount if end is None else (end // self._pagesize + 1))
1427 skip_elems = start - start_page * self._pagesize
1428 only_more = None if end is None else end - start
1429 for pagenum in range(start_page, end_page):
1430 page = list(self._pagefunc(pagenum))
1431 if skip_elems:
1432 page = page[skip_elems:]
1433 skip_elems = None
1434 if only_more is not None:
1435 if len(page) < only_more:
1436 only_more -= len(page)
1437 else:
1438 page = page[:only_more]
1439 res.extend(page)
1440 break
1441 res.extend(page)
1442 return res
1443
1444
1445 def uppercase_escape(s):
1446 unicode_escape = codecs.getdecoder('unicode_escape')
1447 return re.sub(
1448 r'\\U[0-9a-fA-F]{8}',
1449 lambda m: unicode_escape(m.group(0))[0],
1450 s)
1451
1452
1453 def escape_rfc3986(s):
1454 """Escape non-ASCII characters as suggested by RFC 3986"""
1455 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1456 s = s.encode('utf-8')
1457 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1458
1459
1460 def escape_url(url):
1461 """Escape URL as suggested by RFC 3986"""
1462 url_parsed = compat_urllib_parse_urlparse(url)
1463 return url_parsed._replace(
1464 path=escape_rfc3986(url_parsed.path),
1465 params=escape_rfc3986(url_parsed.params),
1466 query=escape_rfc3986(url_parsed.query),
1467 fragment=escape_rfc3986(url_parsed.fragment)
1468 ).geturl()
1469
1470 try:
1471 struct.pack('!I', 0)
1472 except TypeError:
1473 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1474 def struct_pack(spec, *args):
1475 if isinstance(spec, compat_str):
1476 spec = spec.encode('ascii')
1477 return struct.pack(spec, *args)
1478
1479 def struct_unpack(spec, *args):
1480 if isinstance(spec, compat_str):
1481 spec = spec.encode('ascii')
1482 return struct.unpack(spec, *args)
1483 else:
1484 struct_pack = struct.pack
1485 struct_unpack = struct.unpack
1486
1487
1488 def read_batch_urls(batch_fd):
1489 def fixup(url):
1490 if not isinstance(url, compat_str):
1491 url = url.decode('utf-8', 'replace')
1492 BOM_UTF8 = '\xef\xbb\xbf'
1493 if url.startswith(BOM_UTF8):
1494 url = url[len(BOM_UTF8):]
1495 url = url.strip()
1496 if url.startswith(('#', ';', ']')):
1497 return False
1498 return url
1499
1500 with contextlib.closing(batch_fd) as fd:
1501 return [url for url in map(fixup, fd) if url]
1502
1503
1504 def urlencode_postdata(*args, **kargs):
1505 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1506
1507
1508 try:
1509 etree_iter = xml.etree.ElementTree.Element.iter
1510 except AttributeError: # Python <=2.6
1511 etree_iter = lambda n: n.findall('.//*')
1512
1513
1514 def parse_xml(s):
1515 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1516 def doctype(self, name, pubid, system):
1517 pass # Ignore doctypes
1518
1519 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1520 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1521 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1522 # Fix up XML parser in Python 2.x
1523 if sys.version_info < (3, 0):
1524 for n in etree_iter(tree):
1525 if n.text is not None:
1526 if not isinstance(n.text, compat_str):
1527 n.text = n.text.decode('utf-8')
1528 return tree
1529
1530
1531 US_RATINGS = {
1532 'G': 0,
1533 'PG': 10,
1534 'PG-13': 13,
1535 'R': 16,
1536 'NC': 18,
1537 }
1538
1539
1540 def parse_age_limit(s):
1541 if s is None:
1542 return None
1543 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1544 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1545
1546
1547 def strip_jsonp(code):
1548 return re.sub(
1549 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1550
1551
1552 def js_to_json(code):
1553 def fix_kv(m):
1554 v = m.group(0)
1555 if v in ('true', 'false', 'null'):
1556 return v
1557 if v.startswith('"'):
1558 return v
1559 if v.startswith("'"):
1560 v = v[1:-1]
1561 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1562 '\\\\': '\\\\',
1563 "\\'": "'",
1564 '"': '\\"',
1565 }[m.group(0)], v)
1566 return '"%s"' % v
1567
1568 res = re.sub(r'''(?x)
1569 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1570 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1571 [a-zA-Z_][.a-zA-Z_0-9]*
1572 ''', fix_kv, code)
1573 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1574 return res
1575
1576
1577 def qualities(quality_ids):
1578 """ Get a numeric quality value out of a list of possible values """
1579 def q(qid):
1580 try:
1581 return quality_ids.index(qid)
1582 except ValueError:
1583 return -1
1584 return q
1585
1586
1587 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1588
1589
1590 def limit_length(s, length):
1591 """ Add ellipses to overly long strings """
1592 if s is None:
1593 return None
1594 ELLIPSES = '...'
1595 if len(s) > length:
1596 return s[:length - len(ELLIPSES)] + ELLIPSES
1597 return s
1598
1599
1600 def version_tuple(v):
1601 return tuple(int(e) for e in re.split(r'[-.]', v))
1602
1603
1604 def is_outdated_version(version, limit, assume_new=True):
1605 if not version:
1606 return not assume_new
1607 try:
1608 return version_tuple(version) < version_tuple(limit)
1609 except ValueError:
1610 return not assume_new
1611
1612
1613 def ytdl_is_updateable():
1614 """ Returns if youtube-dl can be updated with -U """
1615 from zipimport import zipimporter
1616
1617 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1618
1619
1620 def args_to_str(args):
1621 # Get a short string representation for a subprocess command
1622 return ' '.join(shlex_quote(a) for a in args)
1623
1624
1625 def mimetype2ext(mt):
1626 _, _, res = mt.rpartition('/')
1627
1628 return {
1629 'x-ms-wmv': 'wmv',
1630 'x-mp4-fragmented': 'mp4',
1631 }.get(res, res)
1632
1633
1634 def urlhandle_detect_ext(url_handle):
1635 try:
1636 url_handle.headers
1637 getheader = lambda h: url_handle.headers[h]
1638 except AttributeError: # Python < 3
1639 getheader = url_handle.info().getheader
1640
1641 cd = getheader('Content-Disposition')
1642 if cd:
1643 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1644 if m:
1645 e = determine_ext(m.group('filename'), default_ext=None)
1646 if e:
1647 return e
1648
1649 return mimetype2ext(getheader('Content-Type'))
1650
1651
1652 def age_restricted(content_limit, age_limit):
1653 """ Returns True iff the content should be blocked """
1654
1655 if age_limit is None: # No limit set
1656 return False
1657 if content_limit is None:
1658 return False # Content available for everyone
1659 return age_limit < content_limit
1660
1661
1662 def is_html(first_bytes):
1663 """ Detect whether a file contains HTML by examining its first bytes. """
1664
1665 BOMS = [
1666 (b'\xef\xbb\xbf', 'utf-8'),
1667 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1668 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1669 (b'\xff\xfe', 'utf-16-le'),
1670 (b'\xfe\xff', 'utf-16-be'),
1671 ]
1672 for bom, enc in BOMS:
1673 if first_bytes.startswith(bom):
1674 s = first_bytes[len(bom):].decode(enc, 'replace')
1675 break
1676 else:
1677 s = first_bytes.decode('utf-8', 'replace')
1678
1679 return re.match(r'^\s*<', s)
1680
1681
1682 def determine_protocol(info_dict):
1683 protocol = info_dict.get('protocol')
1684 if protocol is not None:
1685 return protocol
1686
1687 url = info_dict['url']
1688 if url.startswith('rtmp'):
1689 return 'rtmp'
1690 elif url.startswith('mms'):
1691 return 'mms'
1692 elif url.startswith('rtsp'):
1693 return 'rtsp'
1694
1695 ext = determine_ext(url)
1696 if ext == 'm3u8':
1697 return 'm3u8'
1698 elif ext == 'f4m':
1699 return 'f4m'
1700
1701 return compat_urllib_parse_urlparse(url).scheme
1702
1703
1704 def render_table(header_row, data):
1705 """ Render a list of rows, each as a list of values """
1706 table = [header_row] + data
1707 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1708 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1709 return '\n'.join(format_str % tuple(row) for row in table)
1710
1711
1712 def _match_one(filter_part, dct):
1713 COMPARISON_OPERATORS = {
1714 '<': operator.lt,
1715 '<=': operator.le,
1716 '>': operator.gt,
1717 '>=': operator.ge,
1718 '=': operator.eq,
1719 '!=': operator.ne,
1720 }
1721 operator_rex = re.compile(r'''(?x)\s*
1722 (?P<key>[a-z_]+)
1723 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1724 (?:
1725 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1726 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1727 )
1728 \s*$
1729 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1730 m = operator_rex.search(filter_part)
1731 if m:
1732 op = COMPARISON_OPERATORS[m.group('op')]
1733 if m.group('strval') is not None:
1734 if m.group('op') not in ('=', '!='):
1735 raise ValueError(
1736 'Operator %s does not support string values!' % m.group('op'))
1737 comparison_value = m.group('strval')
1738 else:
1739 try:
1740 comparison_value = int(m.group('intval'))
1741 except ValueError:
1742 comparison_value = parse_filesize(m.group('intval'))
1743 if comparison_value is None:
1744 comparison_value = parse_filesize(m.group('intval') + 'B')
1745 if comparison_value is None:
1746 raise ValueError(
1747 'Invalid integer value %r in filter part %r' % (
1748 m.group('intval'), filter_part))
1749 actual_value = dct.get(m.group('key'))
1750 if actual_value is None:
1751 return m.group('none_inclusive')
1752 return op(actual_value, comparison_value)
1753
1754 UNARY_OPERATORS = {
1755 '': lambda v: v is not None,
1756 '!': lambda v: v is None,
1757 }
1758 operator_rex = re.compile(r'''(?x)\s*
1759 (?P<op>%s)\s*(?P<key>[a-z_]+)
1760 \s*$
1761 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1762 m = operator_rex.search(filter_part)
1763 if m:
1764 op = UNARY_OPERATORS[m.group('op')]
1765 actual_value = dct.get(m.group('key'))
1766 return op(actual_value)
1767
1768 raise ValueError('Invalid filter part %r' % filter_part)
1769
1770
1771 def match_str(filter_str, dct):
1772 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1773
1774 return all(
1775 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1776
1777
1778 def match_filter_func(filter_str):
1779 def _match_func(info_dict):
1780 if match_str(filter_str, info_dict):
1781 return None
1782 else:
1783 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1784 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1785 return _match_func
1786
1787
1788 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
1789 def __init__(self, proxies=None):
1790 # Set default handlers
1791 for type in ('http', 'https'):
1792 setattr(self, '%s_open' % type,
1793 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
1794 meth(r, proxy, type))
1795 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
1796
1797 def proxy_open(self, req, proxy, type):
1798 req_proxy = req.headers.get('Ytdl-request-proxy')
1799 if req_proxy is not None:
1800 proxy = req_proxy
1801 del req.headers['Ytdl-request-proxy']
1802
1803 if proxy == '__noproxy__':
1804 return None # No Proxy
1805 return compat_urllib_request.ProxyHandler.proxy_open(
1806 self, req, proxy, type)