]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
Remove the --max-quality option
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import calendar
7 import codecs
8 import contextlib
9 import ctypes
10 import datetime
11 import email.utils
12 import errno
13 import functools
14 import gzip
15 import itertools
16 import io
17 import json
18 import locale
19 import math
20 import operator
21 import os
22 import pipes
23 import platform
24 import re
25 import ssl
26 import socket
27 import struct
28 import subprocess
29 import sys
30 import tempfile
31 import traceback
32 import xml.etree.ElementTree
33 import zlib
34
35 from .compat import (
36 compat_basestring,
37 compat_chr,
38 compat_html_entities,
39 compat_http_client,
40 compat_parse_qs,
41 compat_socket_create_connection,
42 compat_str,
43 compat_urllib_error,
44 compat_urllib_parse,
45 compat_urllib_parse_urlparse,
46 compat_urllib_request,
47 compat_urlparse,
48 shlex_quote,
49 )
50
51
52 # This is not clearly defined otherwise
53 compiled_regex_type = type(re.compile(''))
54
55 std_headers = {
56 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
57 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
58 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
59 'Accept-Encoding': 'gzip, deflate',
60 'Accept-Language': 'en-us,en;q=0.5',
61 }
62
63
64 ENGLISH_MONTH_NAMES = [
65 'January', 'February', 'March', 'April', 'May', 'June',
66 'July', 'August', 'September', 'October', 'November', 'December']
67
68
69 def preferredencoding():
70 """Get preferred encoding.
71
72 Returns the best encoding scheme for the system, based on
73 locale.getpreferredencoding() and some further tweaks.
74 """
75 try:
76 pref = locale.getpreferredencoding()
77 'TEST'.encode(pref)
78 except Exception:
79 pref = 'UTF-8'
80
81 return pref
82
83
84 def write_json_file(obj, fn):
85 """ Encode obj as JSON and write it to fn, atomically if possible """
86
87 fn = encodeFilename(fn)
88 if sys.version_info < (3, 0) and sys.platform != 'win32':
89 encoding = get_filesystem_encoding()
90 # os.path.basename returns a bytes object, but NamedTemporaryFile
91 # will fail if the filename contains non ascii characters unless we
92 # use a unicode object
93 path_basename = lambda f: os.path.basename(fn).decode(encoding)
94 # the same for os.path.dirname
95 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
96 else:
97 path_basename = os.path.basename
98 path_dirname = os.path.dirname
99
100 args = {
101 'suffix': '.tmp',
102 'prefix': path_basename(fn) + '.',
103 'dir': path_dirname(fn),
104 'delete': False,
105 }
106
107 # In Python 2.x, json.dump expects a bytestream.
108 # In Python 3.x, it writes to a character stream
109 if sys.version_info < (3, 0):
110 args['mode'] = 'wb'
111 else:
112 args.update({
113 'mode': 'w',
114 'encoding': 'utf-8',
115 })
116
117 tf = tempfile.NamedTemporaryFile(**args)
118
119 try:
120 with tf:
121 json.dump(obj, tf)
122 if sys.platform == 'win32':
123 # Need to remove existing file on Windows, else os.rename raises
124 # WindowsError or FileExistsError.
125 try:
126 os.unlink(fn)
127 except OSError:
128 pass
129 os.rename(tf.name, fn)
130 except Exception:
131 try:
132 os.remove(tf.name)
133 except OSError:
134 pass
135 raise
136
137
138 if sys.version_info >= (2, 7):
139 def find_xpath_attr(node, xpath, key, val):
140 """ Find the xpath xpath[@key=val] """
141 assert re.match(r'^[a-zA-Z-]+$', key)
142 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
143 expr = xpath + "[@%s='%s']" % (key, val)
144 return node.find(expr)
145 else:
146 def find_xpath_attr(node, xpath, key, val):
147 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
148 # .//node does not match if a node is a direct child of . !
149 if isinstance(xpath, compat_str):
150 xpath = xpath.encode('ascii')
151
152 for f in node.findall(xpath):
153 if f.attrib.get(key) == val:
154 return f
155 return None
156
157 # On python2.6 the xml.etree.ElementTree.Element methods don't support
158 # the namespace parameter
159
160
161 def xpath_with_ns(path, ns_map):
162 components = [c.split(':') for c in path.split('/')]
163 replaced = []
164 for c in components:
165 if len(c) == 1:
166 replaced.append(c[0])
167 else:
168 ns, tag = c
169 replaced.append('{%s}%s' % (ns_map[ns], tag))
170 return '/'.join(replaced)
171
172
173 def xpath_text(node, xpath, name=None, fatal=False):
174 if sys.version_info < (2, 7): # Crazy 2.6
175 xpath = xpath.encode('ascii')
176
177 n = node.find(xpath)
178 if n is None or n.text is None:
179 if fatal:
180 name = xpath if name is None else name
181 raise ExtractorError('Could not find XML element %s' % name)
182 else:
183 return None
184 return n.text
185
186
187 def get_element_by_id(id, html):
188 """Return the content of the tag with the specified ID in the passed HTML document"""
189 return get_element_by_attribute("id", id, html)
190
191
192 def get_element_by_attribute(attribute, value, html):
193 """Return the content of the tag with the specified attribute in the passed HTML document"""
194
195 m = re.search(r'''(?xs)
196 <([a-zA-Z0-9:._-]+)
197 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
198 \s+%s=['"]?%s['"]?
199 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
200 \s*>
201 (?P<content>.*?)
202 </\1>
203 ''' % (re.escape(attribute), re.escape(value)), html)
204
205 if not m:
206 return None
207 res = m.group('content')
208
209 if res.startswith('"') or res.startswith("'"):
210 res = res[1:-1]
211
212 return unescapeHTML(res)
213
214
215 def clean_html(html):
216 """Clean an HTML snippet into a readable string"""
217
218 if html is None: # Convenience for sanitizing descriptions etc.
219 return html
220
221 # Newline vs <br />
222 html = html.replace('\n', ' ')
223 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
224 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
225 # Strip html tags
226 html = re.sub('<.*?>', '', html)
227 # Replace html entities
228 html = unescapeHTML(html)
229 return html.strip()
230
231
232 def sanitize_open(filename, open_mode):
233 """Try to open the given filename, and slightly tweak it if this fails.
234
235 Attempts to open the given filename. If this fails, it tries to change
236 the filename slightly, step by step, until it's either able to open it
237 or it fails and raises a final exception, like the standard open()
238 function.
239
240 It returns the tuple (stream, definitive_file_name).
241 """
242 try:
243 if filename == '-':
244 if sys.platform == 'win32':
245 import msvcrt
246 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
247 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
248 stream = open(encodeFilename(filename), open_mode)
249 return (stream, filename)
250 except (IOError, OSError) as err:
251 if err.errno in (errno.EACCES,):
252 raise
253
254 # In case of error, try to remove win32 forbidden chars
255 alt_filename = sanitize_path(filename)
256 if alt_filename == filename:
257 raise
258 else:
259 # An exception here should be caught in the caller
260 stream = open(encodeFilename(alt_filename), open_mode)
261 return (stream, alt_filename)
262
263
264 def timeconvert(timestr):
265 """Convert RFC 2822 defined time string into system timestamp"""
266 timestamp = None
267 timetuple = email.utils.parsedate_tz(timestr)
268 if timetuple is not None:
269 timestamp = email.utils.mktime_tz(timetuple)
270 return timestamp
271
272
273 def sanitize_filename(s, restricted=False, is_id=False):
274 """Sanitizes a string so it could be used as part of a filename.
275 If restricted is set, use a stricter subset of allowed characters.
276 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
277 """
278 def replace_insane(char):
279 if char == '?' or ord(char) < 32 or ord(char) == 127:
280 return ''
281 elif char == '"':
282 return '' if restricted else '\''
283 elif char == ':':
284 return '_-' if restricted else ' -'
285 elif char in '\\/|*<>':
286 return '_'
287 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
288 return '_'
289 if restricted and ord(char) > 127:
290 return '_'
291 return char
292
293 # Handle timestamps
294 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
295 result = ''.join(map(replace_insane, s))
296 if not is_id:
297 while '__' in result:
298 result = result.replace('__', '_')
299 result = result.strip('_')
300 # Common case of "Foreign band name - English song title"
301 if restricted and result.startswith('-_'):
302 result = result[2:]
303 if result.startswith('-'):
304 result = '_' + result[len('-'):]
305 result = result.lstrip('.')
306 if not result:
307 result = '_'
308 return result
309
310
311 def sanitize_path(s):
312 """Sanitizes and normalizes path on Windows"""
313 if sys.platform != 'win32':
314 return s
315 drive_or_unc, _ = os.path.splitdrive(s)
316 if sys.version_info < (2, 7) and not drive_or_unc:
317 drive_or_unc, _ = os.path.splitunc(s)
318 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
319 if drive_or_unc:
320 norm_path.pop(0)
321 sanitized_path = [
322 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
323 for path_part in norm_path]
324 if drive_or_unc:
325 sanitized_path.insert(0, drive_or_unc + os.path.sep)
326 return os.path.join(*sanitized_path)
327
328
329 def sanitize_url_path_consecutive_slashes(url):
330 """Collapses consecutive slashes in URLs' path"""
331 parsed_url = list(compat_urlparse.urlparse(url))
332 parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2])
333 return compat_urlparse.urlunparse(parsed_url)
334
335
336 def orderedSet(iterable):
337 """ Remove all duplicates from the input iterable """
338 res = []
339 for el in iterable:
340 if el not in res:
341 res.append(el)
342 return res
343
344
345 def _htmlentity_transform(entity):
346 """Transforms an HTML entity to a character."""
347 # Known non-numeric HTML entity
348 if entity in compat_html_entities.name2codepoint:
349 return compat_chr(compat_html_entities.name2codepoint[entity])
350
351 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
352 if mobj is not None:
353 numstr = mobj.group(1)
354 if numstr.startswith('x'):
355 base = 16
356 numstr = '0%s' % numstr
357 else:
358 base = 10
359 return compat_chr(int(numstr, base))
360
361 # Unknown entity in name, return its literal representation
362 return ('&%s;' % entity)
363
364
365 def unescapeHTML(s):
366 if s is None:
367 return None
368 assert type(s) == compat_str
369
370 return re.sub(
371 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
372
373
374 def encodeFilename(s, for_subprocess=False):
375 """
376 @param s The name of the file
377 """
378
379 assert type(s) == compat_str
380
381 # Python 3 has a Unicode API
382 if sys.version_info >= (3, 0):
383 return s
384
385 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
386 # Pass '' directly to use Unicode APIs on Windows 2000 and up
387 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
388 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
389 if not for_subprocess:
390 return s
391 else:
392 # For subprocess calls, encode with locale encoding
393 # Refer to http://stackoverflow.com/a/9951851/35070
394 encoding = preferredencoding()
395 else:
396 encoding = sys.getfilesystemencoding()
397 if encoding is None:
398 encoding = 'utf-8'
399 return s.encode(encoding, 'ignore')
400
401
402 def encodeArgument(s):
403 if not isinstance(s, compat_str):
404 # Legacy code that uses byte strings
405 # Uncomment the following line after fixing all post processors
406 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
407 s = s.decode('ascii')
408 return encodeFilename(s, True)
409
410
411 def decodeOption(optval):
412 if optval is None:
413 return optval
414 if isinstance(optval, bytes):
415 optval = optval.decode(preferredencoding())
416
417 assert isinstance(optval, compat_str)
418 return optval
419
420
421 def formatSeconds(secs):
422 if secs > 3600:
423 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
424 elif secs > 60:
425 return '%d:%02d' % (secs // 60, secs % 60)
426 else:
427 return '%d' % secs
428
429
430 def make_HTTPS_handler(params, **kwargs):
431 opts_no_check_certificate = params.get('nocheckcertificate', False)
432 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
433 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
434 if opts_no_check_certificate:
435 context.check_hostname = False
436 context.verify_mode = ssl.CERT_NONE
437 try:
438 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
439 except TypeError:
440 # Python 2.7.8
441 # (create_default_context present but HTTPSHandler has no context=)
442 pass
443
444 if sys.version_info < (3, 2):
445 return YoutubeDLHTTPSHandler(params, **kwargs)
446 else: # Python < 3.4
447 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
448 context.verify_mode = (ssl.CERT_NONE
449 if opts_no_check_certificate
450 else ssl.CERT_REQUIRED)
451 context.set_default_verify_paths()
452 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
453
454
455 def bug_reports_message():
456 if ytdl_is_updateable():
457 update_cmd = 'type youtube-dl -U to update'
458 else:
459 update_cmd = 'see https://yt-dl.org/update on how to update'
460 msg = '; please report this issue on https://yt-dl.org/bug .'
461 msg += ' Make sure you are using the latest version; %s.' % update_cmd
462 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
463 return msg
464
465
466 class ExtractorError(Exception):
467 """Error during info extraction."""
468
469 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
470 """ tb, if given, is the original traceback (so that it can be printed out).
471 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
472 """
473
474 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
475 expected = True
476 if video_id is not None:
477 msg = video_id + ': ' + msg
478 if cause:
479 msg += ' (caused by %r)' % cause
480 if not expected:
481 msg += bug_reports_message()
482 super(ExtractorError, self).__init__(msg)
483
484 self.traceback = tb
485 self.exc_info = sys.exc_info() # preserve original exception
486 self.cause = cause
487 self.video_id = video_id
488
489 def format_traceback(self):
490 if self.traceback is None:
491 return None
492 return ''.join(traceback.format_tb(self.traceback))
493
494
495 class UnsupportedError(ExtractorError):
496 def __init__(self, url):
497 super(UnsupportedError, self).__init__(
498 'Unsupported URL: %s' % url, expected=True)
499 self.url = url
500
501
502 class RegexNotFoundError(ExtractorError):
503 """Error when a regex didn't match"""
504 pass
505
506
507 class DownloadError(Exception):
508 """Download Error exception.
509
510 This exception may be thrown by FileDownloader objects if they are not
511 configured to continue on errors. They will contain the appropriate
512 error message.
513 """
514
515 def __init__(self, msg, exc_info=None):
516 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
517 super(DownloadError, self).__init__(msg)
518 self.exc_info = exc_info
519
520
521 class SameFileError(Exception):
522 """Same File exception.
523
524 This exception will be thrown by FileDownloader objects if they detect
525 multiple files would have to be downloaded to the same file on disk.
526 """
527 pass
528
529
530 class PostProcessingError(Exception):
531 """Post Processing exception.
532
533 This exception may be raised by PostProcessor's .run() method to
534 indicate an error in the postprocessing task.
535 """
536
537 def __init__(self, msg):
538 self.msg = msg
539
540
541 class MaxDownloadsReached(Exception):
542 """ --max-downloads limit has been reached. """
543 pass
544
545
546 class UnavailableVideoError(Exception):
547 """Unavailable Format exception.
548
549 This exception will be thrown when a video is requested
550 in a format that is not available for that video.
551 """
552 pass
553
554
555 class ContentTooShortError(Exception):
556 """Content Too Short exception.
557
558 This exception may be raised by FileDownloader objects when a file they
559 download is too small for what the server announced first, indicating
560 the connection was probably interrupted.
561 """
562 # Both in bytes
563 downloaded = None
564 expected = None
565
566 def __init__(self, downloaded, expected):
567 self.downloaded = downloaded
568 self.expected = expected
569
570
571 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
572 hc = http_class(*args, **kwargs)
573 source_address = ydl_handler._params.get('source_address')
574 if source_address is not None:
575 sa = (source_address, 0)
576 if hasattr(hc, 'source_address'): # Python 2.7+
577 hc.source_address = sa
578 else: # Python 2.6
579 def _hc_connect(self, *args, **kwargs):
580 sock = compat_socket_create_connection(
581 (self.host, self.port), self.timeout, sa)
582 if is_https:
583 self.sock = ssl.wrap_socket(
584 sock, self.key_file, self.cert_file,
585 ssl_version=ssl.PROTOCOL_TLSv1)
586 else:
587 self.sock = sock
588 hc.connect = functools.partial(_hc_connect, hc)
589
590 return hc
591
592
593 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
594 """Handler for HTTP requests and responses.
595
596 This class, when installed with an OpenerDirector, automatically adds
597 the standard headers to every HTTP request and handles gzipped and
598 deflated responses from web servers. If compression is to be avoided in
599 a particular request, the original request in the program code only has
600 to include the HTTP header "Youtubedl-No-Compression", which will be
601 removed before making the real request.
602
603 Part of this code was copied from:
604
605 http://techknack.net/python-urllib2-handlers/
606
607 Andrew Rowls, the author of that code, agreed to release it to the
608 public domain.
609 """
610
611 def __init__(self, params, *args, **kwargs):
612 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
613 self._params = params
614
615 def http_open(self, req):
616 return self.do_open(functools.partial(
617 _create_http_connection, self, compat_http_client.HTTPConnection, False),
618 req)
619
620 @staticmethod
621 def deflate(data):
622 try:
623 return zlib.decompress(data, -zlib.MAX_WBITS)
624 except zlib.error:
625 return zlib.decompress(data)
626
627 @staticmethod
628 def addinfourl_wrapper(stream, headers, url, code):
629 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
630 return compat_urllib_request.addinfourl(stream, headers, url, code)
631 ret = compat_urllib_request.addinfourl(stream, headers, url)
632 ret.code = code
633 return ret
634
635 def http_request(self, req):
636 for h, v in std_headers.items():
637 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
638 # The dict keys are capitalized because of this bug by urllib
639 if h.capitalize() not in req.headers:
640 req.add_header(h, v)
641 if 'Youtubedl-no-compression' in req.headers:
642 if 'Accept-encoding' in req.headers:
643 del req.headers['Accept-encoding']
644 del req.headers['Youtubedl-no-compression']
645
646 if sys.version_info < (2, 7) and '#' in req.get_full_url():
647 # Python 2.6 is brain-dead when it comes to fragments
648 req._Request__original = req._Request__original.partition('#')[0]
649 req._Request__r_type = req._Request__r_type.partition('#')[0]
650
651 return req
652
653 def http_response(self, req, resp):
654 old_resp = resp
655 # gzip
656 if resp.headers.get('Content-encoding', '') == 'gzip':
657 content = resp.read()
658 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
659 try:
660 uncompressed = io.BytesIO(gz.read())
661 except IOError as original_ioerror:
662 # There may be junk add the end of the file
663 # See http://stackoverflow.com/q/4928560/35070 for details
664 for i in range(1, 1024):
665 try:
666 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
667 uncompressed = io.BytesIO(gz.read())
668 except IOError:
669 continue
670 break
671 else:
672 raise original_ioerror
673 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
674 resp.msg = old_resp.msg
675 # deflate
676 if resp.headers.get('Content-encoding', '') == 'deflate':
677 gz = io.BytesIO(self.deflate(resp.read()))
678 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
679 resp.msg = old_resp.msg
680 return resp
681
682 https_request = http_request
683 https_response = http_response
684
685
686 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
687 def __init__(self, params, https_conn_class=None, *args, **kwargs):
688 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
689 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
690 self._params = params
691
692 def https_open(self, req):
693 kwargs = {}
694 if hasattr(self, '_context'): # python > 2.6
695 kwargs['context'] = self._context
696 if hasattr(self, '_check_hostname'): # python 3.x
697 kwargs['check_hostname'] = self._check_hostname
698 return self.do_open(functools.partial(
699 _create_http_connection, self, self._https_conn_class, True),
700 req, **kwargs)
701
702
703 def parse_iso8601(date_str, delimiter='T', timezone=None):
704 """ Return a UNIX timestamp from the given date """
705
706 if date_str is None:
707 return None
708
709 if timezone is None:
710 m = re.search(
711 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
712 date_str)
713 if not m:
714 timezone = datetime.timedelta()
715 else:
716 date_str = date_str[:-len(m.group(0))]
717 if not m.group('sign'):
718 timezone = datetime.timedelta()
719 else:
720 sign = 1 if m.group('sign') == '+' else -1
721 timezone = datetime.timedelta(
722 hours=sign * int(m.group('hours')),
723 minutes=sign * int(m.group('minutes')))
724 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
725 dt = datetime.datetime.strptime(date_str, date_format) - timezone
726 return calendar.timegm(dt.timetuple())
727
728
729 def unified_strdate(date_str, day_first=True):
730 """Return a string with the date in the format YYYYMMDD"""
731
732 if date_str is None:
733 return None
734 upload_date = None
735 # Replace commas
736 date_str = date_str.replace(',', ' ')
737 # %z (UTC offset) is only supported in python>=3.2
738 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
739 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
740 # Remove AM/PM + timezone
741 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
742
743 format_expressions = [
744 '%d %B %Y',
745 '%d %b %Y',
746 '%B %d %Y',
747 '%b %d %Y',
748 '%b %dst %Y %I:%M%p',
749 '%b %dnd %Y %I:%M%p',
750 '%b %dth %Y %I:%M%p',
751 '%Y %m %d',
752 '%Y-%m-%d',
753 '%Y/%m/%d',
754 '%Y/%m/%d %H:%M:%S',
755 '%Y-%m-%d %H:%M:%S',
756 '%Y-%m-%d %H:%M:%S.%f',
757 '%d.%m.%Y %H:%M',
758 '%d.%m.%Y %H.%M',
759 '%Y-%m-%dT%H:%M:%SZ',
760 '%Y-%m-%dT%H:%M:%S.%fZ',
761 '%Y-%m-%dT%H:%M:%S.%f0Z',
762 '%Y-%m-%dT%H:%M:%S',
763 '%Y-%m-%dT%H:%M:%S.%f',
764 '%Y-%m-%dT%H:%M',
765 ]
766 if day_first:
767 format_expressions.extend([
768 '%d-%m-%Y',
769 '%d.%m.%Y',
770 '%d/%m/%Y',
771 '%d/%m/%y',
772 '%d/%m/%Y %H:%M:%S',
773 ])
774 else:
775 format_expressions.extend([
776 '%m-%d-%Y',
777 '%m.%d.%Y',
778 '%m/%d/%Y',
779 '%m/%d/%y',
780 '%m/%d/%Y %H:%M:%S',
781 ])
782 for expression in format_expressions:
783 try:
784 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
785 except ValueError:
786 pass
787 if upload_date is None:
788 timetuple = email.utils.parsedate_tz(date_str)
789 if timetuple:
790 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
791 return upload_date
792
793
794 def determine_ext(url, default_ext='unknown_video'):
795 if url is None:
796 return default_ext
797 guess = url.partition('?')[0].rpartition('.')[2]
798 if re.match(r'^[A-Za-z0-9]+$', guess):
799 return guess
800 else:
801 return default_ext
802
803
804 def subtitles_filename(filename, sub_lang, sub_format):
805 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
806
807
808 def date_from_str(date_str):
809 """
810 Return a datetime object from a string in the format YYYYMMDD or
811 (now|today)[+-][0-9](day|week|month|year)(s)?"""
812 today = datetime.date.today()
813 if date_str in ('now', 'today'):
814 return today
815 if date_str == 'yesterday':
816 return today - datetime.timedelta(days=1)
817 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
818 if match is not None:
819 sign = match.group('sign')
820 time = int(match.group('time'))
821 if sign == '-':
822 time = -time
823 unit = match.group('unit')
824 # A bad aproximation?
825 if unit == 'month':
826 unit = 'day'
827 time *= 30
828 elif unit == 'year':
829 unit = 'day'
830 time *= 365
831 unit += 's'
832 delta = datetime.timedelta(**{unit: time})
833 return today + delta
834 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
835
836
837 def hyphenate_date(date_str):
838 """
839 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
840 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
841 if match is not None:
842 return '-'.join(match.groups())
843 else:
844 return date_str
845
846
847 class DateRange(object):
848 """Represents a time interval between two dates"""
849
850 def __init__(self, start=None, end=None):
851 """start and end must be strings in the format accepted by date"""
852 if start is not None:
853 self.start = date_from_str(start)
854 else:
855 self.start = datetime.datetime.min.date()
856 if end is not None:
857 self.end = date_from_str(end)
858 else:
859 self.end = datetime.datetime.max.date()
860 if self.start > self.end:
861 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
862
863 @classmethod
864 def day(cls, day):
865 """Returns a range that only contains the given day"""
866 return cls(day, day)
867
868 def __contains__(self, date):
869 """Check if the date is in the range"""
870 if not isinstance(date, datetime.date):
871 date = date_from_str(date)
872 return self.start <= date <= self.end
873
874 def __str__(self):
875 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
876
877
878 def platform_name():
879 """ Returns the platform name as a compat_str """
880 res = platform.platform()
881 if isinstance(res, bytes):
882 res = res.decode(preferredencoding())
883
884 assert isinstance(res, compat_str)
885 return res
886
887
888 def _windows_write_string(s, out):
889 """ Returns True if the string was written using special methods,
890 False if it has yet to be written out."""
891 # Adapted from http://stackoverflow.com/a/3259271/35070
892
893 import ctypes
894 import ctypes.wintypes
895
896 WIN_OUTPUT_IDS = {
897 1: -11,
898 2: -12,
899 }
900
901 try:
902 fileno = out.fileno()
903 except AttributeError:
904 # If the output stream doesn't have a fileno, it's virtual
905 return False
906 except io.UnsupportedOperation:
907 # Some strange Windows pseudo files?
908 return False
909 if fileno not in WIN_OUTPUT_IDS:
910 return False
911
912 GetStdHandle = ctypes.WINFUNCTYPE(
913 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
914 (b"GetStdHandle", ctypes.windll.kernel32))
915 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
916
917 WriteConsoleW = ctypes.WINFUNCTYPE(
918 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
919 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
920 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
921 written = ctypes.wintypes.DWORD(0)
922
923 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
924 FILE_TYPE_CHAR = 0x0002
925 FILE_TYPE_REMOTE = 0x8000
926 GetConsoleMode = ctypes.WINFUNCTYPE(
927 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
928 ctypes.POINTER(ctypes.wintypes.DWORD))(
929 (b"GetConsoleMode", ctypes.windll.kernel32))
930 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
931
932 def not_a_console(handle):
933 if handle == INVALID_HANDLE_VALUE or handle is None:
934 return True
935 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
936 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
937
938 if not_a_console(h):
939 return False
940
941 def next_nonbmp_pos(s):
942 try:
943 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
944 except StopIteration:
945 return len(s)
946
947 while s:
948 count = min(next_nonbmp_pos(s), 1024)
949
950 ret = WriteConsoleW(
951 h, s, count if count else 2, ctypes.byref(written), None)
952 if ret == 0:
953 raise OSError('Failed to write string')
954 if not count: # We just wrote a non-BMP character
955 assert written.value == 2
956 s = s[1:]
957 else:
958 assert written.value > 0
959 s = s[written.value:]
960 return True
961
962
963 def write_string(s, out=None, encoding=None):
964 if out is None:
965 out = sys.stderr
966 assert type(s) == compat_str
967
968 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
969 if _windows_write_string(s, out):
970 return
971
972 if ('b' in getattr(out, 'mode', '') or
973 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
974 byt = s.encode(encoding or preferredencoding(), 'ignore')
975 out.write(byt)
976 elif hasattr(out, 'buffer'):
977 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
978 byt = s.encode(enc, 'ignore')
979 out.buffer.write(byt)
980 else:
981 out.write(s)
982 out.flush()
983
984
985 def bytes_to_intlist(bs):
986 if not bs:
987 return []
988 if isinstance(bs[0], int): # Python 3
989 return list(bs)
990 else:
991 return [ord(c) for c in bs]
992
993
994 def intlist_to_bytes(xs):
995 if not xs:
996 return b''
997 return struct_pack('%dB' % len(xs), *xs)
998
999
1000 # Cross-platform file locking
1001 if sys.platform == 'win32':
1002 import ctypes.wintypes
1003 import msvcrt
1004
1005 class OVERLAPPED(ctypes.Structure):
1006 _fields_ = [
1007 ('Internal', ctypes.wintypes.LPVOID),
1008 ('InternalHigh', ctypes.wintypes.LPVOID),
1009 ('Offset', ctypes.wintypes.DWORD),
1010 ('OffsetHigh', ctypes.wintypes.DWORD),
1011 ('hEvent', ctypes.wintypes.HANDLE),
1012 ]
1013
1014 kernel32 = ctypes.windll.kernel32
1015 LockFileEx = kernel32.LockFileEx
1016 LockFileEx.argtypes = [
1017 ctypes.wintypes.HANDLE, # hFile
1018 ctypes.wintypes.DWORD, # dwFlags
1019 ctypes.wintypes.DWORD, # dwReserved
1020 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1021 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1022 ctypes.POINTER(OVERLAPPED) # Overlapped
1023 ]
1024 LockFileEx.restype = ctypes.wintypes.BOOL
1025 UnlockFileEx = kernel32.UnlockFileEx
1026 UnlockFileEx.argtypes = [
1027 ctypes.wintypes.HANDLE, # hFile
1028 ctypes.wintypes.DWORD, # dwReserved
1029 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1030 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1031 ctypes.POINTER(OVERLAPPED) # Overlapped
1032 ]
1033 UnlockFileEx.restype = ctypes.wintypes.BOOL
1034 whole_low = 0xffffffff
1035 whole_high = 0x7fffffff
1036
1037 def _lock_file(f, exclusive):
1038 overlapped = OVERLAPPED()
1039 overlapped.Offset = 0
1040 overlapped.OffsetHigh = 0
1041 overlapped.hEvent = 0
1042 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1043 handle = msvcrt.get_osfhandle(f.fileno())
1044 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1045 whole_low, whole_high, f._lock_file_overlapped_p):
1046 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1047
1048 def _unlock_file(f):
1049 assert f._lock_file_overlapped_p
1050 handle = msvcrt.get_osfhandle(f.fileno())
1051 if not UnlockFileEx(handle, 0,
1052 whole_low, whole_high, f._lock_file_overlapped_p):
1053 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1054
1055 else:
1056 import fcntl
1057
1058 def _lock_file(f, exclusive):
1059 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1060
1061 def _unlock_file(f):
1062 fcntl.flock(f, fcntl.LOCK_UN)
1063
1064
1065 class locked_file(object):
1066 def __init__(self, filename, mode, encoding=None):
1067 assert mode in ['r', 'a', 'w']
1068 self.f = io.open(filename, mode, encoding=encoding)
1069 self.mode = mode
1070
1071 def __enter__(self):
1072 exclusive = self.mode != 'r'
1073 try:
1074 _lock_file(self.f, exclusive)
1075 except IOError:
1076 self.f.close()
1077 raise
1078 return self
1079
1080 def __exit__(self, etype, value, traceback):
1081 try:
1082 _unlock_file(self.f)
1083 finally:
1084 self.f.close()
1085
1086 def __iter__(self):
1087 return iter(self.f)
1088
1089 def write(self, *args):
1090 return self.f.write(*args)
1091
1092 def read(self, *args):
1093 return self.f.read(*args)
1094
1095
1096 def get_filesystem_encoding():
1097 encoding = sys.getfilesystemencoding()
1098 return encoding if encoding is not None else 'utf-8'
1099
1100
1101 def shell_quote(args):
1102 quoted_args = []
1103 encoding = get_filesystem_encoding()
1104 for a in args:
1105 if isinstance(a, bytes):
1106 # We may get a filename encoded with 'encodeFilename'
1107 a = a.decode(encoding)
1108 quoted_args.append(pipes.quote(a))
1109 return ' '.join(quoted_args)
1110
1111
1112 def smuggle_url(url, data):
1113 """ Pass additional data in a URL for internal use. """
1114
1115 sdata = compat_urllib_parse.urlencode(
1116 {'__youtubedl_smuggle': json.dumps(data)})
1117 return url + '#' + sdata
1118
1119
1120 def unsmuggle_url(smug_url, default=None):
1121 if '#__youtubedl_smuggle' not in smug_url:
1122 return smug_url, default
1123 url, _, sdata = smug_url.rpartition('#')
1124 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1125 data = json.loads(jsond)
1126 return url, data
1127
1128
1129 def format_bytes(bytes):
1130 if bytes is None:
1131 return 'N/A'
1132 if type(bytes) is str:
1133 bytes = float(bytes)
1134 if bytes == 0.0:
1135 exponent = 0
1136 else:
1137 exponent = int(math.log(bytes, 1024.0))
1138 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1139 converted = float(bytes) / float(1024 ** exponent)
1140 return '%.2f%s' % (converted, suffix)
1141
1142
1143 def parse_filesize(s):
1144 if s is None:
1145 return None
1146
1147 # The lower-case forms are of course incorrect and inofficial,
1148 # but we support those too
1149 _UNIT_TABLE = {
1150 'B': 1,
1151 'b': 1,
1152 'KiB': 1024,
1153 'KB': 1000,
1154 'kB': 1024,
1155 'Kb': 1000,
1156 'MiB': 1024 ** 2,
1157 'MB': 1000 ** 2,
1158 'mB': 1024 ** 2,
1159 'Mb': 1000 ** 2,
1160 'GiB': 1024 ** 3,
1161 'GB': 1000 ** 3,
1162 'gB': 1024 ** 3,
1163 'Gb': 1000 ** 3,
1164 'TiB': 1024 ** 4,
1165 'TB': 1000 ** 4,
1166 'tB': 1024 ** 4,
1167 'Tb': 1000 ** 4,
1168 'PiB': 1024 ** 5,
1169 'PB': 1000 ** 5,
1170 'pB': 1024 ** 5,
1171 'Pb': 1000 ** 5,
1172 'EiB': 1024 ** 6,
1173 'EB': 1000 ** 6,
1174 'eB': 1024 ** 6,
1175 'Eb': 1000 ** 6,
1176 'ZiB': 1024 ** 7,
1177 'ZB': 1000 ** 7,
1178 'zB': 1024 ** 7,
1179 'Zb': 1000 ** 7,
1180 'YiB': 1024 ** 8,
1181 'YB': 1000 ** 8,
1182 'yB': 1024 ** 8,
1183 'Yb': 1000 ** 8,
1184 }
1185
1186 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1187 m = re.match(
1188 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1189 if not m:
1190 return None
1191
1192 num_str = m.group('num').replace(',', '.')
1193 mult = _UNIT_TABLE[m.group('unit')]
1194 return int(float(num_str) * mult)
1195
1196
1197 def month_by_name(name):
1198 """ Return the number of a month by (locale-independently) English name """
1199
1200 try:
1201 return ENGLISH_MONTH_NAMES.index(name) + 1
1202 except ValueError:
1203 return None
1204
1205
1206 def month_by_abbreviation(abbrev):
1207 """ Return the number of a month by (locale-independently) English
1208 abbreviations """
1209
1210 try:
1211 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1212 except ValueError:
1213 return None
1214
1215
1216 def fix_xml_ampersands(xml_str):
1217 """Replace all the '&' by '&amp;' in XML"""
1218 return re.sub(
1219 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1220 '&amp;',
1221 xml_str)
1222
1223
1224 def setproctitle(title):
1225 assert isinstance(title, compat_str)
1226 try:
1227 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1228 except OSError:
1229 return
1230 title_bytes = title.encode('utf-8')
1231 buf = ctypes.create_string_buffer(len(title_bytes))
1232 buf.value = title_bytes
1233 try:
1234 libc.prctl(15, buf, 0, 0, 0)
1235 except AttributeError:
1236 return # Strange libc, just skip this
1237
1238
1239 def remove_start(s, start):
1240 if s.startswith(start):
1241 return s[len(start):]
1242 return s
1243
1244
1245 def remove_end(s, end):
1246 if s.endswith(end):
1247 return s[:-len(end)]
1248 return s
1249
1250
1251 def url_basename(url):
1252 path = compat_urlparse.urlparse(url).path
1253 return path.strip('/').split('/')[-1]
1254
1255
1256 class HEADRequest(compat_urllib_request.Request):
1257 def get_method(self):
1258 return "HEAD"
1259
1260
1261 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1262 if get_attr:
1263 if v is not None:
1264 v = getattr(v, get_attr, None)
1265 if v == '':
1266 v = None
1267 return default if v is None else (int(v) * invscale // scale)
1268
1269
1270 def str_or_none(v, default=None):
1271 return default if v is None else compat_str(v)
1272
1273
1274 def str_to_int(int_str):
1275 """ A more relaxed version of int_or_none """
1276 if int_str is None:
1277 return None
1278 int_str = re.sub(r'[,\.\+]', '', int_str)
1279 return int(int_str)
1280
1281
1282 def float_or_none(v, scale=1, invscale=1, default=None):
1283 return default if v is None else (float(v) * invscale / scale)
1284
1285
1286 def parse_duration(s):
1287 if not isinstance(s, compat_basestring):
1288 return None
1289
1290 s = s.strip()
1291
1292 m = re.match(
1293 r'''(?ix)(?:P?T)?
1294 (?:
1295 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1296 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1297
1298 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
1299 (?:
1300 (?:
1301 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1302 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1303 )?
1304 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1305 )?
1306 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1307 )$''', s)
1308 if not m:
1309 return None
1310 res = 0
1311 if m.group('only_mins'):
1312 return float_or_none(m.group('only_mins'), invscale=60)
1313 if m.group('only_hours'):
1314 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1315 if m.group('secs'):
1316 res += int(m.group('secs'))
1317 if m.group('mins_reversed'):
1318 res += int(m.group('mins_reversed')) * 60
1319 if m.group('mins'):
1320 res += int(m.group('mins')) * 60
1321 if m.group('hours'):
1322 res += int(m.group('hours')) * 60 * 60
1323 if m.group('hours_reversed'):
1324 res += int(m.group('hours_reversed')) * 60 * 60
1325 if m.group('days'):
1326 res += int(m.group('days')) * 24 * 60 * 60
1327 if m.group('ms'):
1328 res += float(m.group('ms'))
1329 return res
1330
1331
1332 def prepend_extension(filename, ext):
1333 name, real_ext = os.path.splitext(filename)
1334 return '{0}.{1}{2}'.format(name, ext, real_ext)
1335
1336
1337 def check_executable(exe, args=[]):
1338 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1339 args can be a list of arguments for a short output (like -version) """
1340 try:
1341 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1342 except OSError:
1343 return False
1344 return exe
1345
1346
1347 def get_exe_version(exe, args=['--version'],
1348 version_re=None, unrecognized='present'):
1349 """ Returns the version of the specified executable,
1350 or False if the executable is not present """
1351 try:
1352 out, _ = subprocess.Popen(
1353 [exe] + args,
1354 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1355 except OSError:
1356 return False
1357 if isinstance(out, bytes): # Python 2.x
1358 out = out.decode('ascii', 'ignore')
1359 return detect_exe_version(out, version_re, unrecognized)
1360
1361
1362 def detect_exe_version(output, version_re=None, unrecognized='present'):
1363 assert isinstance(output, compat_str)
1364 if version_re is None:
1365 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1366 m = re.search(version_re, output)
1367 if m:
1368 return m.group(1)
1369 else:
1370 return unrecognized
1371
1372
1373 class PagedList(object):
1374 def __len__(self):
1375 # This is only useful for tests
1376 return len(self.getslice())
1377
1378
1379 class OnDemandPagedList(PagedList):
1380 def __init__(self, pagefunc, pagesize):
1381 self._pagefunc = pagefunc
1382 self._pagesize = pagesize
1383
1384 def getslice(self, start=0, end=None):
1385 res = []
1386 for pagenum in itertools.count(start // self._pagesize):
1387 firstid = pagenum * self._pagesize
1388 nextfirstid = pagenum * self._pagesize + self._pagesize
1389 if start >= nextfirstid:
1390 continue
1391
1392 page_results = list(self._pagefunc(pagenum))
1393
1394 startv = (
1395 start % self._pagesize
1396 if firstid <= start < nextfirstid
1397 else 0)
1398
1399 endv = (
1400 ((end - 1) % self._pagesize) + 1
1401 if (end is not None and firstid <= end <= nextfirstid)
1402 else None)
1403
1404 if startv != 0 or endv is not None:
1405 page_results = page_results[startv:endv]
1406 res.extend(page_results)
1407
1408 # A little optimization - if current page is not "full", ie. does
1409 # not contain page_size videos then we can assume that this page
1410 # is the last one - there are no more ids on further pages -
1411 # i.e. no need to query again.
1412 if len(page_results) + startv < self._pagesize:
1413 break
1414
1415 # If we got the whole page, but the next page is not interesting,
1416 # break out early as well
1417 if end == nextfirstid:
1418 break
1419 return res
1420
1421
1422 class InAdvancePagedList(PagedList):
1423 def __init__(self, pagefunc, pagecount, pagesize):
1424 self._pagefunc = pagefunc
1425 self._pagecount = pagecount
1426 self._pagesize = pagesize
1427
1428 def getslice(self, start=0, end=None):
1429 res = []
1430 start_page = start // self._pagesize
1431 end_page = (
1432 self._pagecount if end is None else (end // self._pagesize + 1))
1433 skip_elems = start - start_page * self._pagesize
1434 only_more = None if end is None else end - start
1435 for pagenum in range(start_page, end_page):
1436 page = list(self._pagefunc(pagenum))
1437 if skip_elems:
1438 page = page[skip_elems:]
1439 skip_elems = None
1440 if only_more is not None:
1441 if len(page) < only_more:
1442 only_more -= len(page)
1443 else:
1444 page = page[:only_more]
1445 res.extend(page)
1446 break
1447 res.extend(page)
1448 return res
1449
1450
1451 def uppercase_escape(s):
1452 unicode_escape = codecs.getdecoder('unicode_escape')
1453 return re.sub(
1454 r'\\U[0-9a-fA-F]{8}',
1455 lambda m: unicode_escape(m.group(0))[0],
1456 s)
1457
1458
1459 def escape_rfc3986(s):
1460 """Escape non-ASCII characters as suggested by RFC 3986"""
1461 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1462 s = s.encode('utf-8')
1463 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1464
1465
1466 def escape_url(url):
1467 """Escape URL as suggested by RFC 3986"""
1468 url_parsed = compat_urllib_parse_urlparse(url)
1469 return url_parsed._replace(
1470 path=escape_rfc3986(url_parsed.path),
1471 params=escape_rfc3986(url_parsed.params),
1472 query=escape_rfc3986(url_parsed.query),
1473 fragment=escape_rfc3986(url_parsed.fragment)
1474 ).geturl()
1475
1476 try:
1477 struct.pack('!I', 0)
1478 except TypeError:
1479 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1480 def struct_pack(spec, *args):
1481 if isinstance(spec, compat_str):
1482 spec = spec.encode('ascii')
1483 return struct.pack(spec, *args)
1484
1485 def struct_unpack(spec, *args):
1486 if isinstance(spec, compat_str):
1487 spec = spec.encode('ascii')
1488 return struct.unpack(spec, *args)
1489 else:
1490 struct_pack = struct.pack
1491 struct_unpack = struct.unpack
1492
1493
1494 def read_batch_urls(batch_fd):
1495 def fixup(url):
1496 if not isinstance(url, compat_str):
1497 url = url.decode('utf-8', 'replace')
1498 BOM_UTF8 = '\xef\xbb\xbf'
1499 if url.startswith(BOM_UTF8):
1500 url = url[len(BOM_UTF8):]
1501 url = url.strip()
1502 if url.startswith(('#', ';', ']')):
1503 return False
1504 return url
1505
1506 with contextlib.closing(batch_fd) as fd:
1507 return [url for url in map(fixup, fd) if url]
1508
1509
1510 def urlencode_postdata(*args, **kargs):
1511 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1512
1513
1514 try:
1515 etree_iter = xml.etree.ElementTree.Element.iter
1516 except AttributeError: # Python <=2.6
1517 etree_iter = lambda n: n.findall('.//*')
1518
1519
1520 def parse_xml(s):
1521 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1522 def doctype(self, name, pubid, system):
1523 pass # Ignore doctypes
1524
1525 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1526 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1527 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1528 # Fix up XML parser in Python 2.x
1529 if sys.version_info < (3, 0):
1530 for n in etree_iter(tree):
1531 if n.text is not None:
1532 if not isinstance(n.text, compat_str):
1533 n.text = n.text.decode('utf-8')
1534 return tree
1535
1536
1537 US_RATINGS = {
1538 'G': 0,
1539 'PG': 10,
1540 'PG-13': 13,
1541 'R': 16,
1542 'NC': 18,
1543 }
1544
1545
1546 def parse_age_limit(s):
1547 if s is None:
1548 return None
1549 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1550 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1551
1552
1553 def strip_jsonp(code):
1554 return re.sub(
1555 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1556
1557
1558 def js_to_json(code):
1559 def fix_kv(m):
1560 v = m.group(0)
1561 if v in ('true', 'false', 'null'):
1562 return v
1563 if v.startswith('"'):
1564 return v
1565 if v.startswith("'"):
1566 v = v[1:-1]
1567 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1568 '\\\\': '\\\\',
1569 "\\'": "'",
1570 '"': '\\"',
1571 }[m.group(0)], v)
1572 return '"%s"' % v
1573
1574 res = re.sub(r'''(?x)
1575 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1576 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1577 [a-zA-Z_][.a-zA-Z_0-9]*
1578 ''', fix_kv, code)
1579 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1580 return res
1581
1582
1583 def qualities(quality_ids):
1584 """ Get a numeric quality value out of a list of possible values """
1585 def q(qid):
1586 try:
1587 return quality_ids.index(qid)
1588 except ValueError:
1589 return -1
1590 return q
1591
1592
1593 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1594
1595
1596 def limit_length(s, length):
1597 """ Add ellipses to overly long strings """
1598 if s is None:
1599 return None
1600 ELLIPSES = '...'
1601 if len(s) > length:
1602 return s[:length - len(ELLIPSES)] + ELLIPSES
1603 return s
1604
1605
1606 def version_tuple(v):
1607 return tuple(int(e) for e in re.split(r'[-.]', v))
1608
1609
1610 def is_outdated_version(version, limit, assume_new=True):
1611 if not version:
1612 return not assume_new
1613 try:
1614 return version_tuple(version) < version_tuple(limit)
1615 except ValueError:
1616 return not assume_new
1617
1618
1619 def ytdl_is_updateable():
1620 """ Returns if youtube-dl can be updated with -U """
1621 from zipimport import zipimporter
1622
1623 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1624
1625
1626 def args_to_str(args):
1627 # Get a short string representation for a subprocess command
1628 return ' '.join(shlex_quote(a) for a in args)
1629
1630
1631 def mimetype2ext(mt):
1632 _, _, res = mt.rpartition('/')
1633
1634 return {
1635 'x-ms-wmv': 'wmv',
1636 'x-mp4-fragmented': 'mp4',
1637 }.get(res, res)
1638
1639
1640 def urlhandle_detect_ext(url_handle):
1641 try:
1642 url_handle.headers
1643 getheader = lambda h: url_handle.headers[h]
1644 except AttributeError: # Python < 3
1645 getheader = url_handle.info().getheader
1646
1647 cd = getheader('Content-Disposition')
1648 if cd:
1649 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1650 if m:
1651 e = determine_ext(m.group('filename'), default_ext=None)
1652 if e:
1653 return e
1654
1655 return mimetype2ext(getheader('Content-Type'))
1656
1657
1658 def age_restricted(content_limit, age_limit):
1659 """ Returns True iff the content should be blocked """
1660
1661 if age_limit is None: # No limit set
1662 return False
1663 if content_limit is None:
1664 return False # Content available for everyone
1665 return age_limit < content_limit
1666
1667
1668 def is_html(first_bytes):
1669 """ Detect whether a file contains HTML by examining its first bytes. """
1670
1671 BOMS = [
1672 (b'\xef\xbb\xbf', 'utf-8'),
1673 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1674 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1675 (b'\xff\xfe', 'utf-16-le'),
1676 (b'\xfe\xff', 'utf-16-be'),
1677 ]
1678 for bom, enc in BOMS:
1679 if first_bytes.startswith(bom):
1680 s = first_bytes[len(bom):].decode(enc, 'replace')
1681 break
1682 else:
1683 s = first_bytes.decode('utf-8', 'replace')
1684
1685 return re.match(r'^\s*<', s)
1686
1687
1688 def determine_protocol(info_dict):
1689 protocol = info_dict.get('protocol')
1690 if protocol is not None:
1691 return protocol
1692
1693 url = info_dict['url']
1694 if url.startswith('rtmp'):
1695 return 'rtmp'
1696 elif url.startswith('mms'):
1697 return 'mms'
1698 elif url.startswith('rtsp'):
1699 return 'rtsp'
1700
1701 ext = determine_ext(url)
1702 if ext == 'm3u8':
1703 return 'm3u8'
1704 elif ext == 'f4m':
1705 return 'f4m'
1706
1707 return compat_urllib_parse_urlparse(url).scheme
1708
1709
1710 def render_table(header_row, data):
1711 """ Render a list of rows, each as a list of values """
1712 table = [header_row] + data
1713 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1714 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1715 return '\n'.join(format_str % tuple(row) for row in table)
1716
1717
1718 def _match_one(filter_part, dct):
1719 COMPARISON_OPERATORS = {
1720 '<': operator.lt,
1721 '<=': operator.le,
1722 '>': operator.gt,
1723 '>=': operator.ge,
1724 '=': operator.eq,
1725 '!=': operator.ne,
1726 }
1727 operator_rex = re.compile(r'''(?x)\s*
1728 (?P<key>[a-z_]+)
1729 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1730 (?:
1731 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1732 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1733 )
1734 \s*$
1735 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1736 m = operator_rex.search(filter_part)
1737 if m:
1738 op = COMPARISON_OPERATORS[m.group('op')]
1739 if m.group('strval') is not None:
1740 if m.group('op') not in ('=', '!='):
1741 raise ValueError(
1742 'Operator %s does not support string values!' % m.group('op'))
1743 comparison_value = m.group('strval')
1744 else:
1745 try:
1746 comparison_value = int(m.group('intval'))
1747 except ValueError:
1748 comparison_value = parse_filesize(m.group('intval'))
1749 if comparison_value is None:
1750 comparison_value = parse_filesize(m.group('intval') + 'B')
1751 if comparison_value is None:
1752 raise ValueError(
1753 'Invalid integer value %r in filter part %r' % (
1754 m.group('intval'), filter_part))
1755 actual_value = dct.get(m.group('key'))
1756 if actual_value is None:
1757 return m.group('none_inclusive')
1758 return op(actual_value, comparison_value)
1759
1760 UNARY_OPERATORS = {
1761 '': lambda v: v is not None,
1762 '!': lambda v: v is None,
1763 }
1764 operator_rex = re.compile(r'''(?x)\s*
1765 (?P<op>%s)\s*(?P<key>[a-z_]+)
1766 \s*$
1767 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1768 m = operator_rex.search(filter_part)
1769 if m:
1770 op = UNARY_OPERATORS[m.group('op')]
1771 actual_value = dct.get(m.group('key'))
1772 return op(actual_value)
1773
1774 raise ValueError('Invalid filter part %r' % filter_part)
1775
1776
1777 def match_str(filter_str, dct):
1778 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1779
1780 return all(
1781 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1782
1783
1784 def match_filter_func(filter_str):
1785 def _match_func(info_dict):
1786 if match_str(filter_str, info_dict):
1787 return None
1788 else:
1789 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1790 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1791 return _match_func
1792
1793
1794 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
1795 def __init__(self, proxies=None):
1796 # Set default handlers
1797 for type in ('http', 'https'):
1798 setattr(self, '%s_open' % type,
1799 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
1800 meth(r, proxy, type))
1801 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
1802
1803 def proxy_open(self, req, proxy, type):
1804 req_proxy = req.headers.get('Ytdl-request-proxy')
1805 if req_proxy is not None:
1806 proxy = req_proxy
1807 del req.headers['Ytdl-request-proxy']
1808
1809 if proxy == '__noproxy__':
1810 return None # No Proxy
1811 return compat_urllib_request.ProxyHandler.proxy_open(
1812 self, req, proxy, type)