]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
[utils] Add replace_extension
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import calendar
7 import codecs
8 import contextlib
9 import ctypes
10 import datetime
11 import email.utils
12 import errno
13 import functools
14 import gzip
15 import itertools
16 import io
17 import json
18 import locale
19 import math
20 import operator
21 import os
22 import pipes
23 import platform
24 import re
25 import ssl
26 import socket
27 import struct
28 import subprocess
29 import sys
30 import tempfile
31 import traceback
32 import xml.etree.ElementTree
33 import zlib
34
35 from .compat import (
36 compat_basestring,
37 compat_chr,
38 compat_html_entities,
39 compat_http_client,
40 compat_kwargs,
41 compat_parse_qs,
42 compat_socket_create_connection,
43 compat_str,
44 compat_urllib_error,
45 compat_urllib_parse,
46 compat_urllib_parse_urlparse,
47 compat_urllib_request,
48 compat_urlparse,
49 shlex_quote,
50 )
51
52
53 # This is not clearly defined otherwise
54 compiled_regex_type = type(re.compile(''))
55
56 std_headers = {
57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
62 }
63
64
65 ENGLISH_MONTH_NAMES = [
66 'January', 'February', 'March', 'April', 'May', 'June',
67 'July', 'August', 'September', 'October', 'November', 'December']
68
69
70 def preferredencoding():
71 """Get preferred encoding.
72
73 Returns the best encoding scheme for the system, based on
74 locale.getpreferredencoding() and some further tweaks.
75 """
76 try:
77 pref = locale.getpreferredencoding()
78 'TEST'.encode(pref)
79 except Exception:
80 pref = 'UTF-8'
81
82 return pref
83
84
85 def write_json_file(obj, fn):
86 """ Encode obj as JSON and write it to fn, atomically if possible """
87
88 fn = encodeFilename(fn)
89 if sys.version_info < (3, 0) and sys.platform != 'win32':
90 encoding = get_filesystem_encoding()
91 # os.path.basename returns a bytes object, but NamedTemporaryFile
92 # will fail if the filename contains non ascii characters unless we
93 # use a unicode object
94 path_basename = lambda f: os.path.basename(fn).decode(encoding)
95 # the same for os.path.dirname
96 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
97 else:
98 path_basename = os.path.basename
99 path_dirname = os.path.dirname
100
101 args = {
102 'suffix': '.tmp',
103 'prefix': path_basename(fn) + '.',
104 'dir': path_dirname(fn),
105 'delete': False,
106 }
107
108 # In Python 2.x, json.dump expects a bytestream.
109 # In Python 3.x, it writes to a character stream
110 if sys.version_info < (3, 0):
111 args['mode'] = 'wb'
112 else:
113 args.update({
114 'mode': 'w',
115 'encoding': 'utf-8',
116 })
117
118 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
119
120 try:
121 with tf:
122 json.dump(obj, tf)
123 if sys.platform == 'win32':
124 # Need to remove existing file on Windows, else os.rename raises
125 # WindowsError or FileExistsError.
126 try:
127 os.unlink(fn)
128 except OSError:
129 pass
130 os.rename(tf.name, fn)
131 except Exception:
132 try:
133 os.remove(tf.name)
134 except OSError:
135 pass
136 raise
137
138
139 if sys.version_info >= (2, 7):
140 def find_xpath_attr(node, xpath, key, val):
141 """ Find the xpath xpath[@key=val] """
142 assert re.match(r'^[a-zA-Z-]+$', key)
143 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
144 expr = xpath + "[@%s='%s']" % (key, val)
145 return node.find(expr)
146 else:
147 def find_xpath_attr(node, xpath, key, val):
148 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
149 # .//node does not match if a node is a direct child of . !
150 if isinstance(xpath, compat_str):
151 xpath = xpath.encode('ascii')
152
153 for f in node.findall(xpath):
154 if f.attrib.get(key) == val:
155 return f
156 return None
157
158 # On python2.6 the xml.etree.ElementTree.Element methods don't support
159 # the namespace parameter
160
161
162 def xpath_with_ns(path, ns_map):
163 components = [c.split(':') for c in path.split('/')]
164 replaced = []
165 for c in components:
166 if len(c) == 1:
167 replaced.append(c[0])
168 else:
169 ns, tag = c
170 replaced.append('{%s}%s' % (ns_map[ns], tag))
171 return '/'.join(replaced)
172
173
174 def xpath_text(node, xpath, name=None, fatal=False):
175 if sys.version_info < (2, 7): # Crazy 2.6
176 xpath = xpath.encode('ascii')
177
178 n = node.find(xpath)
179 if n is None or n.text is None:
180 if fatal:
181 name = xpath if name is None else name
182 raise ExtractorError('Could not find XML element %s' % name)
183 else:
184 return None
185 return n.text
186
187
188 def get_element_by_id(id, html):
189 """Return the content of the tag with the specified ID in the passed HTML document"""
190 return get_element_by_attribute("id", id, html)
191
192
193 def get_element_by_attribute(attribute, value, html):
194 """Return the content of the tag with the specified attribute in the passed HTML document"""
195
196 m = re.search(r'''(?xs)
197 <([a-zA-Z0-9:._-]+)
198 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
199 \s+%s=['"]?%s['"]?
200 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
201 \s*>
202 (?P<content>.*?)
203 </\1>
204 ''' % (re.escape(attribute), re.escape(value)), html)
205
206 if not m:
207 return None
208 res = m.group('content')
209
210 if res.startswith('"') or res.startswith("'"):
211 res = res[1:-1]
212
213 return unescapeHTML(res)
214
215
216 def clean_html(html):
217 """Clean an HTML snippet into a readable string"""
218
219 if html is None: # Convenience for sanitizing descriptions etc.
220 return html
221
222 # Newline vs <br />
223 html = html.replace('\n', ' ')
224 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
225 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
226 # Strip html tags
227 html = re.sub('<.*?>', '', html)
228 # Replace html entities
229 html = unescapeHTML(html)
230 return html.strip()
231
232
233 def sanitize_open(filename, open_mode):
234 """Try to open the given filename, and slightly tweak it if this fails.
235
236 Attempts to open the given filename. If this fails, it tries to change
237 the filename slightly, step by step, until it's either able to open it
238 or it fails and raises a final exception, like the standard open()
239 function.
240
241 It returns the tuple (stream, definitive_file_name).
242 """
243 try:
244 if filename == '-':
245 if sys.platform == 'win32':
246 import msvcrt
247 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
248 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
249 stream = open(encodeFilename(filename), open_mode)
250 return (stream, filename)
251 except (IOError, OSError) as err:
252 if err.errno in (errno.EACCES,):
253 raise
254
255 # In case of error, try to remove win32 forbidden chars
256 alt_filename = sanitize_path(filename)
257 if alt_filename == filename:
258 raise
259 else:
260 # An exception here should be caught in the caller
261 stream = open(encodeFilename(alt_filename), open_mode)
262 return (stream, alt_filename)
263
264
265 def timeconvert(timestr):
266 """Convert RFC 2822 defined time string into system timestamp"""
267 timestamp = None
268 timetuple = email.utils.parsedate_tz(timestr)
269 if timetuple is not None:
270 timestamp = email.utils.mktime_tz(timetuple)
271 return timestamp
272
273
274 def sanitize_filename(s, restricted=False, is_id=False):
275 """Sanitizes a string so it could be used as part of a filename.
276 If restricted is set, use a stricter subset of allowed characters.
277 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
278 """
279 def replace_insane(char):
280 if char == '?' or ord(char) < 32 or ord(char) == 127:
281 return ''
282 elif char == '"':
283 return '' if restricted else '\''
284 elif char == ':':
285 return '_-' if restricted else ' -'
286 elif char in '\\/|*<>':
287 return '_'
288 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
289 return '_'
290 if restricted and ord(char) > 127:
291 return '_'
292 return char
293
294 # Handle timestamps
295 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
296 result = ''.join(map(replace_insane, s))
297 if not is_id:
298 while '__' in result:
299 result = result.replace('__', '_')
300 result = result.strip('_')
301 # Common case of "Foreign band name - English song title"
302 if restricted and result.startswith('-_'):
303 result = result[2:]
304 if result.startswith('-'):
305 result = '_' + result[len('-'):]
306 result = result.lstrip('.')
307 if not result:
308 result = '_'
309 return result
310
311
312 def sanitize_path(s):
313 """Sanitizes and normalizes path on Windows"""
314 if sys.platform != 'win32':
315 return s
316 drive_or_unc, _ = os.path.splitdrive(s)
317 if sys.version_info < (2, 7) and not drive_or_unc:
318 drive_or_unc, _ = os.path.splitunc(s)
319 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
320 if drive_or_unc:
321 norm_path.pop(0)
322 sanitized_path = [
323 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
324 for path_part in norm_path]
325 if drive_or_unc:
326 sanitized_path.insert(0, drive_or_unc + os.path.sep)
327 return os.path.join(*sanitized_path)
328
329
330 def sanitize_url_path_consecutive_slashes(url):
331 """Collapses consecutive slashes in URLs' path"""
332 parsed_url = list(compat_urlparse.urlparse(url))
333 parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2])
334 return compat_urlparse.urlunparse(parsed_url)
335
336
337 def orderedSet(iterable):
338 """ Remove all duplicates from the input iterable """
339 res = []
340 for el in iterable:
341 if el not in res:
342 res.append(el)
343 return res
344
345
346 def _htmlentity_transform(entity):
347 """Transforms an HTML entity to a character."""
348 # Known non-numeric HTML entity
349 if entity in compat_html_entities.name2codepoint:
350 return compat_chr(compat_html_entities.name2codepoint[entity])
351
352 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
353 if mobj is not None:
354 numstr = mobj.group(1)
355 if numstr.startswith('x'):
356 base = 16
357 numstr = '0%s' % numstr
358 else:
359 base = 10
360 return compat_chr(int(numstr, base))
361
362 # Unknown entity in name, return its literal representation
363 return ('&%s;' % entity)
364
365
366 def unescapeHTML(s):
367 if s is None:
368 return None
369 assert type(s) == compat_str
370
371 return re.sub(
372 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
373
374
375 def get_subprocess_encoding():
376 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
377 # For subprocess calls, encode with locale encoding
378 # Refer to http://stackoverflow.com/a/9951851/35070
379 encoding = preferredencoding()
380 else:
381 encoding = sys.getfilesystemencoding()
382 if encoding is None:
383 encoding = 'utf-8'
384 return encoding
385
386
387 def encodeFilename(s, for_subprocess=False):
388 """
389 @param s The name of the file
390 """
391
392 assert type(s) == compat_str
393
394 # Python 3 has a Unicode API
395 if sys.version_info >= (3, 0):
396 return s
397
398 # Pass '' directly to use Unicode APIs on Windows 2000 and up
399 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
400 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
401 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
402 return s
403
404 return s.encode(get_subprocess_encoding(), 'ignore')
405
406
407 def decodeFilename(b, for_subprocess=False):
408
409 if sys.version_info >= (3, 0):
410 return b
411
412 if not isinstance(b, bytes):
413 return b
414
415 return b.decode(get_subprocess_encoding(), 'ignore')
416
417
418 def encodeArgument(s):
419 if not isinstance(s, compat_str):
420 # Legacy code that uses byte strings
421 # Uncomment the following line after fixing all post processors
422 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
423 s = s.decode('ascii')
424 return encodeFilename(s, True)
425
426
427 def decodeArgument(b):
428 return decodeFilename(b, True)
429
430
431 def decodeOption(optval):
432 if optval is None:
433 return optval
434 if isinstance(optval, bytes):
435 optval = optval.decode(preferredencoding())
436
437 assert isinstance(optval, compat_str)
438 return optval
439
440
441 def formatSeconds(secs):
442 if secs > 3600:
443 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
444 elif secs > 60:
445 return '%d:%02d' % (secs // 60, secs % 60)
446 else:
447 return '%d' % secs
448
449
450 def make_HTTPS_handler(params, **kwargs):
451 opts_no_check_certificate = params.get('nocheckcertificate', False)
452 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
453 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
454 if opts_no_check_certificate:
455 context.check_hostname = False
456 context.verify_mode = ssl.CERT_NONE
457 try:
458 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
459 except TypeError:
460 # Python 2.7.8
461 # (create_default_context present but HTTPSHandler has no context=)
462 pass
463
464 if sys.version_info < (3, 2):
465 return YoutubeDLHTTPSHandler(params, **kwargs)
466 else: # Python < 3.4
467 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
468 context.verify_mode = (ssl.CERT_NONE
469 if opts_no_check_certificate
470 else ssl.CERT_REQUIRED)
471 context.set_default_verify_paths()
472 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
473
474
475 def bug_reports_message():
476 if ytdl_is_updateable():
477 update_cmd = 'type youtube-dl -U to update'
478 else:
479 update_cmd = 'see https://yt-dl.org/update on how to update'
480 msg = '; please report this issue on https://yt-dl.org/bug .'
481 msg += ' Make sure you are using the latest version; %s.' % update_cmd
482 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
483 return msg
484
485
486 class ExtractorError(Exception):
487 """Error during info extraction."""
488
489 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
490 """ tb, if given, is the original traceback (so that it can be printed out).
491 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
492 """
493
494 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
495 expected = True
496 if video_id is not None:
497 msg = video_id + ': ' + msg
498 if cause:
499 msg += ' (caused by %r)' % cause
500 if not expected:
501 msg += bug_reports_message()
502 super(ExtractorError, self).__init__(msg)
503
504 self.traceback = tb
505 self.exc_info = sys.exc_info() # preserve original exception
506 self.cause = cause
507 self.video_id = video_id
508
509 def format_traceback(self):
510 if self.traceback is None:
511 return None
512 return ''.join(traceback.format_tb(self.traceback))
513
514
515 class UnsupportedError(ExtractorError):
516 def __init__(self, url):
517 super(UnsupportedError, self).__init__(
518 'Unsupported URL: %s' % url, expected=True)
519 self.url = url
520
521
522 class RegexNotFoundError(ExtractorError):
523 """Error when a regex didn't match"""
524 pass
525
526
527 class DownloadError(Exception):
528 """Download Error exception.
529
530 This exception may be thrown by FileDownloader objects if they are not
531 configured to continue on errors. They will contain the appropriate
532 error message.
533 """
534
535 def __init__(self, msg, exc_info=None):
536 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
537 super(DownloadError, self).__init__(msg)
538 self.exc_info = exc_info
539
540
541 class SameFileError(Exception):
542 """Same File exception.
543
544 This exception will be thrown by FileDownloader objects if they detect
545 multiple files would have to be downloaded to the same file on disk.
546 """
547 pass
548
549
550 class PostProcessingError(Exception):
551 """Post Processing exception.
552
553 This exception may be raised by PostProcessor's .run() method to
554 indicate an error in the postprocessing task.
555 """
556
557 def __init__(self, msg):
558 self.msg = msg
559
560
561 class MaxDownloadsReached(Exception):
562 """ --max-downloads limit has been reached. """
563 pass
564
565
566 class UnavailableVideoError(Exception):
567 """Unavailable Format exception.
568
569 This exception will be thrown when a video is requested
570 in a format that is not available for that video.
571 """
572 pass
573
574
575 class ContentTooShortError(Exception):
576 """Content Too Short exception.
577
578 This exception may be raised by FileDownloader objects when a file they
579 download is too small for what the server announced first, indicating
580 the connection was probably interrupted.
581 """
582 # Both in bytes
583 downloaded = None
584 expected = None
585
586 def __init__(self, downloaded, expected):
587 self.downloaded = downloaded
588 self.expected = expected
589
590
591 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
592 hc = http_class(*args, **kwargs)
593 source_address = ydl_handler._params.get('source_address')
594 if source_address is not None:
595 sa = (source_address, 0)
596 if hasattr(hc, 'source_address'): # Python 2.7+
597 hc.source_address = sa
598 else: # Python 2.6
599 def _hc_connect(self, *args, **kwargs):
600 sock = compat_socket_create_connection(
601 (self.host, self.port), self.timeout, sa)
602 if is_https:
603 self.sock = ssl.wrap_socket(
604 sock, self.key_file, self.cert_file,
605 ssl_version=ssl.PROTOCOL_TLSv1)
606 else:
607 self.sock = sock
608 hc.connect = functools.partial(_hc_connect, hc)
609
610 return hc
611
612
613 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
614 """Handler for HTTP requests and responses.
615
616 This class, when installed with an OpenerDirector, automatically adds
617 the standard headers to every HTTP request and handles gzipped and
618 deflated responses from web servers. If compression is to be avoided in
619 a particular request, the original request in the program code only has
620 to include the HTTP header "Youtubedl-No-Compression", which will be
621 removed before making the real request.
622
623 Part of this code was copied from:
624
625 http://techknack.net/python-urllib2-handlers/
626
627 Andrew Rowls, the author of that code, agreed to release it to the
628 public domain.
629 """
630
631 def __init__(self, params, *args, **kwargs):
632 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
633 self._params = params
634
635 def http_open(self, req):
636 return self.do_open(functools.partial(
637 _create_http_connection, self, compat_http_client.HTTPConnection, False),
638 req)
639
640 @staticmethod
641 def deflate(data):
642 try:
643 return zlib.decompress(data, -zlib.MAX_WBITS)
644 except zlib.error:
645 return zlib.decompress(data)
646
647 @staticmethod
648 def addinfourl_wrapper(stream, headers, url, code):
649 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
650 return compat_urllib_request.addinfourl(stream, headers, url, code)
651 ret = compat_urllib_request.addinfourl(stream, headers, url)
652 ret.code = code
653 return ret
654
655 def http_request(self, req):
656 for h, v in std_headers.items():
657 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
658 # The dict keys are capitalized because of this bug by urllib
659 if h.capitalize() not in req.headers:
660 req.add_header(h, v)
661 if 'Youtubedl-no-compression' in req.headers:
662 if 'Accept-encoding' in req.headers:
663 del req.headers['Accept-encoding']
664 del req.headers['Youtubedl-no-compression']
665
666 if sys.version_info < (2, 7) and '#' in req.get_full_url():
667 # Python 2.6 is brain-dead when it comes to fragments
668 req._Request__original = req._Request__original.partition('#')[0]
669 req._Request__r_type = req._Request__r_type.partition('#')[0]
670
671 return req
672
673 def http_response(self, req, resp):
674 old_resp = resp
675 # gzip
676 if resp.headers.get('Content-encoding', '') == 'gzip':
677 content = resp.read()
678 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
679 try:
680 uncompressed = io.BytesIO(gz.read())
681 except IOError as original_ioerror:
682 # There may be junk add the end of the file
683 # See http://stackoverflow.com/q/4928560/35070 for details
684 for i in range(1, 1024):
685 try:
686 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
687 uncompressed = io.BytesIO(gz.read())
688 except IOError:
689 continue
690 break
691 else:
692 raise original_ioerror
693 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
694 resp.msg = old_resp.msg
695 # deflate
696 if resp.headers.get('Content-encoding', '') == 'deflate':
697 gz = io.BytesIO(self.deflate(resp.read()))
698 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
699 resp.msg = old_resp.msg
700 return resp
701
702 https_request = http_request
703 https_response = http_response
704
705
706 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
707 def __init__(self, params, https_conn_class=None, *args, **kwargs):
708 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
709 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
710 self._params = params
711
712 def https_open(self, req):
713 kwargs = {}
714 if hasattr(self, '_context'): # python > 2.6
715 kwargs['context'] = self._context
716 if hasattr(self, '_check_hostname'): # python 3.x
717 kwargs['check_hostname'] = self._check_hostname
718 return self.do_open(functools.partial(
719 _create_http_connection, self, self._https_conn_class, True),
720 req, **kwargs)
721
722
723 def parse_iso8601(date_str, delimiter='T', timezone=None):
724 """ Return a UNIX timestamp from the given date """
725
726 if date_str is None:
727 return None
728
729 if timezone is None:
730 m = re.search(
731 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
732 date_str)
733 if not m:
734 timezone = datetime.timedelta()
735 else:
736 date_str = date_str[:-len(m.group(0))]
737 if not m.group('sign'):
738 timezone = datetime.timedelta()
739 else:
740 sign = 1 if m.group('sign') == '+' else -1
741 timezone = datetime.timedelta(
742 hours=sign * int(m.group('hours')),
743 minutes=sign * int(m.group('minutes')))
744 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
745 dt = datetime.datetime.strptime(date_str, date_format) - timezone
746 return calendar.timegm(dt.timetuple())
747
748
749 def unified_strdate(date_str, day_first=True):
750 """Return a string with the date in the format YYYYMMDD"""
751
752 if date_str is None:
753 return None
754 upload_date = None
755 # Replace commas
756 date_str = date_str.replace(',', ' ')
757 # %z (UTC offset) is only supported in python>=3.2
758 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
759 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
760 # Remove AM/PM + timezone
761 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
762
763 format_expressions = [
764 '%d %B %Y',
765 '%d %b %Y',
766 '%B %d %Y',
767 '%b %d %Y',
768 '%b %dst %Y %I:%M%p',
769 '%b %dnd %Y %I:%M%p',
770 '%b %dth %Y %I:%M%p',
771 '%Y %m %d',
772 '%Y-%m-%d',
773 '%Y/%m/%d',
774 '%Y/%m/%d %H:%M:%S',
775 '%Y-%m-%d %H:%M:%S',
776 '%Y-%m-%d %H:%M:%S.%f',
777 '%d.%m.%Y %H:%M',
778 '%d.%m.%Y %H.%M',
779 '%Y-%m-%dT%H:%M:%SZ',
780 '%Y-%m-%dT%H:%M:%S.%fZ',
781 '%Y-%m-%dT%H:%M:%S.%f0Z',
782 '%Y-%m-%dT%H:%M:%S',
783 '%Y-%m-%dT%H:%M:%S.%f',
784 '%Y-%m-%dT%H:%M',
785 ]
786 if day_first:
787 format_expressions.extend([
788 '%d-%m-%Y',
789 '%d.%m.%Y',
790 '%d/%m/%Y',
791 '%d/%m/%y',
792 '%d/%m/%Y %H:%M:%S',
793 ])
794 else:
795 format_expressions.extend([
796 '%m-%d-%Y',
797 '%m.%d.%Y',
798 '%m/%d/%Y',
799 '%m/%d/%y',
800 '%m/%d/%Y %H:%M:%S',
801 ])
802 for expression in format_expressions:
803 try:
804 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
805 except ValueError:
806 pass
807 if upload_date is None:
808 timetuple = email.utils.parsedate_tz(date_str)
809 if timetuple:
810 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
811 return upload_date
812
813
814 def determine_ext(url, default_ext='unknown_video'):
815 if url is None:
816 return default_ext
817 guess = url.partition('?')[0].rpartition('.')[2]
818 if re.match(r'^[A-Za-z0-9]+$', guess):
819 return guess
820 else:
821 return default_ext
822
823
824 def subtitles_filename(filename, sub_lang, sub_format):
825 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
826
827
828 def date_from_str(date_str):
829 """
830 Return a datetime object from a string in the format YYYYMMDD or
831 (now|today)[+-][0-9](day|week|month|year)(s)?"""
832 today = datetime.date.today()
833 if date_str in ('now', 'today'):
834 return today
835 if date_str == 'yesterday':
836 return today - datetime.timedelta(days=1)
837 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
838 if match is not None:
839 sign = match.group('sign')
840 time = int(match.group('time'))
841 if sign == '-':
842 time = -time
843 unit = match.group('unit')
844 # A bad aproximation?
845 if unit == 'month':
846 unit = 'day'
847 time *= 30
848 elif unit == 'year':
849 unit = 'day'
850 time *= 365
851 unit += 's'
852 delta = datetime.timedelta(**{unit: time})
853 return today + delta
854 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
855
856
857 def hyphenate_date(date_str):
858 """
859 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
860 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
861 if match is not None:
862 return '-'.join(match.groups())
863 else:
864 return date_str
865
866
867 class DateRange(object):
868 """Represents a time interval between two dates"""
869
870 def __init__(self, start=None, end=None):
871 """start and end must be strings in the format accepted by date"""
872 if start is not None:
873 self.start = date_from_str(start)
874 else:
875 self.start = datetime.datetime.min.date()
876 if end is not None:
877 self.end = date_from_str(end)
878 else:
879 self.end = datetime.datetime.max.date()
880 if self.start > self.end:
881 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
882
883 @classmethod
884 def day(cls, day):
885 """Returns a range that only contains the given day"""
886 return cls(day, day)
887
888 def __contains__(self, date):
889 """Check if the date is in the range"""
890 if not isinstance(date, datetime.date):
891 date = date_from_str(date)
892 return self.start <= date <= self.end
893
894 def __str__(self):
895 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
896
897
898 def platform_name():
899 """ Returns the platform name as a compat_str """
900 res = platform.platform()
901 if isinstance(res, bytes):
902 res = res.decode(preferredencoding())
903
904 assert isinstance(res, compat_str)
905 return res
906
907
908 def _windows_write_string(s, out):
909 """ Returns True if the string was written using special methods,
910 False if it has yet to be written out."""
911 # Adapted from http://stackoverflow.com/a/3259271/35070
912
913 import ctypes
914 import ctypes.wintypes
915
916 WIN_OUTPUT_IDS = {
917 1: -11,
918 2: -12,
919 }
920
921 try:
922 fileno = out.fileno()
923 except AttributeError:
924 # If the output stream doesn't have a fileno, it's virtual
925 return False
926 except io.UnsupportedOperation:
927 # Some strange Windows pseudo files?
928 return False
929 if fileno not in WIN_OUTPUT_IDS:
930 return False
931
932 GetStdHandle = ctypes.WINFUNCTYPE(
933 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
934 (b"GetStdHandle", ctypes.windll.kernel32))
935 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
936
937 WriteConsoleW = ctypes.WINFUNCTYPE(
938 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
939 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
940 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
941 written = ctypes.wintypes.DWORD(0)
942
943 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
944 FILE_TYPE_CHAR = 0x0002
945 FILE_TYPE_REMOTE = 0x8000
946 GetConsoleMode = ctypes.WINFUNCTYPE(
947 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
948 ctypes.POINTER(ctypes.wintypes.DWORD))(
949 (b"GetConsoleMode", ctypes.windll.kernel32))
950 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
951
952 def not_a_console(handle):
953 if handle == INVALID_HANDLE_VALUE or handle is None:
954 return True
955 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
956 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
957
958 if not_a_console(h):
959 return False
960
961 def next_nonbmp_pos(s):
962 try:
963 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
964 except StopIteration:
965 return len(s)
966
967 while s:
968 count = min(next_nonbmp_pos(s), 1024)
969
970 ret = WriteConsoleW(
971 h, s, count if count else 2, ctypes.byref(written), None)
972 if ret == 0:
973 raise OSError('Failed to write string')
974 if not count: # We just wrote a non-BMP character
975 assert written.value == 2
976 s = s[1:]
977 else:
978 assert written.value > 0
979 s = s[written.value:]
980 return True
981
982
983 def write_string(s, out=None, encoding=None):
984 if out is None:
985 out = sys.stderr
986 assert type(s) == compat_str
987
988 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
989 if _windows_write_string(s, out):
990 return
991
992 if ('b' in getattr(out, 'mode', '') or
993 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
994 byt = s.encode(encoding or preferredencoding(), 'ignore')
995 out.write(byt)
996 elif hasattr(out, 'buffer'):
997 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
998 byt = s.encode(enc, 'ignore')
999 out.buffer.write(byt)
1000 else:
1001 out.write(s)
1002 out.flush()
1003
1004
1005 def bytes_to_intlist(bs):
1006 if not bs:
1007 return []
1008 if isinstance(bs[0], int): # Python 3
1009 return list(bs)
1010 else:
1011 return [ord(c) for c in bs]
1012
1013
1014 def intlist_to_bytes(xs):
1015 if not xs:
1016 return b''
1017 return struct_pack('%dB' % len(xs), *xs)
1018
1019
1020 # Cross-platform file locking
1021 if sys.platform == 'win32':
1022 import ctypes.wintypes
1023 import msvcrt
1024
1025 class OVERLAPPED(ctypes.Structure):
1026 _fields_ = [
1027 ('Internal', ctypes.wintypes.LPVOID),
1028 ('InternalHigh', ctypes.wintypes.LPVOID),
1029 ('Offset', ctypes.wintypes.DWORD),
1030 ('OffsetHigh', ctypes.wintypes.DWORD),
1031 ('hEvent', ctypes.wintypes.HANDLE),
1032 ]
1033
1034 kernel32 = ctypes.windll.kernel32
1035 LockFileEx = kernel32.LockFileEx
1036 LockFileEx.argtypes = [
1037 ctypes.wintypes.HANDLE, # hFile
1038 ctypes.wintypes.DWORD, # dwFlags
1039 ctypes.wintypes.DWORD, # dwReserved
1040 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1041 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1042 ctypes.POINTER(OVERLAPPED) # Overlapped
1043 ]
1044 LockFileEx.restype = ctypes.wintypes.BOOL
1045 UnlockFileEx = kernel32.UnlockFileEx
1046 UnlockFileEx.argtypes = [
1047 ctypes.wintypes.HANDLE, # hFile
1048 ctypes.wintypes.DWORD, # dwReserved
1049 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1050 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1051 ctypes.POINTER(OVERLAPPED) # Overlapped
1052 ]
1053 UnlockFileEx.restype = ctypes.wintypes.BOOL
1054 whole_low = 0xffffffff
1055 whole_high = 0x7fffffff
1056
1057 def _lock_file(f, exclusive):
1058 overlapped = OVERLAPPED()
1059 overlapped.Offset = 0
1060 overlapped.OffsetHigh = 0
1061 overlapped.hEvent = 0
1062 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1063 handle = msvcrt.get_osfhandle(f.fileno())
1064 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1065 whole_low, whole_high, f._lock_file_overlapped_p):
1066 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1067
1068 def _unlock_file(f):
1069 assert f._lock_file_overlapped_p
1070 handle = msvcrt.get_osfhandle(f.fileno())
1071 if not UnlockFileEx(handle, 0,
1072 whole_low, whole_high, f._lock_file_overlapped_p):
1073 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1074
1075 else:
1076 import fcntl
1077
1078 def _lock_file(f, exclusive):
1079 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1080
1081 def _unlock_file(f):
1082 fcntl.flock(f, fcntl.LOCK_UN)
1083
1084
1085 class locked_file(object):
1086 def __init__(self, filename, mode, encoding=None):
1087 assert mode in ['r', 'a', 'w']
1088 self.f = io.open(filename, mode, encoding=encoding)
1089 self.mode = mode
1090
1091 def __enter__(self):
1092 exclusive = self.mode != 'r'
1093 try:
1094 _lock_file(self.f, exclusive)
1095 except IOError:
1096 self.f.close()
1097 raise
1098 return self
1099
1100 def __exit__(self, etype, value, traceback):
1101 try:
1102 _unlock_file(self.f)
1103 finally:
1104 self.f.close()
1105
1106 def __iter__(self):
1107 return iter(self.f)
1108
1109 def write(self, *args):
1110 return self.f.write(*args)
1111
1112 def read(self, *args):
1113 return self.f.read(*args)
1114
1115
1116 def get_filesystem_encoding():
1117 encoding = sys.getfilesystemencoding()
1118 return encoding if encoding is not None else 'utf-8'
1119
1120
1121 def shell_quote(args):
1122 quoted_args = []
1123 encoding = get_filesystem_encoding()
1124 for a in args:
1125 if isinstance(a, bytes):
1126 # We may get a filename encoded with 'encodeFilename'
1127 a = a.decode(encoding)
1128 quoted_args.append(pipes.quote(a))
1129 return ' '.join(quoted_args)
1130
1131
1132 def smuggle_url(url, data):
1133 """ Pass additional data in a URL for internal use. """
1134
1135 sdata = compat_urllib_parse.urlencode(
1136 {'__youtubedl_smuggle': json.dumps(data)})
1137 return url + '#' + sdata
1138
1139
1140 def unsmuggle_url(smug_url, default=None):
1141 if '#__youtubedl_smuggle' not in smug_url:
1142 return smug_url, default
1143 url, _, sdata = smug_url.rpartition('#')
1144 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1145 data = json.loads(jsond)
1146 return url, data
1147
1148
1149 def format_bytes(bytes):
1150 if bytes is None:
1151 return 'N/A'
1152 if type(bytes) is str:
1153 bytes = float(bytes)
1154 if bytes == 0.0:
1155 exponent = 0
1156 else:
1157 exponent = int(math.log(bytes, 1024.0))
1158 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1159 converted = float(bytes) / float(1024 ** exponent)
1160 return '%.2f%s' % (converted, suffix)
1161
1162
1163 def parse_filesize(s):
1164 if s is None:
1165 return None
1166
1167 # The lower-case forms are of course incorrect and inofficial,
1168 # but we support those too
1169 _UNIT_TABLE = {
1170 'B': 1,
1171 'b': 1,
1172 'KiB': 1024,
1173 'KB': 1000,
1174 'kB': 1024,
1175 'Kb': 1000,
1176 'MiB': 1024 ** 2,
1177 'MB': 1000 ** 2,
1178 'mB': 1024 ** 2,
1179 'Mb': 1000 ** 2,
1180 'GiB': 1024 ** 3,
1181 'GB': 1000 ** 3,
1182 'gB': 1024 ** 3,
1183 'Gb': 1000 ** 3,
1184 'TiB': 1024 ** 4,
1185 'TB': 1000 ** 4,
1186 'tB': 1024 ** 4,
1187 'Tb': 1000 ** 4,
1188 'PiB': 1024 ** 5,
1189 'PB': 1000 ** 5,
1190 'pB': 1024 ** 5,
1191 'Pb': 1000 ** 5,
1192 'EiB': 1024 ** 6,
1193 'EB': 1000 ** 6,
1194 'eB': 1024 ** 6,
1195 'Eb': 1000 ** 6,
1196 'ZiB': 1024 ** 7,
1197 'ZB': 1000 ** 7,
1198 'zB': 1024 ** 7,
1199 'Zb': 1000 ** 7,
1200 'YiB': 1024 ** 8,
1201 'YB': 1000 ** 8,
1202 'yB': 1024 ** 8,
1203 'Yb': 1000 ** 8,
1204 }
1205
1206 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1207 m = re.match(
1208 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1209 if not m:
1210 return None
1211
1212 num_str = m.group('num').replace(',', '.')
1213 mult = _UNIT_TABLE[m.group('unit')]
1214 return int(float(num_str) * mult)
1215
1216
1217 def month_by_name(name):
1218 """ Return the number of a month by (locale-independently) English name """
1219
1220 try:
1221 return ENGLISH_MONTH_NAMES.index(name) + 1
1222 except ValueError:
1223 return None
1224
1225
1226 def month_by_abbreviation(abbrev):
1227 """ Return the number of a month by (locale-independently) English
1228 abbreviations """
1229
1230 try:
1231 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1232 except ValueError:
1233 return None
1234
1235
1236 def fix_xml_ampersands(xml_str):
1237 """Replace all the '&' by '&amp;' in XML"""
1238 return re.sub(
1239 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1240 '&amp;',
1241 xml_str)
1242
1243
1244 def setproctitle(title):
1245 assert isinstance(title, compat_str)
1246 try:
1247 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1248 except OSError:
1249 return
1250 title_bytes = title.encode('utf-8')
1251 buf = ctypes.create_string_buffer(len(title_bytes))
1252 buf.value = title_bytes
1253 try:
1254 libc.prctl(15, buf, 0, 0, 0)
1255 except AttributeError:
1256 return # Strange libc, just skip this
1257
1258
1259 def remove_start(s, start):
1260 if s.startswith(start):
1261 return s[len(start):]
1262 return s
1263
1264
1265 def remove_end(s, end):
1266 if s.endswith(end):
1267 return s[:-len(end)]
1268 return s
1269
1270
1271 def url_basename(url):
1272 path = compat_urlparse.urlparse(url).path
1273 return path.strip('/').split('/')[-1]
1274
1275
1276 class HEADRequest(compat_urllib_request.Request):
1277 def get_method(self):
1278 return "HEAD"
1279
1280
1281 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1282 if get_attr:
1283 if v is not None:
1284 v = getattr(v, get_attr, None)
1285 if v == '':
1286 v = None
1287 return default if v is None else (int(v) * invscale // scale)
1288
1289
1290 def str_or_none(v, default=None):
1291 return default if v is None else compat_str(v)
1292
1293
1294 def str_to_int(int_str):
1295 """ A more relaxed version of int_or_none """
1296 if int_str is None:
1297 return None
1298 int_str = re.sub(r'[,\.\+]', '', int_str)
1299 return int(int_str)
1300
1301
1302 def float_or_none(v, scale=1, invscale=1, default=None):
1303 return default if v is None else (float(v) * invscale / scale)
1304
1305
1306 def parse_duration(s):
1307 if not isinstance(s, compat_basestring):
1308 return None
1309
1310 s = s.strip()
1311
1312 m = re.match(
1313 r'''(?ix)(?:P?T)?
1314 (?:
1315 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1316 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1317
1318 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
1319 (?:
1320 (?:
1321 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1322 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1323 )?
1324 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1325 )?
1326 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1327 )$''', s)
1328 if not m:
1329 return None
1330 res = 0
1331 if m.group('only_mins'):
1332 return float_or_none(m.group('only_mins'), invscale=60)
1333 if m.group('only_hours'):
1334 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1335 if m.group('secs'):
1336 res += int(m.group('secs'))
1337 if m.group('mins_reversed'):
1338 res += int(m.group('mins_reversed')) * 60
1339 if m.group('mins'):
1340 res += int(m.group('mins')) * 60
1341 if m.group('hours'):
1342 res += int(m.group('hours')) * 60 * 60
1343 if m.group('hours_reversed'):
1344 res += int(m.group('hours_reversed')) * 60 * 60
1345 if m.group('days'):
1346 res += int(m.group('days')) * 24 * 60 * 60
1347 if m.group('ms'):
1348 res += float(m.group('ms'))
1349 return res
1350
1351
1352 def prepend_extension(filename, ext, expected_real_ext=None):
1353 name, real_ext = os.path.splitext(filename)
1354 return (
1355 '{0}.{1}{2}'.format(name, ext, real_ext)
1356 if not expected_real_ext or real_ext[1:] == expected_real_ext
1357 else '{0}.{1}'.format(filename, ext))
1358
1359
1360 def replace_extension(filename, ext, expected_real_ext=None):
1361 name, real_ext = os.path.splitext(filename)
1362 return '{0}.{1}'.format(
1363 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1364 ext)
1365
1366
1367 def check_executable(exe, args=[]):
1368 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1369 args can be a list of arguments for a short output (like -version) """
1370 try:
1371 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1372 except OSError:
1373 return False
1374 return exe
1375
1376
1377 def get_exe_version(exe, args=['--version'],
1378 version_re=None, unrecognized='present'):
1379 """ Returns the version of the specified executable,
1380 or False if the executable is not present """
1381 try:
1382 out, _ = subprocess.Popen(
1383 [exe] + args,
1384 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1385 except OSError:
1386 return False
1387 if isinstance(out, bytes): # Python 2.x
1388 out = out.decode('ascii', 'ignore')
1389 return detect_exe_version(out, version_re, unrecognized)
1390
1391
1392 def detect_exe_version(output, version_re=None, unrecognized='present'):
1393 assert isinstance(output, compat_str)
1394 if version_re is None:
1395 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1396 m = re.search(version_re, output)
1397 if m:
1398 return m.group(1)
1399 else:
1400 return unrecognized
1401
1402
1403 class PagedList(object):
1404 def __len__(self):
1405 # This is only useful for tests
1406 return len(self.getslice())
1407
1408
1409 class OnDemandPagedList(PagedList):
1410 def __init__(self, pagefunc, pagesize):
1411 self._pagefunc = pagefunc
1412 self._pagesize = pagesize
1413
1414 def getslice(self, start=0, end=None):
1415 res = []
1416 for pagenum in itertools.count(start // self._pagesize):
1417 firstid = pagenum * self._pagesize
1418 nextfirstid = pagenum * self._pagesize + self._pagesize
1419 if start >= nextfirstid:
1420 continue
1421
1422 page_results = list(self._pagefunc(pagenum))
1423
1424 startv = (
1425 start % self._pagesize
1426 if firstid <= start < nextfirstid
1427 else 0)
1428
1429 endv = (
1430 ((end - 1) % self._pagesize) + 1
1431 if (end is not None and firstid <= end <= nextfirstid)
1432 else None)
1433
1434 if startv != 0 or endv is not None:
1435 page_results = page_results[startv:endv]
1436 res.extend(page_results)
1437
1438 # A little optimization - if current page is not "full", ie. does
1439 # not contain page_size videos then we can assume that this page
1440 # is the last one - there are no more ids on further pages -
1441 # i.e. no need to query again.
1442 if len(page_results) + startv < self._pagesize:
1443 break
1444
1445 # If we got the whole page, but the next page is not interesting,
1446 # break out early as well
1447 if end == nextfirstid:
1448 break
1449 return res
1450
1451
1452 class InAdvancePagedList(PagedList):
1453 def __init__(self, pagefunc, pagecount, pagesize):
1454 self._pagefunc = pagefunc
1455 self._pagecount = pagecount
1456 self._pagesize = pagesize
1457
1458 def getslice(self, start=0, end=None):
1459 res = []
1460 start_page = start // self._pagesize
1461 end_page = (
1462 self._pagecount if end is None else (end // self._pagesize + 1))
1463 skip_elems = start - start_page * self._pagesize
1464 only_more = None if end is None else end - start
1465 for pagenum in range(start_page, end_page):
1466 page = list(self._pagefunc(pagenum))
1467 if skip_elems:
1468 page = page[skip_elems:]
1469 skip_elems = None
1470 if only_more is not None:
1471 if len(page) < only_more:
1472 only_more -= len(page)
1473 else:
1474 page = page[:only_more]
1475 res.extend(page)
1476 break
1477 res.extend(page)
1478 return res
1479
1480
1481 def uppercase_escape(s):
1482 unicode_escape = codecs.getdecoder('unicode_escape')
1483 return re.sub(
1484 r'\\U[0-9a-fA-F]{8}',
1485 lambda m: unicode_escape(m.group(0))[0],
1486 s)
1487
1488
1489 def escape_rfc3986(s):
1490 """Escape non-ASCII characters as suggested by RFC 3986"""
1491 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1492 s = s.encode('utf-8')
1493 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1494
1495
1496 def escape_url(url):
1497 """Escape URL as suggested by RFC 3986"""
1498 url_parsed = compat_urllib_parse_urlparse(url)
1499 return url_parsed._replace(
1500 path=escape_rfc3986(url_parsed.path),
1501 params=escape_rfc3986(url_parsed.params),
1502 query=escape_rfc3986(url_parsed.query),
1503 fragment=escape_rfc3986(url_parsed.fragment)
1504 ).geturl()
1505
1506 try:
1507 struct.pack('!I', 0)
1508 except TypeError:
1509 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1510 def struct_pack(spec, *args):
1511 if isinstance(spec, compat_str):
1512 spec = spec.encode('ascii')
1513 return struct.pack(spec, *args)
1514
1515 def struct_unpack(spec, *args):
1516 if isinstance(spec, compat_str):
1517 spec = spec.encode('ascii')
1518 return struct.unpack(spec, *args)
1519 else:
1520 struct_pack = struct.pack
1521 struct_unpack = struct.unpack
1522
1523
1524 def read_batch_urls(batch_fd):
1525 def fixup(url):
1526 if not isinstance(url, compat_str):
1527 url = url.decode('utf-8', 'replace')
1528 BOM_UTF8 = '\xef\xbb\xbf'
1529 if url.startswith(BOM_UTF8):
1530 url = url[len(BOM_UTF8):]
1531 url = url.strip()
1532 if url.startswith(('#', ';', ']')):
1533 return False
1534 return url
1535
1536 with contextlib.closing(batch_fd) as fd:
1537 return [url for url in map(fixup, fd) if url]
1538
1539
1540 def urlencode_postdata(*args, **kargs):
1541 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1542
1543
1544 try:
1545 etree_iter = xml.etree.ElementTree.Element.iter
1546 except AttributeError: # Python <=2.6
1547 etree_iter = lambda n: n.findall('.//*')
1548
1549
1550 def parse_xml(s):
1551 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1552 def doctype(self, name, pubid, system):
1553 pass # Ignore doctypes
1554
1555 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1556 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1557 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1558 # Fix up XML parser in Python 2.x
1559 if sys.version_info < (3, 0):
1560 for n in etree_iter(tree):
1561 if n.text is not None:
1562 if not isinstance(n.text, compat_str):
1563 n.text = n.text.decode('utf-8')
1564 return tree
1565
1566
1567 US_RATINGS = {
1568 'G': 0,
1569 'PG': 10,
1570 'PG-13': 13,
1571 'R': 16,
1572 'NC': 18,
1573 }
1574
1575
1576 def parse_age_limit(s):
1577 if s is None:
1578 return None
1579 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1580 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1581
1582
1583 def strip_jsonp(code):
1584 return re.sub(
1585 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1586
1587
1588 def js_to_json(code):
1589 def fix_kv(m):
1590 v = m.group(0)
1591 if v in ('true', 'false', 'null'):
1592 return v
1593 if v.startswith('"'):
1594 return v
1595 if v.startswith("'"):
1596 v = v[1:-1]
1597 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1598 '\\\\': '\\\\',
1599 "\\'": "'",
1600 '"': '\\"',
1601 }[m.group(0)], v)
1602 return '"%s"' % v
1603
1604 res = re.sub(r'''(?x)
1605 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1606 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1607 [a-zA-Z_][.a-zA-Z_0-9]*
1608 ''', fix_kv, code)
1609 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1610 return res
1611
1612
1613 def qualities(quality_ids):
1614 """ Get a numeric quality value out of a list of possible values """
1615 def q(qid):
1616 try:
1617 return quality_ids.index(qid)
1618 except ValueError:
1619 return -1
1620 return q
1621
1622
1623 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1624
1625
1626 def limit_length(s, length):
1627 """ Add ellipses to overly long strings """
1628 if s is None:
1629 return None
1630 ELLIPSES = '...'
1631 if len(s) > length:
1632 return s[:length - len(ELLIPSES)] + ELLIPSES
1633 return s
1634
1635
1636 def version_tuple(v):
1637 return tuple(int(e) for e in re.split(r'[-.]', v))
1638
1639
1640 def is_outdated_version(version, limit, assume_new=True):
1641 if not version:
1642 return not assume_new
1643 try:
1644 return version_tuple(version) < version_tuple(limit)
1645 except ValueError:
1646 return not assume_new
1647
1648
1649 def ytdl_is_updateable():
1650 """ Returns if youtube-dl can be updated with -U """
1651 from zipimport import zipimporter
1652
1653 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1654
1655
1656 def args_to_str(args):
1657 # Get a short string representation for a subprocess command
1658 return ' '.join(shlex_quote(a) for a in args)
1659
1660
1661 def mimetype2ext(mt):
1662 _, _, res = mt.rpartition('/')
1663
1664 return {
1665 'x-ms-wmv': 'wmv',
1666 'x-mp4-fragmented': 'mp4',
1667 }.get(res, res)
1668
1669
1670 def urlhandle_detect_ext(url_handle):
1671 try:
1672 url_handle.headers
1673 getheader = lambda h: url_handle.headers[h]
1674 except AttributeError: # Python < 3
1675 getheader = url_handle.info().getheader
1676
1677 cd = getheader('Content-Disposition')
1678 if cd:
1679 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1680 if m:
1681 e = determine_ext(m.group('filename'), default_ext=None)
1682 if e:
1683 return e
1684
1685 return mimetype2ext(getheader('Content-Type'))
1686
1687
1688 def age_restricted(content_limit, age_limit):
1689 """ Returns True iff the content should be blocked """
1690
1691 if age_limit is None: # No limit set
1692 return False
1693 if content_limit is None:
1694 return False # Content available for everyone
1695 return age_limit < content_limit
1696
1697
1698 def is_html(first_bytes):
1699 """ Detect whether a file contains HTML by examining its first bytes. """
1700
1701 BOMS = [
1702 (b'\xef\xbb\xbf', 'utf-8'),
1703 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1704 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1705 (b'\xff\xfe', 'utf-16-le'),
1706 (b'\xfe\xff', 'utf-16-be'),
1707 ]
1708 for bom, enc in BOMS:
1709 if first_bytes.startswith(bom):
1710 s = first_bytes[len(bom):].decode(enc, 'replace')
1711 break
1712 else:
1713 s = first_bytes.decode('utf-8', 'replace')
1714
1715 return re.match(r'^\s*<', s)
1716
1717
1718 def determine_protocol(info_dict):
1719 protocol = info_dict.get('protocol')
1720 if protocol is not None:
1721 return protocol
1722
1723 url = info_dict['url']
1724 if url.startswith('rtmp'):
1725 return 'rtmp'
1726 elif url.startswith('mms'):
1727 return 'mms'
1728 elif url.startswith('rtsp'):
1729 return 'rtsp'
1730
1731 ext = determine_ext(url)
1732 if ext == 'm3u8':
1733 return 'm3u8'
1734 elif ext == 'f4m':
1735 return 'f4m'
1736
1737 return compat_urllib_parse_urlparse(url).scheme
1738
1739
1740 def render_table(header_row, data):
1741 """ Render a list of rows, each as a list of values """
1742 table = [header_row] + data
1743 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1744 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1745 return '\n'.join(format_str % tuple(row) for row in table)
1746
1747
1748 def _match_one(filter_part, dct):
1749 COMPARISON_OPERATORS = {
1750 '<': operator.lt,
1751 '<=': operator.le,
1752 '>': operator.gt,
1753 '>=': operator.ge,
1754 '=': operator.eq,
1755 '!=': operator.ne,
1756 }
1757 operator_rex = re.compile(r'''(?x)\s*
1758 (?P<key>[a-z_]+)
1759 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1760 (?:
1761 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1762 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1763 )
1764 \s*$
1765 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1766 m = operator_rex.search(filter_part)
1767 if m:
1768 op = COMPARISON_OPERATORS[m.group('op')]
1769 if m.group('strval') is not None:
1770 if m.group('op') not in ('=', '!='):
1771 raise ValueError(
1772 'Operator %s does not support string values!' % m.group('op'))
1773 comparison_value = m.group('strval')
1774 else:
1775 try:
1776 comparison_value = int(m.group('intval'))
1777 except ValueError:
1778 comparison_value = parse_filesize(m.group('intval'))
1779 if comparison_value is None:
1780 comparison_value = parse_filesize(m.group('intval') + 'B')
1781 if comparison_value is None:
1782 raise ValueError(
1783 'Invalid integer value %r in filter part %r' % (
1784 m.group('intval'), filter_part))
1785 actual_value = dct.get(m.group('key'))
1786 if actual_value is None:
1787 return m.group('none_inclusive')
1788 return op(actual_value, comparison_value)
1789
1790 UNARY_OPERATORS = {
1791 '': lambda v: v is not None,
1792 '!': lambda v: v is None,
1793 }
1794 operator_rex = re.compile(r'''(?x)\s*
1795 (?P<op>%s)\s*(?P<key>[a-z_]+)
1796 \s*$
1797 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1798 m = operator_rex.search(filter_part)
1799 if m:
1800 op = UNARY_OPERATORS[m.group('op')]
1801 actual_value = dct.get(m.group('key'))
1802 return op(actual_value)
1803
1804 raise ValueError('Invalid filter part %r' % filter_part)
1805
1806
1807 def match_str(filter_str, dct):
1808 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1809
1810 return all(
1811 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1812
1813
1814 def match_filter_func(filter_str):
1815 def _match_func(info_dict):
1816 if match_str(filter_str, info_dict):
1817 return None
1818 else:
1819 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1820 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1821 return _match_func
1822
1823
1824 def parse_dfxp_time_expr(time_expr):
1825 if not time_expr:
1826 return 0.0
1827
1828 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1829 if mobj:
1830 return float(mobj.group('time_offset'))
1831
1832 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1833 if mobj:
1834 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1835
1836
1837 def format_srt_time(seconds):
1838 (mins, secs) = divmod(seconds, 60)
1839 (hours, mins) = divmod(mins, 60)
1840 millisecs = (secs - int(secs)) * 1000
1841 secs = int(secs)
1842 return '%02d:%02d:%02d,%03d' % (hours, mins, secs, millisecs)
1843
1844
1845 def dfxp2srt(dfxp_data):
1846 _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'})
1847
1848 def parse_node(node):
1849 str_or_empty = functools.partial(str_or_none, default='')
1850
1851 out = str_or_empty(node.text)
1852
1853 for child in node:
1854 if child.tag == _x('ttml:br'):
1855 out += '\n' + str_or_empty(child.tail)
1856 elif child.tag == _x('ttml:span'):
1857 out += str_or_empty(parse_node(child))
1858 else:
1859 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1860
1861 return out
1862
1863 dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1864 out = []
1865 paras = dfxp.findall(_x('.//ttml:p'))
1866
1867 for para, index in zip(paras, itertools.count(1)):
1868 out.append('%d\n%s --> %s\n%s\n\n' % (
1869 index,
1870 format_srt_time(parse_dfxp_time_expr(para.attrib.get('begin'))),
1871 format_srt_time(parse_dfxp_time_expr(para.attrib.get('end'))),
1872 parse_node(para)))
1873
1874 return ''.join(out)
1875
1876
1877 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
1878 def __init__(self, proxies=None):
1879 # Set default handlers
1880 for type in ('http', 'https'):
1881 setattr(self, '%s_open' % type,
1882 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
1883 meth(r, proxy, type))
1884 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
1885
1886 def proxy_open(self, req, proxy, type):
1887 req_proxy = req.headers.get('Ytdl-request-proxy')
1888 if req_proxy is not None:
1889 proxy = req_proxy
1890 del req.headers['Ytdl-request-proxy']
1891
1892 if proxy == '__noproxy__':
1893 return None # No Proxy
1894 return compat_urllib_request.ProxyHandler.proxy_open(
1895 self, req, proxy, type)