]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
[eporner] Fix duration (Closes #4188)
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import calendar
7 import codecs
8 import contextlib
9 import ctypes
10 import datetime
11 import email.utils
12 import errno
13 import gzip
14 import itertools
15 import io
16 import json
17 import locale
18 import math
19 import os
20 import pipes
21 import platform
22 import re
23 import ssl
24 import socket
25 import struct
26 import subprocess
27 import sys
28 import tempfile
29 import traceback
30 import xml.etree.ElementTree
31 import zlib
32
33 from .compat import (
34 compat_chr,
35 compat_getenv,
36 compat_html_entities,
37 compat_parse_qs,
38 compat_str,
39 compat_urllib_error,
40 compat_urllib_parse,
41 compat_urllib_parse_urlparse,
42 compat_urllib_request,
43 compat_urlparse,
44 )
45
46
47 # This is not clearly defined otherwise
48 compiled_regex_type = type(re.compile(''))
49
50 std_headers = {
51 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
52 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
53 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
54 'Accept-Encoding': 'gzip, deflate',
55 'Accept-Language': 'en-us,en;q=0.5',
56 }
57
58 def preferredencoding():
59 """Get preferred encoding.
60
61 Returns the best encoding scheme for the system, based on
62 locale.getpreferredencoding() and some further tweaks.
63 """
64 try:
65 pref = locale.getpreferredencoding()
66 u'TEST'.encode(pref)
67 except:
68 pref = 'UTF-8'
69
70 return pref
71
72
73 def write_json_file(obj, fn):
74 """ Encode obj as JSON and write it to fn, atomically """
75
76 if sys.version_info < (3, 0):
77 encoding = get_filesystem_encoding()
78 # os.path.basename returns a bytes object, but NamedTemporaryFile
79 # will fail if the filename contains non ascii characters unless we
80 # use a unicode object
81 path_basename = lambda f: os.path.basename(fn).decode(encoding)
82 # the same for os.path.dirname
83 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
84 else:
85 path_basename = os.path.basename
86 path_dirname = os.path.dirname
87
88 args = {
89 'suffix': '.tmp',
90 'prefix': path_basename(fn) + '.',
91 'dir': path_dirname(fn),
92 'delete': False,
93 }
94
95 # In Python 2.x, json.dump expects a bytestream.
96 # In Python 3.x, it writes to a character stream
97 if sys.version_info < (3, 0):
98 args['mode'] = 'wb'
99 else:
100 args.update({
101 'mode': 'w',
102 'encoding': 'utf-8',
103 })
104
105 tf = tempfile.NamedTemporaryFile(**args)
106
107 try:
108 with tf:
109 json.dump(obj, tf)
110 os.rename(tf.name, fn)
111 except:
112 try:
113 os.remove(tf.name)
114 except OSError:
115 pass
116 raise
117
118
119 if sys.version_info >= (2, 7):
120 def find_xpath_attr(node, xpath, key, val):
121 """ Find the xpath xpath[@key=val] """
122 assert re.match(r'^[a-zA-Z-]+$', key)
123 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
124 expr = xpath + u"[@%s='%s']" % (key, val)
125 return node.find(expr)
126 else:
127 def find_xpath_attr(node, xpath, key, val):
128 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
129 # .//node does not match if a node is a direct child of . !
130 if isinstance(xpath, unicode):
131 xpath = xpath.encode('ascii')
132
133 for f in node.findall(xpath):
134 if f.attrib.get(key) == val:
135 return f
136 return None
137
138 # On python2.6 the xml.etree.ElementTree.Element methods don't support
139 # the namespace parameter
140 def xpath_with_ns(path, ns_map):
141 components = [c.split(':') for c in path.split('/')]
142 replaced = []
143 for c in components:
144 if len(c) == 1:
145 replaced.append(c[0])
146 else:
147 ns, tag = c
148 replaced.append('{%s}%s' % (ns_map[ns], tag))
149 return '/'.join(replaced)
150
151
152 def xpath_text(node, xpath, name=None, fatal=False):
153 if sys.version_info < (2, 7): # Crazy 2.6
154 xpath = xpath.encode('ascii')
155
156 n = node.find(xpath)
157 if n is None:
158 if fatal:
159 name = xpath if name is None else name
160 raise ExtractorError('Could not find XML element %s' % name)
161 else:
162 return None
163 return n.text
164
165
166 def get_element_by_id(id, html):
167 """Return the content of the tag with the specified ID in the passed HTML document"""
168 return get_element_by_attribute("id", id, html)
169
170
171 def get_element_by_attribute(attribute, value, html):
172 """Return the content of the tag with the specified attribute in the passed HTML document"""
173
174 m = re.search(r'''(?xs)
175 <([a-zA-Z0-9:._-]+)
176 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
177 \s+%s=['"]?%s['"]?
178 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
179 \s*>
180 (?P<content>.*?)
181 </\1>
182 ''' % (re.escape(attribute), re.escape(value)), html)
183
184 if not m:
185 return None
186 res = m.group('content')
187
188 if res.startswith('"') or res.startswith("'"):
189 res = res[1:-1]
190
191 return unescapeHTML(res)
192
193
194 def clean_html(html):
195 """Clean an HTML snippet into a readable string"""
196 # Newline vs <br />
197 html = html.replace('\n', ' ')
198 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
199 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
200 # Strip html tags
201 html = re.sub('<.*?>', '', html)
202 # Replace html entities
203 html = unescapeHTML(html)
204 return html.strip()
205
206
207 def sanitize_open(filename, open_mode):
208 """Try to open the given filename, and slightly tweak it if this fails.
209
210 Attempts to open the given filename. If this fails, it tries to change
211 the filename slightly, step by step, until it's either able to open it
212 or it fails and raises a final exception, like the standard open()
213 function.
214
215 It returns the tuple (stream, definitive_file_name).
216 """
217 try:
218 if filename == u'-':
219 if sys.platform == 'win32':
220 import msvcrt
221 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
222 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
223 stream = open(encodeFilename(filename), open_mode)
224 return (stream, filename)
225 except (IOError, OSError) as err:
226 if err.errno in (errno.EACCES,):
227 raise
228
229 # In case of error, try to remove win32 forbidden chars
230 alt_filename = os.path.join(
231 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
232 for path_part in os.path.split(filename)
233 )
234 if alt_filename == filename:
235 raise
236 else:
237 # An exception here should be caught in the caller
238 stream = open(encodeFilename(filename), open_mode)
239 return (stream, alt_filename)
240
241
242 def timeconvert(timestr):
243 """Convert RFC 2822 defined time string into system timestamp"""
244 timestamp = None
245 timetuple = email.utils.parsedate_tz(timestr)
246 if timetuple is not None:
247 timestamp = email.utils.mktime_tz(timetuple)
248 return timestamp
249
250 def sanitize_filename(s, restricted=False, is_id=False):
251 """Sanitizes a string so it could be used as part of a filename.
252 If restricted is set, use a stricter subset of allowed characters.
253 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
254 """
255 def replace_insane(char):
256 if char == '?' or ord(char) < 32 or ord(char) == 127:
257 return ''
258 elif char == '"':
259 return '' if restricted else '\''
260 elif char == ':':
261 return '_-' if restricted else ' -'
262 elif char in '\\/|*<>':
263 return '_'
264 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
265 return '_'
266 if restricted and ord(char) > 127:
267 return '_'
268 return char
269
270 result = u''.join(map(replace_insane, s))
271 if not is_id:
272 while '__' in result:
273 result = result.replace('__', '_')
274 result = result.strip('_')
275 # Common case of "Foreign band name - English song title"
276 if restricted and result.startswith('-_'):
277 result = result[2:]
278 if not result:
279 result = '_'
280 return result
281
282 def orderedSet(iterable):
283 """ Remove all duplicates from the input iterable """
284 res = []
285 for el in iterable:
286 if el not in res:
287 res.append(el)
288 return res
289
290
291 def _htmlentity_transform(entity):
292 """Transforms an HTML entity to a character."""
293 # Known non-numeric HTML entity
294 if entity in compat_html_entities.name2codepoint:
295 return compat_chr(compat_html_entities.name2codepoint[entity])
296
297 mobj = re.match(r'#(x?[0-9]+)', entity)
298 if mobj is not None:
299 numstr = mobj.group(1)
300 if numstr.startswith(u'x'):
301 base = 16
302 numstr = u'0%s' % numstr
303 else:
304 base = 10
305 return compat_chr(int(numstr, base))
306
307 # Unknown entity in name, return its literal representation
308 return (u'&%s;' % entity)
309
310
311 def unescapeHTML(s):
312 if s is None:
313 return None
314 assert type(s) == compat_str
315
316 return re.sub(
317 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
318
319
320 def encodeFilename(s, for_subprocess=False):
321 """
322 @param s The name of the file
323 """
324
325 assert type(s) == compat_str
326
327 # Python 3 has a Unicode API
328 if sys.version_info >= (3, 0):
329 return s
330
331 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
332 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
333 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
334 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
335 if not for_subprocess:
336 return s
337 else:
338 # For subprocess calls, encode with locale encoding
339 # Refer to http://stackoverflow.com/a/9951851/35070
340 encoding = preferredencoding()
341 else:
342 encoding = sys.getfilesystemencoding()
343 if encoding is None:
344 encoding = 'utf-8'
345 return s.encode(encoding, 'ignore')
346
347
348 def encodeArgument(s):
349 if not isinstance(s, compat_str):
350 # Legacy code that uses byte strings
351 # Uncomment the following line after fixing all post processors
352 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
353 s = s.decode('ascii')
354 return encodeFilename(s, True)
355
356
357 def decodeOption(optval):
358 if optval is None:
359 return optval
360 if isinstance(optval, bytes):
361 optval = optval.decode(preferredencoding())
362
363 assert isinstance(optval, compat_str)
364 return optval
365
366 def formatSeconds(secs):
367 if secs > 3600:
368 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
369 elif secs > 60:
370 return '%d:%02d' % (secs // 60, secs % 60)
371 else:
372 return '%d' % secs
373
374
375 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
376 if sys.version_info < (3, 2):
377 import httplib
378
379 class HTTPSConnectionV3(httplib.HTTPSConnection):
380 def __init__(self, *args, **kwargs):
381 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
382
383 def connect(self):
384 sock = socket.create_connection((self.host, self.port), self.timeout)
385 if getattr(self, '_tunnel_host', False):
386 self.sock = sock
387 self._tunnel()
388 try:
389 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
390 except ssl.SSLError:
391 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
392
393 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
394 def https_open(self, req):
395 return self.do_open(HTTPSConnectionV3, req)
396 return HTTPSHandlerV3(**kwargs)
397 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
398 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
399 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
400 if opts_no_check_certificate:
401 context.verify_mode = ssl.CERT_NONE
402 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
403 else: # Python < 3.4
404 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
405 context.verify_mode = (ssl.CERT_NONE
406 if opts_no_check_certificate
407 else ssl.CERT_REQUIRED)
408 context.set_default_verify_paths()
409 try:
410 context.load_default_certs()
411 except AttributeError:
412 pass # Python < 3.4
413 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
414
415 class ExtractorError(Exception):
416 """Error during info extraction."""
417 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
418 """ tb, if given, is the original traceback (so that it can be printed out).
419 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
420 """
421
422 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
423 expected = True
424 if video_id is not None:
425 msg = video_id + ': ' + msg
426 if cause:
427 msg += u' (caused by %r)' % cause
428 if not expected:
429 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
430 super(ExtractorError, self).__init__(msg)
431
432 self.traceback = tb
433 self.exc_info = sys.exc_info() # preserve original exception
434 self.cause = cause
435 self.video_id = video_id
436
437 def format_traceback(self):
438 if self.traceback is None:
439 return None
440 return u''.join(traceback.format_tb(self.traceback))
441
442
443 class RegexNotFoundError(ExtractorError):
444 """Error when a regex didn't match"""
445 pass
446
447
448 class DownloadError(Exception):
449 """Download Error exception.
450
451 This exception may be thrown by FileDownloader objects if they are not
452 configured to continue on errors. They will contain the appropriate
453 error message.
454 """
455 def __init__(self, msg, exc_info=None):
456 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
457 super(DownloadError, self).__init__(msg)
458 self.exc_info = exc_info
459
460
461 class SameFileError(Exception):
462 """Same File exception.
463
464 This exception will be thrown by FileDownloader objects if they detect
465 multiple files would have to be downloaded to the same file on disk.
466 """
467 pass
468
469
470 class PostProcessingError(Exception):
471 """Post Processing exception.
472
473 This exception may be raised by PostProcessor's .run() method to
474 indicate an error in the postprocessing task.
475 """
476 def __init__(self, msg):
477 self.msg = msg
478
479 class MaxDownloadsReached(Exception):
480 """ --max-downloads limit has been reached. """
481 pass
482
483
484 class UnavailableVideoError(Exception):
485 """Unavailable Format exception.
486
487 This exception will be thrown when a video is requested
488 in a format that is not available for that video.
489 """
490 pass
491
492
493 class ContentTooShortError(Exception):
494 """Content Too Short exception.
495
496 This exception may be raised by FileDownloader objects when a file they
497 download is too small for what the server announced first, indicating
498 the connection was probably interrupted.
499 """
500 # Both in bytes
501 downloaded = None
502 expected = None
503
504 def __init__(self, downloaded, expected):
505 self.downloaded = downloaded
506 self.expected = expected
507
508 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
509 """Handler for HTTP requests and responses.
510
511 This class, when installed with an OpenerDirector, automatically adds
512 the standard headers to every HTTP request and handles gzipped and
513 deflated responses from web servers. If compression is to be avoided in
514 a particular request, the original request in the program code only has
515 to include the HTTP header "Youtubedl-No-Compression", which will be
516 removed before making the real request.
517
518 Part of this code was copied from:
519
520 http://techknack.net/python-urllib2-handlers/
521
522 Andrew Rowls, the author of that code, agreed to release it to the
523 public domain.
524 """
525
526 @staticmethod
527 def deflate(data):
528 try:
529 return zlib.decompress(data, -zlib.MAX_WBITS)
530 except zlib.error:
531 return zlib.decompress(data)
532
533 @staticmethod
534 def addinfourl_wrapper(stream, headers, url, code):
535 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
536 return compat_urllib_request.addinfourl(stream, headers, url, code)
537 ret = compat_urllib_request.addinfourl(stream, headers, url)
538 ret.code = code
539 return ret
540
541 def http_request(self, req):
542 for h, v in std_headers.items():
543 if h not in req.headers:
544 req.add_header(h, v)
545 if 'Youtubedl-no-compression' in req.headers:
546 if 'Accept-encoding' in req.headers:
547 del req.headers['Accept-encoding']
548 del req.headers['Youtubedl-no-compression']
549 if 'Youtubedl-user-agent' in req.headers:
550 if 'User-agent' in req.headers:
551 del req.headers['User-agent']
552 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
553 del req.headers['Youtubedl-user-agent']
554
555 if sys.version_info < (2, 7) and '#' in req.get_full_url():
556 # Python 2.6 is brain-dead when it comes to fragments
557 req._Request__original = req._Request__original.partition('#')[0]
558 req._Request__r_type = req._Request__r_type.partition('#')[0]
559
560 return req
561
562 def http_response(self, req, resp):
563 old_resp = resp
564 # gzip
565 if resp.headers.get('Content-encoding', '') == 'gzip':
566 content = resp.read()
567 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
568 try:
569 uncompressed = io.BytesIO(gz.read())
570 except IOError as original_ioerror:
571 # There may be junk add the end of the file
572 # See http://stackoverflow.com/q/4928560/35070 for details
573 for i in range(1, 1024):
574 try:
575 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
576 uncompressed = io.BytesIO(gz.read())
577 except IOError:
578 continue
579 break
580 else:
581 raise original_ioerror
582 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
583 resp.msg = old_resp.msg
584 # deflate
585 if resp.headers.get('Content-encoding', '') == 'deflate':
586 gz = io.BytesIO(self.deflate(resp.read()))
587 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
588 resp.msg = old_resp.msg
589 return resp
590
591 https_request = http_request
592 https_response = http_response
593
594
595 def parse_iso8601(date_str, delimiter='T'):
596 """ Return a UNIX timestamp from the given date """
597
598 if date_str is None:
599 return None
600
601 m = re.search(
602 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
603 date_str)
604 if not m:
605 timezone = datetime.timedelta()
606 else:
607 date_str = date_str[:-len(m.group(0))]
608 if not m.group('sign'):
609 timezone = datetime.timedelta()
610 else:
611 sign = 1 if m.group('sign') == '+' else -1
612 timezone = datetime.timedelta(
613 hours=sign * int(m.group('hours')),
614 minutes=sign * int(m.group('minutes')))
615 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
616 dt = datetime.datetime.strptime(date_str, date_format) - timezone
617 return calendar.timegm(dt.timetuple())
618
619
620 def unified_strdate(date_str):
621 """Return a string with the date in the format YYYYMMDD"""
622
623 if date_str is None:
624 return None
625
626 upload_date = None
627 #Replace commas
628 date_str = date_str.replace(',', ' ')
629 # %z (UTC offset) is only supported in python>=3.2
630 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
631 format_expressions = [
632 '%d %B %Y',
633 '%d %b %Y',
634 '%B %d %Y',
635 '%b %d %Y',
636 '%b %dst %Y %I:%M%p',
637 '%b %dnd %Y %I:%M%p',
638 '%b %dth %Y %I:%M%p',
639 '%Y-%m-%d',
640 '%Y/%m/%d',
641 '%d.%m.%Y',
642 '%d/%m/%Y',
643 '%d/%m/%y',
644 '%Y/%m/%d %H:%M:%S',
645 '%d/%m/%Y %H:%M:%S',
646 '%Y-%m-%d %H:%M:%S',
647 '%Y-%m-%d %H:%M:%S.%f',
648 '%d.%m.%Y %H:%M',
649 '%d.%m.%Y %H.%M',
650 '%Y-%m-%dT%H:%M:%SZ',
651 '%Y-%m-%dT%H:%M:%S.%fZ',
652 '%Y-%m-%dT%H:%M:%S.%f0Z',
653 '%Y-%m-%dT%H:%M:%S',
654 '%Y-%m-%dT%H:%M:%S.%f',
655 '%Y-%m-%dT%H:%M',
656 ]
657 for expression in format_expressions:
658 try:
659 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
660 except ValueError:
661 pass
662 if upload_date is None:
663 timetuple = email.utils.parsedate_tz(date_str)
664 if timetuple:
665 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
666 return upload_date
667
668 def determine_ext(url, default_ext=u'unknown_video'):
669 if url is None:
670 return default_ext
671 guess = url.partition(u'?')[0].rpartition(u'.')[2]
672 if re.match(r'^[A-Za-z0-9]+$', guess):
673 return guess
674 else:
675 return default_ext
676
677 def subtitles_filename(filename, sub_lang, sub_format):
678 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
679
680 def date_from_str(date_str):
681 """
682 Return a datetime object from a string in the format YYYYMMDD or
683 (now|today)[+-][0-9](day|week|month|year)(s)?"""
684 today = datetime.date.today()
685 if date_str == 'now'or date_str == 'today':
686 return today
687 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
688 if match is not None:
689 sign = match.group('sign')
690 time = int(match.group('time'))
691 if sign == '-':
692 time = -time
693 unit = match.group('unit')
694 #A bad aproximation?
695 if unit == 'month':
696 unit = 'day'
697 time *= 30
698 elif unit == 'year':
699 unit = 'day'
700 time *= 365
701 unit += 's'
702 delta = datetime.timedelta(**{unit: time})
703 return today + delta
704 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
705
706 def hyphenate_date(date_str):
707 """
708 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
709 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
710 if match is not None:
711 return '-'.join(match.groups())
712 else:
713 return date_str
714
715 class DateRange(object):
716 """Represents a time interval between two dates"""
717 def __init__(self, start=None, end=None):
718 """start and end must be strings in the format accepted by date"""
719 if start is not None:
720 self.start = date_from_str(start)
721 else:
722 self.start = datetime.datetime.min.date()
723 if end is not None:
724 self.end = date_from_str(end)
725 else:
726 self.end = datetime.datetime.max.date()
727 if self.start > self.end:
728 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
729 @classmethod
730 def day(cls, day):
731 """Returns a range that only contains the given day"""
732 return cls(day,day)
733 def __contains__(self, date):
734 """Check if the date is in the range"""
735 if not isinstance(date, datetime.date):
736 date = date_from_str(date)
737 return self.start <= date <= self.end
738 def __str__(self):
739 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
740
741
742 def platform_name():
743 """ Returns the platform name as a compat_str """
744 res = platform.platform()
745 if isinstance(res, bytes):
746 res = res.decode(preferredencoding())
747
748 assert isinstance(res, compat_str)
749 return res
750
751
752 def _windows_write_string(s, out):
753 """ Returns True if the string was written using special methods,
754 False if it has yet to be written out."""
755 # Adapted from http://stackoverflow.com/a/3259271/35070
756
757 import ctypes
758 import ctypes.wintypes
759
760 WIN_OUTPUT_IDS = {
761 1: -11,
762 2: -12,
763 }
764
765 try:
766 fileno = out.fileno()
767 except AttributeError:
768 # If the output stream doesn't have a fileno, it's virtual
769 return False
770 if fileno not in WIN_OUTPUT_IDS:
771 return False
772
773 GetStdHandle = ctypes.WINFUNCTYPE(
774 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
775 ("GetStdHandle", ctypes.windll.kernel32))
776 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
777
778 WriteConsoleW = ctypes.WINFUNCTYPE(
779 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
780 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
781 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
782 written = ctypes.wintypes.DWORD(0)
783
784 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
785 FILE_TYPE_CHAR = 0x0002
786 FILE_TYPE_REMOTE = 0x8000
787 GetConsoleMode = ctypes.WINFUNCTYPE(
788 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
789 ctypes.POINTER(ctypes.wintypes.DWORD))(
790 ("GetConsoleMode", ctypes.windll.kernel32))
791 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
792
793 def not_a_console(handle):
794 if handle == INVALID_HANDLE_VALUE or handle is None:
795 return True
796 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
797 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
798
799 if not_a_console(h):
800 return False
801
802 def next_nonbmp_pos(s):
803 try:
804 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
805 except StopIteration:
806 return len(s)
807
808 while s:
809 count = min(next_nonbmp_pos(s), 1024)
810
811 ret = WriteConsoleW(
812 h, s, count if count else 2, ctypes.byref(written), None)
813 if ret == 0:
814 raise OSError('Failed to write string')
815 if not count: # We just wrote a non-BMP character
816 assert written.value == 2
817 s = s[1:]
818 else:
819 assert written.value > 0
820 s = s[written.value:]
821 return True
822
823
824 def write_string(s, out=None, encoding=None):
825 if out is None:
826 out = sys.stderr
827 assert type(s) == compat_str
828
829 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
830 if _windows_write_string(s, out):
831 return
832
833 if ('b' in getattr(out, 'mode', '') or
834 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
835 byt = s.encode(encoding or preferredencoding(), 'ignore')
836 out.write(byt)
837 elif hasattr(out, 'buffer'):
838 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
839 byt = s.encode(enc, 'ignore')
840 out.buffer.write(byt)
841 else:
842 out.write(s)
843 out.flush()
844
845
846 def bytes_to_intlist(bs):
847 if not bs:
848 return []
849 if isinstance(bs[0], int): # Python 3
850 return list(bs)
851 else:
852 return [ord(c) for c in bs]
853
854
855 def intlist_to_bytes(xs):
856 if not xs:
857 return b''
858 return struct_pack('%dB' % len(xs), *xs)
859
860
861 # Cross-platform file locking
862 if sys.platform == 'win32':
863 import ctypes.wintypes
864 import msvcrt
865
866 class OVERLAPPED(ctypes.Structure):
867 _fields_ = [
868 ('Internal', ctypes.wintypes.LPVOID),
869 ('InternalHigh', ctypes.wintypes.LPVOID),
870 ('Offset', ctypes.wintypes.DWORD),
871 ('OffsetHigh', ctypes.wintypes.DWORD),
872 ('hEvent', ctypes.wintypes.HANDLE),
873 ]
874
875 kernel32 = ctypes.windll.kernel32
876 LockFileEx = kernel32.LockFileEx
877 LockFileEx.argtypes = [
878 ctypes.wintypes.HANDLE, # hFile
879 ctypes.wintypes.DWORD, # dwFlags
880 ctypes.wintypes.DWORD, # dwReserved
881 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
882 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
883 ctypes.POINTER(OVERLAPPED) # Overlapped
884 ]
885 LockFileEx.restype = ctypes.wintypes.BOOL
886 UnlockFileEx = kernel32.UnlockFileEx
887 UnlockFileEx.argtypes = [
888 ctypes.wintypes.HANDLE, # hFile
889 ctypes.wintypes.DWORD, # dwReserved
890 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
891 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
892 ctypes.POINTER(OVERLAPPED) # Overlapped
893 ]
894 UnlockFileEx.restype = ctypes.wintypes.BOOL
895 whole_low = 0xffffffff
896 whole_high = 0x7fffffff
897
898 def _lock_file(f, exclusive):
899 overlapped = OVERLAPPED()
900 overlapped.Offset = 0
901 overlapped.OffsetHigh = 0
902 overlapped.hEvent = 0
903 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
904 handle = msvcrt.get_osfhandle(f.fileno())
905 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
906 whole_low, whole_high, f._lock_file_overlapped_p):
907 raise OSError('Locking file failed: %r' % ctypes.FormatError())
908
909 def _unlock_file(f):
910 assert f._lock_file_overlapped_p
911 handle = msvcrt.get_osfhandle(f.fileno())
912 if not UnlockFileEx(handle, 0,
913 whole_low, whole_high, f._lock_file_overlapped_p):
914 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
915
916 else:
917 import fcntl
918
919 def _lock_file(f, exclusive):
920 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
921
922 def _unlock_file(f):
923 fcntl.flock(f, fcntl.LOCK_UN)
924
925
926 class locked_file(object):
927 def __init__(self, filename, mode, encoding=None):
928 assert mode in ['r', 'a', 'w']
929 self.f = io.open(filename, mode, encoding=encoding)
930 self.mode = mode
931
932 def __enter__(self):
933 exclusive = self.mode != 'r'
934 try:
935 _lock_file(self.f, exclusive)
936 except IOError:
937 self.f.close()
938 raise
939 return self
940
941 def __exit__(self, etype, value, traceback):
942 try:
943 _unlock_file(self.f)
944 finally:
945 self.f.close()
946
947 def __iter__(self):
948 return iter(self.f)
949
950 def write(self, *args):
951 return self.f.write(*args)
952
953 def read(self, *args):
954 return self.f.read(*args)
955
956
957 def get_filesystem_encoding():
958 encoding = sys.getfilesystemencoding()
959 return encoding if encoding is not None else 'utf-8'
960
961
962 def shell_quote(args):
963 quoted_args = []
964 encoding = get_filesystem_encoding()
965 for a in args:
966 if isinstance(a, bytes):
967 # We may get a filename encoded with 'encodeFilename'
968 a = a.decode(encoding)
969 quoted_args.append(pipes.quote(a))
970 return u' '.join(quoted_args)
971
972
973 def takewhile_inclusive(pred, seq):
974 """ Like itertools.takewhile, but include the latest evaluated element
975 (the first element so that Not pred(e)) """
976 for e in seq:
977 yield e
978 if not pred(e):
979 return
980
981
982 def smuggle_url(url, data):
983 """ Pass additional data in a URL for internal use. """
984
985 sdata = compat_urllib_parse.urlencode(
986 {u'__youtubedl_smuggle': json.dumps(data)})
987 return url + u'#' + sdata
988
989
990 def unsmuggle_url(smug_url, default=None):
991 if not '#__youtubedl_smuggle' in smug_url:
992 return smug_url, default
993 url, _, sdata = smug_url.rpartition(u'#')
994 jsond = compat_parse_qs(sdata)[u'__youtubedl_smuggle'][0]
995 data = json.loads(jsond)
996 return url, data
997
998
999 def format_bytes(bytes):
1000 if bytes is None:
1001 return u'N/A'
1002 if type(bytes) is str:
1003 bytes = float(bytes)
1004 if bytes == 0.0:
1005 exponent = 0
1006 else:
1007 exponent = int(math.log(bytes, 1024.0))
1008 suffix = [u'B', u'KiB', u'MiB', u'GiB', u'TiB', u'PiB', u'EiB', u'ZiB', u'YiB'][exponent]
1009 converted = float(bytes) / float(1024 ** exponent)
1010 return u'%.2f%s' % (converted, suffix)
1011
1012
1013 def get_term_width():
1014 columns = compat_getenv('COLUMNS', None)
1015 if columns:
1016 return int(columns)
1017
1018 try:
1019 sp = subprocess.Popen(
1020 ['stty', 'size'],
1021 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1022 out, err = sp.communicate()
1023 return int(out.split()[1])
1024 except:
1025 pass
1026 return None
1027
1028
1029 def month_by_name(name):
1030 """ Return the number of a month by (locale-independently) English name """
1031
1032 ENGLISH_NAMES = [
1033 u'January', u'February', u'March', u'April', u'May', u'June',
1034 u'July', u'August', u'September', u'October', u'November', u'December']
1035 try:
1036 return ENGLISH_NAMES.index(name) + 1
1037 except ValueError:
1038 return None
1039
1040
1041 def fix_xml_ampersands(xml_str):
1042 """Replace all the '&' by '&amp;' in XML"""
1043 return re.sub(
1044 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1045 u'&amp;',
1046 xml_str)
1047
1048
1049 def setproctitle(title):
1050 assert isinstance(title, compat_str)
1051 try:
1052 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1053 except OSError:
1054 return
1055 title_bytes = title.encode('utf-8')
1056 buf = ctypes.create_string_buffer(len(title_bytes))
1057 buf.value = title_bytes
1058 try:
1059 libc.prctl(15, buf, 0, 0, 0)
1060 except AttributeError:
1061 return # Strange libc, just skip this
1062
1063
1064 def remove_start(s, start):
1065 if s.startswith(start):
1066 return s[len(start):]
1067 return s
1068
1069
1070 def remove_end(s, end):
1071 if s.endswith(end):
1072 return s[:-len(end)]
1073 return s
1074
1075
1076 def url_basename(url):
1077 path = compat_urlparse.urlparse(url).path
1078 return path.strip(u'/').split(u'/')[-1]
1079
1080
1081 class HEADRequest(compat_urllib_request.Request):
1082 def get_method(self):
1083 return "HEAD"
1084
1085
1086 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1087 if get_attr:
1088 if v is not None:
1089 v = getattr(v, get_attr, None)
1090 if v == '':
1091 v = None
1092 return default if v is None else (int(v) * invscale // scale)
1093
1094
1095 def str_or_none(v, default=None):
1096 return default if v is None else compat_str(v)
1097
1098
1099 def str_to_int(int_str):
1100 """ A more relaxed version of int_or_none """
1101 if int_str is None:
1102 return None
1103 int_str = re.sub(r'[,\.\+]', u'', int_str)
1104 return int(int_str)
1105
1106
1107 def float_or_none(v, scale=1, invscale=1, default=None):
1108 return default if v is None else (float(v) * invscale / scale)
1109
1110
1111 def parse_duration(s):
1112 if s is None:
1113 return None
1114
1115 s = s.strip()
1116
1117 m = re.match(
1118 r'''(?ix)T?
1119 (?:
1120 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1121 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1122 )?
1123 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$''', s)
1124 if not m:
1125 return None
1126 res = int(m.group('secs'))
1127 if m.group('mins'):
1128 res += int(m.group('mins')) * 60
1129 if m.group('hours'):
1130 res += int(m.group('hours')) * 60 * 60
1131 if m.group('ms'):
1132 res += float(m.group('ms'))
1133 return res
1134
1135
1136 def prepend_extension(filename, ext):
1137 name, real_ext = os.path.splitext(filename)
1138 return u'{0}.{1}{2}'.format(name, ext, real_ext)
1139
1140
1141 def check_executable(exe, args=[]):
1142 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1143 args can be a list of arguments for a short output (like -version) """
1144 try:
1145 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1146 except OSError:
1147 return False
1148 return exe
1149
1150
1151 def get_exe_version(exe, args=['--version'],
1152 version_re=r'version\s+([0-9._-a-zA-Z]+)',
1153 unrecognized=u'present'):
1154 """ Returns the version of the specified executable,
1155 or False if the executable is not present """
1156 try:
1157 out, err = subprocess.Popen(
1158 [exe] + args,
1159 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1160 except OSError:
1161 return False
1162 firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1163 m = re.search(version_re, firstline)
1164 if m:
1165 return m.group(1)
1166 else:
1167 return unrecognized
1168
1169
1170 class PagedList(object):
1171 def __len__(self):
1172 # This is only useful for tests
1173 return len(self.getslice())
1174
1175
1176 class OnDemandPagedList(PagedList):
1177 def __init__(self, pagefunc, pagesize):
1178 self._pagefunc = pagefunc
1179 self._pagesize = pagesize
1180
1181 def getslice(self, start=0, end=None):
1182 res = []
1183 for pagenum in itertools.count(start // self._pagesize):
1184 firstid = pagenum * self._pagesize
1185 nextfirstid = pagenum * self._pagesize + self._pagesize
1186 if start >= nextfirstid:
1187 continue
1188
1189 page_results = list(self._pagefunc(pagenum))
1190
1191 startv = (
1192 start % self._pagesize
1193 if firstid <= start < nextfirstid
1194 else 0)
1195
1196 endv = (
1197 ((end - 1) % self._pagesize) + 1
1198 if (end is not None and firstid <= end <= nextfirstid)
1199 else None)
1200
1201 if startv != 0 or endv is not None:
1202 page_results = page_results[startv:endv]
1203 res.extend(page_results)
1204
1205 # A little optimization - if current page is not "full", ie. does
1206 # not contain page_size videos then we can assume that this page
1207 # is the last one - there are no more ids on further pages -
1208 # i.e. no need to query again.
1209 if len(page_results) + startv < self._pagesize:
1210 break
1211
1212 # If we got the whole page, but the next page is not interesting,
1213 # break out early as well
1214 if end == nextfirstid:
1215 break
1216 return res
1217
1218
1219 class InAdvancePagedList(PagedList):
1220 def __init__(self, pagefunc, pagecount, pagesize):
1221 self._pagefunc = pagefunc
1222 self._pagecount = pagecount
1223 self._pagesize = pagesize
1224
1225 def getslice(self, start=0, end=None):
1226 res = []
1227 start_page = start // self._pagesize
1228 end_page = (
1229 self._pagecount if end is None else (end // self._pagesize + 1))
1230 skip_elems = start - start_page * self._pagesize
1231 only_more = None if end is None else end - start
1232 for pagenum in range(start_page, end_page):
1233 page = list(self._pagefunc(pagenum))
1234 if skip_elems:
1235 page = page[skip_elems:]
1236 skip_elems = None
1237 if only_more is not None:
1238 if len(page) < only_more:
1239 only_more -= len(page)
1240 else:
1241 page = page[:only_more]
1242 res.extend(page)
1243 break
1244 res.extend(page)
1245 return res
1246
1247
1248 def uppercase_escape(s):
1249 unicode_escape = codecs.getdecoder('unicode_escape')
1250 return re.sub(
1251 r'\\U[0-9a-fA-F]{8}',
1252 lambda m: unicode_escape(m.group(0))[0],
1253 s)
1254
1255
1256 def escape_rfc3986(s):
1257 """Escape non-ASCII characters as suggested by RFC 3986"""
1258 if sys.version_info < (3, 0) and isinstance(s, unicode):
1259 s = s.encode('utf-8')
1260 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1261
1262
1263 def escape_url(url):
1264 """Escape URL as suggested by RFC 3986"""
1265 url_parsed = compat_urllib_parse_urlparse(url)
1266 return url_parsed._replace(
1267 path=escape_rfc3986(url_parsed.path),
1268 params=escape_rfc3986(url_parsed.params),
1269 query=escape_rfc3986(url_parsed.query),
1270 fragment=escape_rfc3986(url_parsed.fragment)
1271 ).geturl()
1272
1273 try:
1274 struct.pack(u'!I', 0)
1275 except TypeError:
1276 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1277 def struct_pack(spec, *args):
1278 if isinstance(spec, compat_str):
1279 spec = spec.encode('ascii')
1280 return struct.pack(spec, *args)
1281
1282 def struct_unpack(spec, *args):
1283 if isinstance(spec, compat_str):
1284 spec = spec.encode('ascii')
1285 return struct.unpack(spec, *args)
1286 else:
1287 struct_pack = struct.pack
1288 struct_unpack = struct.unpack
1289
1290
1291 def read_batch_urls(batch_fd):
1292 def fixup(url):
1293 if not isinstance(url, compat_str):
1294 url = url.decode('utf-8', 'replace')
1295 BOM_UTF8 = u'\xef\xbb\xbf'
1296 if url.startswith(BOM_UTF8):
1297 url = url[len(BOM_UTF8):]
1298 url = url.strip()
1299 if url.startswith(('#', ';', ']')):
1300 return False
1301 return url
1302
1303 with contextlib.closing(batch_fd) as fd:
1304 return [url for url in map(fixup, fd) if url]
1305
1306
1307 def urlencode_postdata(*args, **kargs):
1308 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1309
1310
1311 try:
1312 etree_iter = xml.etree.ElementTree.Element.iter
1313 except AttributeError: # Python <=2.6
1314 etree_iter = lambda n: n.findall('.//*')
1315
1316
1317 def parse_xml(s):
1318 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1319 def doctype(self, name, pubid, system):
1320 pass # Ignore doctypes
1321
1322 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1323 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1324 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1325 # Fix up XML parser in Python 2.x
1326 if sys.version_info < (3, 0):
1327 for n in etree_iter(tree):
1328 if n.text is not None:
1329 if not isinstance(n.text, compat_str):
1330 n.text = n.text.decode('utf-8')
1331 return tree
1332
1333
1334 US_RATINGS = {
1335 'G': 0,
1336 'PG': 10,
1337 'PG-13': 13,
1338 'R': 16,
1339 'NC': 18,
1340 }
1341
1342
1343 def parse_age_limit(s):
1344 if s is None:
1345 return None
1346 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1347 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1348
1349
1350 def strip_jsonp(code):
1351 return re.sub(
1352 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1353
1354
1355 def js_to_json(code):
1356 def fix_kv(m):
1357 v = m.group(0)
1358 if v in ('true', 'false', 'null'):
1359 return v
1360 if v.startswith('"'):
1361 return v
1362 if v.startswith("'"):
1363 v = v[1:-1]
1364 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1365 '\\\\': '\\\\',
1366 "\\'": "'",
1367 '"': '\\"',
1368 }[m.group(0)], v)
1369 return '"%s"' % v
1370
1371 res = re.sub(r'''(?x)
1372 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1373 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1374 [a-zA-Z_][a-zA-Z_0-9]*
1375 ''', fix_kv, code)
1376 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1377 return res
1378
1379
1380 def qualities(quality_ids):
1381 """ Get a numeric quality value out of a list of possible values """
1382 def q(qid):
1383 try:
1384 return quality_ids.index(qid)
1385 except ValueError:
1386 return -1
1387 return q
1388
1389
1390 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1391
1392
1393 def limit_length(s, length):
1394 """ Add ellipses to overly long strings """
1395 if s is None:
1396 return None
1397 ELLIPSES = '...'
1398 if len(s) > length:
1399 return s[:length - len(ELLIPSES)] + ELLIPSES
1400 return s
1401
1402
1403 def version_tuple(v):
1404 return [int(e) for e in v.split('.')]
1405
1406
1407 def is_outdated_version(version, limit, assume_new=True):
1408 if not version:
1409 return not assume_new
1410 try:
1411 return version_tuple(version) < version_tuple(limit)
1412 except ValueError:
1413 return not assume_new