]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
[arte] Clean up format sorting mess
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import calendar
7 import codecs
8 import contextlib
9 import ctypes
10 import datetime
11 import email.utils
12 import errno
13 import gzip
14 import itertools
15 import io
16 import json
17 import locale
18 import math
19 import os
20 import pipes
21 import platform
22 import re
23 import ssl
24 import socket
25 import struct
26 import subprocess
27 import sys
28 import tempfile
29 import traceback
30 import xml.etree.ElementTree
31 import zlib
32
33 from .compat import (
34 compat_chr,
35 compat_getenv,
36 compat_html_entities,
37 compat_parse_qs,
38 compat_str,
39 compat_urllib_error,
40 compat_urllib_parse,
41 compat_urllib_parse_urlparse,
42 compat_urllib_request,
43 compat_urlparse,
44 )
45
46
47 # This is not clearly defined otherwise
48 compiled_regex_type = type(re.compile(''))
49
50 std_headers = {
51 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0 (Chrome)',
52 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
53 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
54 'Accept-Encoding': 'gzip, deflate',
55 'Accept-Language': 'en-us,en;q=0.5',
56 }
57
58 def preferredencoding():
59 """Get preferred encoding.
60
61 Returns the best encoding scheme for the system, based on
62 locale.getpreferredencoding() and some further tweaks.
63 """
64 try:
65 pref = locale.getpreferredencoding()
66 'TEST'.encode(pref)
67 except:
68 pref = 'UTF-8'
69
70 return pref
71
72
73 def write_json_file(obj, fn):
74 """ Encode obj as JSON and write it to fn, atomically if possible """
75
76 fn = encodeFilename(fn)
77 if sys.version_info < (3, 0) and sys.platform != 'win32':
78 encoding = get_filesystem_encoding()
79 # os.path.basename returns a bytes object, but NamedTemporaryFile
80 # will fail if the filename contains non ascii characters unless we
81 # use a unicode object
82 path_basename = lambda f: os.path.basename(fn).decode(encoding)
83 # the same for os.path.dirname
84 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
85 else:
86 path_basename = os.path.basename
87 path_dirname = os.path.dirname
88
89 args = {
90 'suffix': '.tmp',
91 'prefix': path_basename(fn) + '.',
92 'dir': path_dirname(fn),
93 'delete': False,
94 }
95
96 # In Python 2.x, json.dump expects a bytestream.
97 # In Python 3.x, it writes to a character stream
98 if sys.version_info < (3, 0):
99 args['mode'] = 'wb'
100 else:
101 args.update({
102 'mode': 'w',
103 'encoding': 'utf-8',
104 })
105
106 tf = tempfile.NamedTemporaryFile(**args)
107
108 try:
109 with tf:
110 json.dump(obj, tf)
111 if sys.platform == 'win32':
112 # Need to remove existing file on Windows, else os.rename raises
113 # WindowsError or FileExistsError.
114 try:
115 os.unlink(fn)
116 except OSError:
117 pass
118 os.rename(tf.name, fn)
119 except:
120 try:
121 os.remove(tf.name)
122 except OSError:
123 pass
124 raise
125
126
127 if sys.version_info >= (2, 7):
128 def find_xpath_attr(node, xpath, key, val):
129 """ Find the xpath xpath[@key=val] """
130 assert re.match(r'^[a-zA-Z-]+$', key)
131 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
132 expr = xpath + u"[@%s='%s']" % (key, val)
133 return node.find(expr)
134 else:
135 def find_xpath_attr(node, xpath, key, val):
136 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
137 # .//node does not match if a node is a direct child of . !
138 if isinstance(xpath, unicode):
139 xpath = xpath.encode('ascii')
140
141 for f in node.findall(xpath):
142 if f.attrib.get(key) == val:
143 return f
144 return None
145
146 # On python2.6 the xml.etree.ElementTree.Element methods don't support
147 # the namespace parameter
148 def xpath_with_ns(path, ns_map):
149 components = [c.split(':') for c in path.split('/')]
150 replaced = []
151 for c in components:
152 if len(c) == 1:
153 replaced.append(c[0])
154 else:
155 ns, tag = c
156 replaced.append('{%s}%s' % (ns_map[ns], tag))
157 return '/'.join(replaced)
158
159
160 def xpath_text(node, xpath, name=None, fatal=False):
161 if sys.version_info < (2, 7): # Crazy 2.6
162 xpath = xpath.encode('ascii')
163
164 n = node.find(xpath)
165 if n is None:
166 if fatal:
167 name = xpath if name is None else name
168 raise ExtractorError('Could not find XML element %s' % name)
169 else:
170 return None
171 return n.text
172
173
174 def get_element_by_id(id, html):
175 """Return the content of the tag with the specified ID in the passed HTML document"""
176 return get_element_by_attribute("id", id, html)
177
178
179 def get_element_by_attribute(attribute, value, html):
180 """Return the content of the tag with the specified attribute in the passed HTML document"""
181
182 m = re.search(r'''(?xs)
183 <([a-zA-Z0-9:._-]+)
184 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
185 \s+%s=['"]?%s['"]?
186 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
187 \s*>
188 (?P<content>.*?)
189 </\1>
190 ''' % (re.escape(attribute), re.escape(value)), html)
191
192 if not m:
193 return None
194 res = m.group('content')
195
196 if res.startswith('"') or res.startswith("'"):
197 res = res[1:-1]
198
199 return unescapeHTML(res)
200
201
202 def clean_html(html):
203 """Clean an HTML snippet into a readable string"""
204 # Newline vs <br />
205 html = html.replace('\n', ' ')
206 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
207 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
208 # Strip html tags
209 html = re.sub('<.*?>', '', html)
210 # Replace html entities
211 html = unescapeHTML(html)
212 return html.strip()
213
214
215 def sanitize_open(filename, open_mode):
216 """Try to open the given filename, and slightly tweak it if this fails.
217
218 Attempts to open the given filename. If this fails, it tries to change
219 the filename slightly, step by step, until it's either able to open it
220 or it fails and raises a final exception, like the standard open()
221 function.
222
223 It returns the tuple (stream, definitive_file_name).
224 """
225 try:
226 if filename == '-':
227 if sys.platform == 'win32':
228 import msvcrt
229 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
230 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
231 stream = open(encodeFilename(filename), open_mode)
232 return (stream, filename)
233 except (IOError, OSError) as err:
234 if err.errno in (errno.EACCES,):
235 raise
236
237 # In case of error, try to remove win32 forbidden chars
238 alt_filename = os.path.join(
239 re.sub('[/<>:"\\|\\\\?\\*]', '#', path_part)
240 for path_part in os.path.split(filename)
241 )
242 if alt_filename == filename:
243 raise
244 else:
245 # An exception here should be caught in the caller
246 stream = open(encodeFilename(filename), open_mode)
247 return (stream, alt_filename)
248
249
250 def timeconvert(timestr):
251 """Convert RFC 2822 defined time string into system timestamp"""
252 timestamp = None
253 timetuple = email.utils.parsedate_tz(timestr)
254 if timetuple is not None:
255 timestamp = email.utils.mktime_tz(timetuple)
256 return timestamp
257
258 def sanitize_filename(s, restricted=False, is_id=False):
259 """Sanitizes a string so it could be used as part of a filename.
260 If restricted is set, use a stricter subset of allowed characters.
261 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
262 """
263 def replace_insane(char):
264 if char == '?' or ord(char) < 32 or ord(char) == 127:
265 return ''
266 elif char == '"':
267 return '' if restricted else '\''
268 elif char == ':':
269 return '_-' if restricted else ' -'
270 elif char in '\\/|*<>':
271 return '_'
272 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
273 return '_'
274 if restricted and ord(char) > 127:
275 return '_'
276 return char
277
278 result = ''.join(map(replace_insane, s))
279 if not is_id:
280 while '__' in result:
281 result = result.replace('__', '_')
282 result = result.strip('_')
283 # Common case of "Foreign band name - English song title"
284 if restricted and result.startswith('-_'):
285 result = result[2:]
286 if not result:
287 result = '_'
288 return result
289
290 def orderedSet(iterable):
291 """ Remove all duplicates from the input iterable """
292 res = []
293 for el in iterable:
294 if el not in res:
295 res.append(el)
296 return res
297
298
299 def _htmlentity_transform(entity):
300 """Transforms an HTML entity to a character."""
301 # Known non-numeric HTML entity
302 if entity in compat_html_entities.name2codepoint:
303 return compat_chr(compat_html_entities.name2codepoint[entity])
304
305 mobj = re.match(r'#(x?[0-9]+)', entity)
306 if mobj is not None:
307 numstr = mobj.group(1)
308 if numstr.startswith('x'):
309 base = 16
310 numstr = '0%s' % numstr
311 else:
312 base = 10
313 return compat_chr(int(numstr, base))
314
315 # Unknown entity in name, return its literal representation
316 return ('&%s;' % entity)
317
318
319 def unescapeHTML(s):
320 if s is None:
321 return None
322 assert type(s) == compat_str
323
324 return re.sub(
325 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
326
327
328 def encodeFilename(s, for_subprocess=False):
329 """
330 @param s The name of the file
331 """
332
333 assert type(s) == compat_str
334
335 # Python 3 has a Unicode API
336 if sys.version_info >= (3, 0):
337 return s
338
339 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
340 # Pass '' directly to use Unicode APIs on Windows 2000 and up
341 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
342 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
343 if not for_subprocess:
344 return s
345 else:
346 # For subprocess calls, encode with locale encoding
347 # Refer to http://stackoverflow.com/a/9951851/35070
348 encoding = preferredencoding()
349 else:
350 encoding = sys.getfilesystemencoding()
351 if encoding is None:
352 encoding = 'utf-8'
353 return s.encode(encoding, 'ignore')
354
355
356 def encodeArgument(s):
357 if not isinstance(s, compat_str):
358 # Legacy code that uses byte strings
359 # Uncomment the following line after fixing all post processors
360 #assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
361 s = s.decode('ascii')
362 return encodeFilename(s, True)
363
364
365 def decodeOption(optval):
366 if optval is None:
367 return optval
368 if isinstance(optval, bytes):
369 optval = optval.decode(preferredencoding())
370
371 assert isinstance(optval, compat_str)
372 return optval
373
374 def formatSeconds(secs):
375 if secs > 3600:
376 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
377 elif secs > 60:
378 return '%d:%02d' % (secs // 60, secs % 60)
379 else:
380 return '%d' % secs
381
382
383 def make_HTTPS_handler(opts_no_check_certificate, **kwargs):
384 if sys.version_info < (3, 2):
385 import httplib
386
387 class HTTPSConnectionV3(httplib.HTTPSConnection):
388 def __init__(self, *args, **kwargs):
389 httplib.HTTPSConnection.__init__(self, *args, **kwargs)
390
391 def connect(self):
392 sock = socket.create_connection((self.host, self.port), self.timeout)
393 if getattr(self, '_tunnel_host', False):
394 self.sock = sock
395 self._tunnel()
396 try:
397 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_TLSv1)
398 except ssl.SSLError:
399 self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23)
400
401 class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler):
402 def https_open(self, req):
403 return self.do_open(HTTPSConnectionV3, req)
404 return HTTPSHandlerV3(**kwargs)
405 elif hasattr(ssl, 'create_default_context'): # Python >= 3.4
406 context = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
407 context.options &= ~ssl.OP_NO_SSLv3 # Allow older, not-as-secure SSLv3
408 if opts_no_check_certificate:
409 context.verify_mode = ssl.CERT_NONE
410 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
411 else: # Python < 3.4
412 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
413 context.verify_mode = (ssl.CERT_NONE
414 if opts_no_check_certificate
415 else ssl.CERT_REQUIRED)
416 context.set_default_verify_paths()
417 try:
418 context.load_default_certs()
419 except AttributeError:
420 pass # Python < 3.4
421 return compat_urllib_request.HTTPSHandler(context=context, **kwargs)
422
423 class ExtractorError(Exception):
424 """Error during info extraction."""
425 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
426 """ tb, if given, is the original traceback (so that it can be printed out).
427 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
428 """
429
430 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
431 expected = True
432 if video_id is not None:
433 msg = video_id + ': ' + msg
434 if cause:
435 msg += ' (caused by %r)' % cause
436 if not expected:
437 msg = msg + '; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
438 super(ExtractorError, self).__init__(msg)
439
440 self.traceback = tb
441 self.exc_info = sys.exc_info() # preserve original exception
442 self.cause = cause
443 self.video_id = video_id
444
445 def format_traceback(self):
446 if self.traceback is None:
447 return None
448 return ''.join(traceback.format_tb(self.traceback))
449
450
451 class RegexNotFoundError(ExtractorError):
452 """Error when a regex didn't match"""
453 pass
454
455
456 class DownloadError(Exception):
457 """Download Error exception.
458
459 This exception may be thrown by FileDownloader objects if they are not
460 configured to continue on errors. They will contain the appropriate
461 error message.
462 """
463 def __init__(self, msg, exc_info=None):
464 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
465 super(DownloadError, self).__init__(msg)
466 self.exc_info = exc_info
467
468
469 class SameFileError(Exception):
470 """Same File exception.
471
472 This exception will be thrown by FileDownloader objects if they detect
473 multiple files would have to be downloaded to the same file on disk.
474 """
475 pass
476
477
478 class PostProcessingError(Exception):
479 """Post Processing exception.
480
481 This exception may be raised by PostProcessor's .run() method to
482 indicate an error in the postprocessing task.
483 """
484 def __init__(self, msg):
485 self.msg = msg
486
487 class MaxDownloadsReached(Exception):
488 """ --max-downloads limit has been reached. """
489 pass
490
491
492 class UnavailableVideoError(Exception):
493 """Unavailable Format exception.
494
495 This exception will be thrown when a video is requested
496 in a format that is not available for that video.
497 """
498 pass
499
500
501 class ContentTooShortError(Exception):
502 """Content Too Short exception.
503
504 This exception may be raised by FileDownloader objects when a file they
505 download is too small for what the server announced first, indicating
506 the connection was probably interrupted.
507 """
508 # Both in bytes
509 downloaded = None
510 expected = None
511
512 def __init__(self, downloaded, expected):
513 self.downloaded = downloaded
514 self.expected = expected
515
516 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
517 """Handler for HTTP requests and responses.
518
519 This class, when installed with an OpenerDirector, automatically adds
520 the standard headers to every HTTP request and handles gzipped and
521 deflated responses from web servers. If compression is to be avoided in
522 a particular request, the original request in the program code only has
523 to include the HTTP header "Youtubedl-No-Compression", which will be
524 removed before making the real request.
525
526 Part of this code was copied from:
527
528 http://techknack.net/python-urllib2-handlers/
529
530 Andrew Rowls, the author of that code, agreed to release it to the
531 public domain.
532 """
533
534 @staticmethod
535 def deflate(data):
536 try:
537 return zlib.decompress(data, -zlib.MAX_WBITS)
538 except zlib.error:
539 return zlib.decompress(data)
540
541 @staticmethod
542 def addinfourl_wrapper(stream, headers, url, code):
543 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
544 return compat_urllib_request.addinfourl(stream, headers, url, code)
545 ret = compat_urllib_request.addinfourl(stream, headers, url)
546 ret.code = code
547 return ret
548
549 def http_request(self, req):
550 for h, v in std_headers.items():
551 if h not in req.headers:
552 req.add_header(h, v)
553 if 'Youtubedl-no-compression' in req.headers:
554 if 'Accept-encoding' in req.headers:
555 del req.headers['Accept-encoding']
556 del req.headers['Youtubedl-no-compression']
557 if 'Youtubedl-user-agent' in req.headers:
558 if 'User-agent' in req.headers:
559 del req.headers['User-agent']
560 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
561 del req.headers['Youtubedl-user-agent']
562
563 if sys.version_info < (2, 7) and '#' in req.get_full_url():
564 # Python 2.6 is brain-dead when it comes to fragments
565 req._Request__original = req._Request__original.partition('#')[0]
566 req._Request__r_type = req._Request__r_type.partition('#')[0]
567
568 return req
569
570 def http_response(self, req, resp):
571 old_resp = resp
572 # gzip
573 if resp.headers.get('Content-encoding', '') == 'gzip':
574 content = resp.read()
575 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
576 try:
577 uncompressed = io.BytesIO(gz.read())
578 except IOError as original_ioerror:
579 # There may be junk add the end of the file
580 # See http://stackoverflow.com/q/4928560/35070 for details
581 for i in range(1, 1024):
582 try:
583 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
584 uncompressed = io.BytesIO(gz.read())
585 except IOError:
586 continue
587 break
588 else:
589 raise original_ioerror
590 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
591 resp.msg = old_resp.msg
592 # deflate
593 if resp.headers.get('Content-encoding', '') == 'deflate':
594 gz = io.BytesIO(self.deflate(resp.read()))
595 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
596 resp.msg = old_resp.msg
597 return resp
598
599 https_request = http_request
600 https_response = http_response
601
602
603 def parse_iso8601(date_str, delimiter='T'):
604 """ Return a UNIX timestamp from the given date """
605
606 if date_str is None:
607 return None
608
609 m = re.search(
610 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
611 date_str)
612 if not m:
613 timezone = datetime.timedelta()
614 else:
615 date_str = date_str[:-len(m.group(0))]
616 if not m.group('sign'):
617 timezone = datetime.timedelta()
618 else:
619 sign = 1 if m.group('sign') == '+' else -1
620 timezone = datetime.timedelta(
621 hours=sign * int(m.group('hours')),
622 minutes=sign * int(m.group('minutes')))
623 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
624 dt = datetime.datetime.strptime(date_str, date_format) - timezone
625 return calendar.timegm(dt.timetuple())
626
627
628 def unified_strdate(date_str):
629 """Return a string with the date in the format YYYYMMDD"""
630
631 if date_str is None:
632 return None
633
634 upload_date = None
635 #Replace commas
636 date_str = date_str.replace(',', ' ')
637 # %z (UTC offset) is only supported in python>=3.2
638 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
639 format_expressions = [
640 '%d %B %Y',
641 '%d %b %Y',
642 '%B %d %Y',
643 '%b %d %Y',
644 '%b %dst %Y %I:%M%p',
645 '%b %dnd %Y %I:%M%p',
646 '%b %dth %Y %I:%M%p',
647 '%Y-%m-%d',
648 '%Y/%m/%d',
649 '%d.%m.%Y',
650 '%d/%m/%Y',
651 '%d/%m/%y',
652 '%Y/%m/%d %H:%M:%S',
653 '%d/%m/%Y %H:%M:%S',
654 '%Y-%m-%d %H:%M:%S',
655 '%Y-%m-%d %H:%M:%S.%f',
656 '%d.%m.%Y %H:%M',
657 '%d.%m.%Y %H.%M',
658 '%Y-%m-%dT%H:%M:%SZ',
659 '%Y-%m-%dT%H:%M:%S.%fZ',
660 '%Y-%m-%dT%H:%M:%S.%f0Z',
661 '%Y-%m-%dT%H:%M:%S',
662 '%Y-%m-%dT%H:%M:%S.%f',
663 '%Y-%m-%dT%H:%M',
664 ]
665 for expression in format_expressions:
666 try:
667 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
668 except ValueError:
669 pass
670 if upload_date is None:
671 timetuple = email.utils.parsedate_tz(date_str)
672 if timetuple:
673 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
674 return upload_date
675
676 def determine_ext(url, default_ext='unknown_video'):
677 if url is None:
678 return default_ext
679 guess = url.partition('?')[0].rpartition('.')[2]
680 if re.match(r'^[A-Za-z0-9]+$', guess):
681 return guess
682 else:
683 return default_ext
684
685 def subtitles_filename(filename, sub_lang, sub_format):
686 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
687
688 def date_from_str(date_str):
689 """
690 Return a datetime object from a string in the format YYYYMMDD or
691 (now|today)[+-][0-9](day|week|month|year)(s)?"""
692 today = datetime.date.today()
693 if date_str == 'now'or date_str == 'today':
694 return today
695 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
696 if match is not None:
697 sign = match.group('sign')
698 time = int(match.group('time'))
699 if sign == '-':
700 time = -time
701 unit = match.group('unit')
702 #A bad aproximation?
703 if unit == 'month':
704 unit = 'day'
705 time *= 30
706 elif unit == 'year':
707 unit = 'day'
708 time *= 365
709 unit += 's'
710 delta = datetime.timedelta(**{unit: time})
711 return today + delta
712 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
713
714 def hyphenate_date(date_str):
715 """
716 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
717 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
718 if match is not None:
719 return '-'.join(match.groups())
720 else:
721 return date_str
722
723 class DateRange(object):
724 """Represents a time interval between two dates"""
725 def __init__(self, start=None, end=None):
726 """start and end must be strings in the format accepted by date"""
727 if start is not None:
728 self.start = date_from_str(start)
729 else:
730 self.start = datetime.datetime.min.date()
731 if end is not None:
732 self.end = date_from_str(end)
733 else:
734 self.end = datetime.datetime.max.date()
735 if self.start > self.end:
736 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
737 @classmethod
738 def day(cls, day):
739 """Returns a range that only contains the given day"""
740 return cls(day,day)
741 def __contains__(self, date):
742 """Check if the date is in the range"""
743 if not isinstance(date, datetime.date):
744 date = date_from_str(date)
745 return self.start <= date <= self.end
746 def __str__(self):
747 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
748
749
750 def platform_name():
751 """ Returns the platform name as a compat_str """
752 res = platform.platform()
753 if isinstance(res, bytes):
754 res = res.decode(preferredencoding())
755
756 assert isinstance(res, compat_str)
757 return res
758
759
760 def _windows_write_string(s, out):
761 """ Returns True if the string was written using special methods,
762 False if it has yet to be written out."""
763 # Adapted from http://stackoverflow.com/a/3259271/35070
764
765 import ctypes
766 import ctypes.wintypes
767
768 WIN_OUTPUT_IDS = {
769 1: -11,
770 2: -12,
771 }
772
773 try:
774 fileno = out.fileno()
775 except AttributeError:
776 # If the output stream doesn't have a fileno, it's virtual
777 return False
778 if fileno not in WIN_OUTPUT_IDS:
779 return False
780
781 GetStdHandle = ctypes.WINFUNCTYPE(
782 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
783 ("GetStdHandle", ctypes.windll.kernel32))
784 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
785
786 WriteConsoleW = ctypes.WINFUNCTYPE(
787 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
788 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
789 ctypes.wintypes.LPVOID)(("WriteConsoleW", ctypes.windll.kernel32))
790 written = ctypes.wintypes.DWORD(0)
791
792 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(("GetFileType", ctypes.windll.kernel32))
793 FILE_TYPE_CHAR = 0x0002
794 FILE_TYPE_REMOTE = 0x8000
795 GetConsoleMode = ctypes.WINFUNCTYPE(
796 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
797 ctypes.POINTER(ctypes.wintypes.DWORD))(
798 ("GetConsoleMode", ctypes.windll.kernel32))
799 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
800
801 def not_a_console(handle):
802 if handle == INVALID_HANDLE_VALUE or handle is None:
803 return True
804 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR
805 or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
806
807 if not_a_console(h):
808 return False
809
810 def next_nonbmp_pos(s):
811 try:
812 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
813 except StopIteration:
814 return len(s)
815
816 while s:
817 count = min(next_nonbmp_pos(s), 1024)
818
819 ret = WriteConsoleW(
820 h, s, count if count else 2, ctypes.byref(written), None)
821 if ret == 0:
822 raise OSError('Failed to write string')
823 if not count: # We just wrote a non-BMP character
824 assert written.value == 2
825 s = s[1:]
826 else:
827 assert written.value > 0
828 s = s[written.value:]
829 return True
830
831
832 def write_string(s, out=None, encoding=None):
833 if out is None:
834 out = sys.stderr
835 assert type(s) == compat_str
836
837 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
838 if _windows_write_string(s, out):
839 return
840
841 if ('b' in getattr(out, 'mode', '') or
842 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
843 byt = s.encode(encoding or preferredencoding(), 'ignore')
844 out.write(byt)
845 elif hasattr(out, 'buffer'):
846 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
847 byt = s.encode(enc, 'ignore')
848 out.buffer.write(byt)
849 else:
850 out.write(s)
851 out.flush()
852
853
854 def bytes_to_intlist(bs):
855 if not bs:
856 return []
857 if isinstance(bs[0], int): # Python 3
858 return list(bs)
859 else:
860 return [ord(c) for c in bs]
861
862
863 def intlist_to_bytes(xs):
864 if not xs:
865 return b''
866 return struct_pack('%dB' % len(xs), *xs)
867
868
869 # Cross-platform file locking
870 if sys.platform == 'win32':
871 import ctypes.wintypes
872 import msvcrt
873
874 class OVERLAPPED(ctypes.Structure):
875 _fields_ = [
876 ('Internal', ctypes.wintypes.LPVOID),
877 ('InternalHigh', ctypes.wintypes.LPVOID),
878 ('Offset', ctypes.wintypes.DWORD),
879 ('OffsetHigh', ctypes.wintypes.DWORD),
880 ('hEvent', ctypes.wintypes.HANDLE),
881 ]
882
883 kernel32 = ctypes.windll.kernel32
884 LockFileEx = kernel32.LockFileEx
885 LockFileEx.argtypes = [
886 ctypes.wintypes.HANDLE, # hFile
887 ctypes.wintypes.DWORD, # dwFlags
888 ctypes.wintypes.DWORD, # dwReserved
889 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
890 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
891 ctypes.POINTER(OVERLAPPED) # Overlapped
892 ]
893 LockFileEx.restype = ctypes.wintypes.BOOL
894 UnlockFileEx = kernel32.UnlockFileEx
895 UnlockFileEx.argtypes = [
896 ctypes.wintypes.HANDLE, # hFile
897 ctypes.wintypes.DWORD, # dwReserved
898 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
899 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
900 ctypes.POINTER(OVERLAPPED) # Overlapped
901 ]
902 UnlockFileEx.restype = ctypes.wintypes.BOOL
903 whole_low = 0xffffffff
904 whole_high = 0x7fffffff
905
906 def _lock_file(f, exclusive):
907 overlapped = OVERLAPPED()
908 overlapped.Offset = 0
909 overlapped.OffsetHigh = 0
910 overlapped.hEvent = 0
911 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
912 handle = msvcrt.get_osfhandle(f.fileno())
913 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
914 whole_low, whole_high, f._lock_file_overlapped_p):
915 raise OSError('Locking file failed: %r' % ctypes.FormatError())
916
917 def _unlock_file(f):
918 assert f._lock_file_overlapped_p
919 handle = msvcrt.get_osfhandle(f.fileno())
920 if not UnlockFileEx(handle, 0,
921 whole_low, whole_high, f._lock_file_overlapped_p):
922 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
923
924 else:
925 import fcntl
926
927 def _lock_file(f, exclusive):
928 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
929
930 def _unlock_file(f):
931 fcntl.flock(f, fcntl.LOCK_UN)
932
933
934 class locked_file(object):
935 def __init__(self, filename, mode, encoding=None):
936 assert mode in ['r', 'a', 'w']
937 self.f = io.open(filename, mode, encoding=encoding)
938 self.mode = mode
939
940 def __enter__(self):
941 exclusive = self.mode != 'r'
942 try:
943 _lock_file(self.f, exclusive)
944 except IOError:
945 self.f.close()
946 raise
947 return self
948
949 def __exit__(self, etype, value, traceback):
950 try:
951 _unlock_file(self.f)
952 finally:
953 self.f.close()
954
955 def __iter__(self):
956 return iter(self.f)
957
958 def write(self, *args):
959 return self.f.write(*args)
960
961 def read(self, *args):
962 return self.f.read(*args)
963
964
965 def get_filesystem_encoding():
966 encoding = sys.getfilesystemencoding()
967 return encoding if encoding is not None else 'utf-8'
968
969
970 def shell_quote(args):
971 quoted_args = []
972 encoding = get_filesystem_encoding()
973 for a in args:
974 if isinstance(a, bytes):
975 # We may get a filename encoded with 'encodeFilename'
976 a = a.decode(encoding)
977 quoted_args.append(pipes.quote(a))
978 return ' '.join(quoted_args)
979
980
981 def takewhile_inclusive(pred, seq):
982 """ Like itertools.takewhile, but include the latest evaluated element
983 (the first element so that Not pred(e)) """
984 for e in seq:
985 yield e
986 if not pred(e):
987 return
988
989
990 def smuggle_url(url, data):
991 """ Pass additional data in a URL for internal use. """
992
993 sdata = compat_urllib_parse.urlencode(
994 {'__youtubedl_smuggle': json.dumps(data)})
995 return url + '#' + sdata
996
997
998 def unsmuggle_url(smug_url, default=None):
999 if not '#__youtubedl_smuggle' in smug_url:
1000 return smug_url, default
1001 url, _, sdata = smug_url.rpartition('#')
1002 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1003 data = json.loads(jsond)
1004 return url, data
1005
1006
1007 def format_bytes(bytes):
1008 if bytes is None:
1009 return 'N/A'
1010 if type(bytes) is str:
1011 bytes = float(bytes)
1012 if bytes == 0.0:
1013 exponent = 0
1014 else:
1015 exponent = int(math.log(bytes, 1024.0))
1016 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1017 converted = float(bytes) / float(1024 ** exponent)
1018 return '%.2f%s' % (converted, suffix)
1019
1020
1021 def get_term_width():
1022 columns = compat_getenv('COLUMNS', None)
1023 if columns:
1024 return int(columns)
1025
1026 try:
1027 sp = subprocess.Popen(
1028 ['stty', 'size'],
1029 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
1030 out, err = sp.communicate()
1031 return int(out.split()[1])
1032 except:
1033 pass
1034 return None
1035
1036
1037 def month_by_name(name):
1038 """ Return the number of a month by (locale-independently) English name """
1039
1040 ENGLISH_NAMES = [
1041 'January', 'February', 'March', 'April', 'May', 'June',
1042 'July', 'August', 'September', 'October', 'November', 'December']
1043 try:
1044 return ENGLISH_NAMES.index(name) + 1
1045 except ValueError:
1046 return None
1047
1048
1049 def fix_xml_ampersands(xml_str):
1050 """Replace all the '&' by '&amp;' in XML"""
1051 return re.sub(
1052 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1053 '&amp;',
1054 xml_str)
1055
1056
1057 def setproctitle(title):
1058 assert isinstance(title, compat_str)
1059 try:
1060 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1061 except OSError:
1062 return
1063 title_bytes = title.encode('utf-8')
1064 buf = ctypes.create_string_buffer(len(title_bytes))
1065 buf.value = title_bytes
1066 try:
1067 libc.prctl(15, buf, 0, 0, 0)
1068 except AttributeError:
1069 return # Strange libc, just skip this
1070
1071
1072 def remove_start(s, start):
1073 if s.startswith(start):
1074 return s[len(start):]
1075 return s
1076
1077
1078 def remove_end(s, end):
1079 if s.endswith(end):
1080 return s[:-len(end)]
1081 return s
1082
1083
1084 def url_basename(url):
1085 path = compat_urlparse.urlparse(url).path
1086 return path.strip('/').split('/')[-1]
1087
1088
1089 class HEADRequest(compat_urllib_request.Request):
1090 def get_method(self):
1091 return "HEAD"
1092
1093
1094 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1095 if get_attr:
1096 if v is not None:
1097 v = getattr(v, get_attr, None)
1098 if v == '':
1099 v = None
1100 return default if v is None else (int(v) * invscale // scale)
1101
1102
1103 def str_or_none(v, default=None):
1104 return default if v is None else compat_str(v)
1105
1106
1107 def str_to_int(int_str):
1108 """ A more relaxed version of int_or_none """
1109 if int_str is None:
1110 return None
1111 int_str = re.sub(r'[,\.\+]', '', int_str)
1112 return int(int_str)
1113
1114
1115 def float_or_none(v, scale=1, invscale=1, default=None):
1116 return default if v is None else (float(v) * invscale / scale)
1117
1118
1119 def parse_duration(s):
1120 if s is None:
1121 return None
1122
1123 s = s.strip()
1124
1125 m = re.match(
1126 r'''(?ix)T?
1127 (?:
1128 (?:(?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*)?
1129 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1130 )?
1131 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?$''', s)
1132 if not m:
1133 return None
1134 res = int(m.group('secs'))
1135 if m.group('mins'):
1136 res += int(m.group('mins')) * 60
1137 if m.group('hours'):
1138 res += int(m.group('hours')) * 60 * 60
1139 if m.group('ms'):
1140 res += float(m.group('ms'))
1141 return res
1142
1143
1144 def prepend_extension(filename, ext):
1145 name, real_ext = os.path.splitext(filename)
1146 return '{0}.{1}{2}'.format(name, ext, real_ext)
1147
1148
1149 def check_executable(exe, args=[]):
1150 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1151 args can be a list of arguments for a short output (like -version) """
1152 try:
1153 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1154 except OSError:
1155 return False
1156 return exe
1157
1158
1159 def get_exe_version(exe, args=['--version'],
1160 version_re=r'version\s+([0-9._-a-zA-Z]+)',
1161 unrecognized='present'):
1162 """ Returns the version of the specified executable,
1163 or False if the executable is not present """
1164 try:
1165 out, err = subprocess.Popen(
1166 [exe] + args,
1167 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1168 except OSError:
1169 return False
1170 firstline = out.partition(b'\n')[0].decode('ascii', 'ignore')
1171 m = re.search(version_re, firstline)
1172 if m:
1173 return m.group(1)
1174 else:
1175 return unrecognized
1176
1177
1178 class PagedList(object):
1179 def __len__(self):
1180 # This is only useful for tests
1181 return len(self.getslice())
1182
1183
1184 class OnDemandPagedList(PagedList):
1185 def __init__(self, pagefunc, pagesize):
1186 self._pagefunc = pagefunc
1187 self._pagesize = pagesize
1188
1189 def getslice(self, start=0, end=None):
1190 res = []
1191 for pagenum in itertools.count(start // self._pagesize):
1192 firstid = pagenum * self._pagesize
1193 nextfirstid = pagenum * self._pagesize + self._pagesize
1194 if start >= nextfirstid:
1195 continue
1196
1197 page_results = list(self._pagefunc(pagenum))
1198
1199 startv = (
1200 start % self._pagesize
1201 if firstid <= start < nextfirstid
1202 else 0)
1203
1204 endv = (
1205 ((end - 1) % self._pagesize) + 1
1206 if (end is not None and firstid <= end <= nextfirstid)
1207 else None)
1208
1209 if startv != 0 or endv is not None:
1210 page_results = page_results[startv:endv]
1211 res.extend(page_results)
1212
1213 # A little optimization - if current page is not "full", ie. does
1214 # not contain page_size videos then we can assume that this page
1215 # is the last one - there are no more ids on further pages -
1216 # i.e. no need to query again.
1217 if len(page_results) + startv < self._pagesize:
1218 break
1219
1220 # If we got the whole page, but the next page is not interesting,
1221 # break out early as well
1222 if end == nextfirstid:
1223 break
1224 return res
1225
1226
1227 class InAdvancePagedList(PagedList):
1228 def __init__(self, pagefunc, pagecount, pagesize):
1229 self._pagefunc = pagefunc
1230 self._pagecount = pagecount
1231 self._pagesize = pagesize
1232
1233 def getslice(self, start=0, end=None):
1234 res = []
1235 start_page = start // self._pagesize
1236 end_page = (
1237 self._pagecount if end is None else (end // self._pagesize + 1))
1238 skip_elems = start - start_page * self._pagesize
1239 only_more = None if end is None else end - start
1240 for pagenum in range(start_page, end_page):
1241 page = list(self._pagefunc(pagenum))
1242 if skip_elems:
1243 page = page[skip_elems:]
1244 skip_elems = None
1245 if only_more is not None:
1246 if len(page) < only_more:
1247 only_more -= len(page)
1248 else:
1249 page = page[:only_more]
1250 res.extend(page)
1251 break
1252 res.extend(page)
1253 return res
1254
1255
1256 def uppercase_escape(s):
1257 unicode_escape = codecs.getdecoder('unicode_escape')
1258 return re.sub(
1259 r'\\U[0-9a-fA-F]{8}',
1260 lambda m: unicode_escape(m.group(0))[0],
1261 s)
1262
1263
1264 def escape_rfc3986(s):
1265 """Escape non-ASCII characters as suggested by RFC 3986"""
1266 if sys.version_info < (3, 0) and isinstance(s, unicode):
1267 s = s.encode('utf-8')
1268 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1269
1270
1271 def escape_url(url):
1272 """Escape URL as suggested by RFC 3986"""
1273 url_parsed = compat_urllib_parse_urlparse(url)
1274 return url_parsed._replace(
1275 path=escape_rfc3986(url_parsed.path),
1276 params=escape_rfc3986(url_parsed.params),
1277 query=escape_rfc3986(url_parsed.query),
1278 fragment=escape_rfc3986(url_parsed.fragment)
1279 ).geturl()
1280
1281 try:
1282 struct.pack('!I', 0)
1283 except TypeError:
1284 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1285 def struct_pack(spec, *args):
1286 if isinstance(spec, compat_str):
1287 spec = spec.encode('ascii')
1288 return struct.pack(spec, *args)
1289
1290 def struct_unpack(spec, *args):
1291 if isinstance(spec, compat_str):
1292 spec = spec.encode('ascii')
1293 return struct.unpack(spec, *args)
1294 else:
1295 struct_pack = struct.pack
1296 struct_unpack = struct.unpack
1297
1298
1299 def read_batch_urls(batch_fd):
1300 def fixup(url):
1301 if not isinstance(url, compat_str):
1302 url = url.decode('utf-8', 'replace')
1303 BOM_UTF8 = '\xef\xbb\xbf'
1304 if url.startswith(BOM_UTF8):
1305 url = url[len(BOM_UTF8):]
1306 url = url.strip()
1307 if url.startswith(('#', ';', ']')):
1308 return False
1309 return url
1310
1311 with contextlib.closing(batch_fd) as fd:
1312 return [url for url in map(fixup, fd) if url]
1313
1314
1315 def urlencode_postdata(*args, **kargs):
1316 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1317
1318
1319 try:
1320 etree_iter = xml.etree.ElementTree.Element.iter
1321 except AttributeError: # Python <=2.6
1322 etree_iter = lambda n: n.findall('.//*')
1323
1324
1325 def parse_xml(s):
1326 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1327 def doctype(self, name, pubid, system):
1328 pass # Ignore doctypes
1329
1330 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1331 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1332 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1333 # Fix up XML parser in Python 2.x
1334 if sys.version_info < (3, 0):
1335 for n in etree_iter(tree):
1336 if n.text is not None:
1337 if not isinstance(n.text, compat_str):
1338 n.text = n.text.decode('utf-8')
1339 return tree
1340
1341
1342 US_RATINGS = {
1343 'G': 0,
1344 'PG': 10,
1345 'PG-13': 13,
1346 'R': 16,
1347 'NC': 18,
1348 }
1349
1350
1351 def parse_age_limit(s):
1352 if s is None:
1353 return None
1354 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1355 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1356
1357
1358 def strip_jsonp(code):
1359 return re.sub(
1360 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1361
1362
1363 def js_to_json(code):
1364 def fix_kv(m):
1365 v = m.group(0)
1366 if v in ('true', 'false', 'null'):
1367 return v
1368 if v.startswith('"'):
1369 return v
1370 if v.startswith("'"):
1371 v = v[1:-1]
1372 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1373 '\\\\': '\\\\',
1374 "\\'": "'",
1375 '"': '\\"',
1376 }[m.group(0)], v)
1377 return '"%s"' % v
1378
1379 res = re.sub(r'''(?x)
1380 "(?:[^"\\]*(?:\\\\|\\")?)*"|
1381 '(?:[^'\\]*(?:\\\\|\\')?)*'|
1382 [a-zA-Z_][a-zA-Z_0-9]*
1383 ''', fix_kv, code)
1384 res = re.sub(r',(\s*\])', lambda m: m.group(1), res)
1385 return res
1386
1387
1388 def qualities(quality_ids):
1389 """ Get a numeric quality value out of a list of possible values """
1390 def q(qid):
1391 try:
1392 return quality_ids.index(qid)
1393 except ValueError:
1394 return -1
1395 return q
1396
1397
1398 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1399
1400
1401 def limit_length(s, length):
1402 """ Add ellipses to overly long strings """
1403 if s is None:
1404 return None
1405 ELLIPSES = '...'
1406 if len(s) > length:
1407 return s[:length - len(ELLIPSES)] + ELLIPSES
1408 return s
1409
1410
1411 def version_tuple(v):
1412 return [int(e) for e in v.split('.')]
1413
1414
1415 def is_outdated_version(version, limit, assume_new=True):
1416 if not version:
1417 return not assume_new
1418 try:
1419 return version_tuple(version) < version_tuple(limit)
1420 except ValueError:
1421 return not assume_new