]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
[vimeo] Fix password protected videos again (#5082)
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import calendar
7 import codecs
8 import contextlib
9 import ctypes
10 import datetime
11 import email.utils
12 import errno
13 import functools
14 import gzip
15 import itertools
16 import io
17 import json
18 import locale
19 import math
20 import operator
21 import os
22 import pipes
23 import platform
24 import re
25 import ssl
26 import socket
27 import struct
28 import subprocess
29 import sys
30 import tempfile
31 import traceback
32 import xml.etree.ElementTree
33 import zlib
34
35 from .compat import (
36 compat_basestring,
37 compat_chr,
38 compat_html_entities,
39 compat_http_client,
40 compat_kwargs,
41 compat_parse_qs,
42 compat_socket_create_connection,
43 compat_str,
44 compat_urllib_error,
45 compat_urllib_parse,
46 compat_urllib_parse_urlparse,
47 compat_urllib_request,
48 compat_urlparse,
49 shlex_quote,
50 )
51
52
53 # This is not clearly defined otherwise
54 compiled_regex_type = type(re.compile(''))
55
56 std_headers = {
57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
62 }
63
64
65 ENGLISH_MONTH_NAMES = [
66 'January', 'February', 'March', 'April', 'May', 'June',
67 'July', 'August', 'September', 'October', 'November', 'December']
68
69
70 def preferredencoding():
71 """Get preferred encoding.
72
73 Returns the best encoding scheme for the system, based on
74 locale.getpreferredencoding() and some further tweaks.
75 """
76 try:
77 pref = locale.getpreferredencoding()
78 'TEST'.encode(pref)
79 except Exception:
80 pref = 'UTF-8'
81
82 return pref
83
84
85 def write_json_file(obj, fn):
86 """ Encode obj as JSON and write it to fn, atomically if possible """
87
88 fn = encodeFilename(fn)
89 if sys.version_info < (3, 0) and sys.platform != 'win32':
90 encoding = get_filesystem_encoding()
91 # os.path.basename returns a bytes object, but NamedTemporaryFile
92 # will fail if the filename contains non ascii characters unless we
93 # use a unicode object
94 path_basename = lambda f: os.path.basename(fn).decode(encoding)
95 # the same for os.path.dirname
96 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
97 else:
98 path_basename = os.path.basename
99 path_dirname = os.path.dirname
100
101 args = {
102 'suffix': '.tmp',
103 'prefix': path_basename(fn) + '.',
104 'dir': path_dirname(fn),
105 'delete': False,
106 }
107
108 # In Python 2.x, json.dump expects a bytestream.
109 # In Python 3.x, it writes to a character stream
110 if sys.version_info < (3, 0):
111 args['mode'] = 'wb'
112 else:
113 args.update({
114 'mode': 'w',
115 'encoding': 'utf-8',
116 })
117
118 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
119
120 try:
121 with tf:
122 json.dump(obj, tf)
123 if sys.platform == 'win32':
124 # Need to remove existing file on Windows, else os.rename raises
125 # WindowsError or FileExistsError.
126 try:
127 os.unlink(fn)
128 except OSError:
129 pass
130 os.rename(tf.name, fn)
131 except Exception:
132 try:
133 os.remove(tf.name)
134 except OSError:
135 pass
136 raise
137
138
139 if sys.version_info >= (2, 7):
140 def find_xpath_attr(node, xpath, key, val):
141 """ Find the xpath xpath[@key=val] """
142 assert re.match(r'^[a-zA-Z-]+$', key)
143 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
144 expr = xpath + "[@%s='%s']" % (key, val)
145 return node.find(expr)
146 else:
147 def find_xpath_attr(node, xpath, key, val):
148 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
149 # .//node does not match if a node is a direct child of . !
150 if isinstance(xpath, compat_str):
151 xpath = xpath.encode('ascii')
152
153 for f in node.findall(xpath):
154 if f.attrib.get(key) == val:
155 return f
156 return None
157
158 # On python2.6 the xml.etree.ElementTree.Element methods don't support
159 # the namespace parameter
160
161
162 def xpath_with_ns(path, ns_map):
163 components = [c.split(':') for c in path.split('/')]
164 replaced = []
165 for c in components:
166 if len(c) == 1:
167 replaced.append(c[0])
168 else:
169 ns, tag = c
170 replaced.append('{%s}%s' % (ns_map[ns], tag))
171 return '/'.join(replaced)
172
173
174 def xpath_text(node, xpath, name=None, fatal=False):
175 if sys.version_info < (2, 7): # Crazy 2.6
176 xpath = xpath.encode('ascii')
177
178 n = node.find(xpath)
179 if n is None or n.text is None:
180 if fatal:
181 name = xpath if name is None else name
182 raise ExtractorError('Could not find XML element %s' % name)
183 else:
184 return None
185 return n.text
186
187
188 def get_element_by_id(id, html):
189 """Return the content of the tag with the specified ID in the passed HTML document"""
190 return get_element_by_attribute("id", id, html)
191
192
193 def get_element_by_attribute(attribute, value, html):
194 """Return the content of the tag with the specified attribute in the passed HTML document"""
195
196 m = re.search(r'''(?xs)
197 <([a-zA-Z0-9:._-]+)
198 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
199 \s+%s=['"]?%s['"]?
200 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
201 \s*>
202 (?P<content>.*?)
203 </\1>
204 ''' % (re.escape(attribute), re.escape(value)), html)
205
206 if not m:
207 return None
208 res = m.group('content')
209
210 if res.startswith('"') or res.startswith("'"):
211 res = res[1:-1]
212
213 return unescapeHTML(res)
214
215
216 def clean_html(html):
217 """Clean an HTML snippet into a readable string"""
218
219 if html is None: # Convenience for sanitizing descriptions etc.
220 return html
221
222 # Newline vs <br />
223 html = html.replace('\n', ' ')
224 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
225 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
226 # Strip html tags
227 html = re.sub('<.*?>', '', html)
228 # Replace html entities
229 html = unescapeHTML(html)
230 return html.strip()
231
232
233 def sanitize_open(filename, open_mode):
234 """Try to open the given filename, and slightly tweak it if this fails.
235
236 Attempts to open the given filename. If this fails, it tries to change
237 the filename slightly, step by step, until it's either able to open it
238 or it fails and raises a final exception, like the standard open()
239 function.
240
241 It returns the tuple (stream, definitive_file_name).
242 """
243 try:
244 if filename == '-':
245 if sys.platform == 'win32':
246 import msvcrt
247 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
248 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
249 stream = open(encodeFilename(filename), open_mode)
250 return (stream, filename)
251 except (IOError, OSError) as err:
252 if err.errno in (errno.EACCES,):
253 raise
254
255 # In case of error, try to remove win32 forbidden chars
256 alt_filename = sanitize_path(filename)
257 if alt_filename == filename:
258 raise
259 else:
260 # An exception here should be caught in the caller
261 stream = open(encodeFilename(alt_filename), open_mode)
262 return (stream, alt_filename)
263
264
265 def timeconvert(timestr):
266 """Convert RFC 2822 defined time string into system timestamp"""
267 timestamp = None
268 timetuple = email.utils.parsedate_tz(timestr)
269 if timetuple is not None:
270 timestamp = email.utils.mktime_tz(timetuple)
271 return timestamp
272
273
274 def sanitize_filename(s, restricted=False, is_id=False):
275 """Sanitizes a string so it could be used as part of a filename.
276 If restricted is set, use a stricter subset of allowed characters.
277 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
278 """
279 def replace_insane(char):
280 if char == '?' or ord(char) < 32 or ord(char) == 127:
281 return ''
282 elif char == '"':
283 return '' if restricted else '\''
284 elif char == ':':
285 return '_-' if restricted else ' -'
286 elif char in '\\/|*<>':
287 return '_'
288 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
289 return '_'
290 if restricted and ord(char) > 127:
291 return '_'
292 return char
293
294 # Handle timestamps
295 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
296 result = ''.join(map(replace_insane, s))
297 if not is_id:
298 while '__' in result:
299 result = result.replace('__', '_')
300 result = result.strip('_')
301 # Common case of "Foreign band name - English song title"
302 if restricted and result.startswith('-_'):
303 result = result[2:]
304 if result.startswith('-'):
305 result = '_' + result[len('-'):]
306 result = result.lstrip('.')
307 if not result:
308 result = '_'
309 return result
310
311
312 def sanitize_path(s):
313 """Sanitizes and normalizes path on Windows"""
314 if sys.platform != 'win32':
315 return s
316 drive_or_unc, _ = os.path.splitdrive(s)
317 if sys.version_info < (2, 7) and not drive_or_unc:
318 drive_or_unc, _ = os.path.splitunc(s)
319 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
320 if drive_or_unc:
321 norm_path.pop(0)
322 sanitized_path = [
323 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
324 for path_part in norm_path]
325 if drive_or_unc:
326 sanitized_path.insert(0, drive_or_unc + os.path.sep)
327 return os.path.join(*sanitized_path)
328
329
330 def sanitize_url_path_consecutive_slashes(url):
331 """Collapses consecutive slashes in URLs' path"""
332 parsed_url = list(compat_urlparse.urlparse(url))
333 parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2])
334 return compat_urlparse.urlunparse(parsed_url)
335
336
337 def orderedSet(iterable):
338 """ Remove all duplicates from the input iterable """
339 res = []
340 for el in iterable:
341 if el not in res:
342 res.append(el)
343 return res
344
345
346 def _htmlentity_transform(entity):
347 """Transforms an HTML entity to a character."""
348 # Known non-numeric HTML entity
349 if entity in compat_html_entities.name2codepoint:
350 return compat_chr(compat_html_entities.name2codepoint[entity])
351
352 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
353 if mobj is not None:
354 numstr = mobj.group(1)
355 if numstr.startswith('x'):
356 base = 16
357 numstr = '0%s' % numstr
358 else:
359 base = 10
360 return compat_chr(int(numstr, base))
361
362 # Unknown entity in name, return its literal representation
363 return ('&%s;' % entity)
364
365
366 def unescapeHTML(s):
367 if s is None:
368 return None
369 assert type(s) == compat_str
370
371 return re.sub(
372 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
373
374
375 def get_subprocess_encoding():
376 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
377 # For subprocess calls, encode with locale encoding
378 # Refer to http://stackoverflow.com/a/9951851/35070
379 encoding = preferredencoding()
380 else:
381 encoding = sys.getfilesystemencoding()
382 if encoding is None:
383 encoding = 'utf-8'
384 return encoding
385
386
387 def encodeFilename(s, for_subprocess=False):
388 """
389 @param s The name of the file
390 """
391
392 assert type(s) == compat_str
393
394 # Python 3 has a Unicode API
395 if sys.version_info >= (3, 0):
396 return s
397
398 # Pass '' directly to use Unicode APIs on Windows 2000 and up
399 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
400 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
401 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
402 return s
403
404 return s.encode(get_subprocess_encoding(), 'ignore')
405
406
407 def decodeFilename(b, for_subprocess=False):
408
409 if sys.version_info >= (3, 0):
410 return b
411
412 if not isinstance(b, bytes):
413 return b
414
415 return b.decode(get_subprocess_encoding(), 'ignore')
416
417
418 def encodeArgument(s):
419 if not isinstance(s, compat_str):
420 # Legacy code that uses byte strings
421 # Uncomment the following line after fixing all post processors
422 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
423 s = s.decode('ascii')
424 return encodeFilename(s, True)
425
426
427 def decodeArgument(b):
428 return decodeFilename(b, True)
429
430
431 def decodeOption(optval):
432 if optval is None:
433 return optval
434 if isinstance(optval, bytes):
435 optval = optval.decode(preferredencoding())
436
437 assert isinstance(optval, compat_str)
438 return optval
439
440
441 def formatSeconds(secs):
442 if secs > 3600:
443 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
444 elif secs > 60:
445 return '%d:%02d' % (secs // 60, secs % 60)
446 else:
447 return '%d' % secs
448
449
450 def make_HTTPS_handler(params, **kwargs):
451 opts_no_check_certificate = params.get('nocheckcertificate', False)
452 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
453 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
454 if opts_no_check_certificate:
455 context.check_hostname = False
456 context.verify_mode = ssl.CERT_NONE
457 try:
458 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
459 except TypeError:
460 # Python 2.7.8
461 # (create_default_context present but HTTPSHandler has no context=)
462 pass
463
464 if sys.version_info < (3, 2):
465 return YoutubeDLHTTPSHandler(params, **kwargs)
466 else: # Python < 3.4
467 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
468 context.verify_mode = (ssl.CERT_NONE
469 if opts_no_check_certificate
470 else ssl.CERT_REQUIRED)
471 context.set_default_verify_paths()
472 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
473
474
475 def bug_reports_message():
476 if ytdl_is_updateable():
477 update_cmd = 'type youtube-dl -U to update'
478 else:
479 update_cmd = 'see https://yt-dl.org/update on how to update'
480 msg = '; please report this issue on https://yt-dl.org/bug .'
481 msg += ' Make sure you are using the latest version; %s.' % update_cmd
482 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
483 return msg
484
485
486 class ExtractorError(Exception):
487 """Error during info extraction."""
488
489 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
490 """ tb, if given, is the original traceback (so that it can be printed out).
491 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
492 """
493
494 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
495 expected = True
496 if video_id is not None:
497 msg = video_id + ': ' + msg
498 if cause:
499 msg += ' (caused by %r)' % cause
500 if not expected:
501 msg += bug_reports_message()
502 super(ExtractorError, self).__init__(msg)
503
504 self.traceback = tb
505 self.exc_info = sys.exc_info() # preserve original exception
506 self.cause = cause
507 self.video_id = video_id
508
509 def format_traceback(self):
510 if self.traceback is None:
511 return None
512 return ''.join(traceback.format_tb(self.traceback))
513
514
515 class UnsupportedError(ExtractorError):
516 def __init__(self, url):
517 super(UnsupportedError, self).__init__(
518 'Unsupported URL: %s' % url, expected=True)
519 self.url = url
520
521
522 class RegexNotFoundError(ExtractorError):
523 """Error when a regex didn't match"""
524 pass
525
526
527 class DownloadError(Exception):
528 """Download Error exception.
529
530 This exception may be thrown by FileDownloader objects if they are not
531 configured to continue on errors. They will contain the appropriate
532 error message.
533 """
534
535 def __init__(self, msg, exc_info=None):
536 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
537 super(DownloadError, self).__init__(msg)
538 self.exc_info = exc_info
539
540
541 class SameFileError(Exception):
542 """Same File exception.
543
544 This exception will be thrown by FileDownloader objects if they detect
545 multiple files would have to be downloaded to the same file on disk.
546 """
547 pass
548
549
550 class PostProcessingError(Exception):
551 """Post Processing exception.
552
553 This exception may be raised by PostProcessor's .run() method to
554 indicate an error in the postprocessing task.
555 """
556
557 def __init__(self, msg):
558 self.msg = msg
559
560
561 class MaxDownloadsReached(Exception):
562 """ --max-downloads limit has been reached. """
563 pass
564
565
566 class UnavailableVideoError(Exception):
567 """Unavailable Format exception.
568
569 This exception will be thrown when a video is requested
570 in a format that is not available for that video.
571 """
572 pass
573
574
575 class ContentTooShortError(Exception):
576 """Content Too Short exception.
577
578 This exception may be raised by FileDownloader objects when a file they
579 download is too small for what the server announced first, indicating
580 the connection was probably interrupted.
581 """
582 # Both in bytes
583 downloaded = None
584 expected = None
585
586 def __init__(self, downloaded, expected):
587 self.downloaded = downloaded
588 self.expected = expected
589
590
591 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
592 hc = http_class(*args, **kwargs)
593 source_address = ydl_handler._params.get('source_address')
594 if source_address is not None:
595 sa = (source_address, 0)
596 if hasattr(hc, 'source_address'): # Python 2.7+
597 hc.source_address = sa
598 else: # Python 2.6
599 def _hc_connect(self, *args, **kwargs):
600 sock = compat_socket_create_connection(
601 (self.host, self.port), self.timeout, sa)
602 if is_https:
603 self.sock = ssl.wrap_socket(
604 sock, self.key_file, self.cert_file,
605 ssl_version=ssl.PROTOCOL_TLSv1)
606 else:
607 self.sock = sock
608 hc.connect = functools.partial(_hc_connect, hc)
609
610 return hc
611
612
613 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
614 """Handler for HTTP requests and responses.
615
616 This class, when installed with an OpenerDirector, automatically adds
617 the standard headers to every HTTP request and handles gzipped and
618 deflated responses from web servers. If compression is to be avoided in
619 a particular request, the original request in the program code only has
620 to include the HTTP header "Youtubedl-No-Compression", which will be
621 removed before making the real request.
622
623 Part of this code was copied from:
624
625 http://techknack.net/python-urllib2-handlers/
626
627 Andrew Rowls, the author of that code, agreed to release it to the
628 public domain.
629 """
630
631 def __init__(self, params, *args, **kwargs):
632 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
633 self._params = params
634
635 def http_open(self, req):
636 return self.do_open(functools.partial(
637 _create_http_connection, self, compat_http_client.HTTPConnection, False),
638 req)
639
640 @staticmethod
641 def deflate(data):
642 try:
643 return zlib.decompress(data, -zlib.MAX_WBITS)
644 except zlib.error:
645 return zlib.decompress(data)
646
647 @staticmethod
648 def addinfourl_wrapper(stream, headers, url, code):
649 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
650 return compat_urllib_request.addinfourl(stream, headers, url, code)
651 ret = compat_urllib_request.addinfourl(stream, headers, url)
652 ret.code = code
653 return ret
654
655 def http_request(self, req):
656 for h, v in std_headers.items():
657 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
658 # The dict keys are capitalized because of this bug by urllib
659 if h.capitalize() not in req.headers:
660 req.add_header(h, v)
661 if 'Youtubedl-no-compression' in req.headers:
662 if 'Accept-encoding' in req.headers:
663 del req.headers['Accept-encoding']
664 del req.headers['Youtubedl-no-compression']
665
666 if sys.version_info < (2, 7) and '#' in req.get_full_url():
667 # Python 2.6 is brain-dead when it comes to fragments
668 req._Request__original = req._Request__original.partition('#')[0]
669 req._Request__r_type = req._Request__r_type.partition('#')[0]
670
671 return req
672
673 def http_response(self, req, resp):
674 old_resp = resp
675 # gzip
676 if resp.headers.get('Content-encoding', '') == 'gzip':
677 content = resp.read()
678 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
679 try:
680 uncompressed = io.BytesIO(gz.read())
681 except IOError as original_ioerror:
682 # There may be junk add the end of the file
683 # See http://stackoverflow.com/q/4928560/35070 for details
684 for i in range(1, 1024):
685 try:
686 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
687 uncompressed = io.BytesIO(gz.read())
688 except IOError:
689 continue
690 break
691 else:
692 raise original_ioerror
693 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
694 resp.msg = old_resp.msg
695 # deflate
696 if resp.headers.get('Content-encoding', '') == 'deflate':
697 gz = io.BytesIO(self.deflate(resp.read()))
698 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
699 resp.msg = old_resp.msg
700 return resp
701
702 https_request = http_request
703 https_response = http_response
704
705
706 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
707 def __init__(self, params, https_conn_class=None, *args, **kwargs):
708 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
709 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
710 self._params = params
711
712 def https_open(self, req):
713 kwargs = {}
714 if hasattr(self, '_context'): # python > 2.6
715 kwargs['context'] = self._context
716 if hasattr(self, '_check_hostname'): # python 3.x
717 kwargs['check_hostname'] = self._check_hostname
718 return self.do_open(functools.partial(
719 _create_http_connection, self, self._https_conn_class, True),
720 req, **kwargs)
721
722
723 def parse_iso8601(date_str, delimiter='T', timezone=None):
724 """ Return a UNIX timestamp from the given date """
725
726 if date_str is None:
727 return None
728
729 if timezone is None:
730 m = re.search(
731 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
732 date_str)
733 if not m:
734 timezone = datetime.timedelta()
735 else:
736 date_str = date_str[:-len(m.group(0))]
737 if not m.group('sign'):
738 timezone = datetime.timedelta()
739 else:
740 sign = 1 if m.group('sign') == '+' else -1
741 timezone = datetime.timedelta(
742 hours=sign * int(m.group('hours')),
743 minutes=sign * int(m.group('minutes')))
744 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
745 dt = datetime.datetime.strptime(date_str, date_format) - timezone
746 return calendar.timegm(dt.timetuple())
747
748
749 def unified_strdate(date_str, day_first=True):
750 """Return a string with the date in the format YYYYMMDD"""
751
752 if date_str is None:
753 return None
754 upload_date = None
755 # Replace commas
756 date_str = date_str.replace(',', ' ')
757 # %z (UTC offset) is only supported in python>=3.2
758 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
759 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
760 # Remove AM/PM + timezone
761 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
762
763 format_expressions = [
764 '%d %B %Y',
765 '%d %b %Y',
766 '%B %d %Y',
767 '%b %d %Y',
768 '%b %dst %Y %I:%M%p',
769 '%b %dnd %Y %I:%M%p',
770 '%b %dth %Y %I:%M%p',
771 '%Y %m %d',
772 '%Y-%m-%d',
773 '%Y/%m/%d',
774 '%Y/%m/%d %H:%M:%S',
775 '%Y-%m-%d %H:%M:%S',
776 '%Y-%m-%d %H:%M:%S.%f',
777 '%d.%m.%Y %H:%M',
778 '%d.%m.%Y %H.%M',
779 '%Y-%m-%dT%H:%M:%SZ',
780 '%Y-%m-%dT%H:%M:%S.%fZ',
781 '%Y-%m-%dT%H:%M:%S.%f0Z',
782 '%Y-%m-%dT%H:%M:%S',
783 '%Y-%m-%dT%H:%M:%S.%f',
784 '%Y-%m-%dT%H:%M',
785 ]
786 if day_first:
787 format_expressions.extend([
788 '%d-%m-%Y',
789 '%d.%m.%Y',
790 '%d/%m/%Y',
791 '%d/%m/%y',
792 '%d/%m/%Y %H:%M:%S',
793 ])
794 else:
795 format_expressions.extend([
796 '%m-%d-%Y',
797 '%m.%d.%Y',
798 '%m/%d/%Y',
799 '%m/%d/%y',
800 '%m/%d/%Y %H:%M:%S',
801 ])
802 for expression in format_expressions:
803 try:
804 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
805 except ValueError:
806 pass
807 if upload_date is None:
808 timetuple = email.utils.parsedate_tz(date_str)
809 if timetuple:
810 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
811 return upload_date
812
813
814 def determine_ext(url, default_ext='unknown_video'):
815 if url is None:
816 return default_ext
817 guess = url.partition('?')[0].rpartition('.')[2]
818 if re.match(r'^[A-Za-z0-9]+$', guess):
819 return guess
820 else:
821 return default_ext
822
823
824 def subtitles_filename(filename, sub_lang, sub_format):
825 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
826
827
828 def date_from_str(date_str):
829 """
830 Return a datetime object from a string in the format YYYYMMDD or
831 (now|today)[+-][0-9](day|week|month|year)(s)?"""
832 today = datetime.date.today()
833 if date_str in ('now', 'today'):
834 return today
835 if date_str == 'yesterday':
836 return today - datetime.timedelta(days=1)
837 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
838 if match is not None:
839 sign = match.group('sign')
840 time = int(match.group('time'))
841 if sign == '-':
842 time = -time
843 unit = match.group('unit')
844 # A bad aproximation?
845 if unit == 'month':
846 unit = 'day'
847 time *= 30
848 elif unit == 'year':
849 unit = 'day'
850 time *= 365
851 unit += 's'
852 delta = datetime.timedelta(**{unit: time})
853 return today + delta
854 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
855
856
857 def hyphenate_date(date_str):
858 """
859 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
860 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
861 if match is not None:
862 return '-'.join(match.groups())
863 else:
864 return date_str
865
866
867 class DateRange(object):
868 """Represents a time interval between two dates"""
869
870 def __init__(self, start=None, end=None):
871 """start and end must be strings in the format accepted by date"""
872 if start is not None:
873 self.start = date_from_str(start)
874 else:
875 self.start = datetime.datetime.min.date()
876 if end is not None:
877 self.end = date_from_str(end)
878 else:
879 self.end = datetime.datetime.max.date()
880 if self.start > self.end:
881 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
882
883 @classmethod
884 def day(cls, day):
885 """Returns a range that only contains the given day"""
886 return cls(day, day)
887
888 def __contains__(self, date):
889 """Check if the date is in the range"""
890 if not isinstance(date, datetime.date):
891 date = date_from_str(date)
892 return self.start <= date <= self.end
893
894 def __str__(self):
895 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
896
897
898 def platform_name():
899 """ Returns the platform name as a compat_str """
900 res = platform.platform()
901 if isinstance(res, bytes):
902 res = res.decode(preferredencoding())
903
904 assert isinstance(res, compat_str)
905 return res
906
907
908 def _windows_write_string(s, out):
909 """ Returns True if the string was written using special methods,
910 False if it has yet to be written out."""
911 # Adapted from http://stackoverflow.com/a/3259271/35070
912
913 import ctypes
914 import ctypes.wintypes
915
916 WIN_OUTPUT_IDS = {
917 1: -11,
918 2: -12,
919 }
920
921 try:
922 fileno = out.fileno()
923 except AttributeError:
924 # If the output stream doesn't have a fileno, it's virtual
925 return False
926 except io.UnsupportedOperation:
927 # Some strange Windows pseudo files?
928 return False
929 if fileno not in WIN_OUTPUT_IDS:
930 return False
931
932 GetStdHandle = ctypes.WINFUNCTYPE(
933 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
934 (b"GetStdHandle", ctypes.windll.kernel32))
935 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
936
937 WriteConsoleW = ctypes.WINFUNCTYPE(
938 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
939 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
940 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
941 written = ctypes.wintypes.DWORD(0)
942
943 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
944 FILE_TYPE_CHAR = 0x0002
945 FILE_TYPE_REMOTE = 0x8000
946 GetConsoleMode = ctypes.WINFUNCTYPE(
947 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
948 ctypes.POINTER(ctypes.wintypes.DWORD))(
949 (b"GetConsoleMode", ctypes.windll.kernel32))
950 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
951
952 def not_a_console(handle):
953 if handle == INVALID_HANDLE_VALUE or handle is None:
954 return True
955 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
956 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
957
958 if not_a_console(h):
959 return False
960
961 def next_nonbmp_pos(s):
962 try:
963 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
964 except StopIteration:
965 return len(s)
966
967 while s:
968 count = min(next_nonbmp_pos(s), 1024)
969
970 ret = WriteConsoleW(
971 h, s, count if count else 2, ctypes.byref(written), None)
972 if ret == 0:
973 raise OSError('Failed to write string')
974 if not count: # We just wrote a non-BMP character
975 assert written.value == 2
976 s = s[1:]
977 else:
978 assert written.value > 0
979 s = s[written.value:]
980 return True
981
982
983 def write_string(s, out=None, encoding=None):
984 if out is None:
985 out = sys.stderr
986 assert type(s) == compat_str
987
988 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
989 if _windows_write_string(s, out):
990 return
991
992 if ('b' in getattr(out, 'mode', '') or
993 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
994 byt = s.encode(encoding or preferredencoding(), 'ignore')
995 out.write(byt)
996 elif hasattr(out, 'buffer'):
997 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
998 byt = s.encode(enc, 'ignore')
999 out.buffer.write(byt)
1000 else:
1001 out.write(s)
1002 out.flush()
1003
1004
1005 def bytes_to_intlist(bs):
1006 if not bs:
1007 return []
1008 if isinstance(bs[0], int): # Python 3
1009 return list(bs)
1010 else:
1011 return [ord(c) for c in bs]
1012
1013
1014 def intlist_to_bytes(xs):
1015 if not xs:
1016 return b''
1017 return struct_pack('%dB' % len(xs), *xs)
1018
1019
1020 # Cross-platform file locking
1021 if sys.platform == 'win32':
1022 import ctypes.wintypes
1023 import msvcrt
1024
1025 class OVERLAPPED(ctypes.Structure):
1026 _fields_ = [
1027 ('Internal', ctypes.wintypes.LPVOID),
1028 ('InternalHigh', ctypes.wintypes.LPVOID),
1029 ('Offset', ctypes.wintypes.DWORD),
1030 ('OffsetHigh', ctypes.wintypes.DWORD),
1031 ('hEvent', ctypes.wintypes.HANDLE),
1032 ]
1033
1034 kernel32 = ctypes.windll.kernel32
1035 LockFileEx = kernel32.LockFileEx
1036 LockFileEx.argtypes = [
1037 ctypes.wintypes.HANDLE, # hFile
1038 ctypes.wintypes.DWORD, # dwFlags
1039 ctypes.wintypes.DWORD, # dwReserved
1040 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1041 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1042 ctypes.POINTER(OVERLAPPED) # Overlapped
1043 ]
1044 LockFileEx.restype = ctypes.wintypes.BOOL
1045 UnlockFileEx = kernel32.UnlockFileEx
1046 UnlockFileEx.argtypes = [
1047 ctypes.wintypes.HANDLE, # hFile
1048 ctypes.wintypes.DWORD, # dwReserved
1049 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1050 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1051 ctypes.POINTER(OVERLAPPED) # Overlapped
1052 ]
1053 UnlockFileEx.restype = ctypes.wintypes.BOOL
1054 whole_low = 0xffffffff
1055 whole_high = 0x7fffffff
1056
1057 def _lock_file(f, exclusive):
1058 overlapped = OVERLAPPED()
1059 overlapped.Offset = 0
1060 overlapped.OffsetHigh = 0
1061 overlapped.hEvent = 0
1062 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1063 handle = msvcrt.get_osfhandle(f.fileno())
1064 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1065 whole_low, whole_high, f._lock_file_overlapped_p):
1066 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1067
1068 def _unlock_file(f):
1069 assert f._lock_file_overlapped_p
1070 handle = msvcrt.get_osfhandle(f.fileno())
1071 if not UnlockFileEx(handle, 0,
1072 whole_low, whole_high, f._lock_file_overlapped_p):
1073 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1074
1075 else:
1076 import fcntl
1077
1078 def _lock_file(f, exclusive):
1079 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1080
1081 def _unlock_file(f):
1082 fcntl.flock(f, fcntl.LOCK_UN)
1083
1084
1085 class locked_file(object):
1086 def __init__(self, filename, mode, encoding=None):
1087 assert mode in ['r', 'a', 'w']
1088 self.f = io.open(filename, mode, encoding=encoding)
1089 self.mode = mode
1090
1091 def __enter__(self):
1092 exclusive = self.mode != 'r'
1093 try:
1094 _lock_file(self.f, exclusive)
1095 except IOError:
1096 self.f.close()
1097 raise
1098 return self
1099
1100 def __exit__(self, etype, value, traceback):
1101 try:
1102 _unlock_file(self.f)
1103 finally:
1104 self.f.close()
1105
1106 def __iter__(self):
1107 return iter(self.f)
1108
1109 def write(self, *args):
1110 return self.f.write(*args)
1111
1112 def read(self, *args):
1113 return self.f.read(*args)
1114
1115
1116 def get_filesystem_encoding():
1117 encoding = sys.getfilesystemencoding()
1118 return encoding if encoding is not None else 'utf-8'
1119
1120
1121 def shell_quote(args):
1122 quoted_args = []
1123 encoding = get_filesystem_encoding()
1124 for a in args:
1125 if isinstance(a, bytes):
1126 # We may get a filename encoded with 'encodeFilename'
1127 a = a.decode(encoding)
1128 quoted_args.append(pipes.quote(a))
1129 return ' '.join(quoted_args)
1130
1131
1132 def smuggle_url(url, data):
1133 """ Pass additional data in a URL for internal use. """
1134
1135 sdata = compat_urllib_parse.urlencode(
1136 {'__youtubedl_smuggle': json.dumps(data)})
1137 return url + '#' + sdata
1138
1139
1140 def unsmuggle_url(smug_url, default=None):
1141 if '#__youtubedl_smuggle' not in smug_url:
1142 return smug_url, default
1143 url, _, sdata = smug_url.rpartition('#')
1144 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1145 data = json.loads(jsond)
1146 return url, data
1147
1148
1149 def format_bytes(bytes):
1150 if bytes is None:
1151 return 'N/A'
1152 if type(bytes) is str:
1153 bytes = float(bytes)
1154 if bytes == 0.0:
1155 exponent = 0
1156 else:
1157 exponent = int(math.log(bytes, 1024.0))
1158 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1159 converted = float(bytes) / float(1024 ** exponent)
1160 return '%.2f%s' % (converted, suffix)
1161
1162
1163 def parse_filesize(s):
1164 if s is None:
1165 return None
1166
1167 # The lower-case forms are of course incorrect and inofficial,
1168 # but we support those too
1169 _UNIT_TABLE = {
1170 'B': 1,
1171 'b': 1,
1172 'KiB': 1024,
1173 'KB': 1000,
1174 'kB': 1024,
1175 'Kb': 1000,
1176 'MiB': 1024 ** 2,
1177 'MB': 1000 ** 2,
1178 'mB': 1024 ** 2,
1179 'Mb': 1000 ** 2,
1180 'GiB': 1024 ** 3,
1181 'GB': 1000 ** 3,
1182 'gB': 1024 ** 3,
1183 'Gb': 1000 ** 3,
1184 'TiB': 1024 ** 4,
1185 'TB': 1000 ** 4,
1186 'tB': 1024 ** 4,
1187 'Tb': 1000 ** 4,
1188 'PiB': 1024 ** 5,
1189 'PB': 1000 ** 5,
1190 'pB': 1024 ** 5,
1191 'Pb': 1000 ** 5,
1192 'EiB': 1024 ** 6,
1193 'EB': 1000 ** 6,
1194 'eB': 1024 ** 6,
1195 'Eb': 1000 ** 6,
1196 'ZiB': 1024 ** 7,
1197 'ZB': 1000 ** 7,
1198 'zB': 1024 ** 7,
1199 'Zb': 1000 ** 7,
1200 'YiB': 1024 ** 8,
1201 'YB': 1000 ** 8,
1202 'yB': 1024 ** 8,
1203 'Yb': 1000 ** 8,
1204 }
1205
1206 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1207 m = re.match(
1208 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1209 if not m:
1210 return None
1211
1212 num_str = m.group('num').replace(',', '.')
1213 mult = _UNIT_TABLE[m.group('unit')]
1214 return int(float(num_str) * mult)
1215
1216
1217 def month_by_name(name):
1218 """ Return the number of a month by (locale-independently) English name """
1219
1220 try:
1221 return ENGLISH_MONTH_NAMES.index(name) + 1
1222 except ValueError:
1223 return None
1224
1225
1226 def month_by_abbreviation(abbrev):
1227 """ Return the number of a month by (locale-independently) English
1228 abbreviations """
1229
1230 try:
1231 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1232 except ValueError:
1233 return None
1234
1235
1236 def fix_xml_ampersands(xml_str):
1237 """Replace all the '&' by '&amp;' in XML"""
1238 return re.sub(
1239 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1240 '&amp;',
1241 xml_str)
1242
1243
1244 def setproctitle(title):
1245 assert isinstance(title, compat_str)
1246 try:
1247 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1248 except OSError:
1249 return
1250 title_bytes = title.encode('utf-8')
1251 buf = ctypes.create_string_buffer(len(title_bytes))
1252 buf.value = title_bytes
1253 try:
1254 libc.prctl(15, buf, 0, 0, 0)
1255 except AttributeError:
1256 return # Strange libc, just skip this
1257
1258
1259 def remove_start(s, start):
1260 if s.startswith(start):
1261 return s[len(start):]
1262 return s
1263
1264
1265 def remove_end(s, end):
1266 if s.endswith(end):
1267 return s[:-len(end)]
1268 return s
1269
1270
1271 def url_basename(url):
1272 path = compat_urlparse.urlparse(url).path
1273 return path.strip('/').split('/')[-1]
1274
1275
1276 class HEADRequest(compat_urllib_request.Request):
1277 def get_method(self):
1278 return "HEAD"
1279
1280
1281 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1282 if get_attr:
1283 if v is not None:
1284 v = getattr(v, get_attr, None)
1285 if v == '':
1286 v = None
1287 return default if v is None else (int(v) * invscale // scale)
1288
1289
1290 def str_or_none(v, default=None):
1291 return default if v is None else compat_str(v)
1292
1293
1294 def str_to_int(int_str):
1295 """ A more relaxed version of int_or_none """
1296 if int_str is None:
1297 return None
1298 int_str = re.sub(r'[,\.\+]', '', int_str)
1299 return int(int_str)
1300
1301
1302 def float_or_none(v, scale=1, invscale=1, default=None):
1303 return default if v is None else (float(v) * invscale / scale)
1304
1305
1306 def parse_duration(s):
1307 if not isinstance(s, compat_basestring):
1308 return None
1309
1310 s = s.strip()
1311
1312 m = re.match(
1313 r'''(?ix)(?:P?T)?
1314 (?:
1315 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1316 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1317
1318 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
1319 (?:
1320 (?:
1321 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1322 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1323 )?
1324 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1325 )?
1326 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1327 )$''', s)
1328 if not m:
1329 return None
1330 res = 0
1331 if m.group('only_mins'):
1332 return float_or_none(m.group('only_mins'), invscale=60)
1333 if m.group('only_hours'):
1334 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1335 if m.group('secs'):
1336 res += int(m.group('secs'))
1337 if m.group('mins_reversed'):
1338 res += int(m.group('mins_reversed')) * 60
1339 if m.group('mins'):
1340 res += int(m.group('mins')) * 60
1341 if m.group('hours'):
1342 res += int(m.group('hours')) * 60 * 60
1343 if m.group('hours_reversed'):
1344 res += int(m.group('hours_reversed')) * 60 * 60
1345 if m.group('days'):
1346 res += int(m.group('days')) * 24 * 60 * 60
1347 if m.group('ms'):
1348 res += float(m.group('ms'))
1349 return res
1350
1351
1352 def prepend_extension(filename, ext):
1353 name, real_ext = os.path.splitext(filename)
1354 return '{0}.{1}{2}'.format(name, ext, real_ext)
1355
1356
1357 def check_executable(exe, args=[]):
1358 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1359 args can be a list of arguments for a short output (like -version) """
1360 try:
1361 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1362 except OSError:
1363 return False
1364 return exe
1365
1366
1367 def get_exe_version(exe, args=['--version'],
1368 version_re=None, unrecognized='present'):
1369 """ Returns the version of the specified executable,
1370 or False if the executable is not present """
1371 try:
1372 out, _ = subprocess.Popen(
1373 [exe] + args,
1374 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1375 except OSError:
1376 return False
1377 if isinstance(out, bytes): # Python 2.x
1378 out = out.decode('ascii', 'ignore')
1379 return detect_exe_version(out, version_re, unrecognized)
1380
1381
1382 def detect_exe_version(output, version_re=None, unrecognized='present'):
1383 assert isinstance(output, compat_str)
1384 if version_re is None:
1385 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1386 m = re.search(version_re, output)
1387 if m:
1388 return m.group(1)
1389 else:
1390 return unrecognized
1391
1392
1393 class PagedList(object):
1394 def __len__(self):
1395 # This is only useful for tests
1396 return len(self.getslice())
1397
1398
1399 class OnDemandPagedList(PagedList):
1400 def __init__(self, pagefunc, pagesize):
1401 self._pagefunc = pagefunc
1402 self._pagesize = pagesize
1403
1404 def getslice(self, start=0, end=None):
1405 res = []
1406 for pagenum in itertools.count(start // self._pagesize):
1407 firstid = pagenum * self._pagesize
1408 nextfirstid = pagenum * self._pagesize + self._pagesize
1409 if start >= nextfirstid:
1410 continue
1411
1412 page_results = list(self._pagefunc(pagenum))
1413
1414 startv = (
1415 start % self._pagesize
1416 if firstid <= start < nextfirstid
1417 else 0)
1418
1419 endv = (
1420 ((end - 1) % self._pagesize) + 1
1421 if (end is not None and firstid <= end <= nextfirstid)
1422 else None)
1423
1424 if startv != 0 or endv is not None:
1425 page_results = page_results[startv:endv]
1426 res.extend(page_results)
1427
1428 # A little optimization - if current page is not "full", ie. does
1429 # not contain page_size videos then we can assume that this page
1430 # is the last one - there are no more ids on further pages -
1431 # i.e. no need to query again.
1432 if len(page_results) + startv < self._pagesize:
1433 break
1434
1435 # If we got the whole page, but the next page is not interesting,
1436 # break out early as well
1437 if end == nextfirstid:
1438 break
1439 return res
1440
1441
1442 class InAdvancePagedList(PagedList):
1443 def __init__(self, pagefunc, pagecount, pagesize):
1444 self._pagefunc = pagefunc
1445 self._pagecount = pagecount
1446 self._pagesize = pagesize
1447
1448 def getslice(self, start=0, end=None):
1449 res = []
1450 start_page = start // self._pagesize
1451 end_page = (
1452 self._pagecount if end is None else (end // self._pagesize + 1))
1453 skip_elems = start - start_page * self._pagesize
1454 only_more = None if end is None else end - start
1455 for pagenum in range(start_page, end_page):
1456 page = list(self._pagefunc(pagenum))
1457 if skip_elems:
1458 page = page[skip_elems:]
1459 skip_elems = None
1460 if only_more is not None:
1461 if len(page) < only_more:
1462 only_more -= len(page)
1463 else:
1464 page = page[:only_more]
1465 res.extend(page)
1466 break
1467 res.extend(page)
1468 return res
1469
1470
1471 def uppercase_escape(s):
1472 unicode_escape = codecs.getdecoder('unicode_escape')
1473 return re.sub(
1474 r'\\U[0-9a-fA-F]{8}',
1475 lambda m: unicode_escape(m.group(0))[0],
1476 s)
1477
1478
1479 def escape_rfc3986(s):
1480 """Escape non-ASCII characters as suggested by RFC 3986"""
1481 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1482 s = s.encode('utf-8')
1483 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1484
1485
1486 def escape_url(url):
1487 """Escape URL as suggested by RFC 3986"""
1488 url_parsed = compat_urllib_parse_urlparse(url)
1489 return url_parsed._replace(
1490 path=escape_rfc3986(url_parsed.path),
1491 params=escape_rfc3986(url_parsed.params),
1492 query=escape_rfc3986(url_parsed.query),
1493 fragment=escape_rfc3986(url_parsed.fragment)
1494 ).geturl()
1495
1496 try:
1497 struct.pack('!I', 0)
1498 except TypeError:
1499 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1500 def struct_pack(spec, *args):
1501 if isinstance(spec, compat_str):
1502 spec = spec.encode('ascii')
1503 return struct.pack(spec, *args)
1504
1505 def struct_unpack(spec, *args):
1506 if isinstance(spec, compat_str):
1507 spec = spec.encode('ascii')
1508 return struct.unpack(spec, *args)
1509 else:
1510 struct_pack = struct.pack
1511 struct_unpack = struct.unpack
1512
1513
1514 def read_batch_urls(batch_fd):
1515 def fixup(url):
1516 if not isinstance(url, compat_str):
1517 url = url.decode('utf-8', 'replace')
1518 BOM_UTF8 = '\xef\xbb\xbf'
1519 if url.startswith(BOM_UTF8):
1520 url = url[len(BOM_UTF8):]
1521 url = url.strip()
1522 if url.startswith(('#', ';', ']')):
1523 return False
1524 return url
1525
1526 with contextlib.closing(batch_fd) as fd:
1527 return [url for url in map(fixup, fd) if url]
1528
1529
1530 def urlencode_postdata(*args, **kargs):
1531 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1532
1533
1534 try:
1535 etree_iter = xml.etree.ElementTree.Element.iter
1536 except AttributeError: # Python <=2.6
1537 etree_iter = lambda n: n.findall('.//*')
1538
1539
1540 def parse_xml(s):
1541 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1542 def doctype(self, name, pubid, system):
1543 pass # Ignore doctypes
1544
1545 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1546 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
1547 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1548 # Fix up XML parser in Python 2.x
1549 if sys.version_info < (3, 0):
1550 for n in etree_iter(tree):
1551 if n.text is not None:
1552 if not isinstance(n.text, compat_str):
1553 n.text = n.text.decode('utf-8')
1554 return tree
1555
1556
1557 US_RATINGS = {
1558 'G': 0,
1559 'PG': 10,
1560 'PG-13': 13,
1561 'R': 16,
1562 'NC': 18,
1563 }
1564
1565
1566 def parse_age_limit(s):
1567 if s is None:
1568 return None
1569 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1570 return int(m.group('age')) if m else US_RATINGS.get(s, None)
1571
1572
1573 def strip_jsonp(code):
1574 return re.sub(
1575 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1576
1577
1578 def js_to_json(code):
1579 def fix_kv(m):
1580 v = m.group(0)
1581 if v in ('true', 'false', 'null'):
1582 return v
1583 if v.startswith('"'):
1584 return v
1585 if v.startswith("'"):
1586 v = v[1:-1]
1587 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1588 '\\\\': '\\\\',
1589 "\\'": "'",
1590 '"': '\\"',
1591 }[m.group(0)], v)
1592 return '"%s"' % v
1593
1594 res = re.sub(r'''(?x)
1595 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1596 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1597 [a-zA-Z_][.a-zA-Z_0-9]*
1598 ''', fix_kv, code)
1599 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1600 return res
1601
1602
1603 def qualities(quality_ids):
1604 """ Get a numeric quality value out of a list of possible values """
1605 def q(qid):
1606 try:
1607 return quality_ids.index(qid)
1608 except ValueError:
1609 return -1
1610 return q
1611
1612
1613 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1614
1615
1616 def limit_length(s, length):
1617 """ Add ellipses to overly long strings """
1618 if s is None:
1619 return None
1620 ELLIPSES = '...'
1621 if len(s) > length:
1622 return s[:length - len(ELLIPSES)] + ELLIPSES
1623 return s
1624
1625
1626 def version_tuple(v):
1627 return tuple(int(e) for e in re.split(r'[-.]', v))
1628
1629
1630 def is_outdated_version(version, limit, assume_new=True):
1631 if not version:
1632 return not assume_new
1633 try:
1634 return version_tuple(version) < version_tuple(limit)
1635 except ValueError:
1636 return not assume_new
1637
1638
1639 def ytdl_is_updateable():
1640 """ Returns if youtube-dl can be updated with -U """
1641 from zipimport import zipimporter
1642
1643 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1644
1645
1646 def args_to_str(args):
1647 # Get a short string representation for a subprocess command
1648 return ' '.join(shlex_quote(a) for a in args)
1649
1650
1651 def mimetype2ext(mt):
1652 _, _, res = mt.rpartition('/')
1653
1654 return {
1655 'x-ms-wmv': 'wmv',
1656 'x-mp4-fragmented': 'mp4',
1657 }.get(res, res)
1658
1659
1660 def urlhandle_detect_ext(url_handle):
1661 try:
1662 url_handle.headers
1663 getheader = lambda h: url_handle.headers[h]
1664 except AttributeError: # Python < 3
1665 getheader = url_handle.info().getheader
1666
1667 cd = getheader('Content-Disposition')
1668 if cd:
1669 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1670 if m:
1671 e = determine_ext(m.group('filename'), default_ext=None)
1672 if e:
1673 return e
1674
1675 return mimetype2ext(getheader('Content-Type'))
1676
1677
1678 def age_restricted(content_limit, age_limit):
1679 """ Returns True iff the content should be blocked """
1680
1681 if age_limit is None: # No limit set
1682 return False
1683 if content_limit is None:
1684 return False # Content available for everyone
1685 return age_limit < content_limit
1686
1687
1688 def is_html(first_bytes):
1689 """ Detect whether a file contains HTML by examining its first bytes. """
1690
1691 BOMS = [
1692 (b'\xef\xbb\xbf', 'utf-8'),
1693 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1694 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1695 (b'\xff\xfe', 'utf-16-le'),
1696 (b'\xfe\xff', 'utf-16-be'),
1697 ]
1698 for bom, enc in BOMS:
1699 if first_bytes.startswith(bom):
1700 s = first_bytes[len(bom):].decode(enc, 'replace')
1701 break
1702 else:
1703 s = first_bytes.decode('utf-8', 'replace')
1704
1705 return re.match(r'^\s*<', s)
1706
1707
1708 def determine_protocol(info_dict):
1709 protocol = info_dict.get('protocol')
1710 if protocol is not None:
1711 return protocol
1712
1713 url = info_dict['url']
1714 if url.startswith('rtmp'):
1715 return 'rtmp'
1716 elif url.startswith('mms'):
1717 return 'mms'
1718 elif url.startswith('rtsp'):
1719 return 'rtsp'
1720
1721 ext = determine_ext(url)
1722 if ext == 'm3u8':
1723 return 'm3u8'
1724 elif ext == 'f4m':
1725 return 'f4m'
1726
1727 return compat_urllib_parse_urlparse(url).scheme
1728
1729
1730 def render_table(header_row, data):
1731 """ Render a list of rows, each as a list of values """
1732 table = [header_row] + data
1733 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1734 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1735 return '\n'.join(format_str % tuple(row) for row in table)
1736
1737
1738 def _match_one(filter_part, dct):
1739 COMPARISON_OPERATORS = {
1740 '<': operator.lt,
1741 '<=': operator.le,
1742 '>': operator.gt,
1743 '>=': operator.ge,
1744 '=': operator.eq,
1745 '!=': operator.ne,
1746 }
1747 operator_rex = re.compile(r'''(?x)\s*
1748 (?P<key>[a-z_]+)
1749 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1750 (?:
1751 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1752 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1753 )
1754 \s*$
1755 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1756 m = operator_rex.search(filter_part)
1757 if m:
1758 op = COMPARISON_OPERATORS[m.group('op')]
1759 if m.group('strval') is not None:
1760 if m.group('op') not in ('=', '!='):
1761 raise ValueError(
1762 'Operator %s does not support string values!' % m.group('op'))
1763 comparison_value = m.group('strval')
1764 else:
1765 try:
1766 comparison_value = int(m.group('intval'))
1767 except ValueError:
1768 comparison_value = parse_filesize(m.group('intval'))
1769 if comparison_value is None:
1770 comparison_value = parse_filesize(m.group('intval') + 'B')
1771 if comparison_value is None:
1772 raise ValueError(
1773 'Invalid integer value %r in filter part %r' % (
1774 m.group('intval'), filter_part))
1775 actual_value = dct.get(m.group('key'))
1776 if actual_value is None:
1777 return m.group('none_inclusive')
1778 return op(actual_value, comparison_value)
1779
1780 UNARY_OPERATORS = {
1781 '': lambda v: v is not None,
1782 '!': lambda v: v is None,
1783 }
1784 operator_rex = re.compile(r'''(?x)\s*
1785 (?P<op>%s)\s*(?P<key>[a-z_]+)
1786 \s*$
1787 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1788 m = operator_rex.search(filter_part)
1789 if m:
1790 op = UNARY_OPERATORS[m.group('op')]
1791 actual_value = dct.get(m.group('key'))
1792 return op(actual_value)
1793
1794 raise ValueError('Invalid filter part %r' % filter_part)
1795
1796
1797 def match_str(filter_str, dct):
1798 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1799
1800 return all(
1801 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1802
1803
1804 def match_filter_func(filter_str):
1805 def _match_func(info_dict):
1806 if match_str(filter_str, info_dict):
1807 return None
1808 else:
1809 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1810 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1811 return _match_func
1812
1813
1814 def parse_dfxp_time_expr(time_expr):
1815 if not time_expr:
1816 return 0.0
1817
1818 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1819 if mobj:
1820 return float(mobj.group('time_offset'))
1821
1822 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1823 if mobj:
1824 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1825
1826
1827 def format_srt_time(seconds):
1828 (mins, secs) = divmod(seconds, 60)
1829 (hours, mins) = divmod(mins, 60)
1830 millisecs = (secs - int(secs)) * 1000
1831 secs = int(secs)
1832 return '%02d:%02d:%02d,%03d' % (hours, mins, secs, millisecs)
1833
1834
1835 def dfxp2srt(dfxp_data):
1836 _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'})
1837
1838 def parse_node(node):
1839 str_or_empty = functools.partial(str_or_none, default='')
1840
1841 out = str_or_empty(node.text)
1842
1843 for child in node:
1844 if child.tag == _x('ttml:br'):
1845 out += '\n' + str_or_empty(child.tail)
1846 elif child.tag == _x('ttml:span'):
1847 out += str_or_empty(parse_node(child))
1848 else:
1849 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1850
1851 return out
1852
1853 dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1854 out = []
1855 paras = dfxp.findall(_x('.//ttml:p'))
1856
1857 for para, index in zip(paras, itertools.count(1)):
1858 out.append('%d\n%s --> %s\n%s\n\n' % (
1859 index,
1860 format_srt_time(parse_dfxp_time_expr(para.attrib.get('begin'))),
1861 format_srt_time(parse_dfxp_time_expr(para.attrib.get('end'))),
1862 parse_node(para)))
1863
1864 return ''.join(out)
1865
1866
1867 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
1868 def __init__(self, proxies=None):
1869 # Set default handlers
1870 for type in ('http', 'https'):
1871 setattr(self, '%s_open' % type,
1872 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
1873 meth(r, proxy, type))
1874 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
1875
1876 def proxy_open(self, req, proxy, type):
1877 req_proxy = req.headers.get('Ytdl-request-proxy')
1878 if req_proxy is not None:
1879 proxy = req_proxy
1880 del req.headers['Ytdl-request-proxy']
1881
1882 if proxy == '__noproxy__':
1883 return None # No Proxy
1884 return compat_urllib_request.ProxyHandler.proxy_open(
1885 self, req, proxy, type)