]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
[utils] Clarify Python versions affected by buggy struct module
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import base64
7 import binascii
8 import calendar
9 import codecs
10 import contextlib
11 import ctypes
12 import datetime
13 import email.utils
14 import errno
15 import functools
16 import gzip
17 import itertools
18 import io
19 import json
20 import locale
21 import math
22 import operator
23 import os
24 import pipes
25 import platform
26 import re
27 import ssl
28 import socket
29 import struct
30 import subprocess
31 import sys
32 import tempfile
33 import traceback
34 import xml.etree.ElementTree
35 import zlib
36
37 from .compat import (
38 compat_HTMLParser,
39 compat_basestring,
40 compat_chr,
41 compat_etree_fromstring,
42 compat_html_entities,
43 compat_http_client,
44 compat_kwargs,
45 compat_parse_qs,
46 compat_socket_create_connection,
47 compat_str,
48 compat_urllib_error,
49 compat_urllib_parse,
50 compat_urllib_parse_urlparse,
51 compat_urllib_request,
52 compat_urlparse,
53 compat_xpath,
54 shlex_quote,
55 )
56
57
58 # This is not clearly defined otherwise
59 compiled_regex_type = type(re.compile(''))
60
61 std_headers = {
62 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
63 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
64 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
65 'Accept-Encoding': 'gzip, deflate',
66 'Accept-Language': 'en-us,en;q=0.5',
67 }
68
69
70 NO_DEFAULT = object()
71
72 ENGLISH_MONTH_NAMES = [
73 'January', 'February', 'March', 'April', 'May', 'June',
74 'July', 'August', 'September', 'October', 'November', 'December']
75
76 KNOWN_EXTENSIONS = (
77 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
78 'flv', 'f4v', 'f4a', 'f4b',
79 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
80 'mkv', 'mka', 'mk3d',
81 'avi', 'divx',
82 'mov',
83 'asf', 'wmv', 'wma',
84 '3gp', '3g2',
85 'mp3',
86 'flac',
87 'ape',
88 'wav',
89 'f4f', 'f4m', 'm3u8', 'smil')
90
91
92 def preferredencoding():
93 """Get preferred encoding.
94
95 Returns the best encoding scheme for the system, based on
96 locale.getpreferredencoding() and some further tweaks.
97 """
98 try:
99 pref = locale.getpreferredencoding()
100 'TEST'.encode(pref)
101 except Exception:
102 pref = 'UTF-8'
103
104 return pref
105
106
107 def write_json_file(obj, fn):
108 """ Encode obj as JSON and write it to fn, atomically if possible """
109
110 fn = encodeFilename(fn)
111 if sys.version_info < (3, 0) and sys.platform != 'win32':
112 encoding = get_filesystem_encoding()
113 # os.path.basename returns a bytes object, but NamedTemporaryFile
114 # will fail if the filename contains non ascii characters unless we
115 # use a unicode object
116 path_basename = lambda f: os.path.basename(fn).decode(encoding)
117 # the same for os.path.dirname
118 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
119 else:
120 path_basename = os.path.basename
121 path_dirname = os.path.dirname
122
123 args = {
124 'suffix': '.tmp',
125 'prefix': path_basename(fn) + '.',
126 'dir': path_dirname(fn),
127 'delete': False,
128 }
129
130 # In Python 2.x, json.dump expects a bytestream.
131 # In Python 3.x, it writes to a character stream
132 if sys.version_info < (3, 0):
133 args['mode'] = 'wb'
134 else:
135 args.update({
136 'mode': 'w',
137 'encoding': 'utf-8',
138 })
139
140 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
141
142 try:
143 with tf:
144 json.dump(obj, tf)
145 if sys.platform == 'win32':
146 # Need to remove existing file on Windows, else os.rename raises
147 # WindowsError or FileExistsError.
148 try:
149 os.unlink(fn)
150 except OSError:
151 pass
152 os.rename(tf.name, fn)
153 except Exception:
154 try:
155 os.remove(tf.name)
156 except OSError:
157 pass
158 raise
159
160
161 if sys.version_info >= (2, 7):
162 def find_xpath_attr(node, xpath, key, val=None):
163 """ Find the xpath xpath[@key=val] """
164 assert re.match(r'^[a-zA-Z_-]+$', key)
165 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
166 return node.find(expr)
167 else:
168 def find_xpath_attr(node, xpath, key, val=None):
169 for f in node.findall(compat_xpath(xpath)):
170 if key not in f.attrib:
171 continue
172 if val is None or f.attrib.get(key) == val:
173 return f
174 return None
175
176 # On python2.6 the xml.etree.ElementTree.Element methods don't support
177 # the namespace parameter
178
179
180 def xpath_with_ns(path, ns_map):
181 components = [c.split(':') for c in path.split('/')]
182 replaced = []
183 for c in components:
184 if len(c) == 1:
185 replaced.append(c[0])
186 else:
187 ns, tag = c
188 replaced.append('{%s}%s' % (ns_map[ns], tag))
189 return '/'.join(replaced)
190
191
192 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
193 def _find_xpath(xpath):
194 return node.find(compat_xpath(xpath))
195
196 if isinstance(xpath, (str, compat_str)):
197 n = _find_xpath(xpath)
198 else:
199 for xp in xpath:
200 n = _find_xpath(xp)
201 if n is not None:
202 break
203
204 if n is None:
205 if default is not NO_DEFAULT:
206 return default
207 elif fatal:
208 name = xpath if name is None else name
209 raise ExtractorError('Could not find XML element %s' % name)
210 else:
211 return None
212 return n
213
214
215 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
216 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
217 if n is None or n == default:
218 return n
219 if n.text is None:
220 if default is not NO_DEFAULT:
221 return default
222 elif fatal:
223 name = xpath if name is None else name
224 raise ExtractorError('Could not find XML element\'s text %s' % name)
225 else:
226 return None
227 return n.text
228
229
230 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
231 n = find_xpath_attr(node, xpath, key)
232 if n is None:
233 if default is not NO_DEFAULT:
234 return default
235 elif fatal:
236 name = '%s[@%s]' % (xpath, key) if name is None else name
237 raise ExtractorError('Could not find XML attribute %s' % name)
238 else:
239 return None
240 return n.attrib[key]
241
242
243 def get_element_by_id(id, html):
244 """Return the content of the tag with the specified ID in the passed HTML document"""
245 return get_element_by_attribute('id', id, html)
246
247
248 def get_element_by_attribute(attribute, value, html):
249 """Return the content of the tag with the specified attribute in the passed HTML document"""
250
251 m = re.search(r'''(?xs)
252 <([a-zA-Z0-9:._-]+)
253 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
254 \s+%s=['"]?%s['"]?
255 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
256 \s*>
257 (?P<content>.*?)
258 </\1>
259 ''' % (re.escape(attribute), re.escape(value)), html)
260
261 if not m:
262 return None
263 res = m.group('content')
264
265 if res.startswith('"') or res.startswith("'"):
266 res = res[1:-1]
267
268 return unescapeHTML(res)
269
270
271 class HTMLAttributeParser(compat_HTMLParser):
272 """Trivial HTML parser to gather the attributes for a single element"""
273 def __init__(self):
274 self.attrs = {}
275 compat_HTMLParser.__init__(self)
276
277 def handle_starttag(self, tag, attrs):
278 self.attrs = dict(attrs)
279
280
281 def extract_attributes(html_element):
282 """Given a string for an HTML element such as
283 <el
284 a="foo" B="bar" c="&98;az" d=boz
285 empty= noval entity="&amp;"
286 sq='"' dq="'"
287 >
288 Decode and return a dictionary of attributes.
289 {
290 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
291 'empty': '', 'noval': None, 'entity': '&',
292 'sq': '"', 'dq': '\''
293 }.
294 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
295 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
296 """
297 parser = HTMLAttributeParser()
298 parser.feed(html_element)
299 parser.close()
300 return parser.attrs
301
302
303 def clean_html(html):
304 """Clean an HTML snippet into a readable string"""
305
306 if html is None: # Convenience for sanitizing descriptions etc.
307 return html
308
309 # Newline vs <br />
310 html = html.replace('\n', ' ')
311 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
312 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
313 # Strip html tags
314 html = re.sub('<.*?>', '', html)
315 # Replace html entities
316 html = unescapeHTML(html)
317 return html.strip()
318
319
320 def sanitize_open(filename, open_mode):
321 """Try to open the given filename, and slightly tweak it if this fails.
322
323 Attempts to open the given filename. If this fails, it tries to change
324 the filename slightly, step by step, until it's either able to open it
325 or it fails and raises a final exception, like the standard open()
326 function.
327
328 It returns the tuple (stream, definitive_file_name).
329 """
330 try:
331 if filename == '-':
332 if sys.platform == 'win32':
333 import msvcrt
334 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
335 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
336 stream = open(encodeFilename(filename), open_mode)
337 return (stream, filename)
338 except (IOError, OSError) as err:
339 if err.errno in (errno.EACCES,):
340 raise
341
342 # In case of error, try to remove win32 forbidden chars
343 alt_filename = sanitize_path(filename)
344 if alt_filename == filename:
345 raise
346 else:
347 # An exception here should be caught in the caller
348 stream = open(encodeFilename(alt_filename), open_mode)
349 return (stream, alt_filename)
350
351
352 def timeconvert(timestr):
353 """Convert RFC 2822 defined time string into system timestamp"""
354 timestamp = None
355 timetuple = email.utils.parsedate_tz(timestr)
356 if timetuple is not None:
357 timestamp = email.utils.mktime_tz(timetuple)
358 return timestamp
359
360
361 def sanitize_filename(s, restricted=False, is_id=False):
362 """Sanitizes a string so it could be used as part of a filename.
363 If restricted is set, use a stricter subset of allowed characters.
364 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
365 """
366 def replace_insane(char):
367 if char == '?' or ord(char) < 32 or ord(char) == 127:
368 return ''
369 elif char == '"':
370 return '' if restricted else '\''
371 elif char == ':':
372 return '_-' if restricted else ' -'
373 elif char in '\\/|*<>':
374 return '_'
375 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
376 return '_'
377 if restricted and ord(char) > 127:
378 return '_'
379 return char
380
381 # Handle timestamps
382 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
383 result = ''.join(map(replace_insane, s))
384 if not is_id:
385 while '__' in result:
386 result = result.replace('__', '_')
387 result = result.strip('_')
388 # Common case of "Foreign band name - English song title"
389 if restricted and result.startswith('-_'):
390 result = result[2:]
391 if result.startswith('-'):
392 result = '_' + result[len('-'):]
393 result = result.lstrip('.')
394 if not result:
395 result = '_'
396 return result
397
398
399 def sanitize_path(s):
400 """Sanitizes and normalizes path on Windows"""
401 if sys.platform != 'win32':
402 return s
403 drive_or_unc, _ = os.path.splitdrive(s)
404 if sys.version_info < (2, 7) and not drive_or_unc:
405 drive_or_unc, _ = os.path.splitunc(s)
406 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
407 if drive_or_unc:
408 norm_path.pop(0)
409 sanitized_path = [
410 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
411 for path_part in norm_path]
412 if drive_or_unc:
413 sanitized_path.insert(0, drive_or_unc + os.path.sep)
414 return os.path.join(*sanitized_path)
415
416
417 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
418 # unwanted failures due to missing protocol
419 def sanitized_Request(url, *args, **kwargs):
420 return compat_urllib_request.Request(
421 'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
422
423
424 def orderedSet(iterable):
425 """ Remove all duplicates from the input iterable """
426 res = []
427 for el in iterable:
428 if el not in res:
429 res.append(el)
430 return res
431
432
433 def _htmlentity_transform(entity):
434 """Transforms an HTML entity to a character."""
435 # Known non-numeric HTML entity
436 if entity in compat_html_entities.name2codepoint:
437 return compat_chr(compat_html_entities.name2codepoint[entity])
438
439 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
440 if mobj is not None:
441 numstr = mobj.group(1)
442 if numstr.startswith('x'):
443 base = 16
444 numstr = '0%s' % numstr
445 else:
446 base = 10
447 # See https://github.com/rg3/youtube-dl/issues/7518
448 try:
449 return compat_chr(int(numstr, base))
450 except ValueError:
451 pass
452
453 # Unknown entity in name, return its literal representation
454 return '&%s;' % entity
455
456
457 def unescapeHTML(s):
458 if s is None:
459 return None
460 assert type(s) == compat_str
461
462 return re.sub(
463 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
464
465
466 def get_subprocess_encoding():
467 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
468 # For subprocess calls, encode with locale encoding
469 # Refer to http://stackoverflow.com/a/9951851/35070
470 encoding = preferredencoding()
471 else:
472 encoding = sys.getfilesystemencoding()
473 if encoding is None:
474 encoding = 'utf-8'
475 return encoding
476
477
478 def encodeFilename(s, for_subprocess=False):
479 """
480 @param s The name of the file
481 """
482
483 assert type(s) == compat_str
484
485 # Python 3 has a Unicode API
486 if sys.version_info >= (3, 0):
487 return s
488
489 # Pass '' directly to use Unicode APIs on Windows 2000 and up
490 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
491 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
492 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
493 return s
494
495 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
496 if sys.platform.startswith('java'):
497 return s
498
499 return s.encode(get_subprocess_encoding(), 'ignore')
500
501
502 def decodeFilename(b, for_subprocess=False):
503
504 if sys.version_info >= (3, 0):
505 return b
506
507 if not isinstance(b, bytes):
508 return b
509
510 return b.decode(get_subprocess_encoding(), 'ignore')
511
512
513 def encodeArgument(s):
514 if not isinstance(s, compat_str):
515 # Legacy code that uses byte strings
516 # Uncomment the following line after fixing all post processors
517 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
518 s = s.decode('ascii')
519 return encodeFilename(s, True)
520
521
522 def decodeArgument(b):
523 return decodeFilename(b, True)
524
525
526 def decodeOption(optval):
527 if optval is None:
528 return optval
529 if isinstance(optval, bytes):
530 optval = optval.decode(preferredencoding())
531
532 assert isinstance(optval, compat_str)
533 return optval
534
535
536 def formatSeconds(secs):
537 if secs > 3600:
538 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
539 elif secs > 60:
540 return '%d:%02d' % (secs // 60, secs % 60)
541 else:
542 return '%d' % secs
543
544
545 def make_HTTPS_handler(params, **kwargs):
546 opts_no_check_certificate = params.get('nocheckcertificate', False)
547 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
548 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
549 if opts_no_check_certificate:
550 context.check_hostname = False
551 context.verify_mode = ssl.CERT_NONE
552 try:
553 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
554 except TypeError:
555 # Python 2.7.8
556 # (create_default_context present but HTTPSHandler has no context=)
557 pass
558
559 if sys.version_info < (3, 2):
560 return YoutubeDLHTTPSHandler(params, **kwargs)
561 else: # Python < 3.4
562 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
563 context.verify_mode = (ssl.CERT_NONE
564 if opts_no_check_certificate
565 else ssl.CERT_REQUIRED)
566 context.set_default_verify_paths()
567 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
568
569
570 def bug_reports_message():
571 if ytdl_is_updateable():
572 update_cmd = 'type youtube-dl -U to update'
573 else:
574 update_cmd = 'see https://yt-dl.org/update on how to update'
575 msg = '; please report this issue on https://yt-dl.org/bug .'
576 msg += ' Make sure you are using the latest version; %s.' % update_cmd
577 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
578 return msg
579
580
581 class ExtractorError(Exception):
582 """Error during info extraction."""
583
584 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
585 """ tb, if given, is the original traceback (so that it can be printed out).
586 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
587 """
588
589 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
590 expected = True
591 if video_id is not None:
592 msg = video_id + ': ' + msg
593 if cause:
594 msg += ' (caused by %r)' % cause
595 if not expected:
596 msg += bug_reports_message()
597 super(ExtractorError, self).__init__(msg)
598
599 self.traceback = tb
600 self.exc_info = sys.exc_info() # preserve original exception
601 self.cause = cause
602 self.video_id = video_id
603
604 def format_traceback(self):
605 if self.traceback is None:
606 return None
607 return ''.join(traceback.format_tb(self.traceback))
608
609
610 class UnsupportedError(ExtractorError):
611 def __init__(self, url):
612 super(UnsupportedError, self).__init__(
613 'Unsupported URL: %s' % url, expected=True)
614 self.url = url
615
616
617 class RegexNotFoundError(ExtractorError):
618 """Error when a regex didn't match"""
619 pass
620
621
622 class DownloadError(Exception):
623 """Download Error exception.
624
625 This exception may be thrown by FileDownloader objects if they are not
626 configured to continue on errors. They will contain the appropriate
627 error message.
628 """
629
630 def __init__(self, msg, exc_info=None):
631 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
632 super(DownloadError, self).__init__(msg)
633 self.exc_info = exc_info
634
635
636 class SameFileError(Exception):
637 """Same File exception.
638
639 This exception will be thrown by FileDownloader objects if they detect
640 multiple files would have to be downloaded to the same file on disk.
641 """
642 pass
643
644
645 class PostProcessingError(Exception):
646 """Post Processing exception.
647
648 This exception may be raised by PostProcessor's .run() method to
649 indicate an error in the postprocessing task.
650 """
651
652 def __init__(self, msg):
653 self.msg = msg
654
655
656 class MaxDownloadsReached(Exception):
657 """ --max-downloads limit has been reached. """
658 pass
659
660
661 class UnavailableVideoError(Exception):
662 """Unavailable Format exception.
663
664 This exception will be thrown when a video is requested
665 in a format that is not available for that video.
666 """
667 pass
668
669
670 class ContentTooShortError(Exception):
671 """Content Too Short exception.
672
673 This exception may be raised by FileDownloader objects when a file they
674 download is too small for what the server announced first, indicating
675 the connection was probably interrupted.
676 """
677
678 def __init__(self, downloaded, expected):
679 # Both in bytes
680 self.downloaded = downloaded
681 self.expected = expected
682
683
684 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
685 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
686 # expected HTTP responses to meet HTTP/1.0 or later (see also
687 # https://github.com/rg3/youtube-dl/issues/6727)
688 if sys.version_info < (3, 0):
689 kwargs[b'strict'] = True
690 hc = http_class(*args, **kwargs)
691 source_address = ydl_handler._params.get('source_address')
692 if source_address is not None:
693 sa = (source_address, 0)
694 if hasattr(hc, 'source_address'): # Python 2.7+
695 hc.source_address = sa
696 else: # Python 2.6
697 def _hc_connect(self, *args, **kwargs):
698 sock = compat_socket_create_connection(
699 (self.host, self.port), self.timeout, sa)
700 if is_https:
701 self.sock = ssl.wrap_socket(
702 sock, self.key_file, self.cert_file,
703 ssl_version=ssl.PROTOCOL_TLSv1)
704 else:
705 self.sock = sock
706 hc.connect = functools.partial(_hc_connect, hc)
707
708 return hc
709
710
711 def handle_youtubedl_headers(headers):
712 filtered_headers = headers
713
714 if 'Youtubedl-no-compression' in filtered_headers:
715 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
716 del filtered_headers['Youtubedl-no-compression']
717
718 return filtered_headers
719
720
721 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
722 """Handler for HTTP requests and responses.
723
724 This class, when installed with an OpenerDirector, automatically adds
725 the standard headers to every HTTP request and handles gzipped and
726 deflated responses from web servers. If compression is to be avoided in
727 a particular request, the original request in the program code only has
728 to include the HTTP header "Youtubedl-no-compression", which will be
729 removed before making the real request.
730
731 Part of this code was copied from:
732
733 http://techknack.net/python-urllib2-handlers/
734
735 Andrew Rowls, the author of that code, agreed to release it to the
736 public domain.
737 """
738
739 def __init__(self, params, *args, **kwargs):
740 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
741 self._params = params
742
743 def http_open(self, req):
744 return self.do_open(functools.partial(
745 _create_http_connection, self, compat_http_client.HTTPConnection, False),
746 req)
747
748 @staticmethod
749 def deflate(data):
750 try:
751 return zlib.decompress(data, -zlib.MAX_WBITS)
752 except zlib.error:
753 return zlib.decompress(data)
754
755 @staticmethod
756 def addinfourl_wrapper(stream, headers, url, code):
757 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
758 return compat_urllib_request.addinfourl(stream, headers, url, code)
759 ret = compat_urllib_request.addinfourl(stream, headers, url)
760 ret.code = code
761 return ret
762
763 def http_request(self, req):
764 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
765 # always respected by websites, some tend to give out URLs with non percent-encoded
766 # non-ASCII characters (see telemb.py, ard.py [#3412])
767 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
768 # To work around aforementioned issue we will replace request's original URL with
769 # percent-encoded one
770 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
771 # the code of this workaround has been moved here from YoutubeDL.urlopen()
772 url = req.get_full_url()
773 url_escaped = escape_url(url)
774
775 # Substitute URL if any change after escaping
776 if url != url_escaped:
777 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
778 new_req = req_type(
779 url_escaped, data=req.data, headers=req.headers,
780 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
781 new_req.timeout = req.timeout
782 req = new_req
783
784 for h, v in std_headers.items():
785 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
786 # The dict keys are capitalized because of this bug by urllib
787 if h.capitalize() not in req.headers:
788 req.add_header(h, v)
789
790 req.headers = handle_youtubedl_headers(req.headers)
791
792 if sys.version_info < (2, 7) and '#' in req.get_full_url():
793 # Python 2.6 is brain-dead when it comes to fragments
794 req._Request__original = req._Request__original.partition('#')[0]
795 req._Request__r_type = req._Request__r_type.partition('#')[0]
796
797 return req
798
799 def http_response(self, req, resp):
800 old_resp = resp
801 # gzip
802 if resp.headers.get('Content-encoding', '') == 'gzip':
803 content = resp.read()
804 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
805 try:
806 uncompressed = io.BytesIO(gz.read())
807 except IOError as original_ioerror:
808 # There may be junk add the end of the file
809 # See http://stackoverflow.com/q/4928560/35070 for details
810 for i in range(1, 1024):
811 try:
812 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
813 uncompressed = io.BytesIO(gz.read())
814 except IOError:
815 continue
816 break
817 else:
818 raise original_ioerror
819 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
820 resp.msg = old_resp.msg
821 del resp.headers['Content-encoding']
822 # deflate
823 if resp.headers.get('Content-encoding', '') == 'deflate':
824 gz = io.BytesIO(self.deflate(resp.read()))
825 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
826 resp.msg = old_resp.msg
827 del resp.headers['Content-encoding']
828 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
829 # https://github.com/rg3/youtube-dl/issues/6457).
830 if 300 <= resp.code < 400:
831 location = resp.headers.get('Location')
832 if location:
833 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
834 if sys.version_info >= (3, 0):
835 location = location.encode('iso-8859-1').decode('utf-8')
836 location_escaped = escape_url(location)
837 if location != location_escaped:
838 del resp.headers['Location']
839 resp.headers['Location'] = location_escaped
840 return resp
841
842 https_request = http_request
843 https_response = http_response
844
845
846 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
847 def __init__(self, params, https_conn_class=None, *args, **kwargs):
848 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
849 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
850 self._params = params
851
852 def https_open(self, req):
853 kwargs = {}
854 if hasattr(self, '_context'): # python > 2.6
855 kwargs['context'] = self._context
856 if hasattr(self, '_check_hostname'): # python 3.x
857 kwargs['check_hostname'] = self._check_hostname
858 return self.do_open(functools.partial(
859 _create_http_connection, self, self._https_conn_class, True),
860 req, **kwargs)
861
862
863 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
864 def __init__(self, cookiejar=None):
865 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
866
867 def http_response(self, request, response):
868 # Python 2 will choke on next HTTP request in row if there are non-ASCII
869 # characters in Set-Cookie HTTP header of last response (see
870 # https://github.com/rg3/youtube-dl/issues/6769).
871 # In order to at least prevent crashing we will percent encode Set-Cookie
872 # header before HTTPCookieProcessor starts processing it.
873 # if sys.version_info < (3, 0) and response.headers:
874 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
875 # set_cookie = response.headers.get(set_cookie_header)
876 # if set_cookie:
877 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
878 # if set_cookie != set_cookie_escaped:
879 # del response.headers[set_cookie_header]
880 # response.headers[set_cookie_header] = set_cookie_escaped
881 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
882
883 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
884 https_response = http_response
885
886
887 def parse_iso8601(date_str, delimiter='T', timezone=None):
888 """ Return a UNIX timestamp from the given date """
889
890 if date_str is None:
891 return None
892
893 date_str = re.sub(r'\.[0-9]+', '', date_str)
894
895 if timezone is None:
896 m = re.search(
897 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
898 date_str)
899 if not m:
900 timezone = datetime.timedelta()
901 else:
902 date_str = date_str[:-len(m.group(0))]
903 if not m.group('sign'):
904 timezone = datetime.timedelta()
905 else:
906 sign = 1 if m.group('sign') == '+' else -1
907 timezone = datetime.timedelta(
908 hours=sign * int(m.group('hours')),
909 minutes=sign * int(m.group('minutes')))
910 try:
911 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
912 dt = datetime.datetime.strptime(date_str, date_format) - timezone
913 return calendar.timegm(dt.timetuple())
914 except ValueError:
915 pass
916
917
918 def unified_strdate(date_str, day_first=True):
919 """Return a string with the date in the format YYYYMMDD"""
920
921 if date_str is None:
922 return None
923 upload_date = None
924 # Replace commas
925 date_str = date_str.replace(',', ' ')
926 # %z (UTC offset) is only supported in python>=3.2
927 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
928 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
929 # Remove AM/PM + timezone
930 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
931
932 format_expressions = [
933 '%d %B %Y',
934 '%d %b %Y',
935 '%B %d %Y',
936 '%b %d %Y',
937 '%b %dst %Y %I:%M',
938 '%b %dnd %Y %I:%M',
939 '%b %dth %Y %I:%M',
940 '%Y %m %d',
941 '%Y-%m-%d',
942 '%Y/%m/%d',
943 '%Y/%m/%d %H:%M:%S',
944 '%Y-%m-%d %H:%M:%S',
945 '%Y-%m-%d %H:%M:%S.%f',
946 '%d.%m.%Y %H:%M',
947 '%d.%m.%Y %H.%M',
948 '%Y-%m-%dT%H:%M:%SZ',
949 '%Y-%m-%dT%H:%M:%S.%fZ',
950 '%Y-%m-%dT%H:%M:%S.%f0Z',
951 '%Y-%m-%dT%H:%M:%S',
952 '%Y-%m-%dT%H:%M:%S.%f',
953 '%Y-%m-%dT%H:%M',
954 ]
955 if day_first:
956 format_expressions.extend([
957 '%d-%m-%Y',
958 '%d.%m.%Y',
959 '%d/%m/%Y',
960 '%d/%m/%y',
961 '%d/%m/%Y %H:%M:%S',
962 ])
963 else:
964 format_expressions.extend([
965 '%m-%d-%Y',
966 '%m.%d.%Y',
967 '%m/%d/%Y',
968 '%m/%d/%y',
969 '%m/%d/%Y %H:%M:%S',
970 ])
971 for expression in format_expressions:
972 try:
973 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
974 except ValueError:
975 pass
976 if upload_date is None:
977 timetuple = email.utils.parsedate_tz(date_str)
978 if timetuple:
979 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
980 if upload_date is not None:
981 return compat_str(upload_date)
982
983
984 def determine_ext(url, default_ext='unknown_video'):
985 if url is None:
986 return default_ext
987 guess = url.partition('?')[0].rpartition('.')[2]
988 if re.match(r'^[A-Za-z0-9]+$', guess):
989 return guess
990 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
991 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
992 return guess.rstrip('/')
993 else:
994 return default_ext
995
996
997 def subtitles_filename(filename, sub_lang, sub_format):
998 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
999
1000
1001 def date_from_str(date_str):
1002 """
1003 Return a datetime object from a string in the format YYYYMMDD or
1004 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1005 today = datetime.date.today()
1006 if date_str in ('now', 'today'):
1007 return today
1008 if date_str == 'yesterday':
1009 return today - datetime.timedelta(days=1)
1010 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1011 if match is not None:
1012 sign = match.group('sign')
1013 time = int(match.group('time'))
1014 if sign == '-':
1015 time = -time
1016 unit = match.group('unit')
1017 # A bad approximation?
1018 if unit == 'month':
1019 unit = 'day'
1020 time *= 30
1021 elif unit == 'year':
1022 unit = 'day'
1023 time *= 365
1024 unit += 's'
1025 delta = datetime.timedelta(**{unit: time})
1026 return today + delta
1027 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
1028
1029
1030 def hyphenate_date(date_str):
1031 """
1032 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1033 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1034 if match is not None:
1035 return '-'.join(match.groups())
1036 else:
1037 return date_str
1038
1039
1040 class DateRange(object):
1041 """Represents a time interval between two dates"""
1042
1043 def __init__(self, start=None, end=None):
1044 """start and end must be strings in the format accepted by date"""
1045 if start is not None:
1046 self.start = date_from_str(start)
1047 else:
1048 self.start = datetime.datetime.min.date()
1049 if end is not None:
1050 self.end = date_from_str(end)
1051 else:
1052 self.end = datetime.datetime.max.date()
1053 if self.start > self.end:
1054 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1055
1056 @classmethod
1057 def day(cls, day):
1058 """Returns a range that only contains the given day"""
1059 return cls(day, day)
1060
1061 def __contains__(self, date):
1062 """Check if the date is in the range"""
1063 if not isinstance(date, datetime.date):
1064 date = date_from_str(date)
1065 return self.start <= date <= self.end
1066
1067 def __str__(self):
1068 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1069
1070
1071 def platform_name():
1072 """ Returns the platform name as a compat_str """
1073 res = platform.platform()
1074 if isinstance(res, bytes):
1075 res = res.decode(preferredencoding())
1076
1077 assert isinstance(res, compat_str)
1078 return res
1079
1080
1081 def _windows_write_string(s, out):
1082 """ Returns True if the string was written using special methods,
1083 False if it has yet to be written out."""
1084 # Adapted from http://stackoverflow.com/a/3259271/35070
1085
1086 import ctypes
1087 import ctypes.wintypes
1088
1089 WIN_OUTPUT_IDS = {
1090 1: -11,
1091 2: -12,
1092 }
1093
1094 try:
1095 fileno = out.fileno()
1096 except AttributeError:
1097 # If the output stream doesn't have a fileno, it's virtual
1098 return False
1099 except io.UnsupportedOperation:
1100 # Some strange Windows pseudo files?
1101 return False
1102 if fileno not in WIN_OUTPUT_IDS:
1103 return False
1104
1105 GetStdHandle = ctypes.WINFUNCTYPE(
1106 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1107 (b'GetStdHandle', ctypes.windll.kernel32))
1108 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1109
1110 WriteConsoleW = ctypes.WINFUNCTYPE(
1111 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1112 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1113 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1114 written = ctypes.wintypes.DWORD(0)
1115
1116 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1117 FILE_TYPE_CHAR = 0x0002
1118 FILE_TYPE_REMOTE = 0x8000
1119 GetConsoleMode = ctypes.WINFUNCTYPE(
1120 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1121 ctypes.POINTER(ctypes.wintypes.DWORD))(
1122 (b'GetConsoleMode', ctypes.windll.kernel32))
1123 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1124
1125 def not_a_console(handle):
1126 if handle == INVALID_HANDLE_VALUE or handle is None:
1127 return True
1128 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1129 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1130
1131 if not_a_console(h):
1132 return False
1133
1134 def next_nonbmp_pos(s):
1135 try:
1136 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1137 except StopIteration:
1138 return len(s)
1139
1140 while s:
1141 count = min(next_nonbmp_pos(s), 1024)
1142
1143 ret = WriteConsoleW(
1144 h, s, count if count else 2, ctypes.byref(written), None)
1145 if ret == 0:
1146 raise OSError('Failed to write string')
1147 if not count: # We just wrote a non-BMP character
1148 assert written.value == 2
1149 s = s[1:]
1150 else:
1151 assert written.value > 0
1152 s = s[written.value:]
1153 return True
1154
1155
1156 def write_string(s, out=None, encoding=None):
1157 if out is None:
1158 out = sys.stderr
1159 assert type(s) == compat_str
1160
1161 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1162 if _windows_write_string(s, out):
1163 return
1164
1165 if ('b' in getattr(out, 'mode', '') or
1166 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1167 byt = s.encode(encoding or preferredencoding(), 'ignore')
1168 out.write(byt)
1169 elif hasattr(out, 'buffer'):
1170 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1171 byt = s.encode(enc, 'ignore')
1172 out.buffer.write(byt)
1173 else:
1174 out.write(s)
1175 out.flush()
1176
1177
1178 def bytes_to_intlist(bs):
1179 if not bs:
1180 return []
1181 if isinstance(bs[0], int): # Python 3
1182 return list(bs)
1183 else:
1184 return [ord(c) for c in bs]
1185
1186
1187 def intlist_to_bytes(xs):
1188 if not xs:
1189 return b''
1190 return struct_pack('%dB' % len(xs), *xs)
1191
1192
1193 # Cross-platform file locking
1194 if sys.platform == 'win32':
1195 import ctypes.wintypes
1196 import msvcrt
1197
1198 class OVERLAPPED(ctypes.Structure):
1199 _fields_ = [
1200 ('Internal', ctypes.wintypes.LPVOID),
1201 ('InternalHigh', ctypes.wintypes.LPVOID),
1202 ('Offset', ctypes.wintypes.DWORD),
1203 ('OffsetHigh', ctypes.wintypes.DWORD),
1204 ('hEvent', ctypes.wintypes.HANDLE),
1205 ]
1206
1207 kernel32 = ctypes.windll.kernel32
1208 LockFileEx = kernel32.LockFileEx
1209 LockFileEx.argtypes = [
1210 ctypes.wintypes.HANDLE, # hFile
1211 ctypes.wintypes.DWORD, # dwFlags
1212 ctypes.wintypes.DWORD, # dwReserved
1213 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1214 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1215 ctypes.POINTER(OVERLAPPED) # Overlapped
1216 ]
1217 LockFileEx.restype = ctypes.wintypes.BOOL
1218 UnlockFileEx = kernel32.UnlockFileEx
1219 UnlockFileEx.argtypes = [
1220 ctypes.wintypes.HANDLE, # hFile
1221 ctypes.wintypes.DWORD, # dwReserved
1222 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1223 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1224 ctypes.POINTER(OVERLAPPED) # Overlapped
1225 ]
1226 UnlockFileEx.restype = ctypes.wintypes.BOOL
1227 whole_low = 0xffffffff
1228 whole_high = 0x7fffffff
1229
1230 def _lock_file(f, exclusive):
1231 overlapped = OVERLAPPED()
1232 overlapped.Offset = 0
1233 overlapped.OffsetHigh = 0
1234 overlapped.hEvent = 0
1235 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1236 handle = msvcrt.get_osfhandle(f.fileno())
1237 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1238 whole_low, whole_high, f._lock_file_overlapped_p):
1239 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1240
1241 def _unlock_file(f):
1242 assert f._lock_file_overlapped_p
1243 handle = msvcrt.get_osfhandle(f.fileno())
1244 if not UnlockFileEx(handle, 0,
1245 whole_low, whole_high, f._lock_file_overlapped_p):
1246 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1247
1248 else:
1249 # Some platforms, such as Jython, is missing fcntl
1250 try:
1251 import fcntl
1252
1253 def _lock_file(f, exclusive):
1254 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1255
1256 def _unlock_file(f):
1257 fcntl.flock(f, fcntl.LOCK_UN)
1258 except ImportError:
1259 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1260
1261 def _lock_file(f, exclusive):
1262 raise IOError(UNSUPPORTED_MSG)
1263
1264 def _unlock_file(f):
1265 raise IOError(UNSUPPORTED_MSG)
1266
1267
1268 class locked_file(object):
1269 def __init__(self, filename, mode, encoding=None):
1270 assert mode in ['r', 'a', 'w']
1271 self.f = io.open(filename, mode, encoding=encoding)
1272 self.mode = mode
1273
1274 def __enter__(self):
1275 exclusive = self.mode != 'r'
1276 try:
1277 _lock_file(self.f, exclusive)
1278 except IOError:
1279 self.f.close()
1280 raise
1281 return self
1282
1283 def __exit__(self, etype, value, traceback):
1284 try:
1285 _unlock_file(self.f)
1286 finally:
1287 self.f.close()
1288
1289 def __iter__(self):
1290 return iter(self.f)
1291
1292 def write(self, *args):
1293 return self.f.write(*args)
1294
1295 def read(self, *args):
1296 return self.f.read(*args)
1297
1298
1299 def get_filesystem_encoding():
1300 encoding = sys.getfilesystemencoding()
1301 return encoding if encoding is not None else 'utf-8'
1302
1303
1304 def shell_quote(args):
1305 quoted_args = []
1306 encoding = get_filesystem_encoding()
1307 for a in args:
1308 if isinstance(a, bytes):
1309 # We may get a filename encoded with 'encodeFilename'
1310 a = a.decode(encoding)
1311 quoted_args.append(pipes.quote(a))
1312 return ' '.join(quoted_args)
1313
1314
1315 def smuggle_url(url, data):
1316 """ Pass additional data in a URL for internal use. """
1317
1318 sdata = compat_urllib_parse.urlencode(
1319 {'__youtubedl_smuggle': json.dumps(data)})
1320 return url + '#' + sdata
1321
1322
1323 def unsmuggle_url(smug_url, default=None):
1324 if '#__youtubedl_smuggle' not in smug_url:
1325 return smug_url, default
1326 url, _, sdata = smug_url.rpartition('#')
1327 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1328 data = json.loads(jsond)
1329 return url, data
1330
1331
1332 def format_bytes(bytes):
1333 if bytes is None:
1334 return 'N/A'
1335 if type(bytes) is str:
1336 bytes = float(bytes)
1337 if bytes == 0.0:
1338 exponent = 0
1339 else:
1340 exponent = int(math.log(bytes, 1024.0))
1341 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1342 converted = float(bytes) / float(1024 ** exponent)
1343 return '%.2f%s' % (converted, suffix)
1344
1345
1346 def lookup_unit_table(unit_table, s):
1347 units_re = '|'.join(re.escape(u) for u in unit_table)
1348 m = re.match(
1349 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
1350 if not m:
1351 return None
1352 num_str = m.group('num').replace(',', '.')
1353 mult = unit_table[m.group('unit')]
1354 return int(float(num_str) * mult)
1355
1356
1357 def parse_filesize(s):
1358 if s is None:
1359 return None
1360
1361 # The lower-case forms are of course incorrect and unofficial,
1362 # but we support those too
1363 _UNIT_TABLE = {
1364 'B': 1,
1365 'b': 1,
1366 'KiB': 1024,
1367 'KB': 1000,
1368 'kB': 1024,
1369 'Kb': 1000,
1370 'MiB': 1024 ** 2,
1371 'MB': 1000 ** 2,
1372 'mB': 1024 ** 2,
1373 'Mb': 1000 ** 2,
1374 'GiB': 1024 ** 3,
1375 'GB': 1000 ** 3,
1376 'gB': 1024 ** 3,
1377 'Gb': 1000 ** 3,
1378 'TiB': 1024 ** 4,
1379 'TB': 1000 ** 4,
1380 'tB': 1024 ** 4,
1381 'Tb': 1000 ** 4,
1382 'PiB': 1024 ** 5,
1383 'PB': 1000 ** 5,
1384 'pB': 1024 ** 5,
1385 'Pb': 1000 ** 5,
1386 'EiB': 1024 ** 6,
1387 'EB': 1000 ** 6,
1388 'eB': 1024 ** 6,
1389 'Eb': 1000 ** 6,
1390 'ZiB': 1024 ** 7,
1391 'ZB': 1000 ** 7,
1392 'zB': 1024 ** 7,
1393 'Zb': 1000 ** 7,
1394 'YiB': 1024 ** 8,
1395 'YB': 1000 ** 8,
1396 'yB': 1024 ** 8,
1397 'Yb': 1000 ** 8,
1398 }
1399
1400 return lookup_unit_table(_UNIT_TABLE, s)
1401
1402
1403 def parse_count(s):
1404 if s is None:
1405 return None
1406
1407 s = s.strip()
1408
1409 if re.match(r'^[\d,.]+$', s):
1410 return str_to_int(s)
1411
1412 _UNIT_TABLE = {
1413 'k': 1000,
1414 'K': 1000,
1415 'm': 1000 ** 2,
1416 'M': 1000 ** 2,
1417 'kk': 1000 ** 2,
1418 'KK': 1000 ** 2,
1419 }
1420
1421 return lookup_unit_table(_UNIT_TABLE, s)
1422
1423
1424 def month_by_name(name):
1425 """ Return the number of a month by (locale-independently) English name """
1426
1427 try:
1428 return ENGLISH_MONTH_NAMES.index(name) + 1
1429 except ValueError:
1430 return None
1431
1432
1433 def month_by_abbreviation(abbrev):
1434 """ Return the number of a month by (locale-independently) English
1435 abbreviations """
1436
1437 try:
1438 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1439 except ValueError:
1440 return None
1441
1442
1443 def fix_xml_ampersands(xml_str):
1444 """Replace all the '&' by '&amp;' in XML"""
1445 return re.sub(
1446 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1447 '&amp;',
1448 xml_str)
1449
1450
1451 def setproctitle(title):
1452 assert isinstance(title, compat_str)
1453
1454 # ctypes in Jython is not complete
1455 # http://bugs.jython.org/issue2148
1456 if sys.platform.startswith('java'):
1457 return
1458
1459 try:
1460 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1461 except OSError:
1462 return
1463 title_bytes = title.encode('utf-8')
1464 buf = ctypes.create_string_buffer(len(title_bytes))
1465 buf.value = title_bytes
1466 try:
1467 libc.prctl(15, buf, 0, 0, 0)
1468 except AttributeError:
1469 return # Strange libc, just skip this
1470
1471
1472 def remove_start(s, start):
1473 if s.startswith(start):
1474 return s[len(start):]
1475 return s
1476
1477
1478 def remove_end(s, end):
1479 if s.endswith(end):
1480 return s[:-len(end)]
1481 return s
1482
1483
1484 def remove_quotes(s):
1485 if s is None or len(s) < 2:
1486 return s
1487 for quote in ('"', "'", ):
1488 if s[0] == quote and s[-1] == quote:
1489 return s[1:-1]
1490 return s
1491
1492
1493 def url_basename(url):
1494 path = compat_urlparse.urlparse(url).path
1495 return path.strip('/').split('/')[-1]
1496
1497
1498 class HEADRequest(compat_urllib_request.Request):
1499 def get_method(self):
1500 return 'HEAD'
1501
1502
1503 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1504 if get_attr:
1505 if v is not None:
1506 v = getattr(v, get_attr, None)
1507 if v == '':
1508 v = None
1509 if v is None:
1510 return default
1511 try:
1512 return int(v) * invscale // scale
1513 except ValueError:
1514 return default
1515
1516
1517 def str_or_none(v, default=None):
1518 return default if v is None else compat_str(v)
1519
1520
1521 def str_to_int(int_str):
1522 """ A more relaxed version of int_or_none """
1523 if int_str is None:
1524 return None
1525 int_str = re.sub(r'[,\.\+]', '', int_str)
1526 return int(int_str)
1527
1528
1529 def float_or_none(v, scale=1, invscale=1, default=None):
1530 if v is None:
1531 return default
1532 try:
1533 return float(v) * invscale / scale
1534 except ValueError:
1535 return default
1536
1537
1538 def parse_duration(s):
1539 if not isinstance(s, compat_basestring):
1540 return None
1541
1542 s = s.strip()
1543
1544 m = re.match(
1545 r'''(?ix)(?:P?T)?
1546 (?:
1547 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1548 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1549
1550 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1551 (?:
1552 (?:
1553 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1554 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1555 )?
1556 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1557 )?
1558 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1559 )$''', s)
1560 if not m:
1561 return None
1562 res = 0
1563 if m.group('only_mins'):
1564 return float_or_none(m.group('only_mins'), invscale=60)
1565 if m.group('only_hours'):
1566 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1567 if m.group('secs'):
1568 res += int(m.group('secs'))
1569 if m.group('mins_reversed'):
1570 res += int(m.group('mins_reversed')) * 60
1571 if m.group('mins'):
1572 res += int(m.group('mins')) * 60
1573 if m.group('hours'):
1574 res += int(m.group('hours')) * 60 * 60
1575 if m.group('hours_reversed'):
1576 res += int(m.group('hours_reversed')) * 60 * 60
1577 if m.group('days'):
1578 res += int(m.group('days')) * 24 * 60 * 60
1579 if m.group('ms'):
1580 res += float(m.group('ms'))
1581 return res
1582
1583
1584 def prepend_extension(filename, ext, expected_real_ext=None):
1585 name, real_ext = os.path.splitext(filename)
1586 return (
1587 '{0}.{1}{2}'.format(name, ext, real_ext)
1588 if not expected_real_ext or real_ext[1:] == expected_real_ext
1589 else '{0}.{1}'.format(filename, ext))
1590
1591
1592 def replace_extension(filename, ext, expected_real_ext=None):
1593 name, real_ext = os.path.splitext(filename)
1594 return '{0}.{1}'.format(
1595 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1596 ext)
1597
1598
1599 def check_executable(exe, args=[]):
1600 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1601 args can be a list of arguments for a short output (like -version) """
1602 try:
1603 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1604 except OSError:
1605 return False
1606 return exe
1607
1608
1609 def get_exe_version(exe, args=['--version'],
1610 version_re=None, unrecognized='present'):
1611 """ Returns the version of the specified executable,
1612 or False if the executable is not present """
1613 try:
1614 out, _ = subprocess.Popen(
1615 [encodeArgument(exe)] + args,
1616 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1617 except OSError:
1618 return False
1619 if isinstance(out, bytes): # Python 2.x
1620 out = out.decode('ascii', 'ignore')
1621 return detect_exe_version(out, version_re, unrecognized)
1622
1623
1624 def detect_exe_version(output, version_re=None, unrecognized='present'):
1625 assert isinstance(output, compat_str)
1626 if version_re is None:
1627 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1628 m = re.search(version_re, output)
1629 if m:
1630 return m.group(1)
1631 else:
1632 return unrecognized
1633
1634
1635 class PagedList(object):
1636 def __len__(self):
1637 # This is only useful for tests
1638 return len(self.getslice())
1639
1640
1641 class OnDemandPagedList(PagedList):
1642 def __init__(self, pagefunc, pagesize, use_cache=False):
1643 self._pagefunc = pagefunc
1644 self._pagesize = pagesize
1645 self._use_cache = use_cache
1646 if use_cache:
1647 self._cache = {}
1648
1649 def getslice(self, start=0, end=None):
1650 res = []
1651 for pagenum in itertools.count(start // self._pagesize):
1652 firstid = pagenum * self._pagesize
1653 nextfirstid = pagenum * self._pagesize + self._pagesize
1654 if start >= nextfirstid:
1655 continue
1656
1657 page_results = None
1658 if self._use_cache:
1659 page_results = self._cache.get(pagenum)
1660 if page_results is None:
1661 page_results = list(self._pagefunc(pagenum))
1662 if self._use_cache:
1663 self._cache[pagenum] = page_results
1664
1665 startv = (
1666 start % self._pagesize
1667 if firstid <= start < nextfirstid
1668 else 0)
1669
1670 endv = (
1671 ((end - 1) % self._pagesize) + 1
1672 if (end is not None and firstid <= end <= nextfirstid)
1673 else None)
1674
1675 if startv != 0 or endv is not None:
1676 page_results = page_results[startv:endv]
1677 res.extend(page_results)
1678
1679 # A little optimization - if current page is not "full", ie. does
1680 # not contain page_size videos then we can assume that this page
1681 # is the last one - there are no more ids on further pages -
1682 # i.e. no need to query again.
1683 if len(page_results) + startv < self._pagesize:
1684 break
1685
1686 # If we got the whole page, but the next page is not interesting,
1687 # break out early as well
1688 if end == nextfirstid:
1689 break
1690 return res
1691
1692
1693 class InAdvancePagedList(PagedList):
1694 def __init__(self, pagefunc, pagecount, pagesize):
1695 self._pagefunc = pagefunc
1696 self._pagecount = pagecount
1697 self._pagesize = pagesize
1698
1699 def getslice(self, start=0, end=None):
1700 res = []
1701 start_page = start // self._pagesize
1702 end_page = (
1703 self._pagecount if end is None else (end // self._pagesize + 1))
1704 skip_elems = start - start_page * self._pagesize
1705 only_more = None if end is None else end - start
1706 for pagenum in range(start_page, end_page):
1707 page = list(self._pagefunc(pagenum))
1708 if skip_elems:
1709 page = page[skip_elems:]
1710 skip_elems = None
1711 if only_more is not None:
1712 if len(page) < only_more:
1713 only_more -= len(page)
1714 else:
1715 page = page[:only_more]
1716 res.extend(page)
1717 break
1718 res.extend(page)
1719 return res
1720
1721
1722 def uppercase_escape(s):
1723 unicode_escape = codecs.getdecoder('unicode_escape')
1724 return re.sub(
1725 r'\\U[0-9a-fA-F]{8}',
1726 lambda m: unicode_escape(m.group(0))[0],
1727 s)
1728
1729
1730 def lowercase_escape(s):
1731 unicode_escape = codecs.getdecoder('unicode_escape')
1732 return re.sub(
1733 r'\\u[0-9a-fA-F]{4}',
1734 lambda m: unicode_escape(m.group(0))[0],
1735 s)
1736
1737
1738 def escape_rfc3986(s):
1739 """Escape non-ASCII characters as suggested by RFC 3986"""
1740 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1741 s = s.encode('utf-8')
1742 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1743
1744
1745 def escape_url(url):
1746 """Escape URL as suggested by RFC 3986"""
1747 url_parsed = compat_urllib_parse_urlparse(url)
1748 return url_parsed._replace(
1749 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
1750 path=escape_rfc3986(url_parsed.path),
1751 params=escape_rfc3986(url_parsed.params),
1752 query=escape_rfc3986(url_parsed.query),
1753 fragment=escape_rfc3986(url_parsed.fragment)
1754 ).geturl()
1755
1756 try:
1757 struct.pack('!I', 0)
1758 except TypeError:
1759 # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument
1760 # See https://bugs.python.org/issue19099
1761 def struct_pack(spec, *args):
1762 if isinstance(spec, compat_str):
1763 spec = spec.encode('ascii')
1764 return struct.pack(spec, *args)
1765
1766 def struct_unpack(spec, *args):
1767 if isinstance(spec, compat_str):
1768 spec = spec.encode('ascii')
1769 return struct.unpack(spec, *args)
1770 else:
1771 struct_pack = struct.pack
1772 struct_unpack = struct.unpack
1773
1774
1775 def read_batch_urls(batch_fd):
1776 def fixup(url):
1777 if not isinstance(url, compat_str):
1778 url = url.decode('utf-8', 'replace')
1779 BOM_UTF8 = '\xef\xbb\xbf'
1780 if url.startswith(BOM_UTF8):
1781 url = url[len(BOM_UTF8):]
1782 url = url.strip()
1783 if url.startswith(('#', ';', ']')):
1784 return False
1785 return url
1786
1787 with contextlib.closing(batch_fd) as fd:
1788 return [url for url in map(fixup, fd) if url]
1789
1790
1791 def urlencode_postdata(*args, **kargs):
1792 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1793
1794
1795 def update_url_query(url, query):
1796 parsed_url = compat_urlparse.urlparse(url)
1797 qs = compat_parse_qs(parsed_url.query)
1798 qs.update(query)
1799 qs = encode_dict(qs)
1800 return compat_urlparse.urlunparse(parsed_url._replace(
1801 query=compat_urllib_parse.urlencode(qs, True)))
1802
1803
1804 def encode_dict(d, encoding='utf-8'):
1805 def encode(v):
1806 return v.encode(encoding) if isinstance(v, compat_basestring) else v
1807 return dict((encode(k), encode(v)) for k, v in d.items())
1808
1809
1810 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1811 if isinstance(key_or_keys, (list, tuple)):
1812 for key in key_or_keys:
1813 if key not in d or d[key] is None or skip_false_values and not d[key]:
1814 continue
1815 return d[key]
1816 return default
1817 return d.get(key_or_keys, default)
1818
1819
1820 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1821 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1822
1823
1824 US_RATINGS = {
1825 'G': 0,
1826 'PG': 10,
1827 'PG-13': 13,
1828 'R': 16,
1829 'NC': 18,
1830 }
1831
1832
1833 def parse_age_limit(s):
1834 if s is None:
1835 return None
1836 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1837 return int(m.group('age')) if m else US_RATINGS.get(s)
1838
1839
1840 def strip_jsonp(code):
1841 return re.sub(
1842 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1843
1844
1845 def js_to_json(code):
1846 def fix_kv(m):
1847 v = m.group(0)
1848 if v in ('true', 'false', 'null'):
1849 return v
1850 if v.startswith('"'):
1851 v = re.sub(r"\\'", "'", v[1:-1])
1852 elif v.startswith("'"):
1853 v = v[1:-1]
1854 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1855 '\\\\': '\\\\',
1856 "\\'": "'",
1857 '"': '\\"',
1858 }[m.group(0)], v)
1859 return '"%s"' % v
1860
1861 res = re.sub(r'''(?x)
1862 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1863 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1864 [a-zA-Z_][.a-zA-Z_0-9]*
1865 ''', fix_kv, code)
1866 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1867 return res
1868
1869
1870 def qualities(quality_ids):
1871 """ Get a numeric quality value out of a list of possible values """
1872 def q(qid):
1873 try:
1874 return quality_ids.index(qid)
1875 except ValueError:
1876 return -1
1877 return q
1878
1879
1880 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1881
1882
1883 def limit_length(s, length):
1884 """ Add ellipses to overly long strings """
1885 if s is None:
1886 return None
1887 ELLIPSES = '...'
1888 if len(s) > length:
1889 return s[:length - len(ELLIPSES)] + ELLIPSES
1890 return s
1891
1892
1893 def version_tuple(v):
1894 return tuple(int(e) for e in re.split(r'[-.]', v))
1895
1896
1897 def is_outdated_version(version, limit, assume_new=True):
1898 if not version:
1899 return not assume_new
1900 try:
1901 return version_tuple(version) < version_tuple(limit)
1902 except ValueError:
1903 return not assume_new
1904
1905
1906 def ytdl_is_updateable():
1907 """ Returns if youtube-dl can be updated with -U """
1908 from zipimport import zipimporter
1909
1910 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1911
1912
1913 def args_to_str(args):
1914 # Get a short string representation for a subprocess command
1915 return ' '.join(shlex_quote(a) for a in args)
1916
1917
1918 def error_to_compat_str(err):
1919 err_str = str(err)
1920 # On python 2 error byte string must be decoded with proper
1921 # encoding rather than ascii
1922 if sys.version_info[0] < 3:
1923 err_str = err_str.decode(preferredencoding())
1924 return err_str
1925
1926
1927 def mimetype2ext(mt):
1928 ext = {
1929 'audio/mp4': 'm4a',
1930 }.get(mt)
1931 if ext is not None:
1932 return ext
1933
1934 _, _, res = mt.rpartition('/')
1935
1936 return {
1937 '3gpp': '3gp',
1938 'smptett+xml': 'tt',
1939 'srt': 'srt',
1940 'ttaf+xml': 'dfxp',
1941 'ttml+xml': 'ttml',
1942 'vtt': 'vtt',
1943 'x-flv': 'flv',
1944 'x-mp4-fragmented': 'mp4',
1945 'x-ms-wmv': 'wmv',
1946 }.get(res, res)
1947
1948
1949 def urlhandle_detect_ext(url_handle):
1950 try:
1951 url_handle.headers
1952 getheader = lambda h: url_handle.headers[h]
1953 except AttributeError: # Python < 3
1954 getheader = url_handle.info().getheader
1955
1956 cd = getheader('Content-Disposition')
1957 if cd:
1958 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1959 if m:
1960 e = determine_ext(m.group('filename'), default_ext=None)
1961 if e:
1962 return e
1963
1964 return mimetype2ext(getheader('Content-Type'))
1965
1966
1967 def encode_data_uri(data, mime_type):
1968 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1969
1970
1971 def age_restricted(content_limit, age_limit):
1972 """ Returns True iff the content should be blocked """
1973
1974 if age_limit is None: # No limit set
1975 return False
1976 if content_limit is None:
1977 return False # Content available for everyone
1978 return age_limit < content_limit
1979
1980
1981 def is_html(first_bytes):
1982 """ Detect whether a file contains HTML by examining its first bytes. """
1983
1984 BOMS = [
1985 (b'\xef\xbb\xbf', 'utf-8'),
1986 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1987 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1988 (b'\xff\xfe', 'utf-16-le'),
1989 (b'\xfe\xff', 'utf-16-be'),
1990 ]
1991 for bom, enc in BOMS:
1992 if first_bytes.startswith(bom):
1993 s = first_bytes[len(bom):].decode(enc, 'replace')
1994 break
1995 else:
1996 s = first_bytes.decode('utf-8', 'replace')
1997
1998 return re.match(r'^\s*<', s)
1999
2000
2001 def determine_protocol(info_dict):
2002 protocol = info_dict.get('protocol')
2003 if protocol is not None:
2004 return protocol
2005
2006 url = info_dict['url']
2007 if url.startswith('rtmp'):
2008 return 'rtmp'
2009 elif url.startswith('mms'):
2010 return 'mms'
2011 elif url.startswith('rtsp'):
2012 return 'rtsp'
2013
2014 ext = determine_ext(url)
2015 if ext == 'm3u8':
2016 return 'm3u8'
2017 elif ext == 'f4m':
2018 return 'f4m'
2019
2020 return compat_urllib_parse_urlparse(url).scheme
2021
2022
2023 def render_table(header_row, data):
2024 """ Render a list of rows, each as a list of values """
2025 table = [header_row] + data
2026 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2027 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2028 return '\n'.join(format_str % tuple(row) for row in table)
2029
2030
2031 def _match_one(filter_part, dct):
2032 COMPARISON_OPERATORS = {
2033 '<': operator.lt,
2034 '<=': operator.le,
2035 '>': operator.gt,
2036 '>=': operator.ge,
2037 '=': operator.eq,
2038 '!=': operator.ne,
2039 }
2040 operator_rex = re.compile(r'''(?x)\s*
2041 (?P<key>[a-z_]+)
2042 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2043 (?:
2044 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2045 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2046 )
2047 \s*$
2048 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2049 m = operator_rex.search(filter_part)
2050 if m:
2051 op = COMPARISON_OPERATORS[m.group('op')]
2052 if m.group('strval') is not None:
2053 if m.group('op') not in ('=', '!='):
2054 raise ValueError(
2055 'Operator %s does not support string values!' % m.group('op'))
2056 comparison_value = m.group('strval')
2057 else:
2058 try:
2059 comparison_value = int(m.group('intval'))
2060 except ValueError:
2061 comparison_value = parse_filesize(m.group('intval'))
2062 if comparison_value is None:
2063 comparison_value = parse_filesize(m.group('intval') + 'B')
2064 if comparison_value is None:
2065 raise ValueError(
2066 'Invalid integer value %r in filter part %r' % (
2067 m.group('intval'), filter_part))
2068 actual_value = dct.get(m.group('key'))
2069 if actual_value is None:
2070 return m.group('none_inclusive')
2071 return op(actual_value, comparison_value)
2072
2073 UNARY_OPERATORS = {
2074 '': lambda v: v is not None,
2075 '!': lambda v: v is None,
2076 }
2077 operator_rex = re.compile(r'''(?x)\s*
2078 (?P<op>%s)\s*(?P<key>[a-z_]+)
2079 \s*$
2080 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2081 m = operator_rex.search(filter_part)
2082 if m:
2083 op = UNARY_OPERATORS[m.group('op')]
2084 actual_value = dct.get(m.group('key'))
2085 return op(actual_value)
2086
2087 raise ValueError('Invalid filter part %r' % filter_part)
2088
2089
2090 def match_str(filter_str, dct):
2091 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2092
2093 return all(
2094 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2095
2096
2097 def match_filter_func(filter_str):
2098 def _match_func(info_dict):
2099 if match_str(filter_str, info_dict):
2100 return None
2101 else:
2102 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2103 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2104 return _match_func
2105
2106
2107 def parse_dfxp_time_expr(time_expr):
2108 if not time_expr:
2109 return
2110
2111 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2112 if mobj:
2113 return float(mobj.group('time_offset'))
2114
2115 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2116 if mobj:
2117 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2118
2119
2120 def srt_subtitles_timecode(seconds):
2121 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2122
2123
2124 def dfxp2srt(dfxp_data):
2125 _x = functools.partial(xpath_with_ns, ns_map={
2126 'ttml': 'http://www.w3.org/ns/ttml',
2127 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2128 })
2129
2130 class TTMLPElementParser(object):
2131 out = ''
2132
2133 def start(self, tag, attrib):
2134 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2135 self.out += '\n'
2136
2137 def end(self, tag):
2138 pass
2139
2140 def data(self, data):
2141 self.out += data
2142
2143 def close(self):
2144 return self.out.strip()
2145
2146 def parse_node(node):
2147 target = TTMLPElementParser()
2148 parser = xml.etree.ElementTree.XMLParser(target=target)
2149 parser.feed(xml.etree.ElementTree.tostring(node))
2150 return parser.close()
2151
2152 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2153 out = []
2154 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
2155
2156 if not paras:
2157 raise ValueError('Invalid dfxp/TTML subtitle')
2158
2159 for para, index in zip(paras, itertools.count(1)):
2160 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2161 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2162 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2163 if begin_time is None:
2164 continue
2165 if not end_time:
2166 if not dur:
2167 continue
2168 end_time = begin_time + dur
2169 out.append('%d\n%s --> %s\n%s\n\n' % (
2170 index,
2171 srt_subtitles_timecode(begin_time),
2172 srt_subtitles_timecode(end_time),
2173 parse_node(para)))
2174
2175 return ''.join(out)
2176
2177
2178 def cli_option(params, command_option, param):
2179 param = params.get(param)
2180 return [command_option, param] if param is not None else []
2181
2182
2183 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2184 param = params.get(param)
2185 assert isinstance(param, bool)
2186 if separator:
2187 return [command_option + separator + (true_value if param else false_value)]
2188 return [command_option, true_value if param else false_value]
2189
2190
2191 def cli_valueless_option(params, command_option, param, expected_value=True):
2192 param = params.get(param)
2193 return [command_option] if param == expected_value else []
2194
2195
2196 def cli_configuration_args(params, param, default=[]):
2197 ex_args = params.get(param)
2198 if ex_args is None:
2199 return default
2200 assert isinstance(ex_args, list)
2201 return ex_args
2202
2203
2204 class ISO639Utils(object):
2205 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2206 _lang_map = {
2207 'aa': 'aar',
2208 'ab': 'abk',
2209 'ae': 'ave',
2210 'af': 'afr',
2211 'ak': 'aka',
2212 'am': 'amh',
2213 'an': 'arg',
2214 'ar': 'ara',
2215 'as': 'asm',
2216 'av': 'ava',
2217 'ay': 'aym',
2218 'az': 'aze',
2219 'ba': 'bak',
2220 'be': 'bel',
2221 'bg': 'bul',
2222 'bh': 'bih',
2223 'bi': 'bis',
2224 'bm': 'bam',
2225 'bn': 'ben',
2226 'bo': 'bod',
2227 'br': 'bre',
2228 'bs': 'bos',
2229 'ca': 'cat',
2230 'ce': 'che',
2231 'ch': 'cha',
2232 'co': 'cos',
2233 'cr': 'cre',
2234 'cs': 'ces',
2235 'cu': 'chu',
2236 'cv': 'chv',
2237 'cy': 'cym',
2238 'da': 'dan',
2239 'de': 'deu',
2240 'dv': 'div',
2241 'dz': 'dzo',
2242 'ee': 'ewe',
2243 'el': 'ell',
2244 'en': 'eng',
2245 'eo': 'epo',
2246 'es': 'spa',
2247 'et': 'est',
2248 'eu': 'eus',
2249 'fa': 'fas',
2250 'ff': 'ful',
2251 'fi': 'fin',
2252 'fj': 'fij',
2253 'fo': 'fao',
2254 'fr': 'fra',
2255 'fy': 'fry',
2256 'ga': 'gle',
2257 'gd': 'gla',
2258 'gl': 'glg',
2259 'gn': 'grn',
2260 'gu': 'guj',
2261 'gv': 'glv',
2262 'ha': 'hau',
2263 'he': 'heb',
2264 'hi': 'hin',
2265 'ho': 'hmo',
2266 'hr': 'hrv',
2267 'ht': 'hat',
2268 'hu': 'hun',
2269 'hy': 'hye',
2270 'hz': 'her',
2271 'ia': 'ina',
2272 'id': 'ind',
2273 'ie': 'ile',
2274 'ig': 'ibo',
2275 'ii': 'iii',
2276 'ik': 'ipk',
2277 'io': 'ido',
2278 'is': 'isl',
2279 'it': 'ita',
2280 'iu': 'iku',
2281 'ja': 'jpn',
2282 'jv': 'jav',
2283 'ka': 'kat',
2284 'kg': 'kon',
2285 'ki': 'kik',
2286 'kj': 'kua',
2287 'kk': 'kaz',
2288 'kl': 'kal',
2289 'km': 'khm',
2290 'kn': 'kan',
2291 'ko': 'kor',
2292 'kr': 'kau',
2293 'ks': 'kas',
2294 'ku': 'kur',
2295 'kv': 'kom',
2296 'kw': 'cor',
2297 'ky': 'kir',
2298 'la': 'lat',
2299 'lb': 'ltz',
2300 'lg': 'lug',
2301 'li': 'lim',
2302 'ln': 'lin',
2303 'lo': 'lao',
2304 'lt': 'lit',
2305 'lu': 'lub',
2306 'lv': 'lav',
2307 'mg': 'mlg',
2308 'mh': 'mah',
2309 'mi': 'mri',
2310 'mk': 'mkd',
2311 'ml': 'mal',
2312 'mn': 'mon',
2313 'mr': 'mar',
2314 'ms': 'msa',
2315 'mt': 'mlt',
2316 'my': 'mya',
2317 'na': 'nau',
2318 'nb': 'nob',
2319 'nd': 'nde',
2320 'ne': 'nep',
2321 'ng': 'ndo',
2322 'nl': 'nld',
2323 'nn': 'nno',
2324 'no': 'nor',
2325 'nr': 'nbl',
2326 'nv': 'nav',
2327 'ny': 'nya',
2328 'oc': 'oci',
2329 'oj': 'oji',
2330 'om': 'orm',
2331 'or': 'ori',
2332 'os': 'oss',
2333 'pa': 'pan',
2334 'pi': 'pli',
2335 'pl': 'pol',
2336 'ps': 'pus',
2337 'pt': 'por',
2338 'qu': 'que',
2339 'rm': 'roh',
2340 'rn': 'run',
2341 'ro': 'ron',
2342 'ru': 'rus',
2343 'rw': 'kin',
2344 'sa': 'san',
2345 'sc': 'srd',
2346 'sd': 'snd',
2347 'se': 'sme',
2348 'sg': 'sag',
2349 'si': 'sin',
2350 'sk': 'slk',
2351 'sl': 'slv',
2352 'sm': 'smo',
2353 'sn': 'sna',
2354 'so': 'som',
2355 'sq': 'sqi',
2356 'sr': 'srp',
2357 'ss': 'ssw',
2358 'st': 'sot',
2359 'su': 'sun',
2360 'sv': 'swe',
2361 'sw': 'swa',
2362 'ta': 'tam',
2363 'te': 'tel',
2364 'tg': 'tgk',
2365 'th': 'tha',
2366 'ti': 'tir',
2367 'tk': 'tuk',
2368 'tl': 'tgl',
2369 'tn': 'tsn',
2370 'to': 'ton',
2371 'tr': 'tur',
2372 'ts': 'tso',
2373 'tt': 'tat',
2374 'tw': 'twi',
2375 'ty': 'tah',
2376 'ug': 'uig',
2377 'uk': 'ukr',
2378 'ur': 'urd',
2379 'uz': 'uzb',
2380 've': 'ven',
2381 'vi': 'vie',
2382 'vo': 'vol',
2383 'wa': 'wln',
2384 'wo': 'wol',
2385 'xh': 'xho',
2386 'yi': 'yid',
2387 'yo': 'yor',
2388 'za': 'zha',
2389 'zh': 'zho',
2390 'zu': 'zul',
2391 }
2392
2393 @classmethod
2394 def short2long(cls, code):
2395 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2396 return cls._lang_map.get(code[:2])
2397
2398 @classmethod
2399 def long2short(cls, code):
2400 """Convert language code from ISO 639-2/T to ISO 639-1"""
2401 for short_name, long_name in cls._lang_map.items():
2402 if long_name == code:
2403 return short_name
2404
2405
2406 class ISO3166Utils(object):
2407 # From http://data.okfn.org/data/core/country-list
2408 _country_map = {
2409 'AF': 'Afghanistan',
2410 'AX': 'Åland Islands',
2411 'AL': 'Albania',
2412 'DZ': 'Algeria',
2413 'AS': 'American Samoa',
2414 'AD': 'Andorra',
2415 'AO': 'Angola',
2416 'AI': 'Anguilla',
2417 'AQ': 'Antarctica',
2418 'AG': 'Antigua and Barbuda',
2419 'AR': 'Argentina',
2420 'AM': 'Armenia',
2421 'AW': 'Aruba',
2422 'AU': 'Australia',
2423 'AT': 'Austria',
2424 'AZ': 'Azerbaijan',
2425 'BS': 'Bahamas',
2426 'BH': 'Bahrain',
2427 'BD': 'Bangladesh',
2428 'BB': 'Barbados',
2429 'BY': 'Belarus',
2430 'BE': 'Belgium',
2431 'BZ': 'Belize',
2432 'BJ': 'Benin',
2433 'BM': 'Bermuda',
2434 'BT': 'Bhutan',
2435 'BO': 'Bolivia, Plurinational State of',
2436 'BQ': 'Bonaire, Sint Eustatius and Saba',
2437 'BA': 'Bosnia and Herzegovina',
2438 'BW': 'Botswana',
2439 'BV': 'Bouvet Island',
2440 'BR': 'Brazil',
2441 'IO': 'British Indian Ocean Territory',
2442 'BN': 'Brunei Darussalam',
2443 'BG': 'Bulgaria',
2444 'BF': 'Burkina Faso',
2445 'BI': 'Burundi',
2446 'KH': 'Cambodia',
2447 'CM': 'Cameroon',
2448 'CA': 'Canada',
2449 'CV': 'Cape Verde',
2450 'KY': 'Cayman Islands',
2451 'CF': 'Central African Republic',
2452 'TD': 'Chad',
2453 'CL': 'Chile',
2454 'CN': 'China',
2455 'CX': 'Christmas Island',
2456 'CC': 'Cocos (Keeling) Islands',
2457 'CO': 'Colombia',
2458 'KM': 'Comoros',
2459 'CG': 'Congo',
2460 'CD': 'Congo, the Democratic Republic of the',
2461 'CK': 'Cook Islands',
2462 'CR': 'Costa Rica',
2463 'CI': 'Côte d\'Ivoire',
2464 'HR': 'Croatia',
2465 'CU': 'Cuba',
2466 'CW': 'Curaçao',
2467 'CY': 'Cyprus',
2468 'CZ': 'Czech Republic',
2469 'DK': 'Denmark',
2470 'DJ': 'Djibouti',
2471 'DM': 'Dominica',
2472 'DO': 'Dominican Republic',
2473 'EC': 'Ecuador',
2474 'EG': 'Egypt',
2475 'SV': 'El Salvador',
2476 'GQ': 'Equatorial Guinea',
2477 'ER': 'Eritrea',
2478 'EE': 'Estonia',
2479 'ET': 'Ethiopia',
2480 'FK': 'Falkland Islands (Malvinas)',
2481 'FO': 'Faroe Islands',
2482 'FJ': 'Fiji',
2483 'FI': 'Finland',
2484 'FR': 'France',
2485 'GF': 'French Guiana',
2486 'PF': 'French Polynesia',
2487 'TF': 'French Southern Territories',
2488 'GA': 'Gabon',
2489 'GM': 'Gambia',
2490 'GE': 'Georgia',
2491 'DE': 'Germany',
2492 'GH': 'Ghana',
2493 'GI': 'Gibraltar',
2494 'GR': 'Greece',
2495 'GL': 'Greenland',
2496 'GD': 'Grenada',
2497 'GP': 'Guadeloupe',
2498 'GU': 'Guam',
2499 'GT': 'Guatemala',
2500 'GG': 'Guernsey',
2501 'GN': 'Guinea',
2502 'GW': 'Guinea-Bissau',
2503 'GY': 'Guyana',
2504 'HT': 'Haiti',
2505 'HM': 'Heard Island and McDonald Islands',
2506 'VA': 'Holy See (Vatican City State)',
2507 'HN': 'Honduras',
2508 'HK': 'Hong Kong',
2509 'HU': 'Hungary',
2510 'IS': 'Iceland',
2511 'IN': 'India',
2512 'ID': 'Indonesia',
2513 'IR': 'Iran, Islamic Republic of',
2514 'IQ': 'Iraq',
2515 'IE': 'Ireland',
2516 'IM': 'Isle of Man',
2517 'IL': 'Israel',
2518 'IT': 'Italy',
2519 'JM': 'Jamaica',
2520 'JP': 'Japan',
2521 'JE': 'Jersey',
2522 'JO': 'Jordan',
2523 'KZ': 'Kazakhstan',
2524 'KE': 'Kenya',
2525 'KI': 'Kiribati',
2526 'KP': 'Korea, Democratic People\'s Republic of',
2527 'KR': 'Korea, Republic of',
2528 'KW': 'Kuwait',
2529 'KG': 'Kyrgyzstan',
2530 'LA': 'Lao People\'s Democratic Republic',
2531 'LV': 'Latvia',
2532 'LB': 'Lebanon',
2533 'LS': 'Lesotho',
2534 'LR': 'Liberia',
2535 'LY': 'Libya',
2536 'LI': 'Liechtenstein',
2537 'LT': 'Lithuania',
2538 'LU': 'Luxembourg',
2539 'MO': 'Macao',
2540 'MK': 'Macedonia, the Former Yugoslav Republic of',
2541 'MG': 'Madagascar',
2542 'MW': 'Malawi',
2543 'MY': 'Malaysia',
2544 'MV': 'Maldives',
2545 'ML': 'Mali',
2546 'MT': 'Malta',
2547 'MH': 'Marshall Islands',
2548 'MQ': 'Martinique',
2549 'MR': 'Mauritania',
2550 'MU': 'Mauritius',
2551 'YT': 'Mayotte',
2552 'MX': 'Mexico',
2553 'FM': 'Micronesia, Federated States of',
2554 'MD': 'Moldova, Republic of',
2555 'MC': 'Monaco',
2556 'MN': 'Mongolia',
2557 'ME': 'Montenegro',
2558 'MS': 'Montserrat',
2559 'MA': 'Morocco',
2560 'MZ': 'Mozambique',
2561 'MM': 'Myanmar',
2562 'NA': 'Namibia',
2563 'NR': 'Nauru',
2564 'NP': 'Nepal',
2565 'NL': 'Netherlands',
2566 'NC': 'New Caledonia',
2567 'NZ': 'New Zealand',
2568 'NI': 'Nicaragua',
2569 'NE': 'Niger',
2570 'NG': 'Nigeria',
2571 'NU': 'Niue',
2572 'NF': 'Norfolk Island',
2573 'MP': 'Northern Mariana Islands',
2574 'NO': 'Norway',
2575 'OM': 'Oman',
2576 'PK': 'Pakistan',
2577 'PW': 'Palau',
2578 'PS': 'Palestine, State of',
2579 'PA': 'Panama',
2580 'PG': 'Papua New Guinea',
2581 'PY': 'Paraguay',
2582 'PE': 'Peru',
2583 'PH': 'Philippines',
2584 'PN': 'Pitcairn',
2585 'PL': 'Poland',
2586 'PT': 'Portugal',
2587 'PR': 'Puerto Rico',
2588 'QA': 'Qatar',
2589 'RE': 'Réunion',
2590 'RO': 'Romania',
2591 'RU': 'Russian Federation',
2592 'RW': 'Rwanda',
2593 'BL': 'Saint Barthélemy',
2594 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2595 'KN': 'Saint Kitts and Nevis',
2596 'LC': 'Saint Lucia',
2597 'MF': 'Saint Martin (French part)',
2598 'PM': 'Saint Pierre and Miquelon',
2599 'VC': 'Saint Vincent and the Grenadines',
2600 'WS': 'Samoa',
2601 'SM': 'San Marino',
2602 'ST': 'Sao Tome and Principe',
2603 'SA': 'Saudi Arabia',
2604 'SN': 'Senegal',
2605 'RS': 'Serbia',
2606 'SC': 'Seychelles',
2607 'SL': 'Sierra Leone',
2608 'SG': 'Singapore',
2609 'SX': 'Sint Maarten (Dutch part)',
2610 'SK': 'Slovakia',
2611 'SI': 'Slovenia',
2612 'SB': 'Solomon Islands',
2613 'SO': 'Somalia',
2614 'ZA': 'South Africa',
2615 'GS': 'South Georgia and the South Sandwich Islands',
2616 'SS': 'South Sudan',
2617 'ES': 'Spain',
2618 'LK': 'Sri Lanka',
2619 'SD': 'Sudan',
2620 'SR': 'Suriname',
2621 'SJ': 'Svalbard and Jan Mayen',
2622 'SZ': 'Swaziland',
2623 'SE': 'Sweden',
2624 'CH': 'Switzerland',
2625 'SY': 'Syrian Arab Republic',
2626 'TW': 'Taiwan, Province of China',
2627 'TJ': 'Tajikistan',
2628 'TZ': 'Tanzania, United Republic of',
2629 'TH': 'Thailand',
2630 'TL': 'Timor-Leste',
2631 'TG': 'Togo',
2632 'TK': 'Tokelau',
2633 'TO': 'Tonga',
2634 'TT': 'Trinidad and Tobago',
2635 'TN': 'Tunisia',
2636 'TR': 'Turkey',
2637 'TM': 'Turkmenistan',
2638 'TC': 'Turks and Caicos Islands',
2639 'TV': 'Tuvalu',
2640 'UG': 'Uganda',
2641 'UA': 'Ukraine',
2642 'AE': 'United Arab Emirates',
2643 'GB': 'United Kingdom',
2644 'US': 'United States',
2645 'UM': 'United States Minor Outlying Islands',
2646 'UY': 'Uruguay',
2647 'UZ': 'Uzbekistan',
2648 'VU': 'Vanuatu',
2649 'VE': 'Venezuela, Bolivarian Republic of',
2650 'VN': 'Viet Nam',
2651 'VG': 'Virgin Islands, British',
2652 'VI': 'Virgin Islands, U.S.',
2653 'WF': 'Wallis and Futuna',
2654 'EH': 'Western Sahara',
2655 'YE': 'Yemen',
2656 'ZM': 'Zambia',
2657 'ZW': 'Zimbabwe',
2658 }
2659
2660 @classmethod
2661 def short2full(cls, code):
2662 """Convert an ISO 3166-2 country code to the corresponding full name"""
2663 return cls._country_map.get(code.upper())
2664
2665
2666 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2667 def __init__(self, proxies=None):
2668 # Set default handlers
2669 for type in ('http', 'https'):
2670 setattr(self, '%s_open' % type,
2671 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2672 meth(r, proxy, type))
2673 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2674
2675 def proxy_open(self, req, proxy, type):
2676 req_proxy = req.headers.get('Ytdl-request-proxy')
2677 if req_proxy is not None:
2678 proxy = req_proxy
2679 del req.headers['Ytdl-request-proxy']
2680
2681 if proxy == '__noproxy__':
2682 return None # No Proxy
2683 return compat_urllib_request.ProxyHandler.proxy_open(
2684 self, req, proxy, type)
2685
2686
2687 def ohdave_rsa_encrypt(data, exponent, modulus):
2688 '''
2689 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2690
2691 Input:
2692 data: data to encrypt, bytes-like object
2693 exponent, modulus: parameter e and N of RSA algorithm, both integer
2694 Output: hex string of encrypted data
2695
2696 Limitation: supports one block encryption only
2697 '''
2698
2699 payload = int(binascii.hexlify(data[::-1]), 16)
2700 encrypted = pow(payload, exponent, modulus)
2701 return '%x' % encrypted
2702
2703
2704 def encode_base_n(num, n, table=None):
2705 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
2706 if not table:
2707 table = FULL_TABLE[:n]
2708
2709 if n > len(table):
2710 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2711
2712 if num == 0:
2713 return table[0]
2714
2715 ret = ''
2716 while num:
2717 ret = table[num % n] + ret
2718 num = num // n
2719 return ret
2720
2721
2722 def decode_packed_codes(code):
2723 mobj = re.search(
2724 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
2725 code)
2726 obfucasted_code, base, count, symbols = mobj.groups()
2727 base = int(base)
2728 count = int(count)
2729 symbols = symbols.split('|')
2730 symbol_table = {}
2731
2732 while count:
2733 count -= 1
2734 base_n_count = encode_base_n(count, base)
2735 symbol_table[base_n_count] = symbols[count] or base_n_count
2736
2737 return re.sub(
2738 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2739 obfucasted_code)