]> jfr.im git - yt-dlp.git/blob - youtube_dl/utils.py
[utils] Remove AM/PM from unified_strdate patterns
[yt-dlp.git] / youtube_dl / utils.py
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3
4 from __future__ import unicode_literals
5
6 import base64
7 import binascii
8 import calendar
9 import codecs
10 import contextlib
11 import ctypes
12 import datetime
13 import email.utils
14 import errno
15 import functools
16 import gzip
17 import itertools
18 import io
19 import json
20 import locale
21 import math
22 import operator
23 import os
24 import pipes
25 import platform
26 import re
27 import ssl
28 import socket
29 import struct
30 import subprocess
31 import sys
32 import tempfile
33 import traceback
34 import xml.etree.ElementTree
35 import zlib
36
37 from .compat import (
38 compat_basestring,
39 compat_chr,
40 compat_etree_fromstring,
41 compat_html_entities,
42 compat_http_client,
43 compat_kwargs,
44 compat_parse_qs,
45 compat_socket_create_connection,
46 compat_str,
47 compat_urllib_error,
48 compat_urllib_parse,
49 compat_urllib_parse_urlparse,
50 compat_urllib_request,
51 compat_urlparse,
52 shlex_quote,
53 )
54
55
56 # This is not clearly defined otherwise
57 compiled_regex_type = type(re.compile(''))
58
59 std_headers = {
60 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
61 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
62 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
63 'Accept-Encoding': 'gzip, deflate',
64 'Accept-Language': 'en-us,en;q=0.5',
65 }
66
67
68 NO_DEFAULT = object()
69
70 ENGLISH_MONTH_NAMES = [
71 'January', 'February', 'March', 'April', 'May', 'June',
72 'July', 'August', 'September', 'October', 'November', 'December']
73
74 KNOWN_EXTENSIONS = (
75 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
76 'flv', 'f4v', 'f4a', 'f4b',
77 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
78 'mkv', 'mka', 'mk3d',
79 'avi', 'divx',
80 'mov',
81 'asf', 'wmv', 'wma',
82 '3gp', '3g2',
83 'mp3',
84 'flac',
85 'ape',
86 'wav',
87 'f4f', 'f4m', 'm3u8', 'smil')
88
89
90 def preferredencoding():
91 """Get preferred encoding.
92
93 Returns the best encoding scheme for the system, based on
94 locale.getpreferredencoding() and some further tweaks.
95 """
96 try:
97 pref = locale.getpreferredencoding()
98 'TEST'.encode(pref)
99 except Exception:
100 pref = 'UTF-8'
101
102 return pref
103
104
105 def write_json_file(obj, fn):
106 """ Encode obj as JSON and write it to fn, atomically if possible """
107
108 fn = encodeFilename(fn)
109 if sys.version_info < (3, 0) and sys.platform != 'win32':
110 encoding = get_filesystem_encoding()
111 # os.path.basename returns a bytes object, but NamedTemporaryFile
112 # will fail if the filename contains non ascii characters unless we
113 # use a unicode object
114 path_basename = lambda f: os.path.basename(fn).decode(encoding)
115 # the same for os.path.dirname
116 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
117 else:
118 path_basename = os.path.basename
119 path_dirname = os.path.dirname
120
121 args = {
122 'suffix': '.tmp',
123 'prefix': path_basename(fn) + '.',
124 'dir': path_dirname(fn),
125 'delete': False,
126 }
127
128 # In Python 2.x, json.dump expects a bytestream.
129 # In Python 3.x, it writes to a character stream
130 if sys.version_info < (3, 0):
131 args['mode'] = 'wb'
132 else:
133 args.update({
134 'mode': 'w',
135 'encoding': 'utf-8',
136 })
137
138 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
139
140 try:
141 with tf:
142 json.dump(obj, tf)
143 if sys.platform == 'win32':
144 # Need to remove existing file on Windows, else os.rename raises
145 # WindowsError or FileExistsError.
146 try:
147 os.unlink(fn)
148 except OSError:
149 pass
150 os.rename(tf.name, fn)
151 except Exception:
152 try:
153 os.remove(tf.name)
154 except OSError:
155 pass
156 raise
157
158
159 if sys.version_info >= (2, 7):
160 def find_xpath_attr(node, xpath, key, val=None):
161 """ Find the xpath xpath[@key=val] """
162 assert re.match(r'^[a-zA-Z_-]+$', key)
163 if val:
164 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
165 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
166 return node.find(expr)
167 else:
168 def find_xpath_attr(node, xpath, key, val=None):
169 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
170 # .//node does not match if a node is a direct child of . !
171 if isinstance(xpath, compat_str):
172 xpath = xpath.encode('ascii')
173
174 for f in node.findall(xpath):
175 if key not in f.attrib:
176 continue
177 if val is None or f.attrib.get(key) == val:
178 return f
179 return None
180
181 # On python2.6 the xml.etree.ElementTree.Element methods don't support
182 # the namespace parameter
183
184
185 def xpath_with_ns(path, ns_map):
186 components = [c.split(':') for c in path.split('/')]
187 replaced = []
188 for c in components:
189 if len(c) == 1:
190 replaced.append(c[0])
191 else:
192 ns, tag = c
193 replaced.append('{%s}%s' % (ns_map[ns], tag))
194 return '/'.join(replaced)
195
196
197 def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
198 def _find_xpath(xpath):
199 if sys.version_info < (2, 7): # Crazy 2.6
200 xpath = xpath.encode('ascii')
201 return node.find(xpath)
202
203 if isinstance(xpath, (str, compat_str)):
204 n = _find_xpath(xpath)
205 else:
206 for xp in xpath:
207 n = _find_xpath(xp)
208 if n is not None:
209 break
210
211 if n is None:
212 if default is not NO_DEFAULT:
213 return default
214 elif fatal:
215 name = xpath if name is None else name
216 raise ExtractorError('Could not find XML element %s' % name)
217 else:
218 return None
219 return n
220
221
222 def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
223 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
224 if n is None or n == default:
225 return n
226 if n.text is None:
227 if default is not NO_DEFAULT:
228 return default
229 elif fatal:
230 name = xpath if name is None else name
231 raise ExtractorError('Could not find XML element\'s text %s' % name)
232 else:
233 return None
234 return n.text
235
236
237 def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
238 n = find_xpath_attr(node, xpath, key)
239 if n is None:
240 if default is not NO_DEFAULT:
241 return default
242 elif fatal:
243 name = '%s[@%s]' % (xpath, key) if name is None else name
244 raise ExtractorError('Could not find XML attribute %s' % name)
245 else:
246 return None
247 return n.attrib[key]
248
249
250 def get_element_by_id(id, html):
251 """Return the content of the tag with the specified ID in the passed HTML document"""
252 return get_element_by_attribute('id', id, html)
253
254
255 def get_element_by_attribute(attribute, value, html):
256 """Return the content of the tag with the specified attribute in the passed HTML document"""
257
258 m = re.search(r'''(?xs)
259 <([a-zA-Z0-9:._-]+)
260 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
261 \s+%s=['"]?%s['"]?
262 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
263 \s*>
264 (?P<content>.*?)
265 </\1>
266 ''' % (re.escape(attribute), re.escape(value)), html)
267
268 if not m:
269 return None
270 res = m.group('content')
271
272 if res.startswith('"') or res.startswith("'"):
273 res = res[1:-1]
274
275 return unescapeHTML(res)
276
277
278 def clean_html(html):
279 """Clean an HTML snippet into a readable string"""
280
281 if html is None: # Convenience for sanitizing descriptions etc.
282 return html
283
284 # Newline vs <br />
285 html = html.replace('\n', ' ')
286 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
287 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
288 # Strip html tags
289 html = re.sub('<.*?>', '', html)
290 # Replace html entities
291 html = unescapeHTML(html)
292 return html.strip()
293
294
295 def sanitize_open(filename, open_mode):
296 """Try to open the given filename, and slightly tweak it if this fails.
297
298 Attempts to open the given filename. If this fails, it tries to change
299 the filename slightly, step by step, until it's either able to open it
300 or it fails and raises a final exception, like the standard open()
301 function.
302
303 It returns the tuple (stream, definitive_file_name).
304 """
305 try:
306 if filename == '-':
307 if sys.platform == 'win32':
308 import msvcrt
309 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
310 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
311 stream = open(encodeFilename(filename), open_mode)
312 return (stream, filename)
313 except (IOError, OSError) as err:
314 if err.errno in (errno.EACCES,):
315 raise
316
317 # In case of error, try to remove win32 forbidden chars
318 alt_filename = sanitize_path(filename)
319 if alt_filename == filename:
320 raise
321 else:
322 # An exception here should be caught in the caller
323 stream = open(encodeFilename(alt_filename), open_mode)
324 return (stream, alt_filename)
325
326
327 def timeconvert(timestr):
328 """Convert RFC 2822 defined time string into system timestamp"""
329 timestamp = None
330 timetuple = email.utils.parsedate_tz(timestr)
331 if timetuple is not None:
332 timestamp = email.utils.mktime_tz(timetuple)
333 return timestamp
334
335
336 def sanitize_filename(s, restricted=False, is_id=False):
337 """Sanitizes a string so it could be used as part of a filename.
338 If restricted is set, use a stricter subset of allowed characters.
339 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
340 """
341 def replace_insane(char):
342 if char == '?' or ord(char) < 32 or ord(char) == 127:
343 return ''
344 elif char == '"':
345 return '' if restricted else '\''
346 elif char == ':':
347 return '_-' if restricted else ' -'
348 elif char in '\\/|*<>':
349 return '_'
350 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
351 return '_'
352 if restricted and ord(char) > 127:
353 return '_'
354 return char
355
356 # Handle timestamps
357 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
358 result = ''.join(map(replace_insane, s))
359 if not is_id:
360 while '__' in result:
361 result = result.replace('__', '_')
362 result = result.strip('_')
363 # Common case of "Foreign band name - English song title"
364 if restricted and result.startswith('-_'):
365 result = result[2:]
366 if result.startswith('-'):
367 result = '_' + result[len('-'):]
368 result = result.lstrip('.')
369 if not result:
370 result = '_'
371 return result
372
373
374 def sanitize_path(s):
375 """Sanitizes and normalizes path on Windows"""
376 if sys.platform != 'win32':
377 return s
378 drive_or_unc, _ = os.path.splitdrive(s)
379 if sys.version_info < (2, 7) and not drive_or_unc:
380 drive_or_unc, _ = os.path.splitunc(s)
381 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
382 if drive_or_unc:
383 norm_path.pop(0)
384 sanitized_path = [
385 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
386 for path_part in norm_path]
387 if drive_or_unc:
388 sanitized_path.insert(0, drive_or_unc + os.path.sep)
389 return os.path.join(*sanitized_path)
390
391
392 # Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
393 # unwanted failures due to missing protocol
394 def sanitized_Request(url, *args, **kwargs):
395 return compat_urllib_request.Request(
396 'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
397
398
399 def orderedSet(iterable):
400 """ Remove all duplicates from the input iterable """
401 res = []
402 for el in iterable:
403 if el not in res:
404 res.append(el)
405 return res
406
407
408 def _htmlentity_transform(entity):
409 """Transforms an HTML entity to a character."""
410 # Known non-numeric HTML entity
411 if entity in compat_html_entities.name2codepoint:
412 return compat_chr(compat_html_entities.name2codepoint[entity])
413
414 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
415 if mobj is not None:
416 numstr = mobj.group(1)
417 if numstr.startswith('x'):
418 base = 16
419 numstr = '0%s' % numstr
420 else:
421 base = 10
422 # See https://github.com/rg3/youtube-dl/issues/7518
423 try:
424 return compat_chr(int(numstr, base))
425 except ValueError:
426 pass
427
428 # Unknown entity in name, return its literal representation
429 return '&%s;' % entity
430
431
432 def unescapeHTML(s):
433 if s is None:
434 return None
435 assert type(s) == compat_str
436
437 return re.sub(
438 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
439
440
441 def get_subprocess_encoding():
442 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
443 # For subprocess calls, encode with locale encoding
444 # Refer to http://stackoverflow.com/a/9951851/35070
445 encoding = preferredencoding()
446 else:
447 encoding = sys.getfilesystemencoding()
448 if encoding is None:
449 encoding = 'utf-8'
450 return encoding
451
452
453 def encodeFilename(s, for_subprocess=False):
454 """
455 @param s The name of the file
456 """
457
458 assert type(s) == compat_str
459
460 # Python 3 has a Unicode API
461 if sys.version_info >= (3, 0):
462 return s
463
464 # Pass '' directly to use Unicode APIs on Windows 2000 and up
465 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
466 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
467 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
468 return s
469
470 return s.encode(get_subprocess_encoding(), 'ignore')
471
472
473 def decodeFilename(b, for_subprocess=False):
474
475 if sys.version_info >= (3, 0):
476 return b
477
478 if not isinstance(b, bytes):
479 return b
480
481 return b.decode(get_subprocess_encoding(), 'ignore')
482
483
484 def encodeArgument(s):
485 if not isinstance(s, compat_str):
486 # Legacy code that uses byte strings
487 # Uncomment the following line after fixing all post processors
488 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
489 s = s.decode('ascii')
490 return encodeFilename(s, True)
491
492
493 def decodeArgument(b):
494 return decodeFilename(b, True)
495
496
497 def decodeOption(optval):
498 if optval is None:
499 return optval
500 if isinstance(optval, bytes):
501 optval = optval.decode(preferredencoding())
502
503 assert isinstance(optval, compat_str)
504 return optval
505
506
507 def formatSeconds(secs):
508 if secs > 3600:
509 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
510 elif secs > 60:
511 return '%d:%02d' % (secs // 60, secs % 60)
512 else:
513 return '%d' % secs
514
515
516 def make_HTTPS_handler(params, **kwargs):
517 opts_no_check_certificate = params.get('nocheckcertificate', False)
518 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
519 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
520 if opts_no_check_certificate:
521 context.check_hostname = False
522 context.verify_mode = ssl.CERT_NONE
523 try:
524 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
525 except TypeError:
526 # Python 2.7.8
527 # (create_default_context present but HTTPSHandler has no context=)
528 pass
529
530 if sys.version_info < (3, 2):
531 return YoutubeDLHTTPSHandler(params, **kwargs)
532 else: # Python < 3.4
533 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
534 context.verify_mode = (ssl.CERT_NONE
535 if opts_no_check_certificate
536 else ssl.CERT_REQUIRED)
537 context.set_default_verify_paths()
538 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
539
540
541 def bug_reports_message():
542 if ytdl_is_updateable():
543 update_cmd = 'type youtube-dl -U to update'
544 else:
545 update_cmd = 'see https://yt-dl.org/update on how to update'
546 msg = '; please report this issue on https://yt-dl.org/bug .'
547 msg += ' Make sure you are using the latest version; %s.' % update_cmd
548 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
549 return msg
550
551
552 class ExtractorError(Exception):
553 """Error during info extraction."""
554
555 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
556 """ tb, if given, is the original traceback (so that it can be printed out).
557 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
558 """
559
560 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
561 expected = True
562 if video_id is not None:
563 msg = video_id + ': ' + msg
564 if cause:
565 msg += ' (caused by %r)' % cause
566 if not expected:
567 msg += bug_reports_message()
568 super(ExtractorError, self).__init__(msg)
569
570 self.traceback = tb
571 self.exc_info = sys.exc_info() # preserve original exception
572 self.cause = cause
573 self.video_id = video_id
574
575 def format_traceback(self):
576 if self.traceback is None:
577 return None
578 return ''.join(traceback.format_tb(self.traceback))
579
580
581 class UnsupportedError(ExtractorError):
582 def __init__(self, url):
583 super(UnsupportedError, self).__init__(
584 'Unsupported URL: %s' % url, expected=True)
585 self.url = url
586
587
588 class RegexNotFoundError(ExtractorError):
589 """Error when a regex didn't match"""
590 pass
591
592
593 class DownloadError(Exception):
594 """Download Error exception.
595
596 This exception may be thrown by FileDownloader objects if they are not
597 configured to continue on errors. They will contain the appropriate
598 error message.
599 """
600
601 def __init__(self, msg, exc_info=None):
602 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
603 super(DownloadError, self).__init__(msg)
604 self.exc_info = exc_info
605
606
607 class SameFileError(Exception):
608 """Same File exception.
609
610 This exception will be thrown by FileDownloader objects if they detect
611 multiple files would have to be downloaded to the same file on disk.
612 """
613 pass
614
615
616 class PostProcessingError(Exception):
617 """Post Processing exception.
618
619 This exception may be raised by PostProcessor's .run() method to
620 indicate an error in the postprocessing task.
621 """
622
623 def __init__(self, msg):
624 self.msg = msg
625
626
627 class MaxDownloadsReached(Exception):
628 """ --max-downloads limit has been reached. """
629 pass
630
631
632 class UnavailableVideoError(Exception):
633 """Unavailable Format exception.
634
635 This exception will be thrown when a video is requested
636 in a format that is not available for that video.
637 """
638 pass
639
640
641 class ContentTooShortError(Exception):
642 """Content Too Short exception.
643
644 This exception may be raised by FileDownloader objects when a file they
645 download is too small for what the server announced first, indicating
646 the connection was probably interrupted.
647 """
648
649 def __init__(self, downloaded, expected):
650 # Both in bytes
651 self.downloaded = downloaded
652 self.expected = expected
653
654
655 def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
656 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
657 # expected HTTP responses to meet HTTP/1.0 or later (see also
658 # https://github.com/rg3/youtube-dl/issues/6727)
659 if sys.version_info < (3, 0):
660 kwargs[b'strict'] = True
661 hc = http_class(*args, **kwargs)
662 source_address = ydl_handler._params.get('source_address')
663 if source_address is not None:
664 sa = (source_address, 0)
665 if hasattr(hc, 'source_address'): # Python 2.7+
666 hc.source_address = sa
667 else: # Python 2.6
668 def _hc_connect(self, *args, **kwargs):
669 sock = compat_socket_create_connection(
670 (self.host, self.port), self.timeout, sa)
671 if is_https:
672 self.sock = ssl.wrap_socket(
673 sock, self.key_file, self.cert_file,
674 ssl_version=ssl.PROTOCOL_TLSv1)
675 else:
676 self.sock = sock
677 hc.connect = functools.partial(_hc_connect, hc)
678
679 return hc
680
681
682 def handle_youtubedl_headers(headers):
683 filtered_headers = headers
684
685 if 'Youtubedl-no-compression' in filtered_headers:
686 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
687 del filtered_headers['Youtubedl-no-compression']
688
689 return filtered_headers
690
691
692 class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
693 """Handler for HTTP requests and responses.
694
695 This class, when installed with an OpenerDirector, automatically adds
696 the standard headers to every HTTP request and handles gzipped and
697 deflated responses from web servers. If compression is to be avoided in
698 a particular request, the original request in the program code only has
699 to include the HTTP header "Youtubedl-no-compression", which will be
700 removed before making the real request.
701
702 Part of this code was copied from:
703
704 http://techknack.net/python-urllib2-handlers/
705
706 Andrew Rowls, the author of that code, agreed to release it to the
707 public domain.
708 """
709
710 def __init__(self, params, *args, **kwargs):
711 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
712 self._params = params
713
714 def http_open(self, req):
715 return self.do_open(functools.partial(
716 _create_http_connection, self, compat_http_client.HTTPConnection, False),
717 req)
718
719 @staticmethod
720 def deflate(data):
721 try:
722 return zlib.decompress(data, -zlib.MAX_WBITS)
723 except zlib.error:
724 return zlib.decompress(data)
725
726 @staticmethod
727 def addinfourl_wrapper(stream, headers, url, code):
728 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
729 return compat_urllib_request.addinfourl(stream, headers, url, code)
730 ret = compat_urllib_request.addinfourl(stream, headers, url)
731 ret.code = code
732 return ret
733
734 def http_request(self, req):
735 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
736 # always respected by websites, some tend to give out URLs with non percent-encoded
737 # non-ASCII characters (see telemb.py, ard.py [#3412])
738 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
739 # To work around aforementioned issue we will replace request's original URL with
740 # percent-encoded one
741 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
742 # the code of this workaround has been moved here from YoutubeDL.urlopen()
743 url = req.get_full_url()
744 url_escaped = escape_url(url)
745
746 # Substitute URL if any change after escaping
747 if url != url_escaped:
748 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
749 new_req = req_type(
750 url_escaped, data=req.data, headers=req.headers,
751 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
752 new_req.timeout = req.timeout
753 req = new_req
754
755 for h, v in std_headers.items():
756 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
757 # The dict keys are capitalized because of this bug by urllib
758 if h.capitalize() not in req.headers:
759 req.add_header(h, v)
760
761 req.headers = handle_youtubedl_headers(req.headers)
762
763 if sys.version_info < (2, 7) and '#' in req.get_full_url():
764 # Python 2.6 is brain-dead when it comes to fragments
765 req._Request__original = req._Request__original.partition('#')[0]
766 req._Request__r_type = req._Request__r_type.partition('#')[0]
767
768 return req
769
770 def http_response(self, req, resp):
771 old_resp = resp
772 # gzip
773 if resp.headers.get('Content-encoding', '') == 'gzip':
774 content = resp.read()
775 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
776 try:
777 uncompressed = io.BytesIO(gz.read())
778 except IOError as original_ioerror:
779 # There may be junk add the end of the file
780 # See http://stackoverflow.com/q/4928560/35070 for details
781 for i in range(1, 1024):
782 try:
783 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
784 uncompressed = io.BytesIO(gz.read())
785 except IOError:
786 continue
787 break
788 else:
789 raise original_ioerror
790 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
791 resp.msg = old_resp.msg
792 del resp.headers['Content-encoding']
793 # deflate
794 if resp.headers.get('Content-encoding', '') == 'deflate':
795 gz = io.BytesIO(self.deflate(resp.read()))
796 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
797 resp.msg = old_resp.msg
798 del resp.headers['Content-encoding']
799 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
800 # https://github.com/rg3/youtube-dl/issues/6457).
801 if 300 <= resp.code < 400:
802 location = resp.headers.get('Location')
803 if location:
804 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
805 if sys.version_info >= (3, 0):
806 location = location.encode('iso-8859-1').decode('utf-8')
807 location_escaped = escape_url(location)
808 if location != location_escaped:
809 del resp.headers['Location']
810 resp.headers['Location'] = location_escaped
811 return resp
812
813 https_request = http_request
814 https_response = http_response
815
816
817 class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
818 def __init__(self, params, https_conn_class=None, *args, **kwargs):
819 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
820 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
821 self._params = params
822
823 def https_open(self, req):
824 kwargs = {}
825 if hasattr(self, '_context'): # python > 2.6
826 kwargs['context'] = self._context
827 if hasattr(self, '_check_hostname'): # python 3.x
828 kwargs['check_hostname'] = self._check_hostname
829 return self.do_open(functools.partial(
830 _create_http_connection, self, self._https_conn_class, True),
831 req, **kwargs)
832
833
834 class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
835 def __init__(self, cookiejar=None):
836 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
837
838 def http_response(self, request, response):
839 # Python 2 will choke on next HTTP request in row if there are non-ASCII
840 # characters in Set-Cookie HTTP header of last response (see
841 # https://github.com/rg3/youtube-dl/issues/6769).
842 # In order to at least prevent crashing we will percent encode Set-Cookie
843 # header before HTTPCookieProcessor starts processing it.
844 # if sys.version_info < (3, 0) and response.headers:
845 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
846 # set_cookie = response.headers.get(set_cookie_header)
847 # if set_cookie:
848 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
849 # if set_cookie != set_cookie_escaped:
850 # del response.headers[set_cookie_header]
851 # response.headers[set_cookie_header] = set_cookie_escaped
852 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
853
854 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
855 https_response = http_response
856
857
858 def parse_iso8601(date_str, delimiter='T', timezone=None):
859 """ Return a UNIX timestamp from the given date """
860
861 if date_str is None:
862 return None
863
864 date_str = re.sub(r'\.[0-9]+', '', date_str)
865
866 if timezone is None:
867 m = re.search(
868 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
869 date_str)
870 if not m:
871 timezone = datetime.timedelta()
872 else:
873 date_str = date_str[:-len(m.group(0))]
874 if not m.group('sign'):
875 timezone = datetime.timedelta()
876 else:
877 sign = 1 if m.group('sign') == '+' else -1
878 timezone = datetime.timedelta(
879 hours=sign * int(m.group('hours')),
880 minutes=sign * int(m.group('minutes')))
881 try:
882 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
883 dt = datetime.datetime.strptime(date_str, date_format) - timezone
884 return calendar.timegm(dt.timetuple())
885 except ValueError:
886 pass
887
888
889 def unified_strdate(date_str, day_first=True):
890 """Return a string with the date in the format YYYYMMDD"""
891
892 if date_str is None:
893 return None
894 upload_date = None
895 # Replace commas
896 date_str = date_str.replace(',', ' ')
897 # %z (UTC offset) is only supported in python>=3.2
898 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
899 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
900 # Remove AM/PM + timezone
901 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
902
903 format_expressions = [
904 '%d %B %Y',
905 '%d %b %Y',
906 '%B %d %Y',
907 '%b %d %Y',
908 '%b %dst %Y %I:%M',
909 '%b %dnd %Y %I:%M',
910 '%b %dth %Y %I:%M',
911 '%Y %m %d',
912 '%Y-%m-%d',
913 '%Y/%m/%d',
914 '%Y/%m/%d %H:%M:%S',
915 '%Y-%m-%d %H:%M:%S',
916 '%Y-%m-%d %H:%M:%S.%f',
917 '%d.%m.%Y %H:%M',
918 '%d.%m.%Y %H.%M',
919 '%Y-%m-%dT%H:%M:%SZ',
920 '%Y-%m-%dT%H:%M:%S.%fZ',
921 '%Y-%m-%dT%H:%M:%S.%f0Z',
922 '%Y-%m-%dT%H:%M:%S',
923 '%Y-%m-%dT%H:%M:%S.%f',
924 '%Y-%m-%dT%H:%M',
925 ]
926 if day_first:
927 format_expressions.extend([
928 '%d-%m-%Y',
929 '%d.%m.%Y',
930 '%d/%m/%Y',
931 '%d/%m/%y',
932 '%d/%m/%Y %H:%M:%S',
933 ])
934 else:
935 format_expressions.extend([
936 '%m-%d-%Y',
937 '%m.%d.%Y',
938 '%m/%d/%Y',
939 '%m/%d/%y',
940 '%m/%d/%Y %H:%M:%S',
941 ])
942 for expression in format_expressions:
943 try:
944 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
945 except ValueError:
946 pass
947 if upload_date is None:
948 timetuple = email.utils.parsedate_tz(date_str)
949 if timetuple:
950 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
951 if upload_date is not None:
952 return compat_str(upload_date)
953
954
955 def determine_ext(url, default_ext='unknown_video'):
956 if url is None:
957 return default_ext
958 guess = url.partition('?')[0].rpartition('.')[2]
959 if re.match(r'^[A-Za-z0-9]+$', guess):
960 return guess
961 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
962 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
963 return guess.rstrip('/')
964 else:
965 return default_ext
966
967
968 def subtitles_filename(filename, sub_lang, sub_format):
969 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
970
971
972 def date_from_str(date_str):
973 """
974 Return a datetime object from a string in the format YYYYMMDD or
975 (now|today)[+-][0-9](day|week|month|year)(s)?"""
976 today = datetime.date.today()
977 if date_str in ('now', 'today'):
978 return today
979 if date_str == 'yesterday':
980 return today - datetime.timedelta(days=1)
981 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
982 if match is not None:
983 sign = match.group('sign')
984 time = int(match.group('time'))
985 if sign == '-':
986 time = -time
987 unit = match.group('unit')
988 # A bad approximation?
989 if unit == 'month':
990 unit = 'day'
991 time *= 30
992 elif unit == 'year':
993 unit = 'day'
994 time *= 365
995 unit += 's'
996 delta = datetime.timedelta(**{unit: time})
997 return today + delta
998 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
999
1000
1001 def hyphenate_date(date_str):
1002 """
1003 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1004 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1005 if match is not None:
1006 return '-'.join(match.groups())
1007 else:
1008 return date_str
1009
1010
1011 class DateRange(object):
1012 """Represents a time interval between two dates"""
1013
1014 def __init__(self, start=None, end=None):
1015 """start and end must be strings in the format accepted by date"""
1016 if start is not None:
1017 self.start = date_from_str(start)
1018 else:
1019 self.start = datetime.datetime.min.date()
1020 if end is not None:
1021 self.end = date_from_str(end)
1022 else:
1023 self.end = datetime.datetime.max.date()
1024 if self.start > self.end:
1025 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
1026
1027 @classmethod
1028 def day(cls, day):
1029 """Returns a range that only contains the given day"""
1030 return cls(day, day)
1031
1032 def __contains__(self, date):
1033 """Check if the date is in the range"""
1034 if not isinstance(date, datetime.date):
1035 date = date_from_str(date)
1036 return self.start <= date <= self.end
1037
1038 def __str__(self):
1039 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
1040
1041
1042 def platform_name():
1043 """ Returns the platform name as a compat_str """
1044 res = platform.platform()
1045 if isinstance(res, bytes):
1046 res = res.decode(preferredencoding())
1047
1048 assert isinstance(res, compat_str)
1049 return res
1050
1051
1052 def _windows_write_string(s, out):
1053 """ Returns True if the string was written using special methods,
1054 False if it has yet to be written out."""
1055 # Adapted from http://stackoverflow.com/a/3259271/35070
1056
1057 import ctypes
1058 import ctypes.wintypes
1059
1060 WIN_OUTPUT_IDS = {
1061 1: -11,
1062 2: -12,
1063 }
1064
1065 try:
1066 fileno = out.fileno()
1067 except AttributeError:
1068 # If the output stream doesn't have a fileno, it's virtual
1069 return False
1070 except io.UnsupportedOperation:
1071 # Some strange Windows pseudo files?
1072 return False
1073 if fileno not in WIN_OUTPUT_IDS:
1074 return False
1075
1076 GetStdHandle = ctypes.WINFUNCTYPE(
1077 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
1078 (b'GetStdHandle', ctypes.windll.kernel32))
1079 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1080
1081 WriteConsoleW = ctypes.WINFUNCTYPE(
1082 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1083 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
1084 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
1085 written = ctypes.wintypes.DWORD(0)
1086
1087 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
1088 FILE_TYPE_CHAR = 0x0002
1089 FILE_TYPE_REMOTE = 0x8000
1090 GetConsoleMode = ctypes.WINFUNCTYPE(
1091 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1092 ctypes.POINTER(ctypes.wintypes.DWORD))(
1093 (b'GetConsoleMode', ctypes.windll.kernel32))
1094 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1095
1096 def not_a_console(handle):
1097 if handle == INVALID_HANDLE_VALUE or handle is None:
1098 return True
1099 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1100 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
1101
1102 if not_a_console(h):
1103 return False
1104
1105 def next_nonbmp_pos(s):
1106 try:
1107 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1108 except StopIteration:
1109 return len(s)
1110
1111 while s:
1112 count = min(next_nonbmp_pos(s), 1024)
1113
1114 ret = WriteConsoleW(
1115 h, s, count if count else 2, ctypes.byref(written), None)
1116 if ret == 0:
1117 raise OSError('Failed to write string')
1118 if not count: # We just wrote a non-BMP character
1119 assert written.value == 2
1120 s = s[1:]
1121 else:
1122 assert written.value > 0
1123 s = s[written.value:]
1124 return True
1125
1126
1127 def write_string(s, out=None, encoding=None):
1128 if out is None:
1129 out = sys.stderr
1130 assert type(s) == compat_str
1131
1132 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1133 if _windows_write_string(s, out):
1134 return
1135
1136 if ('b' in getattr(out, 'mode', '') or
1137 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
1138 byt = s.encode(encoding or preferredencoding(), 'ignore')
1139 out.write(byt)
1140 elif hasattr(out, 'buffer'):
1141 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1142 byt = s.encode(enc, 'ignore')
1143 out.buffer.write(byt)
1144 else:
1145 out.write(s)
1146 out.flush()
1147
1148
1149 def bytes_to_intlist(bs):
1150 if not bs:
1151 return []
1152 if isinstance(bs[0], int): # Python 3
1153 return list(bs)
1154 else:
1155 return [ord(c) for c in bs]
1156
1157
1158 def intlist_to_bytes(xs):
1159 if not xs:
1160 return b''
1161 return struct_pack('%dB' % len(xs), *xs)
1162
1163
1164 # Cross-platform file locking
1165 if sys.platform == 'win32':
1166 import ctypes.wintypes
1167 import msvcrt
1168
1169 class OVERLAPPED(ctypes.Structure):
1170 _fields_ = [
1171 ('Internal', ctypes.wintypes.LPVOID),
1172 ('InternalHigh', ctypes.wintypes.LPVOID),
1173 ('Offset', ctypes.wintypes.DWORD),
1174 ('OffsetHigh', ctypes.wintypes.DWORD),
1175 ('hEvent', ctypes.wintypes.HANDLE),
1176 ]
1177
1178 kernel32 = ctypes.windll.kernel32
1179 LockFileEx = kernel32.LockFileEx
1180 LockFileEx.argtypes = [
1181 ctypes.wintypes.HANDLE, # hFile
1182 ctypes.wintypes.DWORD, # dwFlags
1183 ctypes.wintypes.DWORD, # dwReserved
1184 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1185 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1186 ctypes.POINTER(OVERLAPPED) # Overlapped
1187 ]
1188 LockFileEx.restype = ctypes.wintypes.BOOL
1189 UnlockFileEx = kernel32.UnlockFileEx
1190 UnlockFileEx.argtypes = [
1191 ctypes.wintypes.HANDLE, # hFile
1192 ctypes.wintypes.DWORD, # dwReserved
1193 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1194 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1195 ctypes.POINTER(OVERLAPPED) # Overlapped
1196 ]
1197 UnlockFileEx.restype = ctypes.wintypes.BOOL
1198 whole_low = 0xffffffff
1199 whole_high = 0x7fffffff
1200
1201 def _lock_file(f, exclusive):
1202 overlapped = OVERLAPPED()
1203 overlapped.Offset = 0
1204 overlapped.OffsetHigh = 0
1205 overlapped.hEvent = 0
1206 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1207 handle = msvcrt.get_osfhandle(f.fileno())
1208 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1209 whole_low, whole_high, f._lock_file_overlapped_p):
1210 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1211
1212 def _unlock_file(f):
1213 assert f._lock_file_overlapped_p
1214 handle = msvcrt.get_osfhandle(f.fileno())
1215 if not UnlockFileEx(handle, 0,
1216 whole_low, whole_high, f._lock_file_overlapped_p):
1217 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1218
1219 else:
1220 import fcntl
1221
1222 def _lock_file(f, exclusive):
1223 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
1224
1225 def _unlock_file(f):
1226 fcntl.flock(f, fcntl.LOCK_UN)
1227
1228
1229 class locked_file(object):
1230 def __init__(self, filename, mode, encoding=None):
1231 assert mode in ['r', 'a', 'w']
1232 self.f = io.open(filename, mode, encoding=encoding)
1233 self.mode = mode
1234
1235 def __enter__(self):
1236 exclusive = self.mode != 'r'
1237 try:
1238 _lock_file(self.f, exclusive)
1239 except IOError:
1240 self.f.close()
1241 raise
1242 return self
1243
1244 def __exit__(self, etype, value, traceback):
1245 try:
1246 _unlock_file(self.f)
1247 finally:
1248 self.f.close()
1249
1250 def __iter__(self):
1251 return iter(self.f)
1252
1253 def write(self, *args):
1254 return self.f.write(*args)
1255
1256 def read(self, *args):
1257 return self.f.read(*args)
1258
1259
1260 def get_filesystem_encoding():
1261 encoding = sys.getfilesystemencoding()
1262 return encoding if encoding is not None else 'utf-8'
1263
1264
1265 def shell_quote(args):
1266 quoted_args = []
1267 encoding = get_filesystem_encoding()
1268 for a in args:
1269 if isinstance(a, bytes):
1270 # We may get a filename encoded with 'encodeFilename'
1271 a = a.decode(encoding)
1272 quoted_args.append(pipes.quote(a))
1273 return ' '.join(quoted_args)
1274
1275
1276 def smuggle_url(url, data):
1277 """ Pass additional data in a URL for internal use. """
1278
1279 sdata = compat_urllib_parse.urlencode(
1280 {'__youtubedl_smuggle': json.dumps(data)})
1281 return url + '#' + sdata
1282
1283
1284 def unsmuggle_url(smug_url, default=None):
1285 if '#__youtubedl_smuggle' not in smug_url:
1286 return smug_url, default
1287 url, _, sdata = smug_url.rpartition('#')
1288 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
1289 data = json.loads(jsond)
1290 return url, data
1291
1292
1293 def format_bytes(bytes):
1294 if bytes is None:
1295 return 'N/A'
1296 if type(bytes) is str:
1297 bytes = float(bytes)
1298 if bytes == 0.0:
1299 exponent = 0
1300 else:
1301 exponent = int(math.log(bytes, 1024.0))
1302 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
1303 converted = float(bytes) / float(1024 ** exponent)
1304 return '%.2f%s' % (converted, suffix)
1305
1306
1307 def parse_filesize(s):
1308 if s is None:
1309 return None
1310
1311 # The lower-case forms are of course incorrect and unofficial,
1312 # but we support those too
1313 _UNIT_TABLE = {
1314 'B': 1,
1315 'b': 1,
1316 'KiB': 1024,
1317 'KB': 1000,
1318 'kB': 1024,
1319 'Kb': 1000,
1320 'MiB': 1024 ** 2,
1321 'MB': 1000 ** 2,
1322 'mB': 1024 ** 2,
1323 'Mb': 1000 ** 2,
1324 'GiB': 1024 ** 3,
1325 'GB': 1000 ** 3,
1326 'gB': 1024 ** 3,
1327 'Gb': 1000 ** 3,
1328 'TiB': 1024 ** 4,
1329 'TB': 1000 ** 4,
1330 'tB': 1024 ** 4,
1331 'Tb': 1000 ** 4,
1332 'PiB': 1024 ** 5,
1333 'PB': 1000 ** 5,
1334 'pB': 1024 ** 5,
1335 'Pb': 1000 ** 5,
1336 'EiB': 1024 ** 6,
1337 'EB': 1000 ** 6,
1338 'eB': 1024 ** 6,
1339 'Eb': 1000 ** 6,
1340 'ZiB': 1024 ** 7,
1341 'ZB': 1000 ** 7,
1342 'zB': 1024 ** 7,
1343 'Zb': 1000 ** 7,
1344 'YiB': 1024 ** 8,
1345 'YB': 1000 ** 8,
1346 'yB': 1024 ** 8,
1347 'Yb': 1000 ** 8,
1348 }
1349
1350 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
1351 m = re.match(
1352 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
1353 if not m:
1354 return None
1355
1356 num_str = m.group('num').replace(',', '.')
1357 mult = _UNIT_TABLE[m.group('unit')]
1358 return int(float(num_str) * mult)
1359
1360
1361 def month_by_name(name):
1362 """ Return the number of a month by (locale-independently) English name """
1363
1364 try:
1365 return ENGLISH_MONTH_NAMES.index(name) + 1
1366 except ValueError:
1367 return None
1368
1369
1370 def month_by_abbreviation(abbrev):
1371 """ Return the number of a month by (locale-independently) English
1372 abbreviations """
1373
1374 try:
1375 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
1376 except ValueError:
1377 return None
1378
1379
1380 def fix_xml_ampersands(xml_str):
1381 """Replace all the '&' by '&amp;' in XML"""
1382 return re.sub(
1383 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
1384 '&amp;',
1385 xml_str)
1386
1387
1388 def setproctitle(title):
1389 assert isinstance(title, compat_str)
1390 try:
1391 libc = ctypes.cdll.LoadLibrary('libc.so.6')
1392 except OSError:
1393 return
1394 title_bytes = title.encode('utf-8')
1395 buf = ctypes.create_string_buffer(len(title_bytes))
1396 buf.value = title_bytes
1397 try:
1398 libc.prctl(15, buf, 0, 0, 0)
1399 except AttributeError:
1400 return # Strange libc, just skip this
1401
1402
1403 def remove_start(s, start):
1404 if s.startswith(start):
1405 return s[len(start):]
1406 return s
1407
1408
1409 def remove_end(s, end):
1410 if s.endswith(end):
1411 return s[:-len(end)]
1412 return s
1413
1414
1415 def remove_quotes(s):
1416 if s is None or len(s) < 2:
1417 return s
1418 for quote in ('"', "'", ):
1419 if s[0] == quote and s[-1] == quote:
1420 return s[1:-1]
1421 return s
1422
1423
1424 def url_basename(url):
1425 path = compat_urlparse.urlparse(url).path
1426 return path.strip('/').split('/')[-1]
1427
1428
1429 class HEADRequest(compat_urllib_request.Request):
1430 def get_method(self):
1431 return 'HEAD'
1432
1433
1434 def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
1435 if get_attr:
1436 if v is not None:
1437 v = getattr(v, get_attr, None)
1438 if v == '':
1439 v = None
1440 if v is None:
1441 return default
1442 try:
1443 return int(v) * invscale // scale
1444 except ValueError:
1445 return default
1446
1447
1448 def str_or_none(v, default=None):
1449 return default if v is None else compat_str(v)
1450
1451
1452 def str_to_int(int_str):
1453 """ A more relaxed version of int_or_none """
1454 if int_str is None:
1455 return None
1456 int_str = re.sub(r'[,\.\+]', '', int_str)
1457 return int(int_str)
1458
1459
1460 def float_or_none(v, scale=1, invscale=1, default=None):
1461 if v is None:
1462 return default
1463 try:
1464 return float(v) * invscale / scale
1465 except ValueError:
1466 return default
1467
1468
1469 def parse_duration(s):
1470 if not isinstance(s, compat_basestring):
1471 return None
1472
1473 s = s.strip()
1474
1475 m = re.match(
1476 r'''(?ix)(?:P?T)?
1477 (?:
1478 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
1479 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1480
1481 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
1482 (?:
1483 (?:
1484 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1485 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1486 )?
1487 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1488 )?
1489 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1490 )$''', s)
1491 if not m:
1492 return None
1493 res = 0
1494 if m.group('only_mins'):
1495 return float_or_none(m.group('only_mins'), invscale=60)
1496 if m.group('only_hours'):
1497 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1498 if m.group('secs'):
1499 res += int(m.group('secs'))
1500 if m.group('mins_reversed'):
1501 res += int(m.group('mins_reversed')) * 60
1502 if m.group('mins'):
1503 res += int(m.group('mins')) * 60
1504 if m.group('hours'):
1505 res += int(m.group('hours')) * 60 * 60
1506 if m.group('hours_reversed'):
1507 res += int(m.group('hours_reversed')) * 60 * 60
1508 if m.group('days'):
1509 res += int(m.group('days')) * 24 * 60 * 60
1510 if m.group('ms'):
1511 res += float(m.group('ms'))
1512 return res
1513
1514
1515 def prepend_extension(filename, ext, expected_real_ext=None):
1516 name, real_ext = os.path.splitext(filename)
1517 return (
1518 '{0}.{1}{2}'.format(name, ext, real_ext)
1519 if not expected_real_ext or real_ext[1:] == expected_real_ext
1520 else '{0}.{1}'.format(filename, ext))
1521
1522
1523 def replace_extension(filename, ext, expected_real_ext=None):
1524 name, real_ext = os.path.splitext(filename)
1525 return '{0}.{1}'.format(
1526 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1527 ext)
1528
1529
1530 def check_executable(exe, args=[]):
1531 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1532 args can be a list of arguments for a short output (like -version) """
1533 try:
1534 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1535 except OSError:
1536 return False
1537 return exe
1538
1539
1540 def get_exe_version(exe, args=['--version'],
1541 version_re=None, unrecognized='present'):
1542 """ Returns the version of the specified executable,
1543 or False if the executable is not present """
1544 try:
1545 out, _ = subprocess.Popen(
1546 [encodeArgument(exe)] + args,
1547 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1548 except OSError:
1549 return False
1550 if isinstance(out, bytes): # Python 2.x
1551 out = out.decode('ascii', 'ignore')
1552 return detect_exe_version(out, version_re, unrecognized)
1553
1554
1555 def detect_exe_version(output, version_re=None, unrecognized='present'):
1556 assert isinstance(output, compat_str)
1557 if version_re is None:
1558 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1559 m = re.search(version_re, output)
1560 if m:
1561 return m.group(1)
1562 else:
1563 return unrecognized
1564
1565
1566 class PagedList(object):
1567 def __len__(self):
1568 # This is only useful for tests
1569 return len(self.getslice())
1570
1571
1572 class OnDemandPagedList(PagedList):
1573 def __init__(self, pagefunc, pagesize, use_cache=False):
1574 self._pagefunc = pagefunc
1575 self._pagesize = pagesize
1576 self._use_cache = use_cache
1577 if use_cache:
1578 self._cache = {}
1579
1580 def getslice(self, start=0, end=None):
1581 res = []
1582 for pagenum in itertools.count(start // self._pagesize):
1583 firstid = pagenum * self._pagesize
1584 nextfirstid = pagenum * self._pagesize + self._pagesize
1585 if start >= nextfirstid:
1586 continue
1587
1588 page_results = None
1589 if self._use_cache:
1590 page_results = self._cache.get(pagenum)
1591 if page_results is None:
1592 page_results = list(self._pagefunc(pagenum))
1593 if self._use_cache:
1594 self._cache[pagenum] = page_results
1595
1596 startv = (
1597 start % self._pagesize
1598 if firstid <= start < nextfirstid
1599 else 0)
1600
1601 endv = (
1602 ((end - 1) % self._pagesize) + 1
1603 if (end is not None and firstid <= end <= nextfirstid)
1604 else None)
1605
1606 if startv != 0 or endv is not None:
1607 page_results = page_results[startv:endv]
1608 res.extend(page_results)
1609
1610 # A little optimization - if current page is not "full", ie. does
1611 # not contain page_size videos then we can assume that this page
1612 # is the last one - there are no more ids on further pages -
1613 # i.e. no need to query again.
1614 if len(page_results) + startv < self._pagesize:
1615 break
1616
1617 # If we got the whole page, but the next page is not interesting,
1618 # break out early as well
1619 if end == nextfirstid:
1620 break
1621 return res
1622
1623
1624 class InAdvancePagedList(PagedList):
1625 def __init__(self, pagefunc, pagecount, pagesize):
1626 self._pagefunc = pagefunc
1627 self._pagecount = pagecount
1628 self._pagesize = pagesize
1629
1630 def getslice(self, start=0, end=None):
1631 res = []
1632 start_page = start // self._pagesize
1633 end_page = (
1634 self._pagecount if end is None else (end // self._pagesize + 1))
1635 skip_elems = start - start_page * self._pagesize
1636 only_more = None if end is None else end - start
1637 for pagenum in range(start_page, end_page):
1638 page = list(self._pagefunc(pagenum))
1639 if skip_elems:
1640 page = page[skip_elems:]
1641 skip_elems = None
1642 if only_more is not None:
1643 if len(page) < only_more:
1644 only_more -= len(page)
1645 else:
1646 page = page[:only_more]
1647 res.extend(page)
1648 break
1649 res.extend(page)
1650 return res
1651
1652
1653 def uppercase_escape(s):
1654 unicode_escape = codecs.getdecoder('unicode_escape')
1655 return re.sub(
1656 r'\\U[0-9a-fA-F]{8}',
1657 lambda m: unicode_escape(m.group(0))[0],
1658 s)
1659
1660
1661 def lowercase_escape(s):
1662 unicode_escape = codecs.getdecoder('unicode_escape')
1663 return re.sub(
1664 r'\\u[0-9a-fA-F]{4}',
1665 lambda m: unicode_escape(m.group(0))[0],
1666 s)
1667
1668
1669 def escape_rfc3986(s):
1670 """Escape non-ASCII characters as suggested by RFC 3986"""
1671 if sys.version_info < (3, 0) and isinstance(s, compat_str):
1672 s = s.encode('utf-8')
1673 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
1674
1675
1676 def escape_url(url):
1677 """Escape URL as suggested by RFC 3986"""
1678 url_parsed = compat_urllib_parse_urlparse(url)
1679 return url_parsed._replace(
1680 path=escape_rfc3986(url_parsed.path),
1681 params=escape_rfc3986(url_parsed.params),
1682 query=escape_rfc3986(url_parsed.query),
1683 fragment=escape_rfc3986(url_parsed.fragment)
1684 ).geturl()
1685
1686 try:
1687 struct.pack('!I', 0)
1688 except TypeError:
1689 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1690 def struct_pack(spec, *args):
1691 if isinstance(spec, compat_str):
1692 spec = spec.encode('ascii')
1693 return struct.pack(spec, *args)
1694
1695 def struct_unpack(spec, *args):
1696 if isinstance(spec, compat_str):
1697 spec = spec.encode('ascii')
1698 return struct.unpack(spec, *args)
1699 else:
1700 struct_pack = struct.pack
1701 struct_unpack = struct.unpack
1702
1703
1704 def read_batch_urls(batch_fd):
1705 def fixup(url):
1706 if not isinstance(url, compat_str):
1707 url = url.decode('utf-8', 'replace')
1708 BOM_UTF8 = '\xef\xbb\xbf'
1709 if url.startswith(BOM_UTF8):
1710 url = url[len(BOM_UTF8):]
1711 url = url.strip()
1712 if url.startswith(('#', ';', ']')):
1713 return False
1714 return url
1715
1716 with contextlib.closing(batch_fd) as fd:
1717 return [url for url in map(fixup, fd) if url]
1718
1719
1720 def urlencode_postdata(*args, **kargs):
1721 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
1722
1723
1724 def encode_dict(d, encoding='utf-8'):
1725 def encode(v):
1726 return v.encode(encoding) if isinstance(v, compat_basestring) else v
1727 return dict((encode(k), encode(v)) for k, v in d.items())
1728
1729
1730 def dict_get(d, key_or_keys, default=None, skip_false_values=True):
1731 if isinstance(key_or_keys, (list, tuple)):
1732 for key in key_or_keys:
1733 if key not in d or d[key] is None or skip_false_values and not d[key]:
1734 continue
1735 return d[key]
1736 return default
1737 return d.get(key_or_keys, default)
1738
1739
1740 def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1741 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1742
1743
1744 US_RATINGS = {
1745 'G': 0,
1746 'PG': 10,
1747 'PG-13': 13,
1748 'R': 16,
1749 'NC': 18,
1750 }
1751
1752
1753 def parse_age_limit(s):
1754 if s is None:
1755 return None
1756 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
1757 return int(m.group('age')) if m else US_RATINGS.get(s)
1758
1759
1760 def strip_jsonp(code):
1761 return re.sub(
1762 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
1763
1764
1765 def js_to_json(code):
1766 def fix_kv(m):
1767 v = m.group(0)
1768 if v in ('true', 'false', 'null'):
1769 return v
1770 if v.startswith('"'):
1771 v = re.sub(r"\\'", "'", v[1:-1])
1772 elif v.startswith("'"):
1773 v = v[1:-1]
1774 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1775 '\\\\': '\\\\',
1776 "\\'": "'",
1777 '"': '\\"',
1778 }[m.group(0)], v)
1779 return '"%s"' % v
1780
1781 res = re.sub(r'''(?x)
1782 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1783 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
1784 [a-zA-Z_][.a-zA-Z_0-9]*
1785 ''', fix_kv, code)
1786 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
1787 return res
1788
1789
1790 def qualities(quality_ids):
1791 """ Get a numeric quality value out of a list of possible values """
1792 def q(qid):
1793 try:
1794 return quality_ids.index(qid)
1795 except ValueError:
1796 return -1
1797 return q
1798
1799
1800 DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
1801
1802
1803 def limit_length(s, length):
1804 """ Add ellipses to overly long strings """
1805 if s is None:
1806 return None
1807 ELLIPSES = '...'
1808 if len(s) > length:
1809 return s[:length - len(ELLIPSES)] + ELLIPSES
1810 return s
1811
1812
1813 def version_tuple(v):
1814 return tuple(int(e) for e in re.split(r'[-.]', v))
1815
1816
1817 def is_outdated_version(version, limit, assume_new=True):
1818 if not version:
1819 return not assume_new
1820 try:
1821 return version_tuple(version) < version_tuple(limit)
1822 except ValueError:
1823 return not assume_new
1824
1825
1826 def ytdl_is_updateable():
1827 """ Returns if youtube-dl can be updated with -U """
1828 from zipimport import zipimporter
1829
1830 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
1831
1832
1833 def args_to_str(args):
1834 # Get a short string representation for a subprocess command
1835 return ' '.join(shlex_quote(a) for a in args)
1836
1837
1838 def error_to_compat_str(err):
1839 err_str = str(err)
1840 # On python 2 error byte string must be decoded with proper
1841 # encoding rather than ascii
1842 if sys.version_info[0] < 3:
1843 err_str = err_str.decode(preferredencoding())
1844 return err_str
1845
1846
1847 def mimetype2ext(mt):
1848 ext = {
1849 'audio/mp4': 'm4a',
1850 }.get(mt)
1851 if ext is not None:
1852 return ext
1853
1854 _, _, res = mt.rpartition('/')
1855
1856 return {
1857 '3gpp': '3gp',
1858 'smptett+xml': 'tt',
1859 'srt': 'srt',
1860 'ttaf+xml': 'dfxp',
1861 'ttml+xml': 'ttml',
1862 'vtt': 'vtt',
1863 'x-flv': 'flv',
1864 'x-mp4-fragmented': 'mp4',
1865 'x-ms-wmv': 'wmv',
1866 }.get(res, res)
1867
1868
1869 def urlhandle_detect_ext(url_handle):
1870 try:
1871 url_handle.headers
1872 getheader = lambda h: url_handle.headers[h]
1873 except AttributeError: # Python < 3
1874 getheader = url_handle.info().getheader
1875
1876 cd = getheader('Content-Disposition')
1877 if cd:
1878 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1879 if m:
1880 e = determine_ext(m.group('filename'), default_ext=None)
1881 if e:
1882 return e
1883
1884 return mimetype2ext(getheader('Content-Type'))
1885
1886
1887 def encode_data_uri(data, mime_type):
1888 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1889
1890
1891 def age_restricted(content_limit, age_limit):
1892 """ Returns True iff the content should be blocked """
1893
1894 if age_limit is None: # No limit set
1895 return False
1896 if content_limit is None:
1897 return False # Content available for everyone
1898 return age_limit < content_limit
1899
1900
1901 def is_html(first_bytes):
1902 """ Detect whether a file contains HTML by examining its first bytes. """
1903
1904 BOMS = [
1905 (b'\xef\xbb\xbf', 'utf-8'),
1906 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1907 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1908 (b'\xff\xfe', 'utf-16-le'),
1909 (b'\xfe\xff', 'utf-16-be'),
1910 ]
1911 for bom, enc in BOMS:
1912 if first_bytes.startswith(bom):
1913 s = first_bytes[len(bom):].decode(enc, 'replace')
1914 break
1915 else:
1916 s = first_bytes.decode('utf-8', 'replace')
1917
1918 return re.match(r'^\s*<', s)
1919
1920
1921 def determine_protocol(info_dict):
1922 protocol = info_dict.get('protocol')
1923 if protocol is not None:
1924 return protocol
1925
1926 url = info_dict['url']
1927 if url.startswith('rtmp'):
1928 return 'rtmp'
1929 elif url.startswith('mms'):
1930 return 'mms'
1931 elif url.startswith('rtsp'):
1932 return 'rtsp'
1933
1934 ext = determine_ext(url)
1935 if ext == 'm3u8':
1936 return 'm3u8'
1937 elif ext == 'f4m':
1938 return 'f4m'
1939
1940 return compat_urllib_parse_urlparse(url).scheme
1941
1942
1943 def render_table(header_row, data):
1944 """ Render a list of rows, each as a list of values """
1945 table = [header_row] + data
1946 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1947 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1948 return '\n'.join(format_str % tuple(row) for row in table)
1949
1950
1951 def _match_one(filter_part, dct):
1952 COMPARISON_OPERATORS = {
1953 '<': operator.lt,
1954 '<=': operator.le,
1955 '>': operator.gt,
1956 '>=': operator.ge,
1957 '=': operator.eq,
1958 '!=': operator.ne,
1959 }
1960 operator_rex = re.compile(r'''(?x)\s*
1961 (?P<key>[a-z_]+)
1962 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1963 (?:
1964 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1965 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1966 )
1967 \s*$
1968 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1969 m = operator_rex.search(filter_part)
1970 if m:
1971 op = COMPARISON_OPERATORS[m.group('op')]
1972 if m.group('strval') is not None:
1973 if m.group('op') not in ('=', '!='):
1974 raise ValueError(
1975 'Operator %s does not support string values!' % m.group('op'))
1976 comparison_value = m.group('strval')
1977 else:
1978 try:
1979 comparison_value = int(m.group('intval'))
1980 except ValueError:
1981 comparison_value = parse_filesize(m.group('intval'))
1982 if comparison_value is None:
1983 comparison_value = parse_filesize(m.group('intval') + 'B')
1984 if comparison_value is None:
1985 raise ValueError(
1986 'Invalid integer value %r in filter part %r' % (
1987 m.group('intval'), filter_part))
1988 actual_value = dct.get(m.group('key'))
1989 if actual_value is None:
1990 return m.group('none_inclusive')
1991 return op(actual_value, comparison_value)
1992
1993 UNARY_OPERATORS = {
1994 '': lambda v: v is not None,
1995 '!': lambda v: v is None,
1996 }
1997 operator_rex = re.compile(r'''(?x)\s*
1998 (?P<op>%s)\s*(?P<key>[a-z_]+)
1999 \s*$
2000 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2001 m = operator_rex.search(filter_part)
2002 if m:
2003 op = UNARY_OPERATORS[m.group('op')]
2004 actual_value = dct.get(m.group('key'))
2005 return op(actual_value)
2006
2007 raise ValueError('Invalid filter part %r' % filter_part)
2008
2009
2010 def match_str(filter_str, dct):
2011 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2012
2013 return all(
2014 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2015
2016
2017 def match_filter_func(filter_str):
2018 def _match_func(info_dict):
2019 if match_str(filter_str, info_dict):
2020 return None
2021 else:
2022 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2023 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2024 return _match_func
2025
2026
2027 def parse_dfxp_time_expr(time_expr):
2028 if not time_expr:
2029 return
2030
2031 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2032 if mobj:
2033 return float(mobj.group('time_offset'))
2034
2035 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
2036 if mobj:
2037 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
2038
2039
2040 def srt_subtitles_timecode(seconds):
2041 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
2042
2043
2044 def dfxp2srt(dfxp_data):
2045 _x = functools.partial(xpath_with_ns, ns_map={
2046 'ttml': 'http://www.w3.org/ns/ttml',
2047 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2048 })
2049
2050 class TTMLPElementParser(object):
2051 out = ''
2052
2053 def start(self, tag, attrib):
2054 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2055 self.out += '\n'
2056
2057 def end(self, tag):
2058 pass
2059
2060 def data(self, data):
2061 self.out += data
2062
2063 def close(self):
2064 return self.out.strip()
2065
2066 def parse_node(node):
2067 target = TTMLPElementParser()
2068 parser = xml.etree.ElementTree.XMLParser(target=target)
2069 parser.feed(xml.etree.ElementTree.tostring(node))
2070 return parser.close()
2071
2072 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
2073 out = []
2074 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
2075
2076 if not paras:
2077 raise ValueError('Invalid dfxp/TTML subtitle')
2078
2079 for para, index in zip(paras, itertools.count(1)):
2080 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
2081 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
2082 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2083 if begin_time is None:
2084 continue
2085 if not end_time:
2086 if not dur:
2087 continue
2088 end_time = begin_time + dur
2089 out.append('%d\n%s --> %s\n%s\n\n' % (
2090 index,
2091 srt_subtitles_timecode(begin_time),
2092 srt_subtitles_timecode(end_time),
2093 parse_node(para)))
2094
2095 return ''.join(out)
2096
2097
2098 def cli_option(params, command_option, param):
2099 param = params.get(param)
2100 return [command_option, param] if param is not None else []
2101
2102
2103 def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2104 param = params.get(param)
2105 assert isinstance(param, bool)
2106 if separator:
2107 return [command_option + separator + (true_value if param else false_value)]
2108 return [command_option, true_value if param else false_value]
2109
2110
2111 def cli_valueless_option(params, command_option, param, expected_value=True):
2112 param = params.get(param)
2113 return [command_option] if param == expected_value else []
2114
2115
2116 def cli_configuration_args(params, param, default=[]):
2117 ex_args = params.get(param)
2118 if ex_args is None:
2119 return default
2120 assert isinstance(ex_args, list)
2121 return ex_args
2122
2123
2124 class ISO639Utils(object):
2125 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2126 _lang_map = {
2127 'aa': 'aar',
2128 'ab': 'abk',
2129 'ae': 'ave',
2130 'af': 'afr',
2131 'ak': 'aka',
2132 'am': 'amh',
2133 'an': 'arg',
2134 'ar': 'ara',
2135 'as': 'asm',
2136 'av': 'ava',
2137 'ay': 'aym',
2138 'az': 'aze',
2139 'ba': 'bak',
2140 'be': 'bel',
2141 'bg': 'bul',
2142 'bh': 'bih',
2143 'bi': 'bis',
2144 'bm': 'bam',
2145 'bn': 'ben',
2146 'bo': 'bod',
2147 'br': 'bre',
2148 'bs': 'bos',
2149 'ca': 'cat',
2150 'ce': 'che',
2151 'ch': 'cha',
2152 'co': 'cos',
2153 'cr': 'cre',
2154 'cs': 'ces',
2155 'cu': 'chu',
2156 'cv': 'chv',
2157 'cy': 'cym',
2158 'da': 'dan',
2159 'de': 'deu',
2160 'dv': 'div',
2161 'dz': 'dzo',
2162 'ee': 'ewe',
2163 'el': 'ell',
2164 'en': 'eng',
2165 'eo': 'epo',
2166 'es': 'spa',
2167 'et': 'est',
2168 'eu': 'eus',
2169 'fa': 'fas',
2170 'ff': 'ful',
2171 'fi': 'fin',
2172 'fj': 'fij',
2173 'fo': 'fao',
2174 'fr': 'fra',
2175 'fy': 'fry',
2176 'ga': 'gle',
2177 'gd': 'gla',
2178 'gl': 'glg',
2179 'gn': 'grn',
2180 'gu': 'guj',
2181 'gv': 'glv',
2182 'ha': 'hau',
2183 'he': 'heb',
2184 'hi': 'hin',
2185 'ho': 'hmo',
2186 'hr': 'hrv',
2187 'ht': 'hat',
2188 'hu': 'hun',
2189 'hy': 'hye',
2190 'hz': 'her',
2191 'ia': 'ina',
2192 'id': 'ind',
2193 'ie': 'ile',
2194 'ig': 'ibo',
2195 'ii': 'iii',
2196 'ik': 'ipk',
2197 'io': 'ido',
2198 'is': 'isl',
2199 'it': 'ita',
2200 'iu': 'iku',
2201 'ja': 'jpn',
2202 'jv': 'jav',
2203 'ka': 'kat',
2204 'kg': 'kon',
2205 'ki': 'kik',
2206 'kj': 'kua',
2207 'kk': 'kaz',
2208 'kl': 'kal',
2209 'km': 'khm',
2210 'kn': 'kan',
2211 'ko': 'kor',
2212 'kr': 'kau',
2213 'ks': 'kas',
2214 'ku': 'kur',
2215 'kv': 'kom',
2216 'kw': 'cor',
2217 'ky': 'kir',
2218 'la': 'lat',
2219 'lb': 'ltz',
2220 'lg': 'lug',
2221 'li': 'lim',
2222 'ln': 'lin',
2223 'lo': 'lao',
2224 'lt': 'lit',
2225 'lu': 'lub',
2226 'lv': 'lav',
2227 'mg': 'mlg',
2228 'mh': 'mah',
2229 'mi': 'mri',
2230 'mk': 'mkd',
2231 'ml': 'mal',
2232 'mn': 'mon',
2233 'mr': 'mar',
2234 'ms': 'msa',
2235 'mt': 'mlt',
2236 'my': 'mya',
2237 'na': 'nau',
2238 'nb': 'nob',
2239 'nd': 'nde',
2240 'ne': 'nep',
2241 'ng': 'ndo',
2242 'nl': 'nld',
2243 'nn': 'nno',
2244 'no': 'nor',
2245 'nr': 'nbl',
2246 'nv': 'nav',
2247 'ny': 'nya',
2248 'oc': 'oci',
2249 'oj': 'oji',
2250 'om': 'orm',
2251 'or': 'ori',
2252 'os': 'oss',
2253 'pa': 'pan',
2254 'pi': 'pli',
2255 'pl': 'pol',
2256 'ps': 'pus',
2257 'pt': 'por',
2258 'qu': 'que',
2259 'rm': 'roh',
2260 'rn': 'run',
2261 'ro': 'ron',
2262 'ru': 'rus',
2263 'rw': 'kin',
2264 'sa': 'san',
2265 'sc': 'srd',
2266 'sd': 'snd',
2267 'se': 'sme',
2268 'sg': 'sag',
2269 'si': 'sin',
2270 'sk': 'slk',
2271 'sl': 'slv',
2272 'sm': 'smo',
2273 'sn': 'sna',
2274 'so': 'som',
2275 'sq': 'sqi',
2276 'sr': 'srp',
2277 'ss': 'ssw',
2278 'st': 'sot',
2279 'su': 'sun',
2280 'sv': 'swe',
2281 'sw': 'swa',
2282 'ta': 'tam',
2283 'te': 'tel',
2284 'tg': 'tgk',
2285 'th': 'tha',
2286 'ti': 'tir',
2287 'tk': 'tuk',
2288 'tl': 'tgl',
2289 'tn': 'tsn',
2290 'to': 'ton',
2291 'tr': 'tur',
2292 'ts': 'tso',
2293 'tt': 'tat',
2294 'tw': 'twi',
2295 'ty': 'tah',
2296 'ug': 'uig',
2297 'uk': 'ukr',
2298 'ur': 'urd',
2299 'uz': 'uzb',
2300 've': 'ven',
2301 'vi': 'vie',
2302 'vo': 'vol',
2303 'wa': 'wln',
2304 'wo': 'wol',
2305 'xh': 'xho',
2306 'yi': 'yid',
2307 'yo': 'yor',
2308 'za': 'zha',
2309 'zh': 'zho',
2310 'zu': 'zul',
2311 }
2312
2313 @classmethod
2314 def short2long(cls, code):
2315 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2316 return cls._lang_map.get(code[:2])
2317
2318 @classmethod
2319 def long2short(cls, code):
2320 """Convert language code from ISO 639-2/T to ISO 639-1"""
2321 for short_name, long_name in cls._lang_map.items():
2322 if long_name == code:
2323 return short_name
2324
2325
2326 class ISO3166Utils(object):
2327 # From http://data.okfn.org/data/core/country-list
2328 _country_map = {
2329 'AF': 'Afghanistan',
2330 'AX': 'Åland Islands',
2331 'AL': 'Albania',
2332 'DZ': 'Algeria',
2333 'AS': 'American Samoa',
2334 'AD': 'Andorra',
2335 'AO': 'Angola',
2336 'AI': 'Anguilla',
2337 'AQ': 'Antarctica',
2338 'AG': 'Antigua and Barbuda',
2339 'AR': 'Argentina',
2340 'AM': 'Armenia',
2341 'AW': 'Aruba',
2342 'AU': 'Australia',
2343 'AT': 'Austria',
2344 'AZ': 'Azerbaijan',
2345 'BS': 'Bahamas',
2346 'BH': 'Bahrain',
2347 'BD': 'Bangladesh',
2348 'BB': 'Barbados',
2349 'BY': 'Belarus',
2350 'BE': 'Belgium',
2351 'BZ': 'Belize',
2352 'BJ': 'Benin',
2353 'BM': 'Bermuda',
2354 'BT': 'Bhutan',
2355 'BO': 'Bolivia, Plurinational State of',
2356 'BQ': 'Bonaire, Sint Eustatius and Saba',
2357 'BA': 'Bosnia and Herzegovina',
2358 'BW': 'Botswana',
2359 'BV': 'Bouvet Island',
2360 'BR': 'Brazil',
2361 'IO': 'British Indian Ocean Territory',
2362 'BN': 'Brunei Darussalam',
2363 'BG': 'Bulgaria',
2364 'BF': 'Burkina Faso',
2365 'BI': 'Burundi',
2366 'KH': 'Cambodia',
2367 'CM': 'Cameroon',
2368 'CA': 'Canada',
2369 'CV': 'Cape Verde',
2370 'KY': 'Cayman Islands',
2371 'CF': 'Central African Republic',
2372 'TD': 'Chad',
2373 'CL': 'Chile',
2374 'CN': 'China',
2375 'CX': 'Christmas Island',
2376 'CC': 'Cocos (Keeling) Islands',
2377 'CO': 'Colombia',
2378 'KM': 'Comoros',
2379 'CG': 'Congo',
2380 'CD': 'Congo, the Democratic Republic of the',
2381 'CK': 'Cook Islands',
2382 'CR': 'Costa Rica',
2383 'CI': 'Côte d\'Ivoire',
2384 'HR': 'Croatia',
2385 'CU': 'Cuba',
2386 'CW': 'Curaçao',
2387 'CY': 'Cyprus',
2388 'CZ': 'Czech Republic',
2389 'DK': 'Denmark',
2390 'DJ': 'Djibouti',
2391 'DM': 'Dominica',
2392 'DO': 'Dominican Republic',
2393 'EC': 'Ecuador',
2394 'EG': 'Egypt',
2395 'SV': 'El Salvador',
2396 'GQ': 'Equatorial Guinea',
2397 'ER': 'Eritrea',
2398 'EE': 'Estonia',
2399 'ET': 'Ethiopia',
2400 'FK': 'Falkland Islands (Malvinas)',
2401 'FO': 'Faroe Islands',
2402 'FJ': 'Fiji',
2403 'FI': 'Finland',
2404 'FR': 'France',
2405 'GF': 'French Guiana',
2406 'PF': 'French Polynesia',
2407 'TF': 'French Southern Territories',
2408 'GA': 'Gabon',
2409 'GM': 'Gambia',
2410 'GE': 'Georgia',
2411 'DE': 'Germany',
2412 'GH': 'Ghana',
2413 'GI': 'Gibraltar',
2414 'GR': 'Greece',
2415 'GL': 'Greenland',
2416 'GD': 'Grenada',
2417 'GP': 'Guadeloupe',
2418 'GU': 'Guam',
2419 'GT': 'Guatemala',
2420 'GG': 'Guernsey',
2421 'GN': 'Guinea',
2422 'GW': 'Guinea-Bissau',
2423 'GY': 'Guyana',
2424 'HT': 'Haiti',
2425 'HM': 'Heard Island and McDonald Islands',
2426 'VA': 'Holy See (Vatican City State)',
2427 'HN': 'Honduras',
2428 'HK': 'Hong Kong',
2429 'HU': 'Hungary',
2430 'IS': 'Iceland',
2431 'IN': 'India',
2432 'ID': 'Indonesia',
2433 'IR': 'Iran, Islamic Republic of',
2434 'IQ': 'Iraq',
2435 'IE': 'Ireland',
2436 'IM': 'Isle of Man',
2437 'IL': 'Israel',
2438 'IT': 'Italy',
2439 'JM': 'Jamaica',
2440 'JP': 'Japan',
2441 'JE': 'Jersey',
2442 'JO': 'Jordan',
2443 'KZ': 'Kazakhstan',
2444 'KE': 'Kenya',
2445 'KI': 'Kiribati',
2446 'KP': 'Korea, Democratic People\'s Republic of',
2447 'KR': 'Korea, Republic of',
2448 'KW': 'Kuwait',
2449 'KG': 'Kyrgyzstan',
2450 'LA': 'Lao People\'s Democratic Republic',
2451 'LV': 'Latvia',
2452 'LB': 'Lebanon',
2453 'LS': 'Lesotho',
2454 'LR': 'Liberia',
2455 'LY': 'Libya',
2456 'LI': 'Liechtenstein',
2457 'LT': 'Lithuania',
2458 'LU': 'Luxembourg',
2459 'MO': 'Macao',
2460 'MK': 'Macedonia, the Former Yugoslav Republic of',
2461 'MG': 'Madagascar',
2462 'MW': 'Malawi',
2463 'MY': 'Malaysia',
2464 'MV': 'Maldives',
2465 'ML': 'Mali',
2466 'MT': 'Malta',
2467 'MH': 'Marshall Islands',
2468 'MQ': 'Martinique',
2469 'MR': 'Mauritania',
2470 'MU': 'Mauritius',
2471 'YT': 'Mayotte',
2472 'MX': 'Mexico',
2473 'FM': 'Micronesia, Federated States of',
2474 'MD': 'Moldova, Republic of',
2475 'MC': 'Monaco',
2476 'MN': 'Mongolia',
2477 'ME': 'Montenegro',
2478 'MS': 'Montserrat',
2479 'MA': 'Morocco',
2480 'MZ': 'Mozambique',
2481 'MM': 'Myanmar',
2482 'NA': 'Namibia',
2483 'NR': 'Nauru',
2484 'NP': 'Nepal',
2485 'NL': 'Netherlands',
2486 'NC': 'New Caledonia',
2487 'NZ': 'New Zealand',
2488 'NI': 'Nicaragua',
2489 'NE': 'Niger',
2490 'NG': 'Nigeria',
2491 'NU': 'Niue',
2492 'NF': 'Norfolk Island',
2493 'MP': 'Northern Mariana Islands',
2494 'NO': 'Norway',
2495 'OM': 'Oman',
2496 'PK': 'Pakistan',
2497 'PW': 'Palau',
2498 'PS': 'Palestine, State of',
2499 'PA': 'Panama',
2500 'PG': 'Papua New Guinea',
2501 'PY': 'Paraguay',
2502 'PE': 'Peru',
2503 'PH': 'Philippines',
2504 'PN': 'Pitcairn',
2505 'PL': 'Poland',
2506 'PT': 'Portugal',
2507 'PR': 'Puerto Rico',
2508 'QA': 'Qatar',
2509 'RE': 'Réunion',
2510 'RO': 'Romania',
2511 'RU': 'Russian Federation',
2512 'RW': 'Rwanda',
2513 'BL': 'Saint Barthélemy',
2514 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2515 'KN': 'Saint Kitts and Nevis',
2516 'LC': 'Saint Lucia',
2517 'MF': 'Saint Martin (French part)',
2518 'PM': 'Saint Pierre and Miquelon',
2519 'VC': 'Saint Vincent and the Grenadines',
2520 'WS': 'Samoa',
2521 'SM': 'San Marino',
2522 'ST': 'Sao Tome and Principe',
2523 'SA': 'Saudi Arabia',
2524 'SN': 'Senegal',
2525 'RS': 'Serbia',
2526 'SC': 'Seychelles',
2527 'SL': 'Sierra Leone',
2528 'SG': 'Singapore',
2529 'SX': 'Sint Maarten (Dutch part)',
2530 'SK': 'Slovakia',
2531 'SI': 'Slovenia',
2532 'SB': 'Solomon Islands',
2533 'SO': 'Somalia',
2534 'ZA': 'South Africa',
2535 'GS': 'South Georgia and the South Sandwich Islands',
2536 'SS': 'South Sudan',
2537 'ES': 'Spain',
2538 'LK': 'Sri Lanka',
2539 'SD': 'Sudan',
2540 'SR': 'Suriname',
2541 'SJ': 'Svalbard and Jan Mayen',
2542 'SZ': 'Swaziland',
2543 'SE': 'Sweden',
2544 'CH': 'Switzerland',
2545 'SY': 'Syrian Arab Republic',
2546 'TW': 'Taiwan, Province of China',
2547 'TJ': 'Tajikistan',
2548 'TZ': 'Tanzania, United Republic of',
2549 'TH': 'Thailand',
2550 'TL': 'Timor-Leste',
2551 'TG': 'Togo',
2552 'TK': 'Tokelau',
2553 'TO': 'Tonga',
2554 'TT': 'Trinidad and Tobago',
2555 'TN': 'Tunisia',
2556 'TR': 'Turkey',
2557 'TM': 'Turkmenistan',
2558 'TC': 'Turks and Caicos Islands',
2559 'TV': 'Tuvalu',
2560 'UG': 'Uganda',
2561 'UA': 'Ukraine',
2562 'AE': 'United Arab Emirates',
2563 'GB': 'United Kingdom',
2564 'US': 'United States',
2565 'UM': 'United States Minor Outlying Islands',
2566 'UY': 'Uruguay',
2567 'UZ': 'Uzbekistan',
2568 'VU': 'Vanuatu',
2569 'VE': 'Venezuela, Bolivarian Republic of',
2570 'VN': 'Viet Nam',
2571 'VG': 'Virgin Islands, British',
2572 'VI': 'Virgin Islands, U.S.',
2573 'WF': 'Wallis and Futuna',
2574 'EH': 'Western Sahara',
2575 'YE': 'Yemen',
2576 'ZM': 'Zambia',
2577 'ZW': 'Zimbabwe',
2578 }
2579
2580 @classmethod
2581 def short2full(cls, code):
2582 """Convert an ISO 3166-2 country code to the corresponding full name"""
2583 return cls._country_map.get(code.upper())
2584
2585
2586 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2587 def __init__(self, proxies=None):
2588 # Set default handlers
2589 for type in ('http', 'https'):
2590 setattr(self, '%s_open' % type,
2591 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2592 meth(r, proxy, type))
2593 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2594
2595 def proxy_open(self, req, proxy, type):
2596 req_proxy = req.headers.get('Ytdl-request-proxy')
2597 if req_proxy is not None:
2598 proxy = req_proxy
2599 del req.headers['Ytdl-request-proxy']
2600
2601 if proxy == '__noproxy__':
2602 return None # No Proxy
2603 return compat_urllib_request.ProxyHandler.proxy_open(
2604 self, req, proxy, type)
2605
2606
2607 def ohdave_rsa_encrypt(data, exponent, modulus):
2608 '''
2609 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2610
2611 Input:
2612 data: data to encrypt, bytes-like object
2613 exponent, modulus: parameter e and N of RSA algorithm, both integer
2614 Output: hex string of encrypted data
2615
2616 Limitation: supports one block encryption only
2617 '''
2618
2619 payload = int(binascii.hexlify(data[::-1]), 16)
2620 encrypted = pow(payload, exponent, modulus)
2621 return '%x' % encrypted