]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[utils] Support 'dur' field in TTML
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
912b38b4 6import calendar
676eb3f2 7import codecs
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96
PH
10import datetime
11import email.utils
f45c185f 12import errno
be4a824d 13import functools
d77c3dfd 14import gzip
b7ab0590 15import itertools
03f9daab 16import io
f4bfd65f 17import json
d77c3dfd 18import locale
02dbf93f 19import math
347de493 20import operator
d77c3dfd 21import os
4eb7f1d1 22import pipes
c496ca96 23import platform
d77c3dfd 24import re
13ebea79 25import ssl
c496ca96 26import socket
b53466e1 27import struct
1c088fa8 28import subprocess
d77c3dfd 29import sys
181c8655 30import tempfile
01951dda 31import traceback
bcf89ce6 32import xml.etree.ElementTree
d77c3dfd 33import zlib
d77c3dfd 34
8c25f81b 35from .compat import (
8f9312c3 36 compat_basestring,
8c25f81b 37 compat_chr,
8c25f81b 38 compat_html_entities,
be4a824d 39 compat_http_client,
c86b6142 40 compat_kwargs,
8c25f81b 41 compat_parse_qs,
be4a824d 42 compat_socket_create_connection,
8c25f81b
PH
43 compat_str,
44 compat_urllib_error,
45 compat_urllib_parse,
46 compat_urllib_parse_urlparse,
47 compat_urllib_request,
48 compat_urlparse,
7d4111ed 49 shlex_quote,
8c25f81b 50)
4644ac55
S
51
52
468e2e92
FV
53# This is not clearly defined otherwise
54compiled_regex_type = type(re.compile(''))
55
3e669f36 56std_headers = {
18313934 57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
59ae15a5
PH
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 62}
f427df17 63
5f6a1245 64
7105440c
YCH
65ENGLISH_MONTH_NAMES = [
66 'January', 'February', 'March', 'April', 'May', 'June',
67 'July', 'August', 'September', 'October', 'November', 'December']
68
69
d77c3dfd 70def preferredencoding():
59ae15a5 71 """Get preferred encoding.
d77c3dfd 72
59ae15a5
PH
73 Returns the best encoding scheme for the system, based on
74 locale.getpreferredencoding() and some further tweaks.
75 """
76 try:
77 pref = locale.getpreferredencoding()
28e614de 78 'TEST'.encode(pref)
70a1165b 79 except Exception:
59ae15a5 80 pref = 'UTF-8'
bae611f2 81
59ae15a5 82 return pref
d77c3dfd 83
f4bfd65f 84
181c8655 85def write_json_file(obj, fn):
1394646a 86 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 87
92120217 88 fn = encodeFilename(fn)
61ee5aeb 89 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
90 encoding = get_filesystem_encoding()
91 # os.path.basename returns a bytes object, but NamedTemporaryFile
92 # will fail if the filename contains non ascii characters unless we
93 # use a unicode object
94 path_basename = lambda f: os.path.basename(fn).decode(encoding)
95 # the same for os.path.dirname
96 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
97 else:
98 path_basename = os.path.basename
99 path_dirname = os.path.dirname
100
73159f99
S
101 args = {
102 'suffix': '.tmp',
ec5f6016
JMF
103 'prefix': path_basename(fn) + '.',
104 'dir': path_dirname(fn),
73159f99
S
105 'delete': False,
106 }
107
181c8655
PH
108 # In Python 2.x, json.dump expects a bytestream.
109 # In Python 3.x, it writes to a character stream
110 if sys.version_info < (3, 0):
73159f99 111 args['mode'] = 'wb'
181c8655 112 else:
73159f99
S
113 args.update({
114 'mode': 'w',
115 'encoding': 'utf-8',
116 })
117
c86b6142 118 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
119
120 try:
121 with tf:
122 json.dump(obj, tf)
1394646a
IK
123 if sys.platform == 'win32':
124 # Need to remove existing file on Windows, else os.rename raises
125 # WindowsError or FileExistsError.
126 try:
127 os.unlink(fn)
128 except OSError:
129 pass
181c8655 130 os.rename(tf.name, fn)
70a1165b 131 except Exception:
181c8655
PH
132 try:
133 os.remove(tf.name)
134 except OSError:
135 pass
136 raise
137
138
139if sys.version_info >= (2, 7):
59ae56fa
PH
140 def find_xpath_attr(node, xpath, key, val):
141 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
142 assert re.match(r'^[a-zA-Z-]+$', key)
143 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
ab4ee31e 144 expr = xpath + "[@%s='%s']" % (key, val)
59ae56fa
PH
145 return node.find(expr)
146else:
147 def find_xpath_attr(node, xpath, key, val):
4eefbfdb
PH
148 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
149 # .//node does not match if a node is a direct child of . !
8f9312c3 150 if isinstance(xpath, compat_str):
4eefbfdb
PH
151 xpath = xpath.encode('ascii')
152
59ae56fa
PH
153 for f in node.findall(xpath):
154 if f.attrib.get(key) == val:
155 return f
156 return None
157
d7e66d39
JMF
158# On python2.6 the xml.etree.ElementTree.Element methods don't support
159# the namespace parameter
5f6a1245
JW
160
161
d7e66d39
JMF
162def xpath_with_ns(path, ns_map):
163 components = [c.split(':') for c in path.split('/')]
164 replaced = []
165 for c in components:
166 if len(c) == 1:
167 replaced.append(c[0])
168 else:
169 ns, tag = c
170 replaced.append('{%s}%s' % (ns_map[ns], tag))
171 return '/'.join(replaced)
172
d77c3dfd 173
bf0ff932 174def xpath_text(node, xpath, name=None, fatal=False):
d74bebd5
PH
175 if sys.version_info < (2, 7): # Crazy 2.6
176 xpath = xpath.encode('ascii')
177
bf0ff932 178 n = node.find(xpath)
42bdd9d0 179 if n is None or n.text is None:
bf0ff932
PH
180 if fatal:
181 name = xpath if name is None else name
182 raise ExtractorError('Could not find XML element %s' % name)
183 else:
184 return None
185 return n.text
186
187
9e6dd238 188def get_element_by_id(id, html):
43e8fafd
ND
189 """Return the content of the tag with the specified ID in the passed HTML document"""
190 return get_element_by_attribute("id", id, html)
191
12ea2f30 192
43e8fafd
ND
193def get_element_by_attribute(attribute, value, html):
194 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 195
38285056
PH
196 m = re.search(r'''(?xs)
197 <([a-zA-Z0-9:._-]+)
198 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
199 \s+%s=['"]?%s['"]?
200 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
201 \s*>
202 (?P<content>.*?)
203 </\1>
204 ''' % (re.escape(attribute), re.escape(value)), html)
205
206 if not m:
207 return None
208 res = m.group('content')
209
210 if res.startswith('"') or res.startswith("'"):
211 res = res[1:-1]
a921f407 212
38285056 213 return unescapeHTML(res)
a921f407 214
9e6dd238
FV
215
216def clean_html(html):
59ae15a5 217 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
218
219 if html is None: # Convenience for sanitizing descriptions etc.
220 return html
221
59ae15a5
PH
222 # Newline vs <br />
223 html = html.replace('\n', ' ')
6b3aef80
FV
224 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
225 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
226 # Strip html tags
227 html = re.sub('<.*?>', '', html)
228 # Replace html entities
229 html = unescapeHTML(html)
7decf895 230 return html.strip()
9e6dd238
FV
231
232
d77c3dfd 233def sanitize_open(filename, open_mode):
59ae15a5
PH
234 """Try to open the given filename, and slightly tweak it if this fails.
235
236 Attempts to open the given filename. If this fails, it tries to change
237 the filename slightly, step by step, until it's either able to open it
238 or it fails and raises a final exception, like the standard open()
239 function.
240
241 It returns the tuple (stream, definitive_file_name).
242 """
243 try:
28e614de 244 if filename == '-':
59ae15a5
PH
245 if sys.platform == 'win32':
246 import msvcrt
247 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 248 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
249 stream = open(encodeFilename(filename), open_mode)
250 return (stream, filename)
251 except (IOError, OSError) as err:
f45c185f
PH
252 if err.errno in (errno.EACCES,):
253 raise
59ae15a5 254
f45c185f 255 # In case of error, try to remove win32 forbidden chars
d55de57b 256 alt_filename = sanitize_path(filename)
f45c185f
PH
257 if alt_filename == filename:
258 raise
259 else:
260 # An exception here should be caught in the caller
d55de57b 261 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 262 return (stream, alt_filename)
d77c3dfd
FV
263
264
265def timeconvert(timestr):
59ae15a5
PH
266 """Convert RFC 2822 defined time string into system timestamp"""
267 timestamp = None
268 timetuple = email.utils.parsedate_tz(timestr)
269 if timetuple is not None:
270 timestamp = email.utils.mktime_tz(timetuple)
271 return timestamp
1c469a94 272
5f6a1245 273
796173d0 274def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
275 """Sanitizes a string so it could be used as part of a filename.
276 If restricted is set, use a stricter subset of allowed characters.
796173d0 277 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
278 """
279 def replace_insane(char):
280 if char == '?' or ord(char) < 32 or ord(char) == 127:
281 return ''
282 elif char == '"':
283 return '' if restricted else '\''
284 elif char == ':':
285 return '_-' if restricted else ' -'
286 elif char in '\\/|*<>':
287 return '_'
627dcfff 288 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
289 return '_'
290 if restricted and ord(char) > 127:
291 return '_'
292 return char
293
2aeb06d6
PH
294 # Handle timestamps
295 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 296 result = ''.join(map(replace_insane, s))
796173d0
PH
297 if not is_id:
298 while '__' in result:
299 result = result.replace('__', '_')
300 result = result.strip('_')
301 # Common case of "Foreign band name - English song title"
302 if restricted and result.startswith('-_'):
303 result = result[2:]
5a42414b
PH
304 if result.startswith('-'):
305 result = '_' + result[len('-'):]
a7440261 306 result = result.lstrip('.')
796173d0
PH
307 if not result:
308 result = '_'
59ae15a5 309 return result
d77c3dfd 310
5f6a1245 311
a2aaf4db
S
312def sanitize_path(s):
313 """Sanitizes and normalizes path on Windows"""
314 if sys.platform != 'win32':
315 return s
be531ef1
S
316 drive_or_unc, _ = os.path.splitdrive(s)
317 if sys.version_info < (2, 7) and not drive_or_unc:
318 drive_or_unc, _ = os.path.splitunc(s)
319 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
320 if drive_or_unc:
a2aaf4db
S
321 norm_path.pop(0)
322 sanitized_path = [
2ebfeaca 323 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
a2aaf4db 324 for path_part in norm_path]
be531ef1
S
325 if drive_or_unc:
326 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
327 return os.path.join(*sanitized_path)
328
329
d77c3dfd 330def orderedSet(iterable):
59ae15a5
PH
331 """ Remove all duplicates from the input iterable """
332 res = []
333 for el in iterable:
334 if el not in res:
335 res.append(el)
336 return res
d77c3dfd 337
912b38b4 338
4e408e47
PH
339def _htmlentity_transform(entity):
340 """Transforms an HTML entity to a character."""
341 # Known non-numeric HTML entity
342 if entity in compat_html_entities.name2codepoint:
343 return compat_chr(compat_html_entities.name2codepoint[entity])
344
91757b0f 345 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
346 if mobj is not None:
347 numstr = mobj.group(1)
28e614de 348 if numstr.startswith('x'):
4e408e47 349 base = 16
28e614de 350 numstr = '0%s' % numstr
4e408e47
PH
351 else:
352 base = 10
353 return compat_chr(int(numstr, base))
354
355 # Unknown entity in name, return its literal representation
28e614de 356 return ('&%s;' % entity)
4e408e47
PH
357
358
d77c3dfd 359def unescapeHTML(s):
912b38b4
PH
360 if s is None:
361 return None
362 assert type(s) == compat_str
d77c3dfd 363
4e408e47
PH
364 return re.sub(
365 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 366
8bf48f23 367
aa49acd1
S
368def get_subprocess_encoding():
369 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
370 # For subprocess calls, encode with locale encoding
371 # Refer to http://stackoverflow.com/a/9951851/35070
372 encoding = preferredencoding()
373 else:
374 encoding = sys.getfilesystemencoding()
375 if encoding is None:
376 encoding = 'utf-8'
377 return encoding
378
379
8bf48f23 380def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
381 """
382 @param s The name of the file
383 """
d77c3dfd 384
8bf48f23 385 assert type(s) == compat_str
d77c3dfd 386
59ae15a5
PH
387 # Python 3 has a Unicode API
388 if sys.version_info >= (3, 0):
389 return s
0f00efed 390
aa49acd1
S
391 # Pass '' directly to use Unicode APIs on Windows 2000 and up
392 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
393 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
394 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
395 return s
396
397 return s.encode(get_subprocess_encoding(), 'ignore')
398
399
400def decodeFilename(b, for_subprocess=False):
401
402 if sys.version_info >= (3, 0):
403 return b
404
405 if not isinstance(b, bytes):
406 return b
407
408 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 409
f07b74fc
PH
410
411def encodeArgument(s):
412 if not isinstance(s, compat_str):
413 # Legacy code that uses byte strings
414 # Uncomment the following line after fixing all post processors
7af808a5 415 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
416 s = s.decode('ascii')
417 return encodeFilename(s, True)
418
419
aa49acd1
S
420def decodeArgument(b):
421 return decodeFilename(b, True)
422
423
8271226a
PH
424def decodeOption(optval):
425 if optval is None:
426 return optval
427 if isinstance(optval, bytes):
428 optval = optval.decode(preferredencoding())
429
430 assert isinstance(optval, compat_str)
431 return optval
1c256f70 432
5f6a1245 433
4539dd30
PH
434def formatSeconds(secs):
435 if secs > 3600:
436 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
437 elif secs > 60:
438 return '%d:%02d' % (secs // 60, secs % 60)
439 else:
440 return '%d' % secs
441
a0ddb8a2 442
be4a824d
PH
443def make_HTTPS_handler(params, **kwargs):
444 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 445 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 446 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 447 if opts_no_check_certificate:
be5f2c19 448 context.check_hostname = False
0db261ba 449 context.verify_mode = ssl.CERT_NONE
a2366922 450 try:
be4a824d 451 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
452 except TypeError:
453 # Python 2.7.8
454 # (create_default_context present but HTTPSHandler has no context=)
455 pass
456
457 if sys.version_info < (3, 2):
d7932313 458 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 459 else: # Python < 3.4
d7932313 460 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 461 context.verify_mode = (ssl.CERT_NONE
dca08720 462 if opts_no_check_certificate
ea6d901e 463 else ssl.CERT_REQUIRED)
303b479e 464 context.set_default_verify_paths()
be4a824d 465 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 466
732ea2f0 467
08f2a92c
JMF
468def bug_reports_message():
469 if ytdl_is_updateable():
470 update_cmd = 'type youtube-dl -U to update'
471 else:
472 update_cmd = 'see https://yt-dl.org/update on how to update'
473 msg = '; please report this issue on https://yt-dl.org/bug .'
474 msg += ' Make sure you are using the latest version; %s.' % update_cmd
475 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
476 return msg
477
478
1c256f70
PH
479class ExtractorError(Exception):
480 """Error during info extraction."""
5f6a1245 481
d11271dd 482 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
483 """ tb, if given, is the original traceback (so that it can be printed out).
484 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
485 """
486
487 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
488 expected = True
d11271dd
PH
489 if video_id is not None:
490 msg = video_id + ': ' + msg
410f3e73 491 if cause:
28e614de 492 msg += ' (caused by %r)' % cause
9a82b238 493 if not expected:
08f2a92c 494 msg += bug_reports_message()
1c256f70 495 super(ExtractorError, self).__init__(msg)
d5979c5d 496
1c256f70 497 self.traceback = tb
8cc83b8d 498 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 499 self.cause = cause
d11271dd 500 self.video_id = video_id
1c256f70 501
01951dda
PH
502 def format_traceback(self):
503 if self.traceback is None:
504 return None
28e614de 505 return ''.join(traceback.format_tb(self.traceback))
01951dda 506
1c256f70 507
416c7fcb
PH
508class UnsupportedError(ExtractorError):
509 def __init__(self, url):
510 super(UnsupportedError, self).__init__(
511 'Unsupported URL: %s' % url, expected=True)
512 self.url = url
513
514
55b3e45b
JMF
515class RegexNotFoundError(ExtractorError):
516 """Error when a regex didn't match"""
517 pass
518
519
d77c3dfd 520class DownloadError(Exception):
59ae15a5 521 """Download Error exception.
d77c3dfd 522
59ae15a5
PH
523 This exception may be thrown by FileDownloader objects if they are not
524 configured to continue on errors. They will contain the appropriate
525 error message.
526 """
5f6a1245 527
8cc83b8d
FV
528 def __init__(self, msg, exc_info=None):
529 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
530 super(DownloadError, self).__init__(msg)
531 self.exc_info = exc_info
d77c3dfd
FV
532
533
534class SameFileError(Exception):
59ae15a5 535 """Same File exception.
d77c3dfd 536
59ae15a5
PH
537 This exception will be thrown by FileDownloader objects if they detect
538 multiple files would have to be downloaded to the same file on disk.
539 """
540 pass
d77c3dfd
FV
541
542
543class PostProcessingError(Exception):
59ae15a5 544 """Post Processing exception.
d77c3dfd 545
59ae15a5
PH
546 This exception may be raised by PostProcessor's .run() method to
547 indicate an error in the postprocessing task.
548 """
5f6a1245 549
7851b379
PH
550 def __init__(self, msg):
551 self.msg = msg
d77c3dfd 552
5f6a1245 553
d77c3dfd 554class MaxDownloadsReached(Exception):
59ae15a5
PH
555 """ --max-downloads limit has been reached. """
556 pass
d77c3dfd
FV
557
558
559class UnavailableVideoError(Exception):
59ae15a5 560 """Unavailable Format exception.
d77c3dfd 561
59ae15a5
PH
562 This exception will be thrown when a video is requested
563 in a format that is not available for that video.
564 """
565 pass
d77c3dfd
FV
566
567
568class ContentTooShortError(Exception):
59ae15a5 569 """Content Too Short exception.
d77c3dfd 570
59ae15a5
PH
571 This exception may be raised by FileDownloader objects when a file they
572 download is too small for what the server announced first, indicating
573 the connection was probably interrupted.
574 """
575 # Both in bytes
576 downloaded = None
577 expected = None
d77c3dfd 578
59ae15a5
PH
579 def __init__(self, downloaded, expected):
580 self.downloaded = downloaded
581 self.expected = expected
d77c3dfd 582
5f6a1245 583
c5a59d93 584def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
be4a824d
PH
585 hc = http_class(*args, **kwargs)
586 source_address = ydl_handler._params.get('source_address')
587 if source_address is not None:
588 sa = (source_address, 0)
589 if hasattr(hc, 'source_address'): # Python 2.7+
590 hc.source_address = sa
591 else: # Python 2.6
592 def _hc_connect(self, *args, **kwargs):
593 sock = compat_socket_create_connection(
594 (self.host, self.port), self.timeout, sa)
595 if is_https:
d7932313
PH
596 self.sock = ssl.wrap_socket(
597 sock, self.key_file, self.cert_file,
598 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
599 else:
600 self.sock = sock
601 hc.connect = functools.partial(_hc_connect, hc)
602
603 return hc
604
605
acebc9cd 606class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
607 """Handler for HTTP requests and responses.
608
609 This class, when installed with an OpenerDirector, automatically adds
610 the standard headers to every HTTP request and handles gzipped and
611 deflated responses from web servers. If compression is to be avoided in
612 a particular request, the original request in the program code only has
613 to include the HTTP header "Youtubedl-No-Compression", which will be
614 removed before making the real request.
615
616 Part of this code was copied from:
617
618 http://techknack.net/python-urllib2-handlers/
619
620 Andrew Rowls, the author of that code, agreed to release it to the
621 public domain.
622 """
623
be4a824d
PH
624 def __init__(self, params, *args, **kwargs):
625 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
626 self._params = params
627
628 def http_open(self, req):
629 return self.do_open(functools.partial(
c5a59d93 630 _create_http_connection, self, compat_http_client.HTTPConnection, False),
be4a824d
PH
631 req)
632
59ae15a5
PH
633 @staticmethod
634 def deflate(data):
635 try:
636 return zlib.decompress(data, -zlib.MAX_WBITS)
637 except zlib.error:
638 return zlib.decompress(data)
639
640 @staticmethod
641 def addinfourl_wrapper(stream, headers, url, code):
642 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
643 return compat_urllib_request.addinfourl(stream, headers, url, code)
644 ret = compat_urllib_request.addinfourl(stream, headers, url)
645 ret.code = code
646 return ret
647
acebc9cd 648 def http_request(self, req):
33ac271b 649 for h, v in std_headers.items():
3d5f7a39
JK
650 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
651 # The dict keys are capitalized because of this bug by urllib
652 if h.capitalize() not in req.headers:
33ac271b 653 req.add_header(h, v)
59ae15a5
PH
654 if 'Youtubedl-no-compression' in req.headers:
655 if 'Accept-encoding' in req.headers:
656 del req.headers['Accept-encoding']
657 del req.headers['Youtubedl-no-compression']
989b4b2b
PH
658
659 if sys.version_info < (2, 7) and '#' in req.get_full_url():
660 # Python 2.6 is brain-dead when it comes to fragments
661 req._Request__original = req._Request__original.partition('#')[0]
662 req._Request__r_type = req._Request__r_type.partition('#')[0]
663
59ae15a5
PH
664 return req
665
acebc9cd 666 def http_response(self, req, resp):
59ae15a5
PH
667 old_resp = resp
668 # gzip
669 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
670 content = resp.read()
671 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
672 try:
673 uncompressed = io.BytesIO(gz.read())
674 except IOError as original_ioerror:
675 # There may be junk add the end of the file
676 # See http://stackoverflow.com/q/4928560/35070 for details
677 for i in range(1, 1024):
678 try:
679 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
680 uncompressed = io.BytesIO(gz.read())
681 except IOError:
682 continue
683 break
684 else:
685 raise original_ioerror
686 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
687 resp.msg = old_resp.msg
688 # deflate
689 if resp.headers.get('Content-encoding', '') == 'deflate':
690 gz = io.BytesIO(self.deflate(resp.read()))
691 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
692 resp.msg = old_resp.msg
693 return resp
0f8d03f8 694
acebc9cd
PH
695 https_request = http_request
696 https_response = http_response
bf50b038 697
5de90176 698
be4a824d
PH
699class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
700 def __init__(self, params, https_conn_class=None, *args, **kwargs):
701 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
702 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
703 self._params = params
704
705 def https_open(self, req):
4f264c02
JMF
706 kwargs = {}
707 if hasattr(self, '_context'): # python > 2.6
708 kwargs['context'] = self._context
709 if hasattr(self, '_check_hostname'): # python 3.x
710 kwargs['check_hostname'] = self._check_hostname
be4a824d
PH
711 return self.do_open(functools.partial(
712 _create_http_connection, self, self._https_conn_class, True),
4f264c02 713 req, **kwargs)
be4a824d
PH
714
715
08b38d54 716def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
717 """ Return a UNIX timestamp from the given date """
718
719 if date_str is None:
720 return None
721
08b38d54
PH
722 if timezone is None:
723 m = re.search(
724 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
725 date_str)
726 if not m:
912b38b4
PH
727 timezone = datetime.timedelta()
728 else:
08b38d54
PH
729 date_str = date_str[:-len(m.group(0))]
730 if not m.group('sign'):
731 timezone = datetime.timedelta()
732 else:
733 sign = 1 if m.group('sign') == '+' else -1
734 timezone = datetime.timedelta(
735 hours=sign * int(m.group('hours')),
736 minutes=sign * int(m.group('minutes')))
6ad4013d 737 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
305d0683 738 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
739 return calendar.timegm(dt.timetuple())
740
741
42bdd9d0 742def unified_strdate(date_str, day_first=True):
bf50b038 743 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
744
745 if date_str is None:
746 return None
bf50b038 747 upload_date = None
5f6a1245 748 # Replace commas
026fcc04 749 date_str = date_str.replace(',', ' ')
bf50b038 750 # %z (UTC offset) is only supported in python>=3.2
15ac8413
S
751 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
752 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0 753 # Remove AM/PM + timezone
9bb8e0a3 754 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
42bdd9d0 755
19e1d359
JMF
756 format_expressions = [
757 '%d %B %Y',
0f99566c 758 '%d %b %Y',
19e1d359
JMF
759 '%B %d %Y',
760 '%b %d %Y',
78ff59d0
PP
761 '%b %dst %Y %I:%M%p',
762 '%b %dnd %Y %I:%M%p',
763 '%b %dth %Y %I:%M%p',
a69801e2 764 '%Y %m %d',
19e1d359 765 '%Y-%m-%d',
fe556f1b 766 '%Y/%m/%d',
19e1d359 767 '%Y/%m/%d %H:%M:%S',
5d73273f 768 '%Y-%m-%d %H:%M:%S',
e9be9a6a 769 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 770 '%d.%m.%Y %H:%M',
b047de6f 771 '%d.%m.%Y %H.%M',
19e1d359 772 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
773 '%Y-%m-%dT%H:%M:%S.%fZ',
774 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 775 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 776 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 777 '%Y-%m-%dT%H:%M',
19e1d359 778 ]
42bdd9d0
PH
779 if day_first:
780 format_expressions.extend([
79c21abb 781 '%d-%m-%Y',
776dc399
S
782 '%d.%m.%Y',
783 '%d/%m/%Y',
784 '%d/%m/%y',
42bdd9d0
PH
785 '%d/%m/%Y %H:%M:%S',
786 ])
787 else:
788 format_expressions.extend([
79c21abb 789 '%m-%d-%Y',
776dc399
S
790 '%m.%d.%Y',
791 '%m/%d/%Y',
792 '%m/%d/%y',
42bdd9d0
PH
793 '%m/%d/%Y %H:%M:%S',
794 ])
bf50b038
JMF
795 for expression in format_expressions:
796 try:
797 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 798 except ValueError:
bf50b038 799 pass
42393ce2
PH
800 if upload_date is None:
801 timetuple = email.utils.parsedate_tz(date_str)
802 if timetuple:
803 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
804 return upload_date
805
5f6a1245 806
28e614de 807def determine_ext(url, default_ext='unknown_video'):
f4776371
S
808 if url is None:
809 return default_ext
28e614de 810 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
811 if re.match(r'^[A-Za-z0-9]+$', guess):
812 return guess
813 else:
cbdbb766 814 return default_ext
73e79f2a 815
5f6a1245 816
d4051a8e 817def subtitles_filename(filename, sub_lang, sub_format):
28e614de 818 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 819
5f6a1245 820
bd558525 821def date_from_str(date_str):
37254abc
JMF
822 """
823 Return a datetime object from a string in the format YYYYMMDD or
824 (now|today)[+-][0-9](day|week|month|year)(s)?"""
825 today = datetime.date.today()
f8795e10 826 if date_str in ('now', 'today'):
37254abc 827 return today
f8795e10
PH
828 if date_str == 'yesterday':
829 return today - datetime.timedelta(days=1)
37254abc
JMF
830 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
831 if match is not None:
832 sign = match.group('sign')
833 time = int(match.group('time'))
834 if sign == '-':
835 time = -time
836 unit = match.group('unit')
5f6a1245 837 # A bad aproximation?
37254abc
JMF
838 if unit == 'month':
839 unit = 'day'
840 time *= 30
841 elif unit == 'year':
842 unit = 'day'
843 time *= 365
844 unit += 's'
845 delta = datetime.timedelta(**{unit: time})
846 return today + delta
bd558525 847 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
5f6a1245
JW
848
849
e63fc1be 850def hyphenate_date(date_str):
851 """
852 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
853 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
854 if match is not None:
855 return '-'.join(match.groups())
856 else:
857 return date_str
858
5f6a1245 859
bd558525
JMF
860class DateRange(object):
861 """Represents a time interval between two dates"""
5f6a1245 862
bd558525
JMF
863 def __init__(self, start=None, end=None):
864 """start and end must be strings in the format accepted by date"""
865 if start is not None:
866 self.start = date_from_str(start)
867 else:
868 self.start = datetime.datetime.min.date()
869 if end is not None:
870 self.end = date_from_str(end)
871 else:
872 self.end = datetime.datetime.max.date()
37254abc 873 if self.start > self.end:
bd558525 874 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 875
bd558525
JMF
876 @classmethod
877 def day(cls, day):
878 """Returns a range that only contains the given day"""
5f6a1245
JW
879 return cls(day, day)
880
bd558525
JMF
881 def __contains__(self, date):
882 """Check if the date is in the range"""
37254abc
JMF
883 if not isinstance(date, datetime.date):
884 date = date_from_str(date)
885 return self.start <= date <= self.end
5f6a1245 886
bd558525 887 def __str__(self):
5f6a1245 888 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
889
890
891def platform_name():
892 """ Returns the platform name as a compat_str """
893 res = platform.platform()
894 if isinstance(res, bytes):
895 res = res.decode(preferredencoding())
896
897 assert isinstance(res, compat_str)
898 return res
c257baff
PH
899
900
b58ddb32
PH
901def _windows_write_string(s, out):
902 """ Returns True if the string was written using special methods,
903 False if it has yet to be written out."""
904 # Adapted from http://stackoverflow.com/a/3259271/35070
905
906 import ctypes
907 import ctypes.wintypes
908
909 WIN_OUTPUT_IDS = {
910 1: -11,
911 2: -12,
912 }
913
a383a98a
PH
914 try:
915 fileno = out.fileno()
916 except AttributeError:
917 # If the output stream doesn't have a fileno, it's virtual
918 return False
aa42e873
PH
919 except io.UnsupportedOperation:
920 # Some strange Windows pseudo files?
921 return False
b58ddb32
PH
922 if fileno not in WIN_OUTPUT_IDS:
923 return False
924
e2f89ec7 925 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 926 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
6ac4e806 927 (b"GetStdHandle", ctypes.windll.kernel32))
b58ddb32
PH
928 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
929
e2f89ec7 930 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
931 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
932 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
6ac4e806 933 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
b58ddb32
PH
934 written = ctypes.wintypes.DWORD(0)
935
6ac4e806 936 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
b58ddb32
PH
937 FILE_TYPE_CHAR = 0x0002
938 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 939 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
940 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
941 ctypes.POINTER(ctypes.wintypes.DWORD))(
6ac4e806 942 (b"GetConsoleMode", ctypes.windll.kernel32))
b58ddb32
PH
943 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
944
945 def not_a_console(handle):
946 if handle == INVALID_HANDLE_VALUE or handle is None:
947 return True
8fb3ac36
PH
948 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
949 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
950
951 if not_a_console(h):
952 return False
953
d1b9c912
PH
954 def next_nonbmp_pos(s):
955 try:
956 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
957 except StopIteration:
958 return len(s)
959
960 while s:
961 count = min(next_nonbmp_pos(s), 1024)
962
b58ddb32 963 ret = WriteConsoleW(
d1b9c912 964 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
965 if ret == 0:
966 raise OSError('Failed to write string')
d1b9c912
PH
967 if not count: # We just wrote a non-BMP character
968 assert written.value == 2
969 s = s[1:]
970 else:
971 assert written.value > 0
972 s = s[written.value:]
b58ddb32
PH
973 return True
974
975
734f90bb 976def write_string(s, out=None, encoding=None):
7459e3a2
PH
977 if out is None:
978 out = sys.stderr
8bf48f23 979 assert type(s) == compat_str
7459e3a2 980
b58ddb32
PH
981 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
982 if _windows_write_string(s, out):
983 return
984
7459e3a2
PH
985 if ('b' in getattr(out, 'mode', '') or
986 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
987 byt = s.encode(encoding or preferredencoding(), 'ignore')
988 out.write(byt)
989 elif hasattr(out, 'buffer'):
990 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
991 byt = s.encode(enc, 'ignore')
992 out.buffer.write(byt)
993 else:
8bf48f23 994 out.write(s)
7459e3a2
PH
995 out.flush()
996
997
48ea9cea
PH
998def bytes_to_intlist(bs):
999 if not bs:
1000 return []
1001 if isinstance(bs[0], int): # Python 3
1002 return list(bs)
1003 else:
1004 return [ord(c) for c in bs]
1005
c257baff 1006
cba892fa 1007def intlist_to_bytes(xs):
1008 if not xs:
1009 return b''
eb4157fd 1010 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1011
1012
c1c9a79c
PH
1013# Cross-platform file locking
1014if sys.platform == 'win32':
1015 import ctypes.wintypes
1016 import msvcrt
1017
1018 class OVERLAPPED(ctypes.Structure):
1019 _fields_ = [
1020 ('Internal', ctypes.wintypes.LPVOID),
1021 ('InternalHigh', ctypes.wintypes.LPVOID),
1022 ('Offset', ctypes.wintypes.DWORD),
1023 ('OffsetHigh', ctypes.wintypes.DWORD),
1024 ('hEvent', ctypes.wintypes.HANDLE),
1025 ]
1026
1027 kernel32 = ctypes.windll.kernel32
1028 LockFileEx = kernel32.LockFileEx
1029 LockFileEx.argtypes = [
1030 ctypes.wintypes.HANDLE, # hFile
1031 ctypes.wintypes.DWORD, # dwFlags
1032 ctypes.wintypes.DWORD, # dwReserved
1033 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1034 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1035 ctypes.POINTER(OVERLAPPED) # Overlapped
1036 ]
1037 LockFileEx.restype = ctypes.wintypes.BOOL
1038 UnlockFileEx = kernel32.UnlockFileEx
1039 UnlockFileEx.argtypes = [
1040 ctypes.wintypes.HANDLE, # hFile
1041 ctypes.wintypes.DWORD, # dwReserved
1042 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1043 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1044 ctypes.POINTER(OVERLAPPED) # Overlapped
1045 ]
1046 UnlockFileEx.restype = ctypes.wintypes.BOOL
1047 whole_low = 0xffffffff
1048 whole_high = 0x7fffffff
1049
1050 def _lock_file(f, exclusive):
1051 overlapped = OVERLAPPED()
1052 overlapped.Offset = 0
1053 overlapped.OffsetHigh = 0
1054 overlapped.hEvent = 0
1055 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1056 handle = msvcrt.get_osfhandle(f.fileno())
1057 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1058 whole_low, whole_high, f._lock_file_overlapped_p):
1059 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1060
1061 def _unlock_file(f):
1062 assert f._lock_file_overlapped_p
1063 handle = msvcrt.get_osfhandle(f.fileno())
1064 if not UnlockFileEx(handle, 0,
1065 whole_low, whole_high, f._lock_file_overlapped_p):
1066 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1067
1068else:
1069 import fcntl
1070
1071 def _lock_file(f, exclusive):
2582bebe 1072 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
1073
1074 def _unlock_file(f):
2582bebe 1075 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
1076
1077
1078class locked_file(object):
1079 def __init__(self, filename, mode, encoding=None):
1080 assert mode in ['r', 'a', 'w']
1081 self.f = io.open(filename, mode, encoding=encoding)
1082 self.mode = mode
1083
1084 def __enter__(self):
1085 exclusive = self.mode != 'r'
1086 try:
1087 _lock_file(self.f, exclusive)
1088 except IOError:
1089 self.f.close()
1090 raise
1091 return self
1092
1093 def __exit__(self, etype, value, traceback):
1094 try:
1095 _unlock_file(self.f)
1096 finally:
1097 self.f.close()
1098
1099 def __iter__(self):
1100 return iter(self.f)
1101
1102 def write(self, *args):
1103 return self.f.write(*args)
1104
1105 def read(self, *args):
1106 return self.f.read(*args)
4eb7f1d1
JMF
1107
1108
4644ac55
S
1109def get_filesystem_encoding():
1110 encoding = sys.getfilesystemencoding()
1111 return encoding if encoding is not None else 'utf-8'
1112
1113
4eb7f1d1 1114def shell_quote(args):
a6a173c2 1115 quoted_args = []
4644ac55 1116 encoding = get_filesystem_encoding()
a6a173c2
JMF
1117 for a in args:
1118 if isinstance(a, bytes):
1119 # We may get a filename encoded with 'encodeFilename'
1120 a = a.decode(encoding)
1121 quoted_args.append(pipes.quote(a))
28e614de 1122 return ' '.join(quoted_args)
9d4660ca
PH
1123
1124
1125def smuggle_url(url, data):
1126 """ Pass additional data in a URL for internal use. """
1127
1128 sdata = compat_urllib_parse.urlencode(
28e614de
PH
1129 {'__youtubedl_smuggle': json.dumps(data)})
1130 return url + '#' + sdata
9d4660ca
PH
1131
1132
79f82953 1133def unsmuggle_url(smug_url, default=None):
83e865a3 1134 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1135 return smug_url, default
28e614de
PH
1136 url, _, sdata = smug_url.rpartition('#')
1137 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1138 data = json.loads(jsond)
1139 return url, data
02dbf93f
PH
1140
1141
02dbf93f
PH
1142def format_bytes(bytes):
1143 if bytes is None:
28e614de 1144 return 'N/A'
02dbf93f
PH
1145 if type(bytes) is str:
1146 bytes = float(bytes)
1147 if bytes == 0.0:
1148 exponent = 0
1149 else:
1150 exponent = int(math.log(bytes, 1024.0))
28e614de 1151 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1152 converted = float(bytes) / float(1024 ** exponent)
28e614de 1153 return '%.2f%s' % (converted, suffix)
f53c966a 1154
1c088fa8 1155
be64b5b0
PH
1156def parse_filesize(s):
1157 if s is None:
1158 return None
1159
1160 # The lower-case forms are of course incorrect and inofficial,
1161 # but we support those too
1162 _UNIT_TABLE = {
1163 'B': 1,
1164 'b': 1,
1165 'KiB': 1024,
1166 'KB': 1000,
1167 'kB': 1024,
1168 'Kb': 1000,
1169 'MiB': 1024 ** 2,
1170 'MB': 1000 ** 2,
1171 'mB': 1024 ** 2,
1172 'Mb': 1000 ** 2,
1173 'GiB': 1024 ** 3,
1174 'GB': 1000 ** 3,
1175 'gB': 1024 ** 3,
1176 'Gb': 1000 ** 3,
1177 'TiB': 1024 ** 4,
1178 'TB': 1000 ** 4,
1179 'tB': 1024 ** 4,
1180 'Tb': 1000 ** 4,
1181 'PiB': 1024 ** 5,
1182 'PB': 1000 ** 5,
1183 'pB': 1024 ** 5,
1184 'Pb': 1000 ** 5,
1185 'EiB': 1024 ** 6,
1186 'EB': 1000 ** 6,
1187 'eB': 1024 ** 6,
1188 'Eb': 1000 ** 6,
1189 'ZiB': 1024 ** 7,
1190 'ZB': 1000 ** 7,
1191 'zB': 1024 ** 7,
1192 'Zb': 1000 ** 7,
1193 'YiB': 1024 ** 8,
1194 'YB': 1000 ** 8,
1195 'yB': 1024 ** 8,
1196 'Yb': 1000 ** 8,
1197 }
1198
1199 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
4349c07d
PH
1200 m = re.match(
1201 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
be64b5b0
PH
1202 if not m:
1203 return None
1204
4349c07d
PH
1205 num_str = m.group('num').replace(',', '.')
1206 mult = _UNIT_TABLE[m.group('unit')]
1207 return int(float(num_str) * mult)
be64b5b0
PH
1208
1209
caefb1de
PH
1210def month_by_name(name):
1211 """ Return the number of a month by (locale-independently) English name """
1212
caefb1de 1213 try:
7105440c
YCH
1214 return ENGLISH_MONTH_NAMES.index(name) + 1
1215 except ValueError:
1216 return None
1217
1218
1219def month_by_abbreviation(abbrev):
1220 """ Return the number of a month by (locale-independently) English
1221 abbreviations """
1222
1223 try:
1224 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1225 except ValueError:
1226 return None
18258362
JMF
1227
1228
5aafe895 1229def fix_xml_ampersands(xml_str):
18258362 1230 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1231 return re.sub(
1232 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1233 '&amp;',
5aafe895 1234 xml_str)
e3946f98
PH
1235
1236
1237def setproctitle(title):
8bf48f23 1238 assert isinstance(title, compat_str)
e3946f98
PH
1239 try:
1240 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1241 except OSError:
1242 return
6eefe533
PH
1243 title_bytes = title.encode('utf-8')
1244 buf = ctypes.create_string_buffer(len(title_bytes))
1245 buf.value = title_bytes
e3946f98 1246 try:
6eefe533 1247 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1248 except AttributeError:
1249 return # Strange libc, just skip this
d7dda168
PH
1250
1251
1252def remove_start(s, start):
1253 if s.startswith(start):
1254 return s[len(start):]
1255 return s
29eb5174
PH
1256
1257
2b9faf55
PH
1258def remove_end(s, end):
1259 if s.endswith(end):
1260 return s[:-len(end)]
1261 return s
1262
1263
29eb5174 1264def url_basename(url):
9b8aaeed 1265 path = compat_urlparse.urlparse(url).path
28e614de 1266 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1267
1268
1269class HEADRequest(compat_urllib_request.Request):
1270 def get_method(self):
1271 return "HEAD"
7217e148
PH
1272
1273
9732d77e 1274def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1275 if get_attr:
1276 if v is not None:
1277 v = getattr(v, get_attr, None)
9572013d
PH
1278 if v == '':
1279 v = None
9732d77e
PH
1280 return default if v is None else (int(v) * invscale // scale)
1281
9572013d 1282
40a90862
JMF
1283def str_or_none(v, default=None):
1284 return default if v is None else compat_str(v)
1285
9732d77e
PH
1286
1287def str_to_int(int_str):
48d4681e 1288 """ A more relaxed version of int_or_none """
9732d77e
PH
1289 if int_str is None:
1290 return None
28e614de 1291 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1292 return int(int_str)
608d11f5
PH
1293
1294
9732d77e
PH
1295def float_or_none(v, scale=1, invscale=1, default=None):
1296 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1297
1298
608d11f5 1299def parse_duration(s):
8f9312c3 1300 if not isinstance(s, compat_basestring):
608d11f5
PH
1301 return None
1302
ca7b3246
S
1303 s = s.strip()
1304
608d11f5 1305 m = re.match(
9d22a7df 1306 r'''(?ix)(?:P?T)?
e8df5cee
PH
1307 (?:
1308 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1309 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1310
3e675fab 1311 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
6a68bb57 1312 (?:
8f4b58d7
PH
1313 (?:
1314 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1315 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1316 )?
6a68bb57
PH
1317 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1318 )?
e8df5cee
PH
1319 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1320 )$''', s)
608d11f5
PH
1321 if not m:
1322 return None
e8df5cee
PH
1323 res = 0
1324 if m.group('only_mins'):
1325 return float_or_none(m.group('only_mins'), invscale=60)
1326 if m.group('only_hours'):
1327 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1328 if m.group('secs'):
1329 res += int(m.group('secs'))
3e675fab
PH
1330 if m.group('mins_reversed'):
1331 res += int(m.group('mins_reversed')) * 60
608d11f5
PH
1332 if m.group('mins'):
1333 res += int(m.group('mins')) * 60
e8df5cee
PH
1334 if m.group('hours'):
1335 res += int(m.group('hours')) * 60 * 60
3e675fab
PH
1336 if m.group('hours_reversed'):
1337 res += int(m.group('hours_reversed')) * 60 * 60
8f4b58d7
PH
1338 if m.group('days'):
1339 res += int(m.group('days')) * 24 * 60 * 60
7adcbe75
PH
1340 if m.group('ms'):
1341 res += float(m.group('ms'))
608d11f5 1342 return res
91d7d0b3
JMF
1343
1344
e65e4c88 1345def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1346 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1347 return (
1348 '{0}.{1}{2}'.format(name, ext, real_ext)
1349 if not expected_real_ext or real_ext[1:] == expected_real_ext
1350 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1351
1352
b3ed15b7
S
1353def replace_extension(filename, ext, expected_real_ext=None):
1354 name, real_ext = os.path.splitext(filename)
1355 return '{0}.{1}'.format(
1356 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1357 ext)
1358
1359
d70ad093
PH
1360def check_executable(exe, args=[]):
1361 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1362 args can be a list of arguments for a short output (like -version) """
1363 try:
1364 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1365 except OSError:
1366 return False
1367 return exe
b7ab0590
PH
1368
1369
95807118 1370def get_exe_version(exe, args=['--version'],
cae97f65 1371 version_re=None, unrecognized='present'):
95807118
PH
1372 """ Returns the version of the specified executable,
1373 or False if the executable is not present """
1374 try:
cae97f65 1375 out, _ = subprocess.Popen(
54116803 1376 [encodeArgument(exe)] + args,
95807118
PH
1377 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1378 except OSError:
1379 return False
cae97f65
PH
1380 if isinstance(out, bytes): # Python 2.x
1381 out = out.decode('ascii', 'ignore')
1382 return detect_exe_version(out, version_re, unrecognized)
1383
1384
1385def detect_exe_version(output, version_re=None, unrecognized='present'):
1386 assert isinstance(output, compat_str)
1387 if version_re is None:
1388 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1389 m = re.search(version_re, output)
95807118
PH
1390 if m:
1391 return m.group(1)
1392 else:
1393 return unrecognized
1394
1395
b7ab0590 1396class PagedList(object):
dd26ced1
PH
1397 def __len__(self):
1398 # This is only useful for tests
1399 return len(self.getslice())
1400
9c44d242
PH
1401
1402class OnDemandPagedList(PagedList):
1403 def __init__(self, pagefunc, pagesize):
1404 self._pagefunc = pagefunc
1405 self._pagesize = pagesize
1406
b7ab0590
PH
1407 def getslice(self, start=0, end=None):
1408 res = []
1409 for pagenum in itertools.count(start // self._pagesize):
1410 firstid = pagenum * self._pagesize
1411 nextfirstid = pagenum * self._pagesize + self._pagesize
1412 if start >= nextfirstid:
1413 continue
1414
1415 page_results = list(self._pagefunc(pagenum))
1416
1417 startv = (
1418 start % self._pagesize
1419 if firstid <= start < nextfirstid
1420 else 0)
1421
1422 endv = (
1423 ((end - 1) % self._pagesize) + 1
1424 if (end is not None and firstid <= end <= nextfirstid)
1425 else None)
1426
1427 if startv != 0 or endv is not None:
1428 page_results = page_results[startv:endv]
1429 res.extend(page_results)
1430
1431 # A little optimization - if current page is not "full", ie. does
1432 # not contain page_size videos then we can assume that this page
1433 # is the last one - there are no more ids on further pages -
1434 # i.e. no need to query again.
1435 if len(page_results) + startv < self._pagesize:
1436 break
1437
1438 # If we got the whole page, but the next page is not interesting,
1439 # break out early as well
1440 if end == nextfirstid:
1441 break
1442 return res
81c2f20b
PH
1443
1444
9c44d242
PH
1445class InAdvancePagedList(PagedList):
1446 def __init__(self, pagefunc, pagecount, pagesize):
1447 self._pagefunc = pagefunc
1448 self._pagecount = pagecount
1449 self._pagesize = pagesize
1450
1451 def getslice(self, start=0, end=None):
1452 res = []
1453 start_page = start // self._pagesize
1454 end_page = (
1455 self._pagecount if end is None else (end // self._pagesize + 1))
1456 skip_elems = start - start_page * self._pagesize
1457 only_more = None if end is None else end - start
1458 for pagenum in range(start_page, end_page):
1459 page = list(self._pagefunc(pagenum))
1460 if skip_elems:
1461 page = page[skip_elems:]
1462 skip_elems = None
1463 if only_more is not None:
1464 if len(page) < only_more:
1465 only_more -= len(page)
1466 else:
1467 page = page[:only_more]
1468 res.extend(page)
1469 break
1470 res.extend(page)
1471 return res
1472
1473
81c2f20b 1474def uppercase_escape(s):
676eb3f2 1475 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1476 return re.sub(
a612753d 1477 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1478 lambda m: unicode_escape(m.group(0))[0],
1479 s)
0fe2ff78
YCH
1480
1481
1482def lowercase_escape(s):
1483 unicode_escape = codecs.getdecoder('unicode_escape')
1484 return re.sub(
1485 r'\\u[0-9a-fA-F]{4}',
1486 lambda m: unicode_escape(m.group(0))[0],
1487 s)
b53466e1 1488
d05cfe06
S
1489
1490def escape_rfc3986(s):
1491 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1492 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1493 s = s.encode('utf-8')
ecc0c5ee 1494 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1495
1496
1497def escape_url(url):
1498 """Escape URL as suggested by RFC 3986"""
1499 url_parsed = compat_urllib_parse_urlparse(url)
1500 return url_parsed._replace(
1501 path=escape_rfc3986(url_parsed.path),
1502 params=escape_rfc3986(url_parsed.params),
1503 query=escape_rfc3986(url_parsed.query),
1504 fragment=escape_rfc3986(url_parsed.fragment)
1505 ).geturl()
1506
b53466e1 1507try:
28e614de 1508 struct.pack('!I', 0)
b53466e1
PH
1509except TypeError:
1510 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1511 def struct_pack(spec, *args):
1512 if isinstance(spec, compat_str):
1513 spec = spec.encode('ascii')
1514 return struct.pack(spec, *args)
1515
1516 def struct_unpack(spec, *args):
1517 if isinstance(spec, compat_str):
1518 spec = spec.encode('ascii')
1519 return struct.unpack(spec, *args)
1520else:
1521 struct_pack = struct.pack
1522 struct_unpack = struct.unpack
62e609ab
PH
1523
1524
1525def read_batch_urls(batch_fd):
1526 def fixup(url):
1527 if not isinstance(url, compat_str):
1528 url = url.decode('utf-8', 'replace')
28e614de 1529 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1530 if url.startswith(BOM_UTF8):
1531 url = url[len(BOM_UTF8):]
1532 url = url.strip()
1533 if url.startswith(('#', ';', ']')):
1534 return False
1535 return url
1536
1537 with contextlib.closing(batch_fd) as fd:
1538 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1539
1540
1541def urlencode_postdata(*args, **kargs):
1542 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1543
1544
0990305d
PH
1545try:
1546 etree_iter = xml.etree.ElementTree.Element.iter
1547except AttributeError: # Python <=2.6
1548 etree_iter = lambda n: n.findall('.//*')
1549
1550
bcf89ce6
PH
1551def parse_xml(s):
1552 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1553 def doctype(self, name, pubid, system):
1554 pass # Ignore doctypes
1555
1556 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1557 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1558 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1559 # Fix up XML parser in Python 2.x
1560 if sys.version_info < (3, 0):
1561 for n in etree_iter(tree):
1562 if n.text is not None:
1563 if not isinstance(n.text, compat_str):
1564 n.text = n.text.decode('utf-8')
1565 return tree
e68301af
PH
1566
1567
a1a530b0
PH
1568US_RATINGS = {
1569 'G': 0,
1570 'PG': 10,
1571 'PG-13': 13,
1572 'R': 16,
1573 'NC': 18,
1574}
fac55558
PH
1575
1576
146c80e2
S
1577def parse_age_limit(s):
1578 if s is None:
d838b1bd 1579 return None
146c80e2 1580 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d838b1bd 1581 return int(m.group('age')) if m else US_RATINGS.get(s, None)
146c80e2
S
1582
1583
fac55558 1584def strip_jsonp(code):
609a61e3
PH
1585 return re.sub(
1586 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1587
1588
e05f6939
PH
1589def js_to_json(code):
1590 def fix_kv(m):
e7b6d122
PH
1591 v = m.group(0)
1592 if v in ('true', 'false', 'null'):
1593 return v
1594 if v.startswith('"'):
1595 return v
1596 if v.startswith("'"):
1597 v = v[1:-1]
1598 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1599 '\\\\': '\\\\',
1600 "\\'": "'",
1601 '"': '\\"',
1602 }[m.group(0)], v)
1603 return '"%s"' % v
e05f6939
PH
1604
1605 res = re.sub(r'''(?x)
d305dd73
PH
1606 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1607 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
8f4b58d7 1608 [a-zA-Z_][.a-zA-Z_0-9]*
e05f6939 1609 ''', fix_kv, code)
ba9e68f4 1610 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
e05f6939
PH
1611 return res
1612
1613
478c2c61
PH
1614def qualities(quality_ids):
1615 """ Get a numeric quality value out of a list of possible values """
1616 def q(qid):
1617 try:
1618 return quality_ids.index(qid)
1619 except ValueError:
1620 return -1
1621 return q
1622
acd69589
PH
1623
1624DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1625
a020a0dc
PH
1626
1627def limit_length(s, length):
1628 """ Add ellipses to overly long strings """
1629 if s is None:
1630 return None
1631 ELLIPSES = '...'
1632 if len(s) > length:
1633 return s[:length - len(ELLIPSES)] + ELLIPSES
1634 return s
48844745
PH
1635
1636
1637def version_tuple(v):
5f9b8394 1638 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1639
1640
1641def is_outdated_version(version, limit, assume_new=True):
1642 if not version:
1643 return not assume_new
1644 try:
1645 return version_tuple(version) < version_tuple(limit)
1646 except ValueError:
1647 return not assume_new
732ea2f0
PH
1648
1649
1650def ytdl_is_updateable():
1651 """ Returns if youtube-dl can be updated with -U """
1652 from zipimport import zipimporter
1653
1654 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1655
1656
1657def args_to_str(args):
1658 # Get a short string representation for a subprocess command
1659 return ' '.join(shlex_quote(a) for a in args)
2ccd1b10
PH
1660
1661
c460bdd5
PH
1662def mimetype2ext(mt):
1663 _, _, res = mt.rpartition('/')
1664
1665 return {
1666 'x-ms-wmv': 'wmv',
1667 'x-mp4-fragmented': 'mp4',
1668 }.get(res, res)
1669
1670
2ccd1b10
PH
1671def urlhandle_detect_ext(url_handle):
1672 try:
1673 url_handle.headers
1674 getheader = lambda h: url_handle.headers[h]
1675 except AttributeError: # Python < 3
1676 getheader = url_handle.info().getheader
1677
b55ee18f
PH
1678 cd = getheader('Content-Disposition')
1679 if cd:
1680 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1681 if m:
1682 e = determine_ext(m.group('filename'), default_ext=None)
1683 if e:
1684 return e
1685
c460bdd5 1686 return mimetype2ext(getheader('Content-Type'))
05900629
PH
1687
1688
1689def age_restricted(content_limit, age_limit):
1690 """ Returns True iff the content should be blocked """
1691
1692 if age_limit is None: # No limit set
1693 return False
1694 if content_limit is None:
1695 return False # Content available for everyone
1696 return age_limit < content_limit
61ca9a80
PH
1697
1698
1699def is_html(first_bytes):
1700 """ Detect whether a file contains HTML by examining its first bytes. """
1701
1702 BOMS = [
1703 (b'\xef\xbb\xbf', 'utf-8'),
1704 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1705 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1706 (b'\xff\xfe', 'utf-16-le'),
1707 (b'\xfe\xff', 'utf-16-be'),
1708 ]
1709 for bom, enc in BOMS:
1710 if first_bytes.startswith(bom):
1711 s = first_bytes[len(bom):].decode(enc, 'replace')
1712 break
1713 else:
1714 s = first_bytes.decode('utf-8', 'replace')
1715
1716 return re.match(r'^\s*<', s)
a055469f
PH
1717
1718
1719def determine_protocol(info_dict):
1720 protocol = info_dict.get('protocol')
1721 if protocol is not None:
1722 return protocol
1723
1724 url = info_dict['url']
1725 if url.startswith('rtmp'):
1726 return 'rtmp'
1727 elif url.startswith('mms'):
1728 return 'mms'
1729 elif url.startswith('rtsp'):
1730 return 'rtsp'
1731
1732 ext = determine_ext(url)
1733 if ext == 'm3u8':
1734 return 'm3u8'
1735 elif ext == 'f4m':
1736 return 'f4m'
1737
1738 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
1739
1740
1741def render_table(header_row, data):
1742 """ Render a list of rows, each as a list of values """
1743 table = [header_row] + data
1744 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1745 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1746 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
1747
1748
1749def _match_one(filter_part, dct):
1750 COMPARISON_OPERATORS = {
1751 '<': operator.lt,
1752 '<=': operator.le,
1753 '>': operator.gt,
1754 '>=': operator.ge,
1755 '=': operator.eq,
1756 '!=': operator.ne,
1757 }
1758 operator_rex = re.compile(r'''(?x)\s*
1759 (?P<key>[a-z_]+)
1760 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1761 (?:
1762 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1763 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1764 )
1765 \s*$
1766 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1767 m = operator_rex.search(filter_part)
1768 if m:
1769 op = COMPARISON_OPERATORS[m.group('op')]
1770 if m.group('strval') is not None:
1771 if m.group('op') not in ('=', '!='):
1772 raise ValueError(
1773 'Operator %s does not support string values!' % m.group('op'))
1774 comparison_value = m.group('strval')
1775 else:
1776 try:
1777 comparison_value = int(m.group('intval'))
1778 except ValueError:
1779 comparison_value = parse_filesize(m.group('intval'))
1780 if comparison_value is None:
1781 comparison_value = parse_filesize(m.group('intval') + 'B')
1782 if comparison_value is None:
1783 raise ValueError(
1784 'Invalid integer value %r in filter part %r' % (
1785 m.group('intval'), filter_part))
1786 actual_value = dct.get(m.group('key'))
1787 if actual_value is None:
1788 return m.group('none_inclusive')
1789 return op(actual_value, comparison_value)
1790
1791 UNARY_OPERATORS = {
1792 '': lambda v: v is not None,
1793 '!': lambda v: v is None,
1794 }
1795 operator_rex = re.compile(r'''(?x)\s*
1796 (?P<op>%s)\s*(?P<key>[a-z_]+)
1797 \s*$
1798 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1799 m = operator_rex.search(filter_part)
1800 if m:
1801 op = UNARY_OPERATORS[m.group('op')]
1802 actual_value = dct.get(m.group('key'))
1803 return op(actual_value)
1804
1805 raise ValueError('Invalid filter part %r' % filter_part)
1806
1807
1808def match_str(filter_str, dct):
1809 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1810
1811 return all(
1812 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1813
1814
1815def match_filter_func(filter_str):
1816 def _match_func(info_dict):
1817 if match_str(filter_str, info_dict):
1818 return None
1819 else:
1820 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1821 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1822 return _match_func
91410c9b
PH
1823
1824
bf6427d2
YCH
1825def parse_dfxp_time_expr(time_expr):
1826 if not time_expr:
1827 return 0.0
1828
1829 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1830 if mobj:
1831 return float(mobj.group('time_offset'))
1832
1833 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1834 if mobj:
1835 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1836
1837
1838def format_srt_time(seconds):
1839 (mins, secs) = divmod(seconds, 60)
1840 (hours, mins) = divmod(mins, 60)
1841 millisecs = (secs - int(secs)) * 1000
1842 secs = int(secs)
1843 return '%02d:%02d:%02d,%03d' % (hours, mins, secs, millisecs)
1844
1845
1846def dfxp2srt(dfxp_data):
1847 _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'})
1848
1849 def parse_node(node):
1850 str_or_empty = functools.partial(str_or_none, default='')
1851
1852 out = str_or_empty(node.text)
1853
1854 for child in node:
1855 if child.tag == _x('ttml:br'):
1856 out += '\n' + str_or_empty(child.tail)
1857 elif child.tag == _x('ttml:span'):
1858 out += str_or_empty(parse_node(child))
1859 else:
1860 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1861
1862 return out
1863
1864 dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1865 out = []
1866 paras = dfxp.findall(_x('.//ttml:p'))
1867
1868 for para, index in zip(paras, itertools.count(1)):
7dff0363
YCH
1869 begin_time = parse_dfxp_time_expr(para.attrib['begin'])
1870 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
1871 if not end_time:
1872 end_time = begin_time + parse_dfxp_time_expr(para.attrib['dur'])
bf6427d2
YCH
1873 out.append('%d\n%s --> %s\n%s\n\n' % (
1874 index,
7dff0363
YCH
1875 format_srt_time(begin_time),
1876 format_srt_time(end_time),
bf6427d2
YCH
1877 parse_node(para)))
1878
1879 return ''.join(out)
1880
1881
91410c9b 1882class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
1883 def __init__(self, proxies=None):
1884 # Set default handlers
1885 for type in ('http', 'https'):
1886 setattr(self, '%s_open' % type,
1887 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
1888 meth(r, proxy, type))
1889 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
1890
91410c9b 1891 def proxy_open(self, req, proxy, type):
2461f79d 1892 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
1893 if req_proxy is not None:
1894 proxy = req_proxy
2461f79d
PH
1895 del req.headers['Ytdl-request-proxy']
1896
1897 if proxy == '__noproxy__':
1898 return None # No Proxy
91410c9b
PH
1899 return compat_urllib_request.ProxyHandler.proxy_open(
1900 self, req, proxy, type)