]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[flipagram] Make _search_json_ld non fatal
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
f45c185f 14import errno
be4a824d 15import functools
d77c3dfd 16import gzip
03f9daab 17import io
79a2e94e 18import itertools
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
347de493 22import operator
d77c3dfd 23import os
4eb7f1d1 24import pipes
c496ca96 25import platform
d77c3dfd 26import re
c496ca96 27import socket
79a2e94e 28import ssl
1c088fa8 29import subprocess
d77c3dfd 30import sys
181c8655 31import tempfile
01951dda 32import traceback
bcf89ce6 33import xml.etree.ElementTree
d77c3dfd 34import zlib
d77c3dfd 35
8c25f81b 36from .compat import (
8bb56eee 37 compat_HTMLParser,
8f9312c3 38 compat_basestring,
8c25f81b 39 compat_chr,
36e6f62c 40 compat_etree_fromstring,
8c25f81b 41 compat_html_entities,
55b2f099 42 compat_html_entities_html5,
be4a824d 43 compat_http_client,
c86b6142 44 compat_kwargs,
8c25f81b 45 compat_parse_qs,
702ccf2d 46 compat_shlex_quote,
be4a824d 47 compat_socket_create_connection,
8c25f81b 48 compat_str,
edaa23f8 49 compat_struct_pack,
d3f8e038 50 compat_struct_unpack,
8c25f81b
PH
51 compat_urllib_error,
52 compat_urllib_parse,
15707c7e 53 compat_urllib_parse_urlencode,
8c25f81b 54 compat_urllib_parse_urlparse,
7581bfc9 55 compat_urllib_parse_unquote_plus,
8c25f81b
PH
56 compat_urllib_request,
57 compat_urlparse,
810c10ba 58 compat_xpath,
8c25f81b 59)
4644ac55 60
71aff188
YCH
61from .socks import (
62 ProxyType,
63 sockssocket,
64)
65
4644ac55 66
51fb4995
YCH
67def register_socks_protocols():
68 # "Register" SOCKS protocols
d5ae6bb5
YCH
69 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
70 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
71 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
72 if scheme not in compat_urlparse.uses_netloc:
73 compat_urlparse.uses_netloc.append(scheme)
74
75
468e2e92
FV
76# This is not clearly defined otherwise
77compiled_regex_type = type(re.compile(''))
78
3e669f36 79std_headers = {
15d10678 80 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
59ae15a5
PH
81 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
82 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
83 'Accept-Encoding': 'gzip, deflate',
84 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 85}
f427df17 86
5f6a1245 87
bf42a990
S
88NO_DEFAULT = object()
89
7105440c
YCH
90ENGLISH_MONTH_NAMES = [
91 'January', 'February', 'March', 'April', 'May', 'June',
92 'July', 'August', 'September', 'October', 'November', 'December']
93
a7aaa398
S
94KNOWN_EXTENSIONS = (
95 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
96 'flv', 'f4v', 'f4a', 'f4b',
97 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
98 'mkv', 'mka', 'mk3d',
99 'avi', 'divx',
100 'mov',
101 'asf', 'wmv', 'wma',
102 '3gp', '3g2',
103 'mp3',
104 'flac',
105 'ape',
106 'wav',
107 'f4f', 'f4m', 'm3u8', 'smil')
108
c587cbb7 109# needed for sanitizing filenames in restricted mode
c8827027 110ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
111 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
112 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
c587cbb7 113
46f59e89
S
114DATE_FORMATS = (
115 '%d %B %Y',
116 '%d %b %Y',
117 '%B %d %Y',
118 '%b %d %Y',
119 '%b %dst %Y %I:%M',
120 '%b %dnd %Y %I:%M',
121 '%b %dth %Y %I:%M',
122 '%Y %m %d',
123 '%Y-%m-%d',
124 '%Y/%m/%d',
125 '%Y/%m/%d %H:%M:%S',
126 '%Y-%m-%d %H:%M:%S',
127 '%Y-%m-%d %H:%M:%S.%f',
128 '%d.%m.%Y %H:%M',
129 '%d.%m.%Y %H.%M',
130 '%Y-%m-%dT%H:%M:%SZ',
131 '%Y-%m-%dT%H:%M:%S.%fZ',
132 '%Y-%m-%dT%H:%M:%S.%f0Z',
133 '%Y-%m-%dT%H:%M:%S',
134 '%Y-%m-%dT%H:%M:%S.%f',
135 '%Y-%m-%dT%H:%M',
136)
137
138DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
139DATE_FORMATS_DAY_FIRST.extend([
140 '%d-%m-%Y',
141 '%d.%m.%Y',
142 '%d.%m.%y',
143 '%d/%m/%Y',
144 '%d/%m/%y',
145 '%d/%m/%Y %H:%M:%S',
146])
147
148DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
149DATE_FORMATS_MONTH_FIRST.extend([
150 '%m-%d-%Y',
151 '%m.%d.%Y',
152 '%m/%d/%Y',
153 '%m/%d/%y',
154 '%m/%d/%Y %H:%M:%S',
155])
156
7105440c 157
d77c3dfd 158def preferredencoding():
59ae15a5 159 """Get preferred encoding.
d77c3dfd 160
59ae15a5
PH
161 Returns the best encoding scheme for the system, based on
162 locale.getpreferredencoding() and some further tweaks.
163 """
164 try:
165 pref = locale.getpreferredencoding()
28e614de 166 'TEST'.encode(pref)
70a1165b 167 except Exception:
59ae15a5 168 pref = 'UTF-8'
bae611f2 169
59ae15a5 170 return pref
d77c3dfd 171
f4bfd65f 172
181c8655 173def write_json_file(obj, fn):
1394646a 174 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 175
92120217 176 fn = encodeFilename(fn)
61ee5aeb 177 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
178 encoding = get_filesystem_encoding()
179 # os.path.basename returns a bytes object, but NamedTemporaryFile
180 # will fail if the filename contains non ascii characters unless we
181 # use a unicode object
182 path_basename = lambda f: os.path.basename(fn).decode(encoding)
183 # the same for os.path.dirname
184 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
185 else:
186 path_basename = os.path.basename
187 path_dirname = os.path.dirname
188
73159f99
S
189 args = {
190 'suffix': '.tmp',
ec5f6016
JMF
191 'prefix': path_basename(fn) + '.',
192 'dir': path_dirname(fn),
73159f99
S
193 'delete': False,
194 }
195
181c8655
PH
196 # In Python 2.x, json.dump expects a bytestream.
197 # In Python 3.x, it writes to a character stream
198 if sys.version_info < (3, 0):
73159f99 199 args['mode'] = 'wb'
181c8655 200 else:
73159f99
S
201 args.update({
202 'mode': 'w',
203 'encoding': 'utf-8',
204 })
205
c86b6142 206 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
207
208 try:
209 with tf:
210 json.dump(obj, tf)
1394646a
IK
211 if sys.platform == 'win32':
212 # Need to remove existing file on Windows, else os.rename raises
213 # WindowsError or FileExistsError.
214 try:
215 os.unlink(fn)
216 except OSError:
217 pass
181c8655 218 os.rename(tf.name, fn)
70a1165b 219 except Exception:
181c8655
PH
220 try:
221 os.remove(tf.name)
222 except OSError:
223 pass
224 raise
225
226
227if sys.version_info >= (2, 7):
ee114368 228 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 229 """ Find the xpath xpath[@key=val] """
5d2354f1 230 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 231 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
232 return node.find(expr)
233else:
ee114368 234 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 235 for f in node.findall(compat_xpath(xpath)):
ee114368
S
236 if key not in f.attrib:
237 continue
238 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
239 return f
240 return None
241
d7e66d39
JMF
242# On python2.6 the xml.etree.ElementTree.Element methods don't support
243# the namespace parameter
5f6a1245
JW
244
245
d7e66d39
JMF
246def xpath_with_ns(path, ns_map):
247 components = [c.split(':') for c in path.split('/')]
248 replaced = []
249 for c in components:
250 if len(c) == 1:
251 replaced.append(c[0])
252 else:
253 ns, tag = c
254 replaced.append('{%s}%s' % (ns_map[ns], tag))
255 return '/'.join(replaced)
256
d77c3dfd 257
a41fb80c 258def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 259 def _find_xpath(xpath):
810c10ba 260 return node.find(compat_xpath(xpath))
578c0745
S
261
262 if isinstance(xpath, (str, compat_str)):
263 n = _find_xpath(xpath)
264 else:
265 for xp in xpath:
266 n = _find_xpath(xp)
267 if n is not None:
268 break
d74bebd5 269
8e636da4 270 if n is None:
bf42a990
S
271 if default is not NO_DEFAULT:
272 return default
273 elif fatal:
bf0ff932
PH
274 name = xpath if name is None else name
275 raise ExtractorError('Could not find XML element %s' % name)
276 else:
277 return None
a41fb80c
S
278 return n
279
280
281def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
282 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
283 if n is None or n == default:
284 return n
285 if n.text is None:
286 if default is not NO_DEFAULT:
287 return default
288 elif fatal:
289 name = xpath if name is None else name
290 raise ExtractorError('Could not find XML element\'s text %s' % name)
291 else:
292 return None
293 return n.text
a41fb80c
S
294
295
296def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
297 n = find_xpath_attr(node, xpath, key)
298 if n is None:
299 if default is not NO_DEFAULT:
300 return default
301 elif fatal:
302 name = '%s[@%s]' % (xpath, key) if name is None else name
303 raise ExtractorError('Could not find XML attribute %s' % name)
304 else:
305 return None
306 return n.attrib[key]
bf0ff932
PH
307
308
9e6dd238 309def get_element_by_id(id, html):
43e8fafd 310 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 311 return get_element_by_attribute('id', id, html)
43e8fafd 312
12ea2f30 313
84c237fb
YCH
314def get_element_by_class(class_name, html):
315 return get_element_by_attribute(
316 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
317 html, escape_value=False)
318
319
320def get_element_by_attribute(attribute, value, html, escape_value=True):
43e8fafd 321 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 322
84c237fb
YCH
323 value = re.escape(value) if escape_value else value
324
38285056
PH
325 m = re.search(r'''(?xs)
326 <([a-zA-Z0-9:._-]+)
abc97b5e 327 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056 328 \s+%s=['"]?%s['"]?
abc97b5e 329 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056
PH
330 \s*>
331 (?P<content>.*?)
332 </\1>
84c237fb 333 ''' % (re.escape(attribute), value), html)
38285056
PH
334
335 if not m:
336 return None
337 res = m.group('content')
338
339 if res.startswith('"') or res.startswith("'"):
340 res = res[1:-1]
a921f407 341
38285056 342 return unescapeHTML(res)
a921f407 343
c5229f39 344
8bb56eee
BF
345class HTMLAttributeParser(compat_HTMLParser):
346 """Trivial HTML parser to gather the attributes for a single element"""
347 def __init__(self):
c5229f39 348 self.attrs = {}
8bb56eee
BF
349 compat_HTMLParser.__init__(self)
350
351 def handle_starttag(self, tag, attrs):
352 self.attrs = dict(attrs)
353
c5229f39 354
8bb56eee
BF
355def extract_attributes(html_element):
356 """Given a string for an HTML element such as
357 <el
358 a="foo" B="bar" c="&98;az" d=boz
359 empty= noval entity="&amp;"
360 sq='"' dq="'"
361 >
362 Decode and return a dictionary of attributes.
363 {
364 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
365 'empty': '', 'noval': None, 'entity': '&',
366 'sq': '"', 'dq': '\''
367 }.
368 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
369 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
370 """
371 parser = HTMLAttributeParser()
372 parser.feed(html_element)
373 parser.close()
374 return parser.attrs
9e6dd238 375
c5229f39 376
9e6dd238 377def clean_html(html):
59ae15a5 378 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
379
380 if html is None: # Convenience for sanitizing descriptions etc.
381 return html
382
59ae15a5
PH
383 # Newline vs <br />
384 html = html.replace('\n', ' ')
6b3aef80
FV
385 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
386 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
387 # Strip html tags
388 html = re.sub('<.*?>', '', html)
389 # Replace html entities
390 html = unescapeHTML(html)
7decf895 391 return html.strip()
9e6dd238
FV
392
393
d77c3dfd 394def sanitize_open(filename, open_mode):
59ae15a5
PH
395 """Try to open the given filename, and slightly tweak it if this fails.
396
397 Attempts to open the given filename. If this fails, it tries to change
398 the filename slightly, step by step, until it's either able to open it
399 or it fails and raises a final exception, like the standard open()
400 function.
401
402 It returns the tuple (stream, definitive_file_name).
403 """
404 try:
28e614de 405 if filename == '-':
59ae15a5
PH
406 if sys.platform == 'win32':
407 import msvcrt
408 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 409 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
410 stream = open(encodeFilename(filename), open_mode)
411 return (stream, filename)
412 except (IOError, OSError) as err:
f45c185f
PH
413 if err.errno in (errno.EACCES,):
414 raise
59ae15a5 415
f45c185f 416 # In case of error, try to remove win32 forbidden chars
d55de57b 417 alt_filename = sanitize_path(filename)
f45c185f
PH
418 if alt_filename == filename:
419 raise
420 else:
421 # An exception here should be caught in the caller
d55de57b 422 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 423 return (stream, alt_filename)
d77c3dfd
FV
424
425
426def timeconvert(timestr):
59ae15a5
PH
427 """Convert RFC 2822 defined time string into system timestamp"""
428 timestamp = None
429 timetuple = email.utils.parsedate_tz(timestr)
430 if timetuple is not None:
431 timestamp = email.utils.mktime_tz(timetuple)
432 return timestamp
1c469a94 433
5f6a1245 434
796173d0 435def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
436 """Sanitizes a string so it could be used as part of a filename.
437 If restricted is set, use a stricter subset of allowed characters.
796173d0 438 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
439 """
440 def replace_insane(char):
c587cbb7
AT
441 if restricted and char in ACCENT_CHARS:
442 return ACCENT_CHARS[char]
59ae15a5
PH
443 if char == '?' or ord(char) < 32 or ord(char) == 127:
444 return ''
445 elif char == '"':
446 return '' if restricted else '\''
447 elif char == ':':
448 return '_-' if restricted else ' -'
449 elif char in '\\/|*<>':
450 return '_'
627dcfff 451 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
452 return '_'
453 if restricted and ord(char) > 127:
454 return '_'
455 return char
456
2aeb06d6
PH
457 # Handle timestamps
458 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 459 result = ''.join(map(replace_insane, s))
796173d0
PH
460 if not is_id:
461 while '__' in result:
462 result = result.replace('__', '_')
463 result = result.strip('_')
464 # Common case of "Foreign band name - English song title"
465 if restricted and result.startswith('-_'):
466 result = result[2:]
5a42414b
PH
467 if result.startswith('-'):
468 result = '_' + result[len('-'):]
a7440261 469 result = result.lstrip('.')
796173d0
PH
470 if not result:
471 result = '_'
59ae15a5 472 return result
d77c3dfd 473
5f6a1245 474
a2aaf4db
S
475def sanitize_path(s):
476 """Sanitizes and normalizes path on Windows"""
477 if sys.platform != 'win32':
478 return s
be531ef1
S
479 drive_or_unc, _ = os.path.splitdrive(s)
480 if sys.version_info < (2, 7) and not drive_or_unc:
481 drive_or_unc, _ = os.path.splitunc(s)
482 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
483 if drive_or_unc:
a2aaf4db
S
484 norm_path.pop(0)
485 sanitized_path = [
c90d16cf 486 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
a2aaf4db 487 for path_part in norm_path]
be531ef1
S
488 if drive_or_unc:
489 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
490 return os.path.join(*sanitized_path)
491
492
67dda517
S
493# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
494# unwanted failures due to missing protocol
17bcc626
S
495def sanitize_url(url):
496 return 'http:%s' % url if url.startswith('//') else url
497
498
67dda517 499def sanitized_Request(url, *args, **kwargs):
17bcc626 500 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
501
502
d77c3dfd 503def orderedSet(iterable):
59ae15a5
PH
504 """ Remove all duplicates from the input iterable """
505 res = []
506 for el in iterable:
507 if el not in res:
508 res.append(el)
509 return res
d77c3dfd 510
912b38b4 511
55b2f099 512def _htmlentity_transform(entity_with_semicolon):
4e408e47 513 """Transforms an HTML entity to a character."""
55b2f099
YCH
514 entity = entity_with_semicolon[:-1]
515
4e408e47
PH
516 # Known non-numeric HTML entity
517 if entity in compat_html_entities.name2codepoint:
518 return compat_chr(compat_html_entities.name2codepoint[entity])
519
55b2f099
YCH
520 # TODO: HTML5 allows entities without a semicolon. For example,
521 # '&Eacuteric' should be decoded as 'Éric'.
522 if entity_with_semicolon in compat_html_entities_html5:
523 return compat_html_entities_html5[entity_with_semicolon]
524
91757b0f 525 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
526 if mobj is not None:
527 numstr = mobj.group(1)
28e614de 528 if numstr.startswith('x'):
4e408e47 529 base = 16
28e614de 530 numstr = '0%s' % numstr
4e408e47
PH
531 else:
532 base = 10
7aefc49c
S
533 # See https://github.com/rg3/youtube-dl/issues/7518
534 try:
535 return compat_chr(int(numstr, base))
536 except ValueError:
537 pass
4e408e47
PH
538
539 # Unknown entity in name, return its literal representation
7a3f0c00 540 return '&%s;' % entity
4e408e47
PH
541
542
d77c3dfd 543def unescapeHTML(s):
912b38b4
PH
544 if s is None:
545 return None
546 assert type(s) == compat_str
d77c3dfd 547
4e408e47 548 return re.sub(
55b2f099 549 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 550
8bf48f23 551
aa49acd1
S
552def get_subprocess_encoding():
553 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
554 # For subprocess calls, encode with locale encoding
555 # Refer to http://stackoverflow.com/a/9951851/35070
556 encoding = preferredencoding()
557 else:
558 encoding = sys.getfilesystemencoding()
559 if encoding is None:
560 encoding = 'utf-8'
561 return encoding
562
563
8bf48f23 564def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
565 """
566 @param s The name of the file
567 """
d77c3dfd 568
8bf48f23 569 assert type(s) == compat_str
d77c3dfd 570
59ae15a5
PH
571 # Python 3 has a Unicode API
572 if sys.version_info >= (3, 0):
573 return s
0f00efed 574
aa49acd1
S
575 # Pass '' directly to use Unicode APIs on Windows 2000 and up
576 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
577 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
578 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
579 return s
580
8ee239e9
YCH
581 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
582 if sys.platform.startswith('java'):
583 return s
584
aa49acd1
S
585 return s.encode(get_subprocess_encoding(), 'ignore')
586
587
588def decodeFilename(b, for_subprocess=False):
589
590 if sys.version_info >= (3, 0):
591 return b
592
593 if not isinstance(b, bytes):
594 return b
595
596 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 597
f07b74fc
PH
598
599def encodeArgument(s):
600 if not isinstance(s, compat_str):
601 # Legacy code that uses byte strings
602 # Uncomment the following line after fixing all post processors
7af808a5 603 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
604 s = s.decode('ascii')
605 return encodeFilename(s, True)
606
607
aa49acd1
S
608def decodeArgument(b):
609 return decodeFilename(b, True)
610
611
8271226a
PH
612def decodeOption(optval):
613 if optval is None:
614 return optval
615 if isinstance(optval, bytes):
616 optval = optval.decode(preferredencoding())
617
618 assert isinstance(optval, compat_str)
619 return optval
1c256f70 620
5f6a1245 621
4539dd30
PH
622def formatSeconds(secs):
623 if secs > 3600:
624 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
625 elif secs > 60:
626 return '%d:%02d' % (secs // 60, secs % 60)
627 else:
628 return '%d' % secs
629
a0ddb8a2 630
be4a824d
PH
631def make_HTTPS_handler(params, **kwargs):
632 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 633 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 634 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 635 if opts_no_check_certificate:
be5f2c19 636 context.check_hostname = False
0db261ba 637 context.verify_mode = ssl.CERT_NONE
a2366922 638 try:
be4a824d 639 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
640 except TypeError:
641 # Python 2.7.8
642 # (create_default_context present but HTTPSHandler has no context=)
643 pass
644
645 if sys.version_info < (3, 2):
d7932313 646 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 647 else: # Python < 3.4
d7932313 648 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 649 context.verify_mode = (ssl.CERT_NONE
dca08720 650 if opts_no_check_certificate
ea6d901e 651 else ssl.CERT_REQUIRED)
303b479e 652 context.set_default_verify_paths()
be4a824d 653 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 654
732ea2f0 655
08f2a92c
JMF
656def bug_reports_message():
657 if ytdl_is_updateable():
658 update_cmd = 'type youtube-dl -U to update'
659 else:
660 update_cmd = 'see https://yt-dl.org/update on how to update'
661 msg = '; please report this issue on https://yt-dl.org/bug .'
662 msg += ' Make sure you are using the latest version; %s.' % update_cmd
663 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
664 return msg
665
666
1c256f70
PH
667class ExtractorError(Exception):
668 """Error during info extraction."""
5f6a1245 669
d11271dd 670 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
671 """ tb, if given, is the original traceback (so that it can be printed out).
672 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
673 """
674
675 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
676 expected = True
d11271dd
PH
677 if video_id is not None:
678 msg = video_id + ': ' + msg
410f3e73 679 if cause:
28e614de 680 msg += ' (caused by %r)' % cause
9a82b238 681 if not expected:
08f2a92c 682 msg += bug_reports_message()
1c256f70 683 super(ExtractorError, self).__init__(msg)
d5979c5d 684
1c256f70 685 self.traceback = tb
8cc83b8d 686 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 687 self.cause = cause
d11271dd 688 self.video_id = video_id
1c256f70 689
01951dda
PH
690 def format_traceback(self):
691 if self.traceback is None:
692 return None
28e614de 693 return ''.join(traceback.format_tb(self.traceback))
01951dda 694
1c256f70 695
416c7fcb
PH
696class UnsupportedError(ExtractorError):
697 def __init__(self, url):
698 super(UnsupportedError, self).__init__(
699 'Unsupported URL: %s' % url, expected=True)
700 self.url = url
701
702
55b3e45b
JMF
703class RegexNotFoundError(ExtractorError):
704 """Error when a regex didn't match"""
705 pass
706
707
d77c3dfd 708class DownloadError(Exception):
59ae15a5 709 """Download Error exception.
d77c3dfd 710
59ae15a5
PH
711 This exception may be thrown by FileDownloader objects if they are not
712 configured to continue on errors. They will contain the appropriate
713 error message.
714 """
5f6a1245 715
8cc83b8d
FV
716 def __init__(self, msg, exc_info=None):
717 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
718 super(DownloadError, self).__init__(msg)
719 self.exc_info = exc_info
d77c3dfd
FV
720
721
722class SameFileError(Exception):
59ae15a5 723 """Same File exception.
d77c3dfd 724
59ae15a5
PH
725 This exception will be thrown by FileDownloader objects if they detect
726 multiple files would have to be downloaded to the same file on disk.
727 """
728 pass
d77c3dfd
FV
729
730
731class PostProcessingError(Exception):
59ae15a5 732 """Post Processing exception.
d77c3dfd 733
59ae15a5
PH
734 This exception may be raised by PostProcessor's .run() method to
735 indicate an error in the postprocessing task.
736 """
5f6a1245 737
7851b379
PH
738 def __init__(self, msg):
739 self.msg = msg
d77c3dfd 740
5f6a1245 741
d77c3dfd 742class MaxDownloadsReached(Exception):
59ae15a5
PH
743 """ --max-downloads limit has been reached. """
744 pass
d77c3dfd
FV
745
746
747class UnavailableVideoError(Exception):
59ae15a5 748 """Unavailable Format exception.
d77c3dfd 749
59ae15a5
PH
750 This exception will be thrown when a video is requested
751 in a format that is not available for that video.
752 """
753 pass
d77c3dfd
FV
754
755
756class ContentTooShortError(Exception):
59ae15a5 757 """Content Too Short exception.
d77c3dfd 758
59ae15a5
PH
759 This exception may be raised by FileDownloader objects when a file they
760 download is too small for what the server announced first, indicating
761 the connection was probably interrupted.
762 """
d77c3dfd 763
59ae15a5 764 def __init__(self, downloaded, expected):
2c7ed247 765 # Both in bytes
59ae15a5
PH
766 self.downloaded = downloaded
767 self.expected = expected
d77c3dfd 768
5f6a1245 769
c5a59d93 770def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
771 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
772 # expected HTTP responses to meet HTTP/1.0 or later (see also
773 # https://github.com/rg3/youtube-dl/issues/6727)
774 if sys.version_info < (3, 0):
5a1a2e94 775 kwargs[b'strict'] = True
be4a824d
PH
776 hc = http_class(*args, **kwargs)
777 source_address = ydl_handler._params.get('source_address')
778 if source_address is not None:
779 sa = (source_address, 0)
780 if hasattr(hc, 'source_address'): # Python 2.7+
781 hc.source_address = sa
782 else: # Python 2.6
783 def _hc_connect(self, *args, **kwargs):
784 sock = compat_socket_create_connection(
785 (self.host, self.port), self.timeout, sa)
786 if is_https:
d7932313
PH
787 self.sock = ssl.wrap_socket(
788 sock, self.key_file, self.cert_file,
789 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
790 else:
791 self.sock = sock
792 hc.connect = functools.partial(_hc_connect, hc)
793
794 return hc
795
796
87f0e62d 797def handle_youtubedl_headers(headers):
992fc9d6
YCH
798 filtered_headers = headers
799
800 if 'Youtubedl-no-compression' in filtered_headers:
801 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 802 del filtered_headers['Youtubedl-no-compression']
87f0e62d 803
992fc9d6 804 return filtered_headers
87f0e62d
YCH
805
806
acebc9cd 807class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
808 """Handler for HTTP requests and responses.
809
810 This class, when installed with an OpenerDirector, automatically adds
811 the standard headers to every HTTP request and handles gzipped and
812 deflated responses from web servers. If compression is to be avoided in
813 a particular request, the original request in the program code only has
0424ec30 814 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
815 removed before making the real request.
816
817 Part of this code was copied from:
818
819 http://techknack.net/python-urllib2-handlers/
820
821 Andrew Rowls, the author of that code, agreed to release it to the
822 public domain.
823 """
824
be4a824d
PH
825 def __init__(self, params, *args, **kwargs):
826 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
827 self._params = params
828
829 def http_open(self, req):
71aff188
YCH
830 conn_class = compat_http_client.HTTPConnection
831
832 socks_proxy = req.headers.get('Ytdl-socks-proxy')
833 if socks_proxy:
834 conn_class = make_socks_conn_class(conn_class, socks_proxy)
835 del req.headers['Ytdl-socks-proxy']
836
be4a824d 837 return self.do_open(functools.partial(
71aff188 838 _create_http_connection, self, conn_class, False),
be4a824d
PH
839 req)
840
59ae15a5
PH
841 @staticmethod
842 def deflate(data):
843 try:
844 return zlib.decompress(data, -zlib.MAX_WBITS)
845 except zlib.error:
846 return zlib.decompress(data)
847
848 @staticmethod
849 def addinfourl_wrapper(stream, headers, url, code):
850 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
851 return compat_urllib_request.addinfourl(stream, headers, url, code)
852 ret = compat_urllib_request.addinfourl(stream, headers, url)
853 ret.code = code
854 return ret
855
acebc9cd 856 def http_request(self, req):
51f267d9
S
857 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
858 # always respected by websites, some tend to give out URLs with non percent-encoded
859 # non-ASCII characters (see telemb.py, ard.py [#3412])
860 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
861 # To work around aforementioned issue we will replace request's original URL with
862 # percent-encoded one
863 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
864 # the code of this workaround has been moved here from YoutubeDL.urlopen()
865 url = req.get_full_url()
866 url_escaped = escape_url(url)
867
868 # Substitute URL if any change after escaping
869 if url != url_escaped:
15d260eb 870 req = update_Request(req, url=url_escaped)
51f267d9 871
33ac271b 872 for h, v in std_headers.items():
3d5f7a39
JK
873 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
874 # The dict keys are capitalized because of this bug by urllib
875 if h.capitalize() not in req.headers:
33ac271b 876 req.add_header(h, v)
87f0e62d
YCH
877
878 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
879
880 if sys.version_info < (2, 7) and '#' in req.get_full_url():
881 # Python 2.6 is brain-dead when it comes to fragments
882 req._Request__original = req._Request__original.partition('#')[0]
883 req._Request__r_type = req._Request__r_type.partition('#')[0]
884
59ae15a5
PH
885 return req
886
acebc9cd 887 def http_response(self, req, resp):
59ae15a5
PH
888 old_resp = resp
889 # gzip
890 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
891 content = resp.read()
892 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
893 try:
894 uncompressed = io.BytesIO(gz.read())
895 except IOError as original_ioerror:
896 # There may be junk add the end of the file
897 # See http://stackoverflow.com/q/4928560/35070 for details
898 for i in range(1, 1024):
899 try:
900 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
901 uncompressed = io.BytesIO(gz.read())
902 except IOError:
903 continue
904 break
905 else:
906 raise original_ioerror
907 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 908 resp.msg = old_resp.msg
c047270c 909 del resp.headers['Content-encoding']
59ae15a5
PH
910 # deflate
911 if resp.headers.get('Content-encoding', '') == 'deflate':
912 gz = io.BytesIO(self.deflate(resp.read()))
913 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
914 resp.msg = old_resp.msg
c047270c 915 del resp.headers['Content-encoding']
ad729172
S
916 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
917 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
918 if 300 <= resp.code < 400:
919 location = resp.headers.get('Location')
920 if location:
921 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
922 if sys.version_info >= (3, 0):
923 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
924 else:
925 location = location.decode('utf-8')
5a4d9ddb
S
926 location_escaped = escape_url(location)
927 if location != location_escaped:
928 del resp.headers['Location']
9a4aec8b
YCH
929 if sys.version_info < (3, 0):
930 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 931 resp.headers['Location'] = location_escaped
59ae15a5 932 return resp
0f8d03f8 933
acebc9cd
PH
934 https_request = http_request
935 https_response = http_response
bf50b038 936
5de90176 937
71aff188
YCH
938def make_socks_conn_class(base_class, socks_proxy):
939 assert issubclass(base_class, (
940 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
941
942 url_components = compat_urlparse.urlparse(socks_proxy)
943 if url_components.scheme.lower() == 'socks5':
944 socks_type = ProxyType.SOCKS5
945 elif url_components.scheme.lower() in ('socks', 'socks4'):
946 socks_type = ProxyType.SOCKS4
51fb4995
YCH
947 elif url_components.scheme.lower() == 'socks4a':
948 socks_type = ProxyType.SOCKS4A
71aff188 949
cdd94c2e
YCH
950 def unquote_if_non_empty(s):
951 if not s:
952 return s
953 return compat_urllib_parse_unquote_plus(s)
954
71aff188
YCH
955 proxy_args = (
956 socks_type,
957 url_components.hostname, url_components.port or 1080,
958 True, # Remote DNS
cdd94c2e
YCH
959 unquote_if_non_empty(url_components.username),
960 unquote_if_non_empty(url_components.password),
71aff188
YCH
961 )
962
963 class SocksConnection(base_class):
964 def connect(self):
965 self.sock = sockssocket()
966 self.sock.setproxy(*proxy_args)
967 if type(self.timeout) in (int, float):
968 self.sock.settimeout(self.timeout)
969 self.sock.connect((self.host, self.port))
970
971 if isinstance(self, compat_http_client.HTTPSConnection):
972 if hasattr(self, '_context'): # Python > 2.6
973 self.sock = self._context.wrap_socket(
974 self.sock, server_hostname=self.host)
975 else:
976 self.sock = ssl.wrap_socket(self.sock)
977
978 return SocksConnection
979
980
be4a824d
PH
981class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
982 def __init__(self, params, https_conn_class=None, *args, **kwargs):
983 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
984 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
985 self._params = params
986
987 def https_open(self, req):
4f264c02 988 kwargs = {}
71aff188
YCH
989 conn_class = self._https_conn_class
990
4f264c02
JMF
991 if hasattr(self, '_context'): # python > 2.6
992 kwargs['context'] = self._context
993 if hasattr(self, '_check_hostname'): # python 3.x
994 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
995
996 socks_proxy = req.headers.get('Ytdl-socks-proxy')
997 if socks_proxy:
998 conn_class = make_socks_conn_class(conn_class, socks_proxy)
999 del req.headers['Ytdl-socks-proxy']
1000
be4a824d 1001 return self.do_open(functools.partial(
71aff188 1002 _create_http_connection, self, conn_class, True),
4f264c02 1003 req, **kwargs)
be4a824d
PH
1004
1005
a6420bf5
S
1006class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1007 def __init__(self, cookiejar=None):
1008 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1009
1010 def http_response(self, request, response):
1011 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1012 # characters in Set-Cookie HTTP header of last response (see
1013 # https://github.com/rg3/youtube-dl/issues/6769).
1014 # In order to at least prevent crashing we will percent encode Set-Cookie
1015 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1016 # if sys.version_info < (3, 0) and response.headers:
1017 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1018 # set_cookie = response.headers.get(set_cookie_header)
1019 # if set_cookie:
1020 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1021 # if set_cookie != set_cookie_escaped:
1022 # del response.headers[set_cookie_header]
1023 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1024 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1025
1026 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1027 https_response = http_response
1028
1029
46f59e89
S
1030def extract_timezone(date_str):
1031 m = re.search(
1032 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1033 date_str)
1034 if not m:
1035 timezone = datetime.timedelta()
1036 else:
1037 date_str = date_str[:-len(m.group('tz'))]
1038 if not m.group('sign'):
1039 timezone = datetime.timedelta()
1040 else:
1041 sign = 1 if m.group('sign') == '+' else -1
1042 timezone = datetime.timedelta(
1043 hours=sign * int(m.group('hours')),
1044 minutes=sign * int(m.group('minutes')))
1045 return timezone, date_str
1046
1047
08b38d54 1048def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1049 """ Return a UNIX timestamp from the given date """
1050
1051 if date_str is None:
1052 return None
1053
52c3a6e4
S
1054 date_str = re.sub(r'\.[0-9]+', '', date_str)
1055
08b38d54 1056 if timezone is None:
46f59e89
S
1057 timezone, date_str = extract_timezone(date_str)
1058
52c3a6e4
S
1059 try:
1060 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1061 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1062 return calendar.timegm(dt.timetuple())
1063 except ValueError:
1064 pass
912b38b4
PH
1065
1066
46f59e89
S
1067def date_formats(day_first=True):
1068 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1069
1070
42bdd9d0 1071def unified_strdate(date_str, day_first=True):
bf50b038 1072 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1073
1074 if date_str is None:
1075 return None
bf50b038 1076 upload_date = None
5f6a1245 1077 # Replace commas
026fcc04 1078 date_str = date_str.replace(',', ' ')
42bdd9d0 1079 # Remove AM/PM + timezone
9bb8e0a3 1080 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1081 _, date_str = extract_timezone(date_str)
42bdd9d0 1082
46f59e89 1083 for expression in date_formats(day_first):
bf50b038
JMF
1084 try:
1085 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1086 except ValueError:
bf50b038 1087 pass
42393ce2
PH
1088 if upload_date is None:
1089 timetuple = email.utils.parsedate_tz(date_str)
1090 if timetuple:
c6b9cf05
S
1091 try:
1092 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1093 except ValueError:
1094 pass
6a750402
JMF
1095 if upload_date is not None:
1096 return compat_str(upload_date)
bf50b038 1097
5f6a1245 1098
46f59e89
S
1099def unified_timestamp(date_str, day_first=True):
1100 if date_str is None:
1101 return None
1102
1103 date_str = date_str.replace(',', ' ')
1104
7dc2a74e 1105 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1106 timezone, date_str = extract_timezone(date_str)
1107
1108 # Remove AM/PM + timezone
1109 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1110
1111 for expression in date_formats(day_first):
1112 try:
7dc2a74e 1113 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1114 return calendar.timegm(dt.timetuple())
1115 except ValueError:
1116 pass
1117 timetuple = email.utils.parsedate_tz(date_str)
1118 if timetuple:
7dc2a74e 1119 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1120
1121
28e614de 1122def determine_ext(url, default_ext='unknown_video'):
f4776371
S
1123 if url is None:
1124 return default_ext
9cb9a5df 1125 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1126 if re.match(r'^[A-Za-z0-9]+$', guess):
1127 return guess
a7aaa398
S
1128 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1129 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1130 return guess.rstrip('/')
73e79f2a 1131 else:
cbdbb766 1132 return default_ext
73e79f2a 1133
5f6a1245 1134
d4051a8e 1135def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1136 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1137
5f6a1245 1138
bd558525 1139def date_from_str(date_str):
37254abc
JMF
1140 """
1141 Return a datetime object from a string in the format YYYYMMDD or
1142 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1143 today = datetime.date.today()
f8795e10 1144 if date_str in ('now', 'today'):
37254abc 1145 return today
f8795e10
PH
1146 if date_str == 'yesterday':
1147 return today - datetime.timedelta(days=1)
37254abc
JMF
1148 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1149 if match is not None:
1150 sign = match.group('sign')
1151 time = int(match.group('time'))
1152 if sign == '-':
1153 time = -time
1154 unit = match.group('unit')
dfb1b146 1155 # A bad approximation?
37254abc
JMF
1156 if unit == 'month':
1157 unit = 'day'
1158 time *= 30
1159 elif unit == 'year':
1160 unit = 'day'
1161 time *= 365
1162 unit += 's'
1163 delta = datetime.timedelta(**{unit: time})
1164 return today + delta
611c1dd9 1165 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1166
1167
e63fc1be 1168def hyphenate_date(date_str):
1169 """
1170 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1171 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1172 if match is not None:
1173 return '-'.join(match.groups())
1174 else:
1175 return date_str
1176
5f6a1245 1177
bd558525
JMF
1178class DateRange(object):
1179 """Represents a time interval between two dates"""
5f6a1245 1180
bd558525
JMF
1181 def __init__(self, start=None, end=None):
1182 """start and end must be strings in the format accepted by date"""
1183 if start is not None:
1184 self.start = date_from_str(start)
1185 else:
1186 self.start = datetime.datetime.min.date()
1187 if end is not None:
1188 self.end = date_from_str(end)
1189 else:
1190 self.end = datetime.datetime.max.date()
37254abc 1191 if self.start > self.end:
bd558525 1192 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1193
bd558525
JMF
1194 @classmethod
1195 def day(cls, day):
1196 """Returns a range that only contains the given day"""
5f6a1245
JW
1197 return cls(day, day)
1198
bd558525
JMF
1199 def __contains__(self, date):
1200 """Check if the date is in the range"""
37254abc
JMF
1201 if not isinstance(date, datetime.date):
1202 date = date_from_str(date)
1203 return self.start <= date <= self.end
5f6a1245 1204
bd558525 1205 def __str__(self):
5f6a1245 1206 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1207
1208
1209def platform_name():
1210 """ Returns the platform name as a compat_str """
1211 res = platform.platform()
1212 if isinstance(res, bytes):
1213 res = res.decode(preferredencoding())
1214
1215 assert isinstance(res, compat_str)
1216 return res
c257baff
PH
1217
1218
b58ddb32
PH
1219def _windows_write_string(s, out):
1220 """ Returns True if the string was written using special methods,
1221 False if it has yet to be written out."""
1222 # Adapted from http://stackoverflow.com/a/3259271/35070
1223
1224 import ctypes
1225 import ctypes.wintypes
1226
1227 WIN_OUTPUT_IDS = {
1228 1: -11,
1229 2: -12,
1230 }
1231
a383a98a
PH
1232 try:
1233 fileno = out.fileno()
1234 except AttributeError:
1235 # If the output stream doesn't have a fileno, it's virtual
1236 return False
aa42e873
PH
1237 except io.UnsupportedOperation:
1238 # Some strange Windows pseudo files?
1239 return False
b58ddb32
PH
1240 if fileno not in WIN_OUTPUT_IDS:
1241 return False
1242
e2f89ec7 1243 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1244 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1245 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1246 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1247
e2f89ec7 1248 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1249 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1250 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1251 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1252 written = ctypes.wintypes.DWORD(0)
1253
611c1dd9 1254 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1255 FILE_TYPE_CHAR = 0x0002
1256 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1257 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1258 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1259 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1260 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1261 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1262
1263 def not_a_console(handle):
1264 if handle == INVALID_HANDLE_VALUE or handle is None:
1265 return True
8fb3ac36
PH
1266 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1267 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1268
1269 if not_a_console(h):
1270 return False
1271
d1b9c912
PH
1272 def next_nonbmp_pos(s):
1273 try:
1274 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1275 except StopIteration:
1276 return len(s)
1277
1278 while s:
1279 count = min(next_nonbmp_pos(s), 1024)
1280
b58ddb32 1281 ret = WriteConsoleW(
d1b9c912 1282 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1283 if ret == 0:
1284 raise OSError('Failed to write string')
d1b9c912
PH
1285 if not count: # We just wrote a non-BMP character
1286 assert written.value == 2
1287 s = s[1:]
1288 else:
1289 assert written.value > 0
1290 s = s[written.value:]
b58ddb32
PH
1291 return True
1292
1293
734f90bb 1294def write_string(s, out=None, encoding=None):
7459e3a2
PH
1295 if out is None:
1296 out = sys.stderr
8bf48f23 1297 assert type(s) == compat_str
7459e3a2 1298
b58ddb32
PH
1299 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1300 if _windows_write_string(s, out):
1301 return
1302
7459e3a2
PH
1303 if ('b' in getattr(out, 'mode', '') or
1304 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1305 byt = s.encode(encoding or preferredencoding(), 'ignore')
1306 out.write(byt)
1307 elif hasattr(out, 'buffer'):
1308 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1309 byt = s.encode(enc, 'ignore')
1310 out.buffer.write(byt)
1311 else:
8bf48f23 1312 out.write(s)
7459e3a2
PH
1313 out.flush()
1314
1315
48ea9cea
PH
1316def bytes_to_intlist(bs):
1317 if not bs:
1318 return []
1319 if isinstance(bs[0], int): # Python 3
1320 return list(bs)
1321 else:
1322 return [ord(c) for c in bs]
1323
c257baff 1324
cba892fa 1325def intlist_to_bytes(xs):
1326 if not xs:
1327 return b''
edaa23f8 1328 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1329
1330
c1c9a79c
PH
1331# Cross-platform file locking
1332if sys.platform == 'win32':
1333 import ctypes.wintypes
1334 import msvcrt
1335
1336 class OVERLAPPED(ctypes.Structure):
1337 _fields_ = [
1338 ('Internal', ctypes.wintypes.LPVOID),
1339 ('InternalHigh', ctypes.wintypes.LPVOID),
1340 ('Offset', ctypes.wintypes.DWORD),
1341 ('OffsetHigh', ctypes.wintypes.DWORD),
1342 ('hEvent', ctypes.wintypes.HANDLE),
1343 ]
1344
1345 kernel32 = ctypes.windll.kernel32
1346 LockFileEx = kernel32.LockFileEx
1347 LockFileEx.argtypes = [
1348 ctypes.wintypes.HANDLE, # hFile
1349 ctypes.wintypes.DWORD, # dwFlags
1350 ctypes.wintypes.DWORD, # dwReserved
1351 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1352 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1353 ctypes.POINTER(OVERLAPPED) # Overlapped
1354 ]
1355 LockFileEx.restype = ctypes.wintypes.BOOL
1356 UnlockFileEx = kernel32.UnlockFileEx
1357 UnlockFileEx.argtypes = [
1358 ctypes.wintypes.HANDLE, # hFile
1359 ctypes.wintypes.DWORD, # dwReserved
1360 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1361 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1362 ctypes.POINTER(OVERLAPPED) # Overlapped
1363 ]
1364 UnlockFileEx.restype = ctypes.wintypes.BOOL
1365 whole_low = 0xffffffff
1366 whole_high = 0x7fffffff
1367
1368 def _lock_file(f, exclusive):
1369 overlapped = OVERLAPPED()
1370 overlapped.Offset = 0
1371 overlapped.OffsetHigh = 0
1372 overlapped.hEvent = 0
1373 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1374 handle = msvcrt.get_osfhandle(f.fileno())
1375 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1376 whole_low, whole_high, f._lock_file_overlapped_p):
1377 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1378
1379 def _unlock_file(f):
1380 assert f._lock_file_overlapped_p
1381 handle = msvcrt.get_osfhandle(f.fileno())
1382 if not UnlockFileEx(handle, 0,
1383 whole_low, whole_high, f._lock_file_overlapped_p):
1384 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1385
1386else:
399a76e6
YCH
1387 # Some platforms, such as Jython, is missing fcntl
1388 try:
1389 import fcntl
c1c9a79c 1390
399a76e6
YCH
1391 def _lock_file(f, exclusive):
1392 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1393
399a76e6
YCH
1394 def _unlock_file(f):
1395 fcntl.flock(f, fcntl.LOCK_UN)
1396 except ImportError:
1397 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1398
1399 def _lock_file(f, exclusive):
1400 raise IOError(UNSUPPORTED_MSG)
1401
1402 def _unlock_file(f):
1403 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1404
1405
1406class locked_file(object):
1407 def __init__(self, filename, mode, encoding=None):
1408 assert mode in ['r', 'a', 'w']
1409 self.f = io.open(filename, mode, encoding=encoding)
1410 self.mode = mode
1411
1412 def __enter__(self):
1413 exclusive = self.mode != 'r'
1414 try:
1415 _lock_file(self.f, exclusive)
1416 except IOError:
1417 self.f.close()
1418 raise
1419 return self
1420
1421 def __exit__(self, etype, value, traceback):
1422 try:
1423 _unlock_file(self.f)
1424 finally:
1425 self.f.close()
1426
1427 def __iter__(self):
1428 return iter(self.f)
1429
1430 def write(self, *args):
1431 return self.f.write(*args)
1432
1433 def read(self, *args):
1434 return self.f.read(*args)
4eb7f1d1
JMF
1435
1436
4644ac55
S
1437def get_filesystem_encoding():
1438 encoding = sys.getfilesystemencoding()
1439 return encoding if encoding is not None else 'utf-8'
1440
1441
4eb7f1d1 1442def shell_quote(args):
a6a173c2 1443 quoted_args = []
4644ac55 1444 encoding = get_filesystem_encoding()
a6a173c2
JMF
1445 for a in args:
1446 if isinstance(a, bytes):
1447 # We may get a filename encoded with 'encodeFilename'
1448 a = a.decode(encoding)
1449 quoted_args.append(pipes.quote(a))
28e614de 1450 return ' '.join(quoted_args)
9d4660ca
PH
1451
1452
1453def smuggle_url(url, data):
1454 """ Pass additional data in a URL for internal use. """
1455
81953d1a
RA
1456 url, idata = unsmuggle_url(url, {})
1457 data.update(idata)
15707c7e 1458 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1459 {'__youtubedl_smuggle': json.dumps(data)})
1460 return url + '#' + sdata
9d4660ca
PH
1461
1462
79f82953 1463def unsmuggle_url(smug_url, default=None):
83e865a3 1464 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1465 return smug_url, default
28e614de
PH
1466 url, _, sdata = smug_url.rpartition('#')
1467 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1468 data = json.loads(jsond)
1469 return url, data
02dbf93f
PH
1470
1471
02dbf93f
PH
1472def format_bytes(bytes):
1473 if bytes is None:
28e614de 1474 return 'N/A'
02dbf93f
PH
1475 if type(bytes) is str:
1476 bytes = float(bytes)
1477 if bytes == 0.0:
1478 exponent = 0
1479 else:
1480 exponent = int(math.log(bytes, 1024.0))
28e614de 1481 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1482 converted = float(bytes) / float(1024 ** exponent)
28e614de 1483 return '%.2f%s' % (converted, suffix)
f53c966a 1484
1c088fa8 1485
fb47597b
S
1486def lookup_unit_table(unit_table, s):
1487 units_re = '|'.join(re.escape(u) for u in unit_table)
1488 m = re.match(
782b1b5b 1489 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1490 if not m:
1491 return None
1492 num_str = m.group('num').replace(',', '.')
1493 mult = unit_table[m.group('unit')]
1494 return int(float(num_str) * mult)
1495
1496
be64b5b0
PH
1497def parse_filesize(s):
1498 if s is None:
1499 return None
1500
dfb1b146 1501 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1502 # but we support those too
1503 _UNIT_TABLE = {
1504 'B': 1,
1505 'b': 1,
1506 'KiB': 1024,
1507 'KB': 1000,
1508 'kB': 1024,
1509 'Kb': 1000,
1510 'MiB': 1024 ** 2,
1511 'MB': 1000 ** 2,
1512 'mB': 1024 ** 2,
1513 'Mb': 1000 ** 2,
1514 'GiB': 1024 ** 3,
1515 'GB': 1000 ** 3,
1516 'gB': 1024 ** 3,
1517 'Gb': 1000 ** 3,
1518 'TiB': 1024 ** 4,
1519 'TB': 1000 ** 4,
1520 'tB': 1024 ** 4,
1521 'Tb': 1000 ** 4,
1522 'PiB': 1024 ** 5,
1523 'PB': 1000 ** 5,
1524 'pB': 1024 ** 5,
1525 'Pb': 1000 ** 5,
1526 'EiB': 1024 ** 6,
1527 'EB': 1000 ** 6,
1528 'eB': 1024 ** 6,
1529 'Eb': 1000 ** 6,
1530 'ZiB': 1024 ** 7,
1531 'ZB': 1000 ** 7,
1532 'zB': 1024 ** 7,
1533 'Zb': 1000 ** 7,
1534 'YiB': 1024 ** 8,
1535 'YB': 1000 ** 8,
1536 'yB': 1024 ** 8,
1537 'Yb': 1000 ** 8,
1538 }
1539
fb47597b
S
1540 return lookup_unit_table(_UNIT_TABLE, s)
1541
1542
1543def parse_count(s):
1544 if s is None:
be64b5b0
PH
1545 return None
1546
fb47597b
S
1547 s = s.strip()
1548
1549 if re.match(r'^[\d,.]+$', s):
1550 return str_to_int(s)
1551
1552 _UNIT_TABLE = {
1553 'k': 1000,
1554 'K': 1000,
1555 'm': 1000 ** 2,
1556 'M': 1000 ** 2,
1557 'kk': 1000 ** 2,
1558 'KK': 1000 ** 2,
1559 }
be64b5b0 1560
fb47597b 1561 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1562
2f7ae819 1563
caefb1de
PH
1564def month_by_name(name):
1565 """ Return the number of a month by (locale-independently) English name """
1566
caefb1de 1567 try:
7105440c
YCH
1568 return ENGLISH_MONTH_NAMES.index(name) + 1
1569 except ValueError:
1570 return None
1571
1572
1573def month_by_abbreviation(abbrev):
1574 """ Return the number of a month by (locale-independently) English
1575 abbreviations """
1576
1577 try:
1578 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1579 except ValueError:
1580 return None
18258362
JMF
1581
1582
5aafe895 1583def fix_xml_ampersands(xml_str):
18258362 1584 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1585 return re.sub(
1586 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1587 '&amp;',
5aafe895 1588 xml_str)
e3946f98
PH
1589
1590
1591def setproctitle(title):
8bf48f23 1592 assert isinstance(title, compat_str)
c1c05c67
YCH
1593
1594 # ctypes in Jython is not complete
1595 # http://bugs.jython.org/issue2148
1596 if sys.platform.startswith('java'):
1597 return
1598
e3946f98 1599 try:
611c1dd9 1600 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1601 except OSError:
1602 return
6eefe533
PH
1603 title_bytes = title.encode('utf-8')
1604 buf = ctypes.create_string_buffer(len(title_bytes))
1605 buf.value = title_bytes
e3946f98 1606 try:
6eefe533 1607 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1608 except AttributeError:
1609 return # Strange libc, just skip this
d7dda168
PH
1610
1611
1612def remove_start(s, start):
46bc9b7d 1613 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1614
1615
2b9faf55 1616def remove_end(s, end):
46bc9b7d 1617 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1618
1619
31b2051e
S
1620def remove_quotes(s):
1621 if s is None or len(s) < 2:
1622 return s
1623 for quote in ('"', "'", ):
1624 if s[0] == quote and s[-1] == quote:
1625 return s[1:-1]
1626 return s
1627
1628
29eb5174 1629def url_basename(url):
9b8aaeed 1630 path = compat_urlparse.urlparse(url).path
28e614de 1631 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1632
1633
1634class HEADRequest(compat_urllib_request.Request):
1635 def get_method(self):
611c1dd9 1636 return 'HEAD'
7217e148
PH
1637
1638
95cf60e8
S
1639class PUTRequest(compat_urllib_request.Request):
1640 def get_method(self):
1641 return 'PUT'
1642
1643
9732d77e 1644def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1645 if get_attr:
1646 if v is not None:
1647 v = getattr(v, get_attr, None)
9572013d
PH
1648 if v == '':
1649 v = None
1812afb7
S
1650 if v is None:
1651 return default
1652 try:
1653 return int(v) * invscale // scale
1654 except ValueError:
af98f8ff 1655 return default
9732d77e 1656
9572013d 1657
40a90862
JMF
1658def str_or_none(v, default=None):
1659 return default if v is None else compat_str(v)
1660
9732d77e
PH
1661
1662def str_to_int(int_str):
48d4681e 1663 """ A more relaxed version of int_or_none """
9732d77e
PH
1664 if int_str is None:
1665 return None
28e614de 1666 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1667 return int(int_str)
608d11f5
PH
1668
1669
9732d77e 1670def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1671 if v is None:
1672 return default
1673 try:
1674 return float(v) * invscale / scale
1675 except ValueError:
1676 return default
43f775e4
PH
1677
1678
b72b4431
S
1679def strip_or_none(v):
1680 return None if v is None else v.strip()
1681
1682
608d11f5 1683def parse_duration(s):
8f9312c3 1684 if not isinstance(s, compat_basestring):
608d11f5
PH
1685 return None
1686
ca7b3246
S
1687 s = s.strip()
1688
acaff495 1689 days, hours, mins, secs, ms = [None] * 5
1690 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1691 if m:
1692 days, hours, mins, secs, ms = m.groups()
1693 else:
1694 m = re.match(
1695 r'''(?ix)(?:P?T)?
8f4b58d7 1696 (?:
acaff495 1697 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1698 )?
acaff495 1699 (?:
1700 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1701 )?
1702 (?:
1703 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1704 )?
1705 (?:
1706 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1707 )?$''', s)
1708 if m:
1709 days, hours, mins, secs, ms = m.groups()
1710 else:
1711 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1712 if m:
1713 hours, mins = m.groups()
1714 else:
1715 return None
1716
1717 duration = 0
1718 if secs:
1719 duration += float(secs)
1720 if mins:
1721 duration += float(mins) * 60
1722 if hours:
1723 duration += float(hours) * 60 * 60
1724 if days:
1725 duration += float(days) * 24 * 60 * 60
1726 if ms:
1727 duration += float(ms)
1728 return duration
91d7d0b3
JMF
1729
1730
e65e4c88 1731def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1732 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1733 return (
1734 '{0}.{1}{2}'.format(name, ext, real_ext)
1735 if not expected_real_ext or real_ext[1:] == expected_real_ext
1736 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1737
1738
b3ed15b7
S
1739def replace_extension(filename, ext, expected_real_ext=None):
1740 name, real_ext = os.path.splitext(filename)
1741 return '{0}.{1}'.format(
1742 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1743 ext)
1744
1745
d70ad093
PH
1746def check_executable(exe, args=[]):
1747 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1748 args can be a list of arguments for a short output (like -version) """
1749 try:
1750 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1751 except OSError:
1752 return False
1753 return exe
b7ab0590
PH
1754
1755
95807118 1756def get_exe_version(exe, args=['--version'],
cae97f65 1757 version_re=None, unrecognized='present'):
95807118
PH
1758 """ Returns the version of the specified executable,
1759 or False if the executable is not present """
1760 try:
cae97f65 1761 out, _ = subprocess.Popen(
54116803 1762 [encodeArgument(exe)] + args,
95807118
PH
1763 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1764 except OSError:
1765 return False
cae97f65
PH
1766 if isinstance(out, bytes): # Python 2.x
1767 out = out.decode('ascii', 'ignore')
1768 return detect_exe_version(out, version_re, unrecognized)
1769
1770
1771def detect_exe_version(output, version_re=None, unrecognized='present'):
1772 assert isinstance(output, compat_str)
1773 if version_re is None:
1774 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1775 m = re.search(version_re, output)
95807118
PH
1776 if m:
1777 return m.group(1)
1778 else:
1779 return unrecognized
1780
1781
b7ab0590 1782class PagedList(object):
dd26ced1
PH
1783 def __len__(self):
1784 # This is only useful for tests
1785 return len(self.getslice())
1786
9c44d242
PH
1787
1788class OnDemandPagedList(PagedList):
b95dc034 1789 def __init__(self, pagefunc, pagesize, use_cache=False):
9c44d242
PH
1790 self._pagefunc = pagefunc
1791 self._pagesize = pagesize
b95dc034
YCH
1792 self._use_cache = use_cache
1793 if use_cache:
1794 self._cache = {}
9c44d242 1795
b7ab0590
PH
1796 def getslice(self, start=0, end=None):
1797 res = []
1798 for pagenum in itertools.count(start // self._pagesize):
1799 firstid = pagenum * self._pagesize
1800 nextfirstid = pagenum * self._pagesize + self._pagesize
1801 if start >= nextfirstid:
1802 continue
1803
b95dc034
YCH
1804 page_results = None
1805 if self._use_cache:
1806 page_results = self._cache.get(pagenum)
1807 if page_results is None:
1808 page_results = list(self._pagefunc(pagenum))
1809 if self._use_cache:
1810 self._cache[pagenum] = page_results
b7ab0590
PH
1811
1812 startv = (
1813 start % self._pagesize
1814 if firstid <= start < nextfirstid
1815 else 0)
1816
1817 endv = (
1818 ((end - 1) % self._pagesize) + 1
1819 if (end is not None and firstid <= end <= nextfirstid)
1820 else None)
1821
1822 if startv != 0 or endv is not None:
1823 page_results = page_results[startv:endv]
1824 res.extend(page_results)
1825
1826 # A little optimization - if current page is not "full", ie. does
1827 # not contain page_size videos then we can assume that this page
1828 # is the last one - there are no more ids on further pages -
1829 # i.e. no need to query again.
1830 if len(page_results) + startv < self._pagesize:
1831 break
1832
1833 # If we got the whole page, but the next page is not interesting,
1834 # break out early as well
1835 if end == nextfirstid:
1836 break
1837 return res
81c2f20b
PH
1838
1839
9c44d242
PH
1840class InAdvancePagedList(PagedList):
1841 def __init__(self, pagefunc, pagecount, pagesize):
1842 self._pagefunc = pagefunc
1843 self._pagecount = pagecount
1844 self._pagesize = pagesize
1845
1846 def getslice(self, start=0, end=None):
1847 res = []
1848 start_page = start // self._pagesize
1849 end_page = (
1850 self._pagecount if end is None else (end // self._pagesize + 1))
1851 skip_elems = start - start_page * self._pagesize
1852 only_more = None if end is None else end - start
1853 for pagenum in range(start_page, end_page):
1854 page = list(self._pagefunc(pagenum))
1855 if skip_elems:
1856 page = page[skip_elems:]
1857 skip_elems = None
1858 if only_more is not None:
1859 if len(page) < only_more:
1860 only_more -= len(page)
1861 else:
1862 page = page[:only_more]
1863 res.extend(page)
1864 break
1865 res.extend(page)
1866 return res
1867
1868
81c2f20b 1869def uppercase_escape(s):
676eb3f2 1870 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1871 return re.sub(
a612753d 1872 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1873 lambda m: unicode_escape(m.group(0))[0],
1874 s)
0fe2ff78
YCH
1875
1876
1877def lowercase_escape(s):
1878 unicode_escape = codecs.getdecoder('unicode_escape')
1879 return re.sub(
1880 r'\\u[0-9a-fA-F]{4}',
1881 lambda m: unicode_escape(m.group(0))[0],
1882 s)
b53466e1 1883
d05cfe06
S
1884
1885def escape_rfc3986(s):
1886 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1887 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1888 s = s.encode('utf-8')
ecc0c5ee 1889 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1890
1891
1892def escape_url(url):
1893 """Escape URL as suggested by RFC 3986"""
1894 url_parsed = compat_urllib_parse_urlparse(url)
1895 return url_parsed._replace(
efbed08d 1896 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
1897 path=escape_rfc3986(url_parsed.path),
1898 params=escape_rfc3986(url_parsed.params),
1899 query=escape_rfc3986(url_parsed.query),
1900 fragment=escape_rfc3986(url_parsed.fragment)
1901 ).geturl()
1902
62e609ab
PH
1903
1904def read_batch_urls(batch_fd):
1905 def fixup(url):
1906 if not isinstance(url, compat_str):
1907 url = url.decode('utf-8', 'replace')
28e614de 1908 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1909 if url.startswith(BOM_UTF8):
1910 url = url[len(BOM_UTF8):]
1911 url = url.strip()
1912 if url.startswith(('#', ';', ']')):
1913 return False
1914 return url
1915
1916 with contextlib.closing(batch_fd) as fd:
1917 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1918
1919
1920def urlencode_postdata(*args, **kargs):
15707c7e 1921 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1922
1923
38f9ef31 1924def update_url_query(url, query):
cacd9966
YCH
1925 if not query:
1926 return url
38f9ef31 1927 parsed_url = compat_urlparse.urlparse(url)
1928 qs = compat_parse_qs(parsed_url.query)
1929 qs.update(query)
1930 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 1931 query=compat_urllib_parse_urlencode(qs, True)))
16392824 1932
8e60dc75 1933
ed0291d1
S
1934def update_Request(req, url=None, data=None, headers={}, query={}):
1935 req_headers = req.headers.copy()
1936 req_headers.update(headers)
1937 req_data = data or req.data
1938 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
1939 req_get_method = req.get_method()
1940 if req_get_method == 'HEAD':
1941 req_type = HEADRequest
1942 elif req_get_method == 'PUT':
1943 req_type = PUTRequest
1944 else:
1945 req_type = compat_urllib_request.Request
ed0291d1
S
1946 new_req = req_type(
1947 req_url, data=req_data, headers=req_headers,
1948 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1949 if hasattr(req, 'timeout'):
1950 new_req.timeout = req.timeout
1951 return new_req
1952
1953
86296ad2 1954def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
1955 if isinstance(key_or_keys, (list, tuple)):
1956 for key in key_or_keys:
86296ad2
S
1957 if key not in d or d[key] is None or skip_false_values and not d[key]:
1958 continue
1959 return d[key]
cbecc9b9
S
1960 return default
1961 return d.get(key_or_keys, default)
1962
1963
329ca3be
S
1964def try_get(src, getter, expected_type=None):
1965 try:
1966 v = getter(src)
1967 except (AttributeError, KeyError, TypeError, IndexError):
1968 pass
1969 else:
1970 if expected_type is None or isinstance(v, expected_type):
1971 return v
1972
1973
8e60dc75
S
1974def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1975 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1976
16392824 1977
a1a530b0
PH
1978US_RATINGS = {
1979 'G': 0,
1980 'PG': 10,
1981 'PG-13': 13,
1982 'R': 16,
1983 'NC': 18,
1984}
fac55558
PH
1985
1986
146c80e2
S
1987def parse_age_limit(s):
1988 if s is None:
d838b1bd 1989 return None
146c80e2 1990 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d800609c 1991 return int(m.group('age')) if m else US_RATINGS.get(s)
146c80e2
S
1992
1993
fac55558 1994def strip_jsonp(code):
609a61e3 1995 return re.sub(
5950cb1d 1996 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1997
1998
e05f6939
PH
1999def js_to_json(code):
2000 def fix_kv(m):
e7b6d122
PH
2001 v = m.group(0)
2002 if v in ('true', 'false', 'null'):
2003 return v
bd1e4844 2004 elif v.startswith('/*') or v == ',':
2005 return ""
2006
2007 if v[0] in ("'", '"'):
2008 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 2009 '"': '\\"',
bd1e4844 2010 "\\'": "'",
2011 '\\\n': '',
2012 '\\x': '\\u00',
2013 }.get(m.group(0), m.group(0)), v[1:-1])
2014
89ac4a19 2015 INTEGER_TABLE = (
cda6d47a
S
2016 (r'^0[xX][0-9a-fA-F]+', 16),
2017 (r'^0+[0-7]+', 8),
89ac4a19
S
2018 )
2019
2020 for regex, base in INTEGER_TABLE:
2021 im = re.match(regex, v)
2022 if im:
cda6d47a 2023 i = int(im.group(0), base)
89ac4a19
S
2024 return '"%d":' % i if v.endswith(':') else '%d' % i
2025
e7b6d122 2026 return '"%s"' % v
e05f6939 2027
bd1e4844 2028 return re.sub(r'''(?sx)
2029 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2030 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2031 /\*.*?\*/|,(?=\s*[\]}])|
2032 [a-zA-Z_][.a-zA-Z_0-9]*|
47212f7b 2033 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
bd1e4844 2034 [0-9]+(?=\s*:)
e05f6939 2035 ''', fix_kv, code)
e05f6939
PH
2036
2037
478c2c61
PH
2038def qualities(quality_ids):
2039 """ Get a numeric quality value out of a list of possible values """
2040 def q(qid):
2041 try:
2042 return quality_ids.index(qid)
2043 except ValueError:
2044 return -1
2045 return q
2046
acd69589
PH
2047
2048DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 2049
a020a0dc
PH
2050
2051def limit_length(s, length):
2052 """ Add ellipses to overly long strings """
2053 if s is None:
2054 return None
2055 ELLIPSES = '...'
2056 if len(s) > length:
2057 return s[:length - len(ELLIPSES)] + ELLIPSES
2058 return s
48844745
PH
2059
2060
2061def version_tuple(v):
5f9b8394 2062 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2063
2064
2065def is_outdated_version(version, limit, assume_new=True):
2066 if not version:
2067 return not assume_new
2068 try:
2069 return version_tuple(version) < version_tuple(limit)
2070 except ValueError:
2071 return not assume_new
732ea2f0
PH
2072
2073
2074def ytdl_is_updateable():
2075 """ Returns if youtube-dl can be updated with -U """
2076 from zipimport import zipimporter
2077
2078 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
2079
2080
2081def args_to_str(args):
2082 # Get a short string representation for a subprocess command
702ccf2d 2083 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2084
2085
9b9c5355 2086def error_to_compat_str(err):
fdae2358
S
2087 err_str = str(err)
2088 # On python 2 error byte string must be decoded with proper
2089 # encoding rather than ascii
2090 if sys.version_info[0] < 3:
2091 err_str = err_str.decode(preferredencoding())
2092 return err_str
2093
2094
c460bdd5 2095def mimetype2ext(mt):
eb9ee194
S
2096 if mt is None:
2097 return None
2098
765ac263
JMF
2099 ext = {
2100 'audio/mp4': 'm4a',
6c33d24b
YCH
2101 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2102 # it's the most popular one
2103 'audio/mpeg': 'mp3',
765ac263
JMF
2104 }.get(mt)
2105 if ext is not None:
2106 return ext
2107
c460bdd5 2108 _, _, res = mt.rpartition('/')
b4173f15 2109 res = res.lower()
c460bdd5
PH
2110
2111 return {
f6861ec9 2112 '3gpp': '3gp',
cafcf657 2113 'smptett+xml': 'tt',
2114 'srt': 'srt',
2115 'ttaf+xml': 'dfxp',
a0d8d704 2116 'ttml+xml': 'ttml',
cafcf657 2117 'vtt': 'vtt',
f6861ec9 2118 'x-flv': 'flv',
a0d8d704
YCH
2119 'x-mp4-fragmented': 'mp4',
2120 'x-ms-wmv': 'wmv',
b4173f15
RA
2121 'mpegurl': 'm3u8',
2122 'x-mpegurl': 'm3u8',
2123 'vnd.apple.mpegurl': 'm3u8',
2124 'dash+xml': 'mpd',
2125 'f4m': 'f4m',
2126 'f4m+xml': 'f4m',
f164b971 2127 'hds+xml': 'f4m',
e910fe2f 2128 'vnd.ms-sstr+xml': 'ism',
c460bdd5
PH
2129 }.get(res, res)
2130
2131
4f3c5e06 2132def parse_codecs(codecs_str):
2133 # http://tools.ietf.org/html/rfc6381
2134 if not codecs_str:
2135 return {}
2136 splited_codecs = list(filter(None, map(
2137 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2138 vcodec, acodec = None, None
2139 for full_codec in splited_codecs:
2140 codec = full_codec.split('.')[0]
2141 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2142 if not vcodec:
2143 vcodec = full_codec
2144 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac'):
2145 if not acodec:
2146 acodec = full_codec
2147 else:
2148 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2149 if not vcodec and not acodec:
2150 if len(splited_codecs) == 2:
2151 return {
2152 'vcodec': vcodec,
2153 'acodec': acodec,
2154 }
2155 elif len(splited_codecs) == 1:
2156 return {
2157 'vcodec': 'none',
2158 'acodec': vcodec,
2159 }
2160 else:
2161 return {
2162 'vcodec': vcodec or 'none',
2163 'acodec': acodec or 'none',
2164 }
2165 return {}
2166
2167
2ccd1b10 2168def urlhandle_detect_ext(url_handle):
79298173 2169 getheader = url_handle.headers.get
2ccd1b10 2170
b55ee18f
PH
2171 cd = getheader('Content-Disposition')
2172 if cd:
2173 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2174 if m:
2175 e = determine_ext(m.group('filename'), default_ext=None)
2176 if e:
2177 return e
2178
c460bdd5 2179 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2180
2181
1e399778
YCH
2182def encode_data_uri(data, mime_type):
2183 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2184
2185
05900629 2186def age_restricted(content_limit, age_limit):
6ec6cb4e 2187 """ Returns True iff the content should be blocked """
05900629
PH
2188
2189 if age_limit is None: # No limit set
2190 return False
2191 if content_limit is None:
2192 return False # Content available for everyone
2193 return age_limit < content_limit
61ca9a80
PH
2194
2195
2196def is_html(first_bytes):
2197 """ Detect whether a file contains HTML by examining its first bytes. """
2198
2199 BOMS = [
2200 (b'\xef\xbb\xbf', 'utf-8'),
2201 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2202 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2203 (b'\xff\xfe', 'utf-16-le'),
2204 (b'\xfe\xff', 'utf-16-be'),
2205 ]
2206 for bom, enc in BOMS:
2207 if first_bytes.startswith(bom):
2208 s = first_bytes[len(bom):].decode(enc, 'replace')
2209 break
2210 else:
2211 s = first_bytes.decode('utf-8', 'replace')
2212
2213 return re.match(r'^\s*<', s)
a055469f
PH
2214
2215
2216def determine_protocol(info_dict):
2217 protocol = info_dict.get('protocol')
2218 if protocol is not None:
2219 return protocol
2220
2221 url = info_dict['url']
2222 if url.startswith('rtmp'):
2223 return 'rtmp'
2224 elif url.startswith('mms'):
2225 return 'mms'
2226 elif url.startswith('rtsp'):
2227 return 'rtsp'
2228
2229 ext = determine_ext(url)
2230 if ext == 'm3u8':
2231 return 'm3u8'
2232 elif ext == 'f4m':
2233 return 'f4m'
2234
2235 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2236
2237
2238def render_table(header_row, data):
2239 """ Render a list of rows, each as a list of values """
2240 table = [header_row] + data
2241 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2242 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2243 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2244
2245
2246def _match_one(filter_part, dct):
2247 COMPARISON_OPERATORS = {
2248 '<': operator.lt,
2249 '<=': operator.le,
2250 '>': operator.gt,
2251 '>=': operator.ge,
2252 '=': operator.eq,
2253 '!=': operator.ne,
2254 }
2255 operator_rex = re.compile(r'''(?x)\s*
2256 (?P<key>[a-z_]+)
2257 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2258 (?:
2259 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2260 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2261 )
2262 \s*$
2263 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2264 m = operator_rex.search(filter_part)
2265 if m:
2266 op = COMPARISON_OPERATORS[m.group('op')]
2267 if m.group('strval') is not None:
2268 if m.group('op') not in ('=', '!='):
2269 raise ValueError(
2270 'Operator %s does not support string values!' % m.group('op'))
2271 comparison_value = m.group('strval')
2272 else:
2273 try:
2274 comparison_value = int(m.group('intval'))
2275 except ValueError:
2276 comparison_value = parse_filesize(m.group('intval'))
2277 if comparison_value is None:
2278 comparison_value = parse_filesize(m.group('intval') + 'B')
2279 if comparison_value is None:
2280 raise ValueError(
2281 'Invalid integer value %r in filter part %r' % (
2282 m.group('intval'), filter_part))
2283 actual_value = dct.get(m.group('key'))
2284 if actual_value is None:
2285 return m.group('none_inclusive')
2286 return op(actual_value, comparison_value)
2287
2288 UNARY_OPERATORS = {
2289 '': lambda v: v is not None,
2290 '!': lambda v: v is None,
2291 }
2292 operator_rex = re.compile(r'''(?x)\s*
2293 (?P<op>%s)\s*(?P<key>[a-z_]+)
2294 \s*$
2295 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2296 m = operator_rex.search(filter_part)
2297 if m:
2298 op = UNARY_OPERATORS[m.group('op')]
2299 actual_value = dct.get(m.group('key'))
2300 return op(actual_value)
2301
2302 raise ValueError('Invalid filter part %r' % filter_part)
2303
2304
2305def match_str(filter_str, dct):
2306 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2307
2308 return all(
2309 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2310
2311
2312def match_filter_func(filter_str):
2313 def _match_func(info_dict):
2314 if match_str(filter_str, info_dict):
2315 return None
2316 else:
2317 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2318 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2319 return _match_func
91410c9b
PH
2320
2321
bf6427d2
YCH
2322def parse_dfxp_time_expr(time_expr):
2323 if not time_expr:
d631d5f9 2324 return
bf6427d2
YCH
2325
2326 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2327 if mobj:
2328 return float(mobj.group('time_offset'))
2329
db2fe38b 2330 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2331 if mobj:
db2fe38b 2332 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2333
2334
c1c924ab
YCH
2335def srt_subtitles_timecode(seconds):
2336 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2337
2338
2339def dfxp2srt(dfxp_data):
4e335771
YCH
2340 _x = functools.partial(xpath_with_ns, ns_map={
2341 'ttml': 'http://www.w3.org/ns/ttml',
2342 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
5bf28d78 2343 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
4e335771 2344 })
bf6427d2 2345
87de7069 2346 class TTMLPElementParser(object):
2b14cb56 2347 out = ''
bf6427d2 2348
2b14cb56 2349 def start(self, tag, attrib):
2350 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2351 self.out += '\n'
bf6427d2 2352
2b14cb56 2353 def end(self, tag):
2354 pass
bf6427d2 2355
2b14cb56 2356 def data(self, data):
2357 self.out += data
2358
2359 def close(self):
2360 return self.out.strip()
2361
2362 def parse_node(node):
2363 target = TTMLPElementParser()
2364 parser = xml.etree.ElementTree.XMLParser(target=target)
2365 parser.feed(xml.etree.ElementTree.tostring(node))
2366 return parser.close()
bf6427d2 2367
36e6f62c 2368 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
bf6427d2 2369 out = []
5bf28d78 2370 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2371
2372 if not paras:
2373 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
2374
2375 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2376 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2377 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2378 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2379 if begin_time is None:
2380 continue
7dff0363 2381 if not end_time:
d631d5f9
YCH
2382 if not dur:
2383 continue
2384 end_time = begin_time + dur
bf6427d2
YCH
2385 out.append('%d\n%s --> %s\n%s\n\n' % (
2386 index,
c1c924ab
YCH
2387 srt_subtitles_timecode(begin_time),
2388 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2389 parse_node(para)))
2390
2391 return ''.join(out)
2392
2393
66e289ba
S
2394def cli_option(params, command_option, param):
2395 param = params.get(param)
2396 return [command_option, param] if param is not None else []
2397
2398
2399def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2400 param = params.get(param)
2401 assert isinstance(param, bool)
2402 if separator:
2403 return [command_option + separator + (true_value if param else false_value)]
2404 return [command_option, true_value if param else false_value]
2405
2406
2407def cli_valueless_option(params, command_option, param, expected_value=True):
2408 param = params.get(param)
2409 return [command_option] if param == expected_value else []
2410
2411
2412def cli_configuration_args(params, param, default=[]):
2413 ex_args = params.get(param)
2414 if ex_args is None:
2415 return default
2416 assert isinstance(ex_args, list)
2417 return ex_args
2418
2419
39672624
YCH
2420class ISO639Utils(object):
2421 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2422 _lang_map = {
2423 'aa': 'aar',
2424 'ab': 'abk',
2425 'ae': 'ave',
2426 'af': 'afr',
2427 'ak': 'aka',
2428 'am': 'amh',
2429 'an': 'arg',
2430 'ar': 'ara',
2431 'as': 'asm',
2432 'av': 'ava',
2433 'ay': 'aym',
2434 'az': 'aze',
2435 'ba': 'bak',
2436 'be': 'bel',
2437 'bg': 'bul',
2438 'bh': 'bih',
2439 'bi': 'bis',
2440 'bm': 'bam',
2441 'bn': 'ben',
2442 'bo': 'bod',
2443 'br': 'bre',
2444 'bs': 'bos',
2445 'ca': 'cat',
2446 'ce': 'che',
2447 'ch': 'cha',
2448 'co': 'cos',
2449 'cr': 'cre',
2450 'cs': 'ces',
2451 'cu': 'chu',
2452 'cv': 'chv',
2453 'cy': 'cym',
2454 'da': 'dan',
2455 'de': 'deu',
2456 'dv': 'div',
2457 'dz': 'dzo',
2458 'ee': 'ewe',
2459 'el': 'ell',
2460 'en': 'eng',
2461 'eo': 'epo',
2462 'es': 'spa',
2463 'et': 'est',
2464 'eu': 'eus',
2465 'fa': 'fas',
2466 'ff': 'ful',
2467 'fi': 'fin',
2468 'fj': 'fij',
2469 'fo': 'fao',
2470 'fr': 'fra',
2471 'fy': 'fry',
2472 'ga': 'gle',
2473 'gd': 'gla',
2474 'gl': 'glg',
2475 'gn': 'grn',
2476 'gu': 'guj',
2477 'gv': 'glv',
2478 'ha': 'hau',
2479 'he': 'heb',
2480 'hi': 'hin',
2481 'ho': 'hmo',
2482 'hr': 'hrv',
2483 'ht': 'hat',
2484 'hu': 'hun',
2485 'hy': 'hye',
2486 'hz': 'her',
2487 'ia': 'ina',
2488 'id': 'ind',
2489 'ie': 'ile',
2490 'ig': 'ibo',
2491 'ii': 'iii',
2492 'ik': 'ipk',
2493 'io': 'ido',
2494 'is': 'isl',
2495 'it': 'ita',
2496 'iu': 'iku',
2497 'ja': 'jpn',
2498 'jv': 'jav',
2499 'ka': 'kat',
2500 'kg': 'kon',
2501 'ki': 'kik',
2502 'kj': 'kua',
2503 'kk': 'kaz',
2504 'kl': 'kal',
2505 'km': 'khm',
2506 'kn': 'kan',
2507 'ko': 'kor',
2508 'kr': 'kau',
2509 'ks': 'kas',
2510 'ku': 'kur',
2511 'kv': 'kom',
2512 'kw': 'cor',
2513 'ky': 'kir',
2514 'la': 'lat',
2515 'lb': 'ltz',
2516 'lg': 'lug',
2517 'li': 'lim',
2518 'ln': 'lin',
2519 'lo': 'lao',
2520 'lt': 'lit',
2521 'lu': 'lub',
2522 'lv': 'lav',
2523 'mg': 'mlg',
2524 'mh': 'mah',
2525 'mi': 'mri',
2526 'mk': 'mkd',
2527 'ml': 'mal',
2528 'mn': 'mon',
2529 'mr': 'mar',
2530 'ms': 'msa',
2531 'mt': 'mlt',
2532 'my': 'mya',
2533 'na': 'nau',
2534 'nb': 'nob',
2535 'nd': 'nde',
2536 'ne': 'nep',
2537 'ng': 'ndo',
2538 'nl': 'nld',
2539 'nn': 'nno',
2540 'no': 'nor',
2541 'nr': 'nbl',
2542 'nv': 'nav',
2543 'ny': 'nya',
2544 'oc': 'oci',
2545 'oj': 'oji',
2546 'om': 'orm',
2547 'or': 'ori',
2548 'os': 'oss',
2549 'pa': 'pan',
2550 'pi': 'pli',
2551 'pl': 'pol',
2552 'ps': 'pus',
2553 'pt': 'por',
2554 'qu': 'que',
2555 'rm': 'roh',
2556 'rn': 'run',
2557 'ro': 'ron',
2558 'ru': 'rus',
2559 'rw': 'kin',
2560 'sa': 'san',
2561 'sc': 'srd',
2562 'sd': 'snd',
2563 'se': 'sme',
2564 'sg': 'sag',
2565 'si': 'sin',
2566 'sk': 'slk',
2567 'sl': 'slv',
2568 'sm': 'smo',
2569 'sn': 'sna',
2570 'so': 'som',
2571 'sq': 'sqi',
2572 'sr': 'srp',
2573 'ss': 'ssw',
2574 'st': 'sot',
2575 'su': 'sun',
2576 'sv': 'swe',
2577 'sw': 'swa',
2578 'ta': 'tam',
2579 'te': 'tel',
2580 'tg': 'tgk',
2581 'th': 'tha',
2582 'ti': 'tir',
2583 'tk': 'tuk',
2584 'tl': 'tgl',
2585 'tn': 'tsn',
2586 'to': 'ton',
2587 'tr': 'tur',
2588 'ts': 'tso',
2589 'tt': 'tat',
2590 'tw': 'twi',
2591 'ty': 'tah',
2592 'ug': 'uig',
2593 'uk': 'ukr',
2594 'ur': 'urd',
2595 'uz': 'uzb',
2596 've': 'ven',
2597 'vi': 'vie',
2598 'vo': 'vol',
2599 'wa': 'wln',
2600 'wo': 'wol',
2601 'xh': 'xho',
2602 'yi': 'yid',
2603 'yo': 'yor',
2604 'za': 'zha',
2605 'zh': 'zho',
2606 'zu': 'zul',
2607 }
2608
2609 @classmethod
2610 def short2long(cls, code):
2611 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2612 return cls._lang_map.get(code[:2])
2613
2614 @classmethod
2615 def long2short(cls, code):
2616 """Convert language code from ISO 639-2/T to ISO 639-1"""
2617 for short_name, long_name in cls._lang_map.items():
2618 if long_name == code:
2619 return short_name
2620
2621
4eb10f66
YCH
2622class ISO3166Utils(object):
2623 # From http://data.okfn.org/data/core/country-list
2624 _country_map = {
2625 'AF': 'Afghanistan',
2626 'AX': 'Åland Islands',
2627 'AL': 'Albania',
2628 'DZ': 'Algeria',
2629 'AS': 'American Samoa',
2630 'AD': 'Andorra',
2631 'AO': 'Angola',
2632 'AI': 'Anguilla',
2633 'AQ': 'Antarctica',
2634 'AG': 'Antigua and Barbuda',
2635 'AR': 'Argentina',
2636 'AM': 'Armenia',
2637 'AW': 'Aruba',
2638 'AU': 'Australia',
2639 'AT': 'Austria',
2640 'AZ': 'Azerbaijan',
2641 'BS': 'Bahamas',
2642 'BH': 'Bahrain',
2643 'BD': 'Bangladesh',
2644 'BB': 'Barbados',
2645 'BY': 'Belarus',
2646 'BE': 'Belgium',
2647 'BZ': 'Belize',
2648 'BJ': 'Benin',
2649 'BM': 'Bermuda',
2650 'BT': 'Bhutan',
2651 'BO': 'Bolivia, Plurinational State of',
2652 'BQ': 'Bonaire, Sint Eustatius and Saba',
2653 'BA': 'Bosnia and Herzegovina',
2654 'BW': 'Botswana',
2655 'BV': 'Bouvet Island',
2656 'BR': 'Brazil',
2657 'IO': 'British Indian Ocean Territory',
2658 'BN': 'Brunei Darussalam',
2659 'BG': 'Bulgaria',
2660 'BF': 'Burkina Faso',
2661 'BI': 'Burundi',
2662 'KH': 'Cambodia',
2663 'CM': 'Cameroon',
2664 'CA': 'Canada',
2665 'CV': 'Cape Verde',
2666 'KY': 'Cayman Islands',
2667 'CF': 'Central African Republic',
2668 'TD': 'Chad',
2669 'CL': 'Chile',
2670 'CN': 'China',
2671 'CX': 'Christmas Island',
2672 'CC': 'Cocos (Keeling) Islands',
2673 'CO': 'Colombia',
2674 'KM': 'Comoros',
2675 'CG': 'Congo',
2676 'CD': 'Congo, the Democratic Republic of the',
2677 'CK': 'Cook Islands',
2678 'CR': 'Costa Rica',
2679 'CI': 'Côte d\'Ivoire',
2680 'HR': 'Croatia',
2681 'CU': 'Cuba',
2682 'CW': 'Curaçao',
2683 'CY': 'Cyprus',
2684 'CZ': 'Czech Republic',
2685 'DK': 'Denmark',
2686 'DJ': 'Djibouti',
2687 'DM': 'Dominica',
2688 'DO': 'Dominican Republic',
2689 'EC': 'Ecuador',
2690 'EG': 'Egypt',
2691 'SV': 'El Salvador',
2692 'GQ': 'Equatorial Guinea',
2693 'ER': 'Eritrea',
2694 'EE': 'Estonia',
2695 'ET': 'Ethiopia',
2696 'FK': 'Falkland Islands (Malvinas)',
2697 'FO': 'Faroe Islands',
2698 'FJ': 'Fiji',
2699 'FI': 'Finland',
2700 'FR': 'France',
2701 'GF': 'French Guiana',
2702 'PF': 'French Polynesia',
2703 'TF': 'French Southern Territories',
2704 'GA': 'Gabon',
2705 'GM': 'Gambia',
2706 'GE': 'Georgia',
2707 'DE': 'Germany',
2708 'GH': 'Ghana',
2709 'GI': 'Gibraltar',
2710 'GR': 'Greece',
2711 'GL': 'Greenland',
2712 'GD': 'Grenada',
2713 'GP': 'Guadeloupe',
2714 'GU': 'Guam',
2715 'GT': 'Guatemala',
2716 'GG': 'Guernsey',
2717 'GN': 'Guinea',
2718 'GW': 'Guinea-Bissau',
2719 'GY': 'Guyana',
2720 'HT': 'Haiti',
2721 'HM': 'Heard Island and McDonald Islands',
2722 'VA': 'Holy See (Vatican City State)',
2723 'HN': 'Honduras',
2724 'HK': 'Hong Kong',
2725 'HU': 'Hungary',
2726 'IS': 'Iceland',
2727 'IN': 'India',
2728 'ID': 'Indonesia',
2729 'IR': 'Iran, Islamic Republic of',
2730 'IQ': 'Iraq',
2731 'IE': 'Ireland',
2732 'IM': 'Isle of Man',
2733 'IL': 'Israel',
2734 'IT': 'Italy',
2735 'JM': 'Jamaica',
2736 'JP': 'Japan',
2737 'JE': 'Jersey',
2738 'JO': 'Jordan',
2739 'KZ': 'Kazakhstan',
2740 'KE': 'Kenya',
2741 'KI': 'Kiribati',
2742 'KP': 'Korea, Democratic People\'s Republic of',
2743 'KR': 'Korea, Republic of',
2744 'KW': 'Kuwait',
2745 'KG': 'Kyrgyzstan',
2746 'LA': 'Lao People\'s Democratic Republic',
2747 'LV': 'Latvia',
2748 'LB': 'Lebanon',
2749 'LS': 'Lesotho',
2750 'LR': 'Liberia',
2751 'LY': 'Libya',
2752 'LI': 'Liechtenstein',
2753 'LT': 'Lithuania',
2754 'LU': 'Luxembourg',
2755 'MO': 'Macao',
2756 'MK': 'Macedonia, the Former Yugoslav Republic of',
2757 'MG': 'Madagascar',
2758 'MW': 'Malawi',
2759 'MY': 'Malaysia',
2760 'MV': 'Maldives',
2761 'ML': 'Mali',
2762 'MT': 'Malta',
2763 'MH': 'Marshall Islands',
2764 'MQ': 'Martinique',
2765 'MR': 'Mauritania',
2766 'MU': 'Mauritius',
2767 'YT': 'Mayotte',
2768 'MX': 'Mexico',
2769 'FM': 'Micronesia, Federated States of',
2770 'MD': 'Moldova, Republic of',
2771 'MC': 'Monaco',
2772 'MN': 'Mongolia',
2773 'ME': 'Montenegro',
2774 'MS': 'Montserrat',
2775 'MA': 'Morocco',
2776 'MZ': 'Mozambique',
2777 'MM': 'Myanmar',
2778 'NA': 'Namibia',
2779 'NR': 'Nauru',
2780 'NP': 'Nepal',
2781 'NL': 'Netherlands',
2782 'NC': 'New Caledonia',
2783 'NZ': 'New Zealand',
2784 'NI': 'Nicaragua',
2785 'NE': 'Niger',
2786 'NG': 'Nigeria',
2787 'NU': 'Niue',
2788 'NF': 'Norfolk Island',
2789 'MP': 'Northern Mariana Islands',
2790 'NO': 'Norway',
2791 'OM': 'Oman',
2792 'PK': 'Pakistan',
2793 'PW': 'Palau',
2794 'PS': 'Palestine, State of',
2795 'PA': 'Panama',
2796 'PG': 'Papua New Guinea',
2797 'PY': 'Paraguay',
2798 'PE': 'Peru',
2799 'PH': 'Philippines',
2800 'PN': 'Pitcairn',
2801 'PL': 'Poland',
2802 'PT': 'Portugal',
2803 'PR': 'Puerto Rico',
2804 'QA': 'Qatar',
2805 'RE': 'Réunion',
2806 'RO': 'Romania',
2807 'RU': 'Russian Federation',
2808 'RW': 'Rwanda',
2809 'BL': 'Saint Barthélemy',
2810 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2811 'KN': 'Saint Kitts and Nevis',
2812 'LC': 'Saint Lucia',
2813 'MF': 'Saint Martin (French part)',
2814 'PM': 'Saint Pierre and Miquelon',
2815 'VC': 'Saint Vincent and the Grenadines',
2816 'WS': 'Samoa',
2817 'SM': 'San Marino',
2818 'ST': 'Sao Tome and Principe',
2819 'SA': 'Saudi Arabia',
2820 'SN': 'Senegal',
2821 'RS': 'Serbia',
2822 'SC': 'Seychelles',
2823 'SL': 'Sierra Leone',
2824 'SG': 'Singapore',
2825 'SX': 'Sint Maarten (Dutch part)',
2826 'SK': 'Slovakia',
2827 'SI': 'Slovenia',
2828 'SB': 'Solomon Islands',
2829 'SO': 'Somalia',
2830 'ZA': 'South Africa',
2831 'GS': 'South Georgia and the South Sandwich Islands',
2832 'SS': 'South Sudan',
2833 'ES': 'Spain',
2834 'LK': 'Sri Lanka',
2835 'SD': 'Sudan',
2836 'SR': 'Suriname',
2837 'SJ': 'Svalbard and Jan Mayen',
2838 'SZ': 'Swaziland',
2839 'SE': 'Sweden',
2840 'CH': 'Switzerland',
2841 'SY': 'Syrian Arab Republic',
2842 'TW': 'Taiwan, Province of China',
2843 'TJ': 'Tajikistan',
2844 'TZ': 'Tanzania, United Republic of',
2845 'TH': 'Thailand',
2846 'TL': 'Timor-Leste',
2847 'TG': 'Togo',
2848 'TK': 'Tokelau',
2849 'TO': 'Tonga',
2850 'TT': 'Trinidad and Tobago',
2851 'TN': 'Tunisia',
2852 'TR': 'Turkey',
2853 'TM': 'Turkmenistan',
2854 'TC': 'Turks and Caicos Islands',
2855 'TV': 'Tuvalu',
2856 'UG': 'Uganda',
2857 'UA': 'Ukraine',
2858 'AE': 'United Arab Emirates',
2859 'GB': 'United Kingdom',
2860 'US': 'United States',
2861 'UM': 'United States Minor Outlying Islands',
2862 'UY': 'Uruguay',
2863 'UZ': 'Uzbekistan',
2864 'VU': 'Vanuatu',
2865 'VE': 'Venezuela, Bolivarian Republic of',
2866 'VN': 'Viet Nam',
2867 'VG': 'Virgin Islands, British',
2868 'VI': 'Virgin Islands, U.S.',
2869 'WF': 'Wallis and Futuna',
2870 'EH': 'Western Sahara',
2871 'YE': 'Yemen',
2872 'ZM': 'Zambia',
2873 'ZW': 'Zimbabwe',
2874 }
2875
2876 @classmethod
2877 def short2full(cls, code):
2878 """Convert an ISO 3166-2 country code to the corresponding full name"""
2879 return cls._country_map.get(code.upper())
2880
2881
91410c9b 2882class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2883 def __init__(self, proxies=None):
2884 # Set default handlers
2885 for type in ('http', 'https'):
2886 setattr(self, '%s_open' % type,
2887 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2888 meth(r, proxy, type))
2889 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2890
91410c9b 2891 def proxy_open(self, req, proxy, type):
2461f79d 2892 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
2893 if req_proxy is not None:
2894 proxy = req_proxy
2461f79d
PH
2895 del req.headers['Ytdl-request-proxy']
2896
2897 if proxy == '__noproxy__':
2898 return None # No Proxy
51fb4995 2899 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
2900 req.add_header('Ytdl-socks-proxy', proxy)
2901 # youtube-dl's http/https handlers do wrapping the socket with socks
2902 return None
91410c9b
PH
2903 return compat_urllib_request.ProxyHandler.proxy_open(
2904 self, req, proxy, type)
5bc880b9
YCH
2905
2906
2907def ohdave_rsa_encrypt(data, exponent, modulus):
2908 '''
2909 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2910
2911 Input:
2912 data: data to encrypt, bytes-like object
2913 exponent, modulus: parameter e and N of RSA algorithm, both integer
2914 Output: hex string of encrypted data
2915
2916 Limitation: supports one block encryption only
2917 '''
2918
2919 payload = int(binascii.hexlify(data[::-1]), 16)
2920 encrypted = pow(payload, exponent, modulus)
2921 return '%x' % encrypted
81bdc8fd
YCH
2922
2923
5eb6bdce 2924def encode_base_n(num, n, table=None):
59f898b7 2925 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
2926 if not table:
2927 table = FULL_TABLE[:n]
2928
5eb6bdce
YCH
2929 if n > len(table):
2930 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2931
2932 if num == 0:
2933 return table[0]
2934
81bdc8fd
YCH
2935 ret = ''
2936 while num:
2937 ret = table[num % n] + ret
2938 num = num // n
2939 return ret
f52354a8
YCH
2940
2941
2942def decode_packed_codes(code):
2943 mobj = re.search(
680079be 2944 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
f52354a8
YCH
2945 code)
2946 obfucasted_code, base, count, symbols = mobj.groups()
2947 base = int(base)
2948 count = int(count)
2949 symbols = symbols.split('|')
2950 symbol_table = {}
2951
2952 while count:
2953 count -= 1
5eb6bdce 2954 base_n_count = encode_base_n(count, base)
f52354a8
YCH
2955 symbol_table[base_n_count] = symbols[count] or base_n_count
2956
2957 return re.sub(
2958 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2959 obfucasted_code)
e154c651 2960
2961
2962def parse_m3u8_attributes(attrib):
2963 info = {}
2964 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
2965 if val.startswith('"'):
2966 val = val[1:-1]
2967 info[key] = val
2968 return info
1143535d
YCH
2969
2970
2971def urshift(val, n):
2972 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
2973
2974
2975# Based on png2str() written by @gdkchan and improved by @yokrysty
2976# Originally posted at https://github.com/rg3/youtube-dl/issues/9706
2977def decode_png(png_data):
2978 # Reference: https://www.w3.org/TR/PNG/
2979 header = png_data[8:]
2980
2981 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
2982 raise IOError('Not a valid PNG file.')
2983
2984 int_map = {1: '>B', 2: '>H', 4: '>I'}
2985 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
2986
2987 chunks = []
2988
2989 while header:
2990 length = unpack_integer(header[:4])
2991 header = header[4:]
2992
2993 chunk_type = header[:4]
2994 header = header[4:]
2995
2996 chunk_data = header[:length]
2997 header = header[length:]
2998
2999 header = header[4:] # Skip CRC
3000
3001 chunks.append({
3002 'type': chunk_type,
3003 'length': length,
3004 'data': chunk_data
3005 })
3006
3007 ihdr = chunks[0]['data']
3008
3009 width = unpack_integer(ihdr[:4])
3010 height = unpack_integer(ihdr[4:8])
3011
3012 idat = b''
3013
3014 for chunk in chunks:
3015 if chunk['type'] == b'IDAT':
3016 idat += chunk['data']
3017
3018 if not idat:
3019 raise IOError('Unable to read PNG data.')
3020
3021 decompressed_data = bytearray(zlib.decompress(idat))
3022
3023 stride = width * 3
3024 pixels = []
3025
3026 def _get_pixel(idx):
3027 x = idx % stride
3028 y = idx // stride
3029 return pixels[y][x]
3030
3031 for y in range(height):
3032 basePos = y * (1 + stride)
3033 filter_type = decompressed_data[basePos]
3034
3035 current_row = []
3036
3037 pixels.append(current_row)
3038
3039 for x in range(stride):
3040 color = decompressed_data[1 + basePos + x]
3041 basex = y * stride + x
3042 left = 0
3043 up = 0
3044
3045 if x > 2:
3046 left = _get_pixel(basex - 3)
3047 if y > 0:
3048 up = _get_pixel(basex - stride)
3049
3050 if filter_type == 1: # Sub
3051 color = (color + left) & 0xff
3052 elif filter_type == 2: # Up
3053 color = (color + up) & 0xff
3054 elif filter_type == 3: # Average
3055 color = (color + ((left + up) >> 1)) & 0xff
3056 elif filter_type == 4: # Paeth
3057 a = left
3058 b = up
3059 c = 0
3060
3061 if x > 2 and y > 0:
3062 c = _get_pixel(basex - stride - 3)
3063
3064 p = a + b - c
3065
3066 pa = abs(p - a)
3067 pb = abs(p - b)
3068 pc = abs(p - c)
3069
3070 if pa <= pb and pa <= pc:
3071 color = (color + a) & 0xff
3072 elif pb <= pc:
3073 color = (color + b) & 0xff
3074 else:
3075 color = (color + c) & 0xff
3076
3077 current_row.append(color)
3078
3079 return width, height, pixels