]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[downloader/http] xattr values should be bytes
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
f45c185f 14import errno
be4a824d 15import functools
d77c3dfd 16import gzip
03f9daab 17import io
79a2e94e 18import itertools
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
347de493 22import operator
d77c3dfd 23import os
4eb7f1d1 24import pipes
c496ca96 25import platform
d77c3dfd 26import re
c496ca96 27import socket
79a2e94e 28import ssl
1c088fa8 29import subprocess
d77c3dfd 30import sys
181c8655 31import tempfile
01951dda 32import traceback
bcf89ce6 33import xml.etree.ElementTree
d77c3dfd 34import zlib
d77c3dfd 35
8c25f81b 36from .compat import (
8bb56eee 37 compat_HTMLParser,
8f9312c3 38 compat_basestring,
8c25f81b 39 compat_chr,
36e6f62c 40 compat_etree_fromstring,
8c25f81b 41 compat_html_entities,
55b2f099 42 compat_html_entities_html5,
be4a824d 43 compat_http_client,
c86b6142 44 compat_kwargs,
efa97bdc 45 compat_os_name,
8c25f81b 46 compat_parse_qs,
702ccf2d 47 compat_shlex_quote,
be4a824d 48 compat_socket_create_connection,
8c25f81b 49 compat_str,
edaa23f8 50 compat_struct_pack,
d3f8e038 51 compat_struct_unpack,
8c25f81b
PH
52 compat_urllib_error,
53 compat_urllib_parse,
15707c7e 54 compat_urllib_parse_urlencode,
8c25f81b 55 compat_urllib_parse_urlparse,
7581bfc9 56 compat_urllib_parse_unquote_plus,
8c25f81b
PH
57 compat_urllib_request,
58 compat_urlparse,
810c10ba 59 compat_xpath,
8c25f81b 60)
4644ac55 61
71aff188
YCH
62from .socks import (
63 ProxyType,
64 sockssocket,
65)
66
4644ac55 67
51fb4995
YCH
68def register_socks_protocols():
69 # "Register" SOCKS protocols
d5ae6bb5
YCH
70 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
71 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
72 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
73 if scheme not in compat_urlparse.uses_netloc:
74 compat_urlparse.uses_netloc.append(scheme)
75
76
468e2e92
FV
77# This is not clearly defined otherwise
78compiled_regex_type = type(re.compile(''))
79
3e669f36 80std_headers = {
15d10678 81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
59ae15a5
PH
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 86}
f427df17 87
5f6a1245 88
bf42a990
S
89NO_DEFAULT = object()
90
7105440c
YCH
91ENGLISH_MONTH_NAMES = [
92 'January', 'February', 'March', 'April', 'May', 'June',
93 'July', 'August', 'September', 'October', 'November', 'December']
94
f6717dec
S
95MONTH_NAMES = {
96 'en': ENGLISH_MONTH_NAMES,
97 'fr': [
3e4185c3
S
98 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
99 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 100}
a942d6cb 101
a7aaa398
S
102KNOWN_EXTENSIONS = (
103 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
104 'flv', 'f4v', 'f4a', 'f4b',
105 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
106 'mkv', 'mka', 'mk3d',
107 'avi', 'divx',
108 'mov',
109 'asf', 'wmv', 'wma',
110 '3gp', '3g2',
111 'mp3',
112 'flac',
113 'ape',
114 'wav',
115 'f4f', 'f4m', 'm3u8', 'smil')
116
c587cbb7 117# needed for sanitizing filenames in restricted mode
c8827027 118ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
119 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
120 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
c587cbb7 121
46f59e89
S
122DATE_FORMATS = (
123 '%d %B %Y',
124 '%d %b %Y',
125 '%B %d %Y',
126 '%b %d %Y',
127 '%b %dst %Y %I:%M',
128 '%b %dnd %Y %I:%M',
129 '%b %dth %Y %I:%M',
130 '%Y %m %d',
131 '%Y-%m-%d',
132 '%Y/%m/%d',
81c13222 133 '%Y/%m/%d %H:%M',
46f59e89
S
134 '%Y/%m/%d %H:%M:%S',
135 '%Y-%m-%d %H:%M:%S',
136 '%Y-%m-%d %H:%M:%S.%f',
137 '%d.%m.%Y %H:%M',
138 '%d.%m.%Y %H.%M',
139 '%Y-%m-%dT%H:%M:%SZ',
140 '%Y-%m-%dT%H:%M:%S.%fZ',
141 '%Y-%m-%dT%H:%M:%S.%f0Z',
142 '%Y-%m-%dT%H:%M:%S',
143 '%Y-%m-%dT%H:%M:%S.%f',
144 '%Y-%m-%dT%H:%M',
c6eed6b8
S
145 '%b %d %Y at %H:%M',
146 '%b %d %Y at %H:%M:%S',
46f59e89
S
147)
148
149DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
150DATE_FORMATS_DAY_FIRST.extend([
151 '%d-%m-%Y',
152 '%d.%m.%Y',
153 '%d.%m.%y',
154 '%d/%m/%Y',
155 '%d/%m/%y',
156 '%d/%m/%Y %H:%M:%S',
157])
158
159DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
160DATE_FORMATS_MONTH_FIRST.extend([
161 '%m-%d-%Y',
162 '%m.%d.%Y',
163 '%m/%d/%Y',
164 '%m/%d/%y',
165 '%m/%d/%Y %H:%M:%S',
166])
167
7105440c 168
d77c3dfd 169def preferredencoding():
59ae15a5 170 """Get preferred encoding.
d77c3dfd 171
59ae15a5
PH
172 Returns the best encoding scheme for the system, based on
173 locale.getpreferredencoding() and some further tweaks.
174 """
175 try:
176 pref = locale.getpreferredencoding()
28e614de 177 'TEST'.encode(pref)
70a1165b 178 except Exception:
59ae15a5 179 pref = 'UTF-8'
bae611f2 180
59ae15a5 181 return pref
d77c3dfd 182
f4bfd65f 183
181c8655 184def write_json_file(obj, fn):
1394646a 185 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 186
92120217 187 fn = encodeFilename(fn)
61ee5aeb 188 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
189 encoding = get_filesystem_encoding()
190 # os.path.basename returns a bytes object, but NamedTemporaryFile
191 # will fail if the filename contains non ascii characters unless we
192 # use a unicode object
193 path_basename = lambda f: os.path.basename(fn).decode(encoding)
194 # the same for os.path.dirname
195 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
196 else:
197 path_basename = os.path.basename
198 path_dirname = os.path.dirname
199
73159f99
S
200 args = {
201 'suffix': '.tmp',
ec5f6016
JMF
202 'prefix': path_basename(fn) + '.',
203 'dir': path_dirname(fn),
73159f99
S
204 'delete': False,
205 }
206
181c8655
PH
207 # In Python 2.x, json.dump expects a bytestream.
208 # In Python 3.x, it writes to a character stream
209 if sys.version_info < (3, 0):
73159f99 210 args['mode'] = 'wb'
181c8655 211 else:
73159f99
S
212 args.update({
213 'mode': 'w',
214 'encoding': 'utf-8',
215 })
216
c86b6142 217 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
218
219 try:
220 with tf:
221 json.dump(obj, tf)
1394646a
IK
222 if sys.platform == 'win32':
223 # Need to remove existing file on Windows, else os.rename raises
224 # WindowsError or FileExistsError.
225 try:
226 os.unlink(fn)
227 except OSError:
228 pass
181c8655 229 os.rename(tf.name, fn)
70a1165b 230 except Exception:
181c8655
PH
231 try:
232 os.remove(tf.name)
233 except OSError:
234 pass
235 raise
236
237
238if sys.version_info >= (2, 7):
ee114368 239 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 240 """ Find the xpath xpath[@key=val] """
5d2354f1 241 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 242 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
243 return node.find(expr)
244else:
ee114368 245 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 246 for f in node.findall(compat_xpath(xpath)):
ee114368
S
247 if key not in f.attrib:
248 continue
249 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
250 return f
251 return None
252
d7e66d39
JMF
253# On python2.6 the xml.etree.ElementTree.Element methods don't support
254# the namespace parameter
5f6a1245
JW
255
256
d7e66d39
JMF
257def xpath_with_ns(path, ns_map):
258 components = [c.split(':') for c in path.split('/')]
259 replaced = []
260 for c in components:
261 if len(c) == 1:
262 replaced.append(c[0])
263 else:
264 ns, tag = c
265 replaced.append('{%s}%s' % (ns_map[ns], tag))
266 return '/'.join(replaced)
267
d77c3dfd 268
a41fb80c 269def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 270 def _find_xpath(xpath):
810c10ba 271 return node.find(compat_xpath(xpath))
578c0745
S
272
273 if isinstance(xpath, (str, compat_str)):
274 n = _find_xpath(xpath)
275 else:
276 for xp in xpath:
277 n = _find_xpath(xp)
278 if n is not None:
279 break
d74bebd5 280
8e636da4 281 if n is None:
bf42a990
S
282 if default is not NO_DEFAULT:
283 return default
284 elif fatal:
bf0ff932
PH
285 name = xpath if name is None else name
286 raise ExtractorError('Could not find XML element %s' % name)
287 else:
288 return None
a41fb80c
S
289 return n
290
291
292def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
293 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
294 if n is None or n == default:
295 return n
296 if n.text is None:
297 if default is not NO_DEFAULT:
298 return default
299 elif fatal:
300 name = xpath if name is None else name
301 raise ExtractorError('Could not find XML element\'s text %s' % name)
302 else:
303 return None
304 return n.text
a41fb80c
S
305
306
307def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
308 n = find_xpath_attr(node, xpath, key)
309 if n is None:
310 if default is not NO_DEFAULT:
311 return default
312 elif fatal:
313 name = '%s[@%s]' % (xpath, key) if name is None else name
314 raise ExtractorError('Could not find XML attribute %s' % name)
315 else:
316 return None
317 return n.attrib[key]
bf0ff932
PH
318
319
9e6dd238 320def get_element_by_id(id, html):
43e8fafd 321 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 322 return get_element_by_attribute('id', id, html)
43e8fafd 323
12ea2f30 324
84c237fb
YCH
325def get_element_by_class(class_name, html):
326 return get_element_by_attribute(
327 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
328 html, escape_value=False)
329
330
331def get_element_by_attribute(attribute, value, html, escape_value=True):
43e8fafd 332 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 333
84c237fb
YCH
334 value = re.escape(value) if escape_value else value
335
38285056
PH
336 m = re.search(r'''(?xs)
337 <([a-zA-Z0-9:._-]+)
abc97b5e 338 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056 339 \s+%s=['"]?%s['"]?
abc97b5e 340 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056
PH
341 \s*>
342 (?P<content>.*?)
343 </\1>
84c237fb 344 ''' % (re.escape(attribute), value), html)
38285056
PH
345
346 if not m:
347 return None
348 res = m.group('content')
349
350 if res.startswith('"') or res.startswith("'"):
351 res = res[1:-1]
a921f407 352
38285056 353 return unescapeHTML(res)
a921f407 354
c5229f39 355
8bb56eee
BF
356class HTMLAttributeParser(compat_HTMLParser):
357 """Trivial HTML parser to gather the attributes for a single element"""
358 def __init__(self):
c5229f39 359 self.attrs = {}
8bb56eee
BF
360 compat_HTMLParser.__init__(self)
361
362 def handle_starttag(self, tag, attrs):
363 self.attrs = dict(attrs)
364
c5229f39 365
8bb56eee
BF
366def extract_attributes(html_element):
367 """Given a string for an HTML element such as
368 <el
369 a="foo" B="bar" c="&98;az" d=boz
370 empty= noval entity="&amp;"
371 sq='"' dq="'"
372 >
373 Decode and return a dictionary of attributes.
374 {
375 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
376 'empty': '', 'noval': None, 'entity': '&',
377 'sq': '"', 'dq': '\''
378 }.
379 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
380 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
381 """
382 parser = HTMLAttributeParser()
383 parser.feed(html_element)
384 parser.close()
385 return parser.attrs
9e6dd238 386
c5229f39 387
9e6dd238 388def clean_html(html):
59ae15a5 389 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
390
391 if html is None: # Convenience for sanitizing descriptions etc.
392 return html
393
59ae15a5
PH
394 # Newline vs <br />
395 html = html.replace('\n', ' ')
6b3aef80
FV
396 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
397 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
398 # Strip html tags
399 html = re.sub('<.*?>', '', html)
400 # Replace html entities
401 html = unescapeHTML(html)
7decf895 402 return html.strip()
9e6dd238
FV
403
404
d77c3dfd 405def sanitize_open(filename, open_mode):
59ae15a5
PH
406 """Try to open the given filename, and slightly tweak it if this fails.
407
408 Attempts to open the given filename. If this fails, it tries to change
409 the filename slightly, step by step, until it's either able to open it
410 or it fails and raises a final exception, like the standard open()
411 function.
412
413 It returns the tuple (stream, definitive_file_name).
414 """
415 try:
28e614de 416 if filename == '-':
59ae15a5
PH
417 if sys.platform == 'win32':
418 import msvcrt
419 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 420 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
421 stream = open(encodeFilename(filename), open_mode)
422 return (stream, filename)
423 except (IOError, OSError) as err:
f45c185f
PH
424 if err.errno in (errno.EACCES,):
425 raise
59ae15a5 426
f45c185f 427 # In case of error, try to remove win32 forbidden chars
d55de57b 428 alt_filename = sanitize_path(filename)
f45c185f
PH
429 if alt_filename == filename:
430 raise
431 else:
432 # An exception here should be caught in the caller
d55de57b 433 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 434 return (stream, alt_filename)
d77c3dfd
FV
435
436
437def timeconvert(timestr):
59ae15a5
PH
438 """Convert RFC 2822 defined time string into system timestamp"""
439 timestamp = None
440 timetuple = email.utils.parsedate_tz(timestr)
441 if timetuple is not None:
442 timestamp = email.utils.mktime_tz(timetuple)
443 return timestamp
1c469a94 444
5f6a1245 445
796173d0 446def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
447 """Sanitizes a string so it could be used as part of a filename.
448 If restricted is set, use a stricter subset of allowed characters.
796173d0 449 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
450 """
451 def replace_insane(char):
c587cbb7
AT
452 if restricted and char in ACCENT_CHARS:
453 return ACCENT_CHARS[char]
59ae15a5
PH
454 if char == '?' or ord(char) < 32 or ord(char) == 127:
455 return ''
456 elif char == '"':
457 return '' if restricted else '\''
458 elif char == ':':
459 return '_-' if restricted else ' -'
460 elif char in '\\/|*<>':
461 return '_'
627dcfff 462 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
463 return '_'
464 if restricted and ord(char) > 127:
465 return '_'
466 return char
467
2aeb06d6
PH
468 # Handle timestamps
469 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 470 result = ''.join(map(replace_insane, s))
796173d0
PH
471 if not is_id:
472 while '__' in result:
473 result = result.replace('__', '_')
474 result = result.strip('_')
475 # Common case of "Foreign band name - English song title"
476 if restricted and result.startswith('-_'):
477 result = result[2:]
5a42414b
PH
478 if result.startswith('-'):
479 result = '_' + result[len('-'):]
a7440261 480 result = result.lstrip('.')
796173d0
PH
481 if not result:
482 result = '_'
59ae15a5 483 return result
d77c3dfd 484
5f6a1245 485
a2aaf4db
S
486def sanitize_path(s):
487 """Sanitizes and normalizes path on Windows"""
488 if sys.platform != 'win32':
489 return s
be531ef1
S
490 drive_or_unc, _ = os.path.splitdrive(s)
491 if sys.version_info < (2, 7) and not drive_or_unc:
492 drive_or_unc, _ = os.path.splitunc(s)
493 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
494 if drive_or_unc:
a2aaf4db
S
495 norm_path.pop(0)
496 sanitized_path = [
c90d16cf 497 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
a2aaf4db 498 for path_part in norm_path]
be531ef1
S
499 if drive_or_unc:
500 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
501 return os.path.join(*sanitized_path)
502
503
67dda517
S
504# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
505# unwanted failures due to missing protocol
17bcc626
S
506def sanitize_url(url):
507 return 'http:%s' % url if url.startswith('//') else url
508
509
67dda517 510def sanitized_Request(url, *args, **kwargs):
17bcc626 511 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
512
513
d77c3dfd 514def orderedSet(iterable):
59ae15a5
PH
515 """ Remove all duplicates from the input iterable """
516 res = []
517 for el in iterable:
518 if el not in res:
519 res.append(el)
520 return res
d77c3dfd 521
912b38b4 522
55b2f099 523def _htmlentity_transform(entity_with_semicolon):
4e408e47 524 """Transforms an HTML entity to a character."""
55b2f099
YCH
525 entity = entity_with_semicolon[:-1]
526
4e408e47
PH
527 # Known non-numeric HTML entity
528 if entity in compat_html_entities.name2codepoint:
529 return compat_chr(compat_html_entities.name2codepoint[entity])
530
55b2f099
YCH
531 # TODO: HTML5 allows entities without a semicolon. For example,
532 # '&Eacuteric' should be decoded as 'Éric'.
533 if entity_with_semicolon in compat_html_entities_html5:
534 return compat_html_entities_html5[entity_with_semicolon]
535
91757b0f 536 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
537 if mobj is not None:
538 numstr = mobj.group(1)
28e614de 539 if numstr.startswith('x'):
4e408e47 540 base = 16
28e614de 541 numstr = '0%s' % numstr
4e408e47
PH
542 else:
543 base = 10
7aefc49c
S
544 # See https://github.com/rg3/youtube-dl/issues/7518
545 try:
546 return compat_chr(int(numstr, base))
547 except ValueError:
548 pass
4e408e47
PH
549
550 # Unknown entity in name, return its literal representation
7a3f0c00 551 return '&%s;' % entity
4e408e47
PH
552
553
d77c3dfd 554def unescapeHTML(s):
912b38b4
PH
555 if s is None:
556 return None
557 assert type(s) == compat_str
d77c3dfd 558
4e408e47 559 return re.sub(
55b2f099 560 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 561
8bf48f23 562
aa49acd1
S
563def get_subprocess_encoding():
564 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
565 # For subprocess calls, encode with locale encoding
566 # Refer to http://stackoverflow.com/a/9951851/35070
567 encoding = preferredencoding()
568 else:
569 encoding = sys.getfilesystemencoding()
570 if encoding is None:
571 encoding = 'utf-8'
572 return encoding
573
574
8bf48f23 575def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
576 """
577 @param s The name of the file
578 """
d77c3dfd 579
8bf48f23 580 assert type(s) == compat_str
d77c3dfd 581
59ae15a5
PH
582 # Python 3 has a Unicode API
583 if sys.version_info >= (3, 0):
584 return s
0f00efed 585
aa49acd1
S
586 # Pass '' directly to use Unicode APIs on Windows 2000 and up
587 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
588 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
589 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
590 return s
591
8ee239e9
YCH
592 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
593 if sys.platform.startswith('java'):
594 return s
595
aa49acd1
S
596 return s.encode(get_subprocess_encoding(), 'ignore')
597
598
599def decodeFilename(b, for_subprocess=False):
600
601 if sys.version_info >= (3, 0):
602 return b
603
604 if not isinstance(b, bytes):
605 return b
606
607 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 608
f07b74fc
PH
609
610def encodeArgument(s):
611 if not isinstance(s, compat_str):
612 # Legacy code that uses byte strings
613 # Uncomment the following line after fixing all post processors
7af808a5 614 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
615 s = s.decode('ascii')
616 return encodeFilename(s, True)
617
618
aa49acd1
S
619def decodeArgument(b):
620 return decodeFilename(b, True)
621
622
8271226a
PH
623def decodeOption(optval):
624 if optval is None:
625 return optval
626 if isinstance(optval, bytes):
627 optval = optval.decode(preferredencoding())
628
629 assert isinstance(optval, compat_str)
630 return optval
1c256f70 631
5f6a1245 632
4539dd30
PH
633def formatSeconds(secs):
634 if secs > 3600:
635 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
636 elif secs > 60:
637 return '%d:%02d' % (secs // 60, secs % 60)
638 else:
639 return '%d' % secs
640
a0ddb8a2 641
be4a824d
PH
642def make_HTTPS_handler(params, **kwargs):
643 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 644 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 645 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 646 if opts_no_check_certificate:
be5f2c19 647 context.check_hostname = False
0db261ba 648 context.verify_mode = ssl.CERT_NONE
a2366922 649 try:
be4a824d 650 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
651 except TypeError:
652 # Python 2.7.8
653 # (create_default_context present but HTTPSHandler has no context=)
654 pass
655
656 if sys.version_info < (3, 2):
d7932313 657 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 658 else: # Python < 3.4
d7932313 659 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 660 context.verify_mode = (ssl.CERT_NONE
dca08720 661 if opts_no_check_certificate
ea6d901e 662 else ssl.CERT_REQUIRED)
303b479e 663 context.set_default_verify_paths()
be4a824d 664 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 665
732ea2f0 666
08f2a92c
JMF
667def bug_reports_message():
668 if ytdl_is_updateable():
669 update_cmd = 'type youtube-dl -U to update'
670 else:
671 update_cmd = 'see https://yt-dl.org/update on how to update'
672 msg = '; please report this issue on https://yt-dl.org/bug .'
673 msg += ' Make sure you are using the latest version; %s.' % update_cmd
674 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
675 return msg
676
677
1c256f70
PH
678class ExtractorError(Exception):
679 """Error during info extraction."""
5f6a1245 680
d11271dd 681 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
682 """ tb, if given, is the original traceback (so that it can be printed out).
683 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
684 """
685
686 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
687 expected = True
d11271dd
PH
688 if video_id is not None:
689 msg = video_id + ': ' + msg
410f3e73 690 if cause:
28e614de 691 msg += ' (caused by %r)' % cause
9a82b238 692 if not expected:
08f2a92c 693 msg += bug_reports_message()
1c256f70 694 super(ExtractorError, self).__init__(msg)
d5979c5d 695
1c256f70 696 self.traceback = tb
8cc83b8d 697 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 698 self.cause = cause
d11271dd 699 self.video_id = video_id
1c256f70 700
01951dda
PH
701 def format_traceback(self):
702 if self.traceback is None:
703 return None
28e614de 704 return ''.join(traceback.format_tb(self.traceback))
01951dda 705
1c256f70 706
416c7fcb
PH
707class UnsupportedError(ExtractorError):
708 def __init__(self, url):
709 super(UnsupportedError, self).__init__(
710 'Unsupported URL: %s' % url, expected=True)
711 self.url = url
712
713
55b3e45b
JMF
714class RegexNotFoundError(ExtractorError):
715 """Error when a regex didn't match"""
716 pass
717
718
d77c3dfd 719class DownloadError(Exception):
59ae15a5 720 """Download Error exception.
d77c3dfd 721
59ae15a5
PH
722 This exception may be thrown by FileDownloader objects if they are not
723 configured to continue on errors. They will contain the appropriate
724 error message.
725 """
5f6a1245 726
8cc83b8d
FV
727 def __init__(self, msg, exc_info=None):
728 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
729 super(DownloadError, self).__init__(msg)
730 self.exc_info = exc_info
d77c3dfd
FV
731
732
733class SameFileError(Exception):
59ae15a5 734 """Same File exception.
d77c3dfd 735
59ae15a5
PH
736 This exception will be thrown by FileDownloader objects if they detect
737 multiple files would have to be downloaded to the same file on disk.
738 """
739 pass
d77c3dfd
FV
740
741
742class PostProcessingError(Exception):
59ae15a5 743 """Post Processing exception.
d77c3dfd 744
59ae15a5
PH
745 This exception may be raised by PostProcessor's .run() method to
746 indicate an error in the postprocessing task.
747 """
5f6a1245 748
7851b379
PH
749 def __init__(self, msg):
750 self.msg = msg
d77c3dfd 751
5f6a1245 752
d77c3dfd 753class MaxDownloadsReached(Exception):
59ae15a5
PH
754 """ --max-downloads limit has been reached. """
755 pass
d77c3dfd
FV
756
757
758class UnavailableVideoError(Exception):
59ae15a5 759 """Unavailable Format exception.
d77c3dfd 760
59ae15a5
PH
761 This exception will be thrown when a video is requested
762 in a format that is not available for that video.
763 """
764 pass
d77c3dfd
FV
765
766
767class ContentTooShortError(Exception):
59ae15a5 768 """Content Too Short exception.
d77c3dfd 769
59ae15a5
PH
770 This exception may be raised by FileDownloader objects when a file they
771 download is too small for what the server announced first, indicating
772 the connection was probably interrupted.
773 """
d77c3dfd 774
59ae15a5 775 def __init__(self, downloaded, expected):
2c7ed247 776 # Both in bytes
59ae15a5
PH
777 self.downloaded = downloaded
778 self.expected = expected
d77c3dfd 779
5f6a1245 780
efa97bdc
YCH
781class XAttrMetadataError(Exception):
782 def __init__(self, code=None, msg='Unknown error'):
783 super(XAttrMetadataError, self).__init__(msg)
784 self.code = code
785
786 # Parsing code and msg
787 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
788 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
789 self.reason = 'NO_SPACE'
790 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
791 self.reason = 'VALUE_TOO_LONG'
792 else:
793 self.reason = 'NOT_SUPPORTED'
794
795
796class XAttrUnavailableError(Exception):
797 pass
798
799
c5a59d93 800def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
801 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
802 # expected HTTP responses to meet HTTP/1.0 or later (see also
803 # https://github.com/rg3/youtube-dl/issues/6727)
804 if sys.version_info < (3, 0):
5a1a2e94 805 kwargs[b'strict'] = True
be4a824d
PH
806 hc = http_class(*args, **kwargs)
807 source_address = ydl_handler._params.get('source_address')
808 if source_address is not None:
809 sa = (source_address, 0)
810 if hasattr(hc, 'source_address'): # Python 2.7+
811 hc.source_address = sa
812 else: # Python 2.6
813 def _hc_connect(self, *args, **kwargs):
814 sock = compat_socket_create_connection(
815 (self.host, self.port), self.timeout, sa)
816 if is_https:
d7932313
PH
817 self.sock = ssl.wrap_socket(
818 sock, self.key_file, self.cert_file,
819 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
820 else:
821 self.sock = sock
822 hc.connect = functools.partial(_hc_connect, hc)
823
824 return hc
825
826
87f0e62d 827def handle_youtubedl_headers(headers):
992fc9d6
YCH
828 filtered_headers = headers
829
830 if 'Youtubedl-no-compression' in filtered_headers:
831 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 832 del filtered_headers['Youtubedl-no-compression']
87f0e62d 833
992fc9d6 834 return filtered_headers
87f0e62d
YCH
835
836
acebc9cd 837class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
838 """Handler for HTTP requests and responses.
839
840 This class, when installed with an OpenerDirector, automatically adds
841 the standard headers to every HTTP request and handles gzipped and
842 deflated responses from web servers. If compression is to be avoided in
843 a particular request, the original request in the program code only has
0424ec30 844 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
845 removed before making the real request.
846
847 Part of this code was copied from:
848
849 http://techknack.net/python-urllib2-handlers/
850
851 Andrew Rowls, the author of that code, agreed to release it to the
852 public domain.
853 """
854
be4a824d
PH
855 def __init__(self, params, *args, **kwargs):
856 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
857 self._params = params
858
859 def http_open(self, req):
71aff188
YCH
860 conn_class = compat_http_client.HTTPConnection
861
862 socks_proxy = req.headers.get('Ytdl-socks-proxy')
863 if socks_proxy:
864 conn_class = make_socks_conn_class(conn_class, socks_proxy)
865 del req.headers['Ytdl-socks-proxy']
866
be4a824d 867 return self.do_open(functools.partial(
71aff188 868 _create_http_connection, self, conn_class, False),
be4a824d
PH
869 req)
870
59ae15a5
PH
871 @staticmethod
872 def deflate(data):
873 try:
874 return zlib.decompress(data, -zlib.MAX_WBITS)
875 except zlib.error:
876 return zlib.decompress(data)
877
878 @staticmethod
879 def addinfourl_wrapper(stream, headers, url, code):
880 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
881 return compat_urllib_request.addinfourl(stream, headers, url, code)
882 ret = compat_urllib_request.addinfourl(stream, headers, url)
883 ret.code = code
884 return ret
885
acebc9cd 886 def http_request(self, req):
51f267d9
S
887 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
888 # always respected by websites, some tend to give out URLs with non percent-encoded
889 # non-ASCII characters (see telemb.py, ard.py [#3412])
890 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
891 # To work around aforementioned issue we will replace request's original URL with
892 # percent-encoded one
893 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
894 # the code of this workaround has been moved here from YoutubeDL.urlopen()
895 url = req.get_full_url()
896 url_escaped = escape_url(url)
897
898 # Substitute URL if any change after escaping
899 if url != url_escaped:
15d260eb 900 req = update_Request(req, url=url_escaped)
51f267d9 901
33ac271b 902 for h, v in std_headers.items():
3d5f7a39
JK
903 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
904 # The dict keys are capitalized because of this bug by urllib
905 if h.capitalize() not in req.headers:
33ac271b 906 req.add_header(h, v)
87f0e62d
YCH
907
908 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
909
910 if sys.version_info < (2, 7) and '#' in req.get_full_url():
911 # Python 2.6 is brain-dead when it comes to fragments
912 req._Request__original = req._Request__original.partition('#')[0]
913 req._Request__r_type = req._Request__r_type.partition('#')[0]
914
59ae15a5
PH
915 return req
916
acebc9cd 917 def http_response(self, req, resp):
59ae15a5
PH
918 old_resp = resp
919 # gzip
920 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
921 content = resp.read()
922 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
923 try:
924 uncompressed = io.BytesIO(gz.read())
925 except IOError as original_ioerror:
926 # There may be junk add the end of the file
927 # See http://stackoverflow.com/q/4928560/35070 for details
928 for i in range(1, 1024):
929 try:
930 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
931 uncompressed = io.BytesIO(gz.read())
932 except IOError:
933 continue
934 break
935 else:
936 raise original_ioerror
937 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 938 resp.msg = old_resp.msg
c047270c 939 del resp.headers['Content-encoding']
59ae15a5
PH
940 # deflate
941 if resp.headers.get('Content-encoding', '') == 'deflate':
942 gz = io.BytesIO(self.deflate(resp.read()))
943 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
944 resp.msg = old_resp.msg
c047270c 945 del resp.headers['Content-encoding']
ad729172
S
946 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
947 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
948 if 300 <= resp.code < 400:
949 location = resp.headers.get('Location')
950 if location:
951 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
952 if sys.version_info >= (3, 0):
953 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
954 else:
955 location = location.decode('utf-8')
5a4d9ddb
S
956 location_escaped = escape_url(location)
957 if location != location_escaped:
958 del resp.headers['Location']
9a4aec8b
YCH
959 if sys.version_info < (3, 0):
960 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 961 resp.headers['Location'] = location_escaped
59ae15a5 962 return resp
0f8d03f8 963
acebc9cd
PH
964 https_request = http_request
965 https_response = http_response
bf50b038 966
5de90176 967
71aff188
YCH
968def make_socks_conn_class(base_class, socks_proxy):
969 assert issubclass(base_class, (
970 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
971
972 url_components = compat_urlparse.urlparse(socks_proxy)
973 if url_components.scheme.lower() == 'socks5':
974 socks_type = ProxyType.SOCKS5
975 elif url_components.scheme.lower() in ('socks', 'socks4'):
976 socks_type = ProxyType.SOCKS4
51fb4995
YCH
977 elif url_components.scheme.lower() == 'socks4a':
978 socks_type = ProxyType.SOCKS4A
71aff188 979
cdd94c2e
YCH
980 def unquote_if_non_empty(s):
981 if not s:
982 return s
983 return compat_urllib_parse_unquote_plus(s)
984
71aff188
YCH
985 proxy_args = (
986 socks_type,
987 url_components.hostname, url_components.port or 1080,
988 True, # Remote DNS
cdd94c2e
YCH
989 unquote_if_non_empty(url_components.username),
990 unquote_if_non_empty(url_components.password),
71aff188
YCH
991 )
992
993 class SocksConnection(base_class):
994 def connect(self):
995 self.sock = sockssocket()
996 self.sock.setproxy(*proxy_args)
997 if type(self.timeout) in (int, float):
998 self.sock.settimeout(self.timeout)
999 self.sock.connect((self.host, self.port))
1000
1001 if isinstance(self, compat_http_client.HTTPSConnection):
1002 if hasattr(self, '_context'): # Python > 2.6
1003 self.sock = self._context.wrap_socket(
1004 self.sock, server_hostname=self.host)
1005 else:
1006 self.sock = ssl.wrap_socket(self.sock)
1007
1008 return SocksConnection
1009
1010
be4a824d
PH
1011class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1012 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1013 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1014 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1015 self._params = params
1016
1017 def https_open(self, req):
4f264c02 1018 kwargs = {}
71aff188
YCH
1019 conn_class = self._https_conn_class
1020
4f264c02
JMF
1021 if hasattr(self, '_context'): # python > 2.6
1022 kwargs['context'] = self._context
1023 if hasattr(self, '_check_hostname'): # python 3.x
1024 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1025
1026 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1027 if socks_proxy:
1028 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1029 del req.headers['Ytdl-socks-proxy']
1030
be4a824d 1031 return self.do_open(functools.partial(
71aff188 1032 _create_http_connection, self, conn_class, True),
4f264c02 1033 req, **kwargs)
be4a824d
PH
1034
1035
a6420bf5
S
1036class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1037 def __init__(self, cookiejar=None):
1038 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1039
1040 def http_response(self, request, response):
1041 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1042 # characters in Set-Cookie HTTP header of last response (see
1043 # https://github.com/rg3/youtube-dl/issues/6769).
1044 # In order to at least prevent crashing we will percent encode Set-Cookie
1045 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1046 # if sys.version_info < (3, 0) and response.headers:
1047 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1048 # set_cookie = response.headers.get(set_cookie_header)
1049 # if set_cookie:
1050 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1051 # if set_cookie != set_cookie_escaped:
1052 # del response.headers[set_cookie_header]
1053 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1054 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1055
1056 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1057 https_response = http_response
1058
1059
46f59e89
S
1060def extract_timezone(date_str):
1061 m = re.search(
1062 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1063 date_str)
1064 if not m:
1065 timezone = datetime.timedelta()
1066 else:
1067 date_str = date_str[:-len(m.group('tz'))]
1068 if not m.group('sign'):
1069 timezone = datetime.timedelta()
1070 else:
1071 sign = 1 if m.group('sign') == '+' else -1
1072 timezone = datetime.timedelta(
1073 hours=sign * int(m.group('hours')),
1074 minutes=sign * int(m.group('minutes')))
1075 return timezone, date_str
1076
1077
08b38d54 1078def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1079 """ Return a UNIX timestamp from the given date """
1080
1081 if date_str is None:
1082 return None
1083
52c3a6e4
S
1084 date_str = re.sub(r'\.[0-9]+', '', date_str)
1085
08b38d54 1086 if timezone is None:
46f59e89
S
1087 timezone, date_str = extract_timezone(date_str)
1088
52c3a6e4
S
1089 try:
1090 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1091 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1092 return calendar.timegm(dt.timetuple())
1093 except ValueError:
1094 pass
912b38b4
PH
1095
1096
46f59e89
S
1097def date_formats(day_first=True):
1098 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1099
1100
42bdd9d0 1101def unified_strdate(date_str, day_first=True):
bf50b038 1102 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1103
1104 if date_str is None:
1105 return None
bf50b038 1106 upload_date = None
5f6a1245 1107 # Replace commas
026fcc04 1108 date_str = date_str.replace(',', ' ')
42bdd9d0 1109 # Remove AM/PM + timezone
9bb8e0a3 1110 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1111 _, date_str = extract_timezone(date_str)
42bdd9d0 1112
46f59e89 1113 for expression in date_formats(day_first):
bf50b038
JMF
1114 try:
1115 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1116 except ValueError:
bf50b038 1117 pass
42393ce2
PH
1118 if upload_date is None:
1119 timetuple = email.utils.parsedate_tz(date_str)
1120 if timetuple:
c6b9cf05
S
1121 try:
1122 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1123 except ValueError:
1124 pass
6a750402
JMF
1125 if upload_date is not None:
1126 return compat_str(upload_date)
bf50b038 1127
5f6a1245 1128
46f59e89
S
1129def unified_timestamp(date_str, day_first=True):
1130 if date_str is None:
1131 return None
1132
1133 date_str = date_str.replace(',', ' ')
1134
7dc2a74e 1135 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1136 timezone, date_str = extract_timezone(date_str)
1137
1138 # Remove AM/PM + timezone
1139 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1140
1141 for expression in date_formats(day_first):
1142 try:
7dc2a74e 1143 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1144 return calendar.timegm(dt.timetuple())
1145 except ValueError:
1146 pass
1147 timetuple = email.utils.parsedate_tz(date_str)
1148 if timetuple:
7dc2a74e 1149 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1150
1151
28e614de 1152def determine_ext(url, default_ext='unknown_video'):
f4776371
S
1153 if url is None:
1154 return default_ext
9cb9a5df 1155 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1156 if re.match(r'^[A-Za-z0-9]+$', guess):
1157 return guess
a7aaa398
S
1158 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1159 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1160 return guess.rstrip('/')
73e79f2a 1161 else:
cbdbb766 1162 return default_ext
73e79f2a 1163
5f6a1245 1164
d4051a8e 1165def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1166 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1167
5f6a1245 1168
bd558525 1169def date_from_str(date_str):
37254abc
JMF
1170 """
1171 Return a datetime object from a string in the format YYYYMMDD or
1172 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1173 today = datetime.date.today()
f8795e10 1174 if date_str in ('now', 'today'):
37254abc 1175 return today
f8795e10
PH
1176 if date_str == 'yesterday':
1177 return today - datetime.timedelta(days=1)
37254abc
JMF
1178 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1179 if match is not None:
1180 sign = match.group('sign')
1181 time = int(match.group('time'))
1182 if sign == '-':
1183 time = -time
1184 unit = match.group('unit')
dfb1b146 1185 # A bad approximation?
37254abc
JMF
1186 if unit == 'month':
1187 unit = 'day'
1188 time *= 30
1189 elif unit == 'year':
1190 unit = 'day'
1191 time *= 365
1192 unit += 's'
1193 delta = datetime.timedelta(**{unit: time})
1194 return today + delta
611c1dd9 1195 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1196
1197
e63fc1be 1198def hyphenate_date(date_str):
1199 """
1200 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1201 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1202 if match is not None:
1203 return '-'.join(match.groups())
1204 else:
1205 return date_str
1206
5f6a1245 1207
bd558525
JMF
1208class DateRange(object):
1209 """Represents a time interval between two dates"""
5f6a1245 1210
bd558525
JMF
1211 def __init__(self, start=None, end=None):
1212 """start and end must be strings in the format accepted by date"""
1213 if start is not None:
1214 self.start = date_from_str(start)
1215 else:
1216 self.start = datetime.datetime.min.date()
1217 if end is not None:
1218 self.end = date_from_str(end)
1219 else:
1220 self.end = datetime.datetime.max.date()
37254abc 1221 if self.start > self.end:
bd558525 1222 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1223
bd558525
JMF
1224 @classmethod
1225 def day(cls, day):
1226 """Returns a range that only contains the given day"""
5f6a1245
JW
1227 return cls(day, day)
1228
bd558525
JMF
1229 def __contains__(self, date):
1230 """Check if the date is in the range"""
37254abc
JMF
1231 if not isinstance(date, datetime.date):
1232 date = date_from_str(date)
1233 return self.start <= date <= self.end
5f6a1245 1234
bd558525 1235 def __str__(self):
5f6a1245 1236 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1237
1238
1239def platform_name():
1240 """ Returns the platform name as a compat_str """
1241 res = platform.platform()
1242 if isinstance(res, bytes):
1243 res = res.decode(preferredencoding())
1244
1245 assert isinstance(res, compat_str)
1246 return res
c257baff
PH
1247
1248
b58ddb32
PH
1249def _windows_write_string(s, out):
1250 """ Returns True if the string was written using special methods,
1251 False if it has yet to be written out."""
1252 # Adapted from http://stackoverflow.com/a/3259271/35070
1253
1254 import ctypes
1255 import ctypes.wintypes
1256
1257 WIN_OUTPUT_IDS = {
1258 1: -11,
1259 2: -12,
1260 }
1261
a383a98a
PH
1262 try:
1263 fileno = out.fileno()
1264 except AttributeError:
1265 # If the output stream doesn't have a fileno, it's virtual
1266 return False
aa42e873
PH
1267 except io.UnsupportedOperation:
1268 # Some strange Windows pseudo files?
1269 return False
b58ddb32
PH
1270 if fileno not in WIN_OUTPUT_IDS:
1271 return False
1272
e2f89ec7 1273 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1274 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1275 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1276 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1277
e2f89ec7 1278 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1279 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1280 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1281 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1282 written = ctypes.wintypes.DWORD(0)
1283
611c1dd9 1284 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1285 FILE_TYPE_CHAR = 0x0002
1286 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1287 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1288 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1289 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1290 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1291 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1292
1293 def not_a_console(handle):
1294 if handle == INVALID_HANDLE_VALUE or handle is None:
1295 return True
8fb3ac36
PH
1296 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1297 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1298
1299 if not_a_console(h):
1300 return False
1301
d1b9c912
PH
1302 def next_nonbmp_pos(s):
1303 try:
1304 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1305 except StopIteration:
1306 return len(s)
1307
1308 while s:
1309 count = min(next_nonbmp_pos(s), 1024)
1310
b58ddb32 1311 ret = WriteConsoleW(
d1b9c912 1312 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1313 if ret == 0:
1314 raise OSError('Failed to write string')
d1b9c912
PH
1315 if not count: # We just wrote a non-BMP character
1316 assert written.value == 2
1317 s = s[1:]
1318 else:
1319 assert written.value > 0
1320 s = s[written.value:]
b58ddb32
PH
1321 return True
1322
1323
734f90bb 1324def write_string(s, out=None, encoding=None):
7459e3a2
PH
1325 if out is None:
1326 out = sys.stderr
8bf48f23 1327 assert type(s) == compat_str
7459e3a2 1328
b58ddb32
PH
1329 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1330 if _windows_write_string(s, out):
1331 return
1332
7459e3a2
PH
1333 if ('b' in getattr(out, 'mode', '') or
1334 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1335 byt = s.encode(encoding or preferredencoding(), 'ignore')
1336 out.write(byt)
1337 elif hasattr(out, 'buffer'):
1338 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1339 byt = s.encode(enc, 'ignore')
1340 out.buffer.write(byt)
1341 else:
8bf48f23 1342 out.write(s)
7459e3a2
PH
1343 out.flush()
1344
1345
48ea9cea
PH
1346def bytes_to_intlist(bs):
1347 if not bs:
1348 return []
1349 if isinstance(bs[0], int): # Python 3
1350 return list(bs)
1351 else:
1352 return [ord(c) for c in bs]
1353
c257baff 1354
cba892fa 1355def intlist_to_bytes(xs):
1356 if not xs:
1357 return b''
edaa23f8 1358 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1359
1360
c1c9a79c
PH
1361# Cross-platform file locking
1362if sys.platform == 'win32':
1363 import ctypes.wintypes
1364 import msvcrt
1365
1366 class OVERLAPPED(ctypes.Structure):
1367 _fields_ = [
1368 ('Internal', ctypes.wintypes.LPVOID),
1369 ('InternalHigh', ctypes.wintypes.LPVOID),
1370 ('Offset', ctypes.wintypes.DWORD),
1371 ('OffsetHigh', ctypes.wintypes.DWORD),
1372 ('hEvent', ctypes.wintypes.HANDLE),
1373 ]
1374
1375 kernel32 = ctypes.windll.kernel32
1376 LockFileEx = kernel32.LockFileEx
1377 LockFileEx.argtypes = [
1378 ctypes.wintypes.HANDLE, # hFile
1379 ctypes.wintypes.DWORD, # dwFlags
1380 ctypes.wintypes.DWORD, # dwReserved
1381 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1382 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1383 ctypes.POINTER(OVERLAPPED) # Overlapped
1384 ]
1385 LockFileEx.restype = ctypes.wintypes.BOOL
1386 UnlockFileEx = kernel32.UnlockFileEx
1387 UnlockFileEx.argtypes = [
1388 ctypes.wintypes.HANDLE, # hFile
1389 ctypes.wintypes.DWORD, # dwReserved
1390 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1391 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1392 ctypes.POINTER(OVERLAPPED) # Overlapped
1393 ]
1394 UnlockFileEx.restype = ctypes.wintypes.BOOL
1395 whole_low = 0xffffffff
1396 whole_high = 0x7fffffff
1397
1398 def _lock_file(f, exclusive):
1399 overlapped = OVERLAPPED()
1400 overlapped.Offset = 0
1401 overlapped.OffsetHigh = 0
1402 overlapped.hEvent = 0
1403 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1404 handle = msvcrt.get_osfhandle(f.fileno())
1405 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1406 whole_low, whole_high, f._lock_file_overlapped_p):
1407 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1408
1409 def _unlock_file(f):
1410 assert f._lock_file_overlapped_p
1411 handle = msvcrt.get_osfhandle(f.fileno())
1412 if not UnlockFileEx(handle, 0,
1413 whole_low, whole_high, f._lock_file_overlapped_p):
1414 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1415
1416else:
399a76e6
YCH
1417 # Some platforms, such as Jython, is missing fcntl
1418 try:
1419 import fcntl
c1c9a79c 1420
399a76e6
YCH
1421 def _lock_file(f, exclusive):
1422 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1423
399a76e6
YCH
1424 def _unlock_file(f):
1425 fcntl.flock(f, fcntl.LOCK_UN)
1426 except ImportError:
1427 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1428
1429 def _lock_file(f, exclusive):
1430 raise IOError(UNSUPPORTED_MSG)
1431
1432 def _unlock_file(f):
1433 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1434
1435
1436class locked_file(object):
1437 def __init__(self, filename, mode, encoding=None):
1438 assert mode in ['r', 'a', 'w']
1439 self.f = io.open(filename, mode, encoding=encoding)
1440 self.mode = mode
1441
1442 def __enter__(self):
1443 exclusive = self.mode != 'r'
1444 try:
1445 _lock_file(self.f, exclusive)
1446 except IOError:
1447 self.f.close()
1448 raise
1449 return self
1450
1451 def __exit__(self, etype, value, traceback):
1452 try:
1453 _unlock_file(self.f)
1454 finally:
1455 self.f.close()
1456
1457 def __iter__(self):
1458 return iter(self.f)
1459
1460 def write(self, *args):
1461 return self.f.write(*args)
1462
1463 def read(self, *args):
1464 return self.f.read(*args)
4eb7f1d1
JMF
1465
1466
4644ac55
S
1467def get_filesystem_encoding():
1468 encoding = sys.getfilesystemencoding()
1469 return encoding if encoding is not None else 'utf-8'
1470
1471
4eb7f1d1 1472def shell_quote(args):
a6a173c2 1473 quoted_args = []
4644ac55 1474 encoding = get_filesystem_encoding()
a6a173c2
JMF
1475 for a in args:
1476 if isinstance(a, bytes):
1477 # We may get a filename encoded with 'encodeFilename'
1478 a = a.decode(encoding)
1479 quoted_args.append(pipes.quote(a))
28e614de 1480 return ' '.join(quoted_args)
9d4660ca
PH
1481
1482
1483def smuggle_url(url, data):
1484 """ Pass additional data in a URL for internal use. """
1485
81953d1a
RA
1486 url, idata = unsmuggle_url(url, {})
1487 data.update(idata)
15707c7e 1488 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1489 {'__youtubedl_smuggle': json.dumps(data)})
1490 return url + '#' + sdata
9d4660ca
PH
1491
1492
79f82953 1493def unsmuggle_url(smug_url, default=None):
83e865a3 1494 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1495 return smug_url, default
28e614de
PH
1496 url, _, sdata = smug_url.rpartition('#')
1497 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1498 data = json.loads(jsond)
1499 return url, data
02dbf93f
PH
1500
1501
02dbf93f
PH
1502def format_bytes(bytes):
1503 if bytes is None:
28e614de 1504 return 'N/A'
02dbf93f
PH
1505 if type(bytes) is str:
1506 bytes = float(bytes)
1507 if bytes == 0.0:
1508 exponent = 0
1509 else:
1510 exponent = int(math.log(bytes, 1024.0))
28e614de 1511 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1512 converted = float(bytes) / float(1024 ** exponent)
28e614de 1513 return '%.2f%s' % (converted, suffix)
f53c966a 1514
1c088fa8 1515
fb47597b
S
1516def lookup_unit_table(unit_table, s):
1517 units_re = '|'.join(re.escape(u) for u in unit_table)
1518 m = re.match(
782b1b5b 1519 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1520 if not m:
1521 return None
1522 num_str = m.group('num').replace(',', '.')
1523 mult = unit_table[m.group('unit')]
1524 return int(float(num_str) * mult)
1525
1526
be64b5b0
PH
1527def parse_filesize(s):
1528 if s is None:
1529 return None
1530
dfb1b146 1531 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1532 # but we support those too
1533 _UNIT_TABLE = {
1534 'B': 1,
1535 'b': 1,
70852b47 1536 'bytes': 1,
be64b5b0
PH
1537 'KiB': 1024,
1538 'KB': 1000,
1539 'kB': 1024,
1540 'Kb': 1000,
13585d76 1541 'kb': 1000,
70852b47
YCH
1542 'kilobytes': 1000,
1543 'kibibytes': 1024,
be64b5b0
PH
1544 'MiB': 1024 ** 2,
1545 'MB': 1000 ** 2,
1546 'mB': 1024 ** 2,
1547 'Mb': 1000 ** 2,
13585d76 1548 'mb': 1000 ** 2,
70852b47
YCH
1549 'megabytes': 1000 ** 2,
1550 'mebibytes': 1024 ** 2,
be64b5b0
PH
1551 'GiB': 1024 ** 3,
1552 'GB': 1000 ** 3,
1553 'gB': 1024 ** 3,
1554 'Gb': 1000 ** 3,
13585d76 1555 'gb': 1000 ** 3,
70852b47
YCH
1556 'gigabytes': 1000 ** 3,
1557 'gibibytes': 1024 ** 3,
be64b5b0
PH
1558 'TiB': 1024 ** 4,
1559 'TB': 1000 ** 4,
1560 'tB': 1024 ** 4,
1561 'Tb': 1000 ** 4,
13585d76 1562 'tb': 1000 ** 4,
70852b47
YCH
1563 'terabytes': 1000 ** 4,
1564 'tebibytes': 1024 ** 4,
be64b5b0
PH
1565 'PiB': 1024 ** 5,
1566 'PB': 1000 ** 5,
1567 'pB': 1024 ** 5,
1568 'Pb': 1000 ** 5,
13585d76 1569 'pb': 1000 ** 5,
70852b47
YCH
1570 'petabytes': 1000 ** 5,
1571 'pebibytes': 1024 ** 5,
be64b5b0
PH
1572 'EiB': 1024 ** 6,
1573 'EB': 1000 ** 6,
1574 'eB': 1024 ** 6,
1575 'Eb': 1000 ** 6,
13585d76 1576 'eb': 1000 ** 6,
70852b47
YCH
1577 'exabytes': 1000 ** 6,
1578 'exbibytes': 1024 ** 6,
be64b5b0
PH
1579 'ZiB': 1024 ** 7,
1580 'ZB': 1000 ** 7,
1581 'zB': 1024 ** 7,
1582 'Zb': 1000 ** 7,
13585d76 1583 'zb': 1000 ** 7,
70852b47
YCH
1584 'zettabytes': 1000 ** 7,
1585 'zebibytes': 1024 ** 7,
be64b5b0
PH
1586 'YiB': 1024 ** 8,
1587 'YB': 1000 ** 8,
1588 'yB': 1024 ** 8,
1589 'Yb': 1000 ** 8,
13585d76 1590 'yb': 1000 ** 8,
70852b47
YCH
1591 'yottabytes': 1000 ** 8,
1592 'yobibytes': 1024 ** 8,
be64b5b0
PH
1593 }
1594
fb47597b
S
1595 return lookup_unit_table(_UNIT_TABLE, s)
1596
1597
1598def parse_count(s):
1599 if s is None:
be64b5b0
PH
1600 return None
1601
fb47597b
S
1602 s = s.strip()
1603
1604 if re.match(r'^[\d,.]+$', s):
1605 return str_to_int(s)
1606
1607 _UNIT_TABLE = {
1608 'k': 1000,
1609 'K': 1000,
1610 'm': 1000 ** 2,
1611 'M': 1000 ** 2,
1612 'kk': 1000 ** 2,
1613 'KK': 1000 ** 2,
1614 }
be64b5b0 1615
fb47597b 1616 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1617
2f7ae819 1618
a942d6cb 1619def month_by_name(name, lang='en'):
caefb1de
PH
1620 """ Return the number of a month by (locale-independently) English name """
1621
f6717dec 1622 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1623
caefb1de 1624 try:
f6717dec 1625 return month_names.index(name) + 1
7105440c
YCH
1626 except ValueError:
1627 return None
1628
1629
1630def month_by_abbreviation(abbrev):
1631 """ Return the number of a month by (locale-independently) English
1632 abbreviations """
1633
1634 try:
1635 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1636 except ValueError:
1637 return None
18258362
JMF
1638
1639
5aafe895 1640def fix_xml_ampersands(xml_str):
18258362 1641 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1642 return re.sub(
1643 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1644 '&amp;',
5aafe895 1645 xml_str)
e3946f98
PH
1646
1647
1648def setproctitle(title):
8bf48f23 1649 assert isinstance(title, compat_str)
c1c05c67
YCH
1650
1651 # ctypes in Jython is not complete
1652 # http://bugs.jython.org/issue2148
1653 if sys.platform.startswith('java'):
1654 return
1655
e3946f98 1656 try:
611c1dd9 1657 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1658 except OSError:
1659 return
6eefe533
PH
1660 title_bytes = title.encode('utf-8')
1661 buf = ctypes.create_string_buffer(len(title_bytes))
1662 buf.value = title_bytes
e3946f98 1663 try:
6eefe533 1664 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1665 except AttributeError:
1666 return # Strange libc, just skip this
d7dda168
PH
1667
1668
1669def remove_start(s, start):
46bc9b7d 1670 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1671
1672
2b9faf55 1673def remove_end(s, end):
46bc9b7d 1674 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1675
1676
31b2051e
S
1677def remove_quotes(s):
1678 if s is None or len(s) < 2:
1679 return s
1680 for quote in ('"', "'", ):
1681 if s[0] == quote and s[-1] == quote:
1682 return s[1:-1]
1683 return s
1684
1685
29eb5174 1686def url_basename(url):
9b8aaeed 1687 path = compat_urlparse.urlparse(url).path
28e614de 1688 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1689
1690
1691class HEADRequest(compat_urllib_request.Request):
1692 def get_method(self):
611c1dd9 1693 return 'HEAD'
7217e148
PH
1694
1695
95cf60e8
S
1696class PUTRequest(compat_urllib_request.Request):
1697 def get_method(self):
1698 return 'PUT'
1699
1700
9732d77e 1701def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1702 if get_attr:
1703 if v is not None:
1704 v = getattr(v, get_attr, None)
9572013d
PH
1705 if v == '':
1706 v = None
1812afb7
S
1707 if v is None:
1708 return default
1709 try:
1710 return int(v) * invscale // scale
1711 except ValueError:
af98f8ff 1712 return default
9732d77e 1713
9572013d 1714
40a90862
JMF
1715def str_or_none(v, default=None):
1716 return default if v is None else compat_str(v)
1717
9732d77e
PH
1718
1719def str_to_int(int_str):
48d4681e 1720 """ A more relaxed version of int_or_none """
9732d77e
PH
1721 if int_str is None:
1722 return None
28e614de 1723 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1724 return int(int_str)
608d11f5
PH
1725
1726
9732d77e 1727def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1728 if v is None:
1729 return default
1730 try:
1731 return float(v) * invscale / scale
1732 except ValueError:
1733 return default
43f775e4
PH
1734
1735
b72b4431
S
1736def strip_or_none(v):
1737 return None if v is None else v.strip()
1738
1739
608d11f5 1740def parse_duration(s):
8f9312c3 1741 if not isinstance(s, compat_basestring):
608d11f5
PH
1742 return None
1743
ca7b3246
S
1744 s = s.strip()
1745
acaff495 1746 days, hours, mins, secs, ms = [None] * 5
1747 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1748 if m:
1749 days, hours, mins, secs, ms = m.groups()
1750 else:
1751 m = re.match(
1752 r'''(?ix)(?:P?T)?
8f4b58d7 1753 (?:
acaff495 1754 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1755 )?
acaff495 1756 (?:
1757 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1758 )?
1759 (?:
1760 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1761 )?
1762 (?:
1763 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1764 )?$''', s)
1765 if m:
1766 days, hours, mins, secs, ms = m.groups()
1767 else:
1768 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1769 if m:
1770 hours, mins = m.groups()
1771 else:
1772 return None
1773
1774 duration = 0
1775 if secs:
1776 duration += float(secs)
1777 if mins:
1778 duration += float(mins) * 60
1779 if hours:
1780 duration += float(hours) * 60 * 60
1781 if days:
1782 duration += float(days) * 24 * 60 * 60
1783 if ms:
1784 duration += float(ms)
1785 return duration
91d7d0b3
JMF
1786
1787
e65e4c88 1788def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1789 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1790 return (
1791 '{0}.{1}{2}'.format(name, ext, real_ext)
1792 if not expected_real_ext or real_ext[1:] == expected_real_ext
1793 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1794
1795
b3ed15b7
S
1796def replace_extension(filename, ext, expected_real_ext=None):
1797 name, real_ext = os.path.splitext(filename)
1798 return '{0}.{1}'.format(
1799 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1800 ext)
1801
1802
d70ad093
PH
1803def check_executable(exe, args=[]):
1804 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1805 args can be a list of arguments for a short output (like -version) """
1806 try:
1807 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1808 except OSError:
1809 return False
1810 return exe
b7ab0590
PH
1811
1812
95807118 1813def get_exe_version(exe, args=['--version'],
cae97f65 1814 version_re=None, unrecognized='present'):
95807118
PH
1815 """ Returns the version of the specified executable,
1816 or False if the executable is not present """
1817 try:
cae97f65 1818 out, _ = subprocess.Popen(
54116803 1819 [encodeArgument(exe)] + args,
95807118
PH
1820 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1821 except OSError:
1822 return False
cae97f65
PH
1823 if isinstance(out, bytes): # Python 2.x
1824 out = out.decode('ascii', 'ignore')
1825 return detect_exe_version(out, version_re, unrecognized)
1826
1827
1828def detect_exe_version(output, version_re=None, unrecognized='present'):
1829 assert isinstance(output, compat_str)
1830 if version_re is None:
1831 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1832 m = re.search(version_re, output)
95807118
PH
1833 if m:
1834 return m.group(1)
1835 else:
1836 return unrecognized
1837
1838
b7ab0590 1839class PagedList(object):
dd26ced1
PH
1840 def __len__(self):
1841 # This is only useful for tests
1842 return len(self.getslice())
1843
9c44d242
PH
1844
1845class OnDemandPagedList(PagedList):
b95dc034 1846 def __init__(self, pagefunc, pagesize, use_cache=False):
9c44d242
PH
1847 self._pagefunc = pagefunc
1848 self._pagesize = pagesize
b95dc034
YCH
1849 self._use_cache = use_cache
1850 if use_cache:
1851 self._cache = {}
9c44d242 1852
b7ab0590
PH
1853 def getslice(self, start=0, end=None):
1854 res = []
1855 for pagenum in itertools.count(start // self._pagesize):
1856 firstid = pagenum * self._pagesize
1857 nextfirstid = pagenum * self._pagesize + self._pagesize
1858 if start >= nextfirstid:
1859 continue
1860
b95dc034
YCH
1861 page_results = None
1862 if self._use_cache:
1863 page_results = self._cache.get(pagenum)
1864 if page_results is None:
1865 page_results = list(self._pagefunc(pagenum))
1866 if self._use_cache:
1867 self._cache[pagenum] = page_results
b7ab0590
PH
1868
1869 startv = (
1870 start % self._pagesize
1871 if firstid <= start < nextfirstid
1872 else 0)
1873
1874 endv = (
1875 ((end - 1) % self._pagesize) + 1
1876 if (end is not None and firstid <= end <= nextfirstid)
1877 else None)
1878
1879 if startv != 0 or endv is not None:
1880 page_results = page_results[startv:endv]
1881 res.extend(page_results)
1882
1883 # A little optimization - if current page is not "full", ie. does
1884 # not contain page_size videos then we can assume that this page
1885 # is the last one - there are no more ids on further pages -
1886 # i.e. no need to query again.
1887 if len(page_results) + startv < self._pagesize:
1888 break
1889
1890 # If we got the whole page, but the next page is not interesting,
1891 # break out early as well
1892 if end == nextfirstid:
1893 break
1894 return res
81c2f20b
PH
1895
1896
9c44d242
PH
1897class InAdvancePagedList(PagedList):
1898 def __init__(self, pagefunc, pagecount, pagesize):
1899 self._pagefunc = pagefunc
1900 self._pagecount = pagecount
1901 self._pagesize = pagesize
1902
1903 def getslice(self, start=0, end=None):
1904 res = []
1905 start_page = start // self._pagesize
1906 end_page = (
1907 self._pagecount if end is None else (end // self._pagesize + 1))
1908 skip_elems = start - start_page * self._pagesize
1909 only_more = None if end is None else end - start
1910 for pagenum in range(start_page, end_page):
1911 page = list(self._pagefunc(pagenum))
1912 if skip_elems:
1913 page = page[skip_elems:]
1914 skip_elems = None
1915 if only_more is not None:
1916 if len(page) < only_more:
1917 only_more -= len(page)
1918 else:
1919 page = page[:only_more]
1920 res.extend(page)
1921 break
1922 res.extend(page)
1923 return res
1924
1925
81c2f20b 1926def uppercase_escape(s):
676eb3f2 1927 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1928 return re.sub(
a612753d 1929 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1930 lambda m: unicode_escape(m.group(0))[0],
1931 s)
0fe2ff78
YCH
1932
1933
1934def lowercase_escape(s):
1935 unicode_escape = codecs.getdecoder('unicode_escape')
1936 return re.sub(
1937 r'\\u[0-9a-fA-F]{4}',
1938 lambda m: unicode_escape(m.group(0))[0],
1939 s)
b53466e1 1940
d05cfe06
S
1941
1942def escape_rfc3986(s):
1943 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1944 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1945 s = s.encode('utf-8')
ecc0c5ee 1946 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1947
1948
1949def escape_url(url):
1950 """Escape URL as suggested by RFC 3986"""
1951 url_parsed = compat_urllib_parse_urlparse(url)
1952 return url_parsed._replace(
efbed08d 1953 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
1954 path=escape_rfc3986(url_parsed.path),
1955 params=escape_rfc3986(url_parsed.params),
1956 query=escape_rfc3986(url_parsed.query),
1957 fragment=escape_rfc3986(url_parsed.fragment)
1958 ).geturl()
1959
62e609ab
PH
1960
1961def read_batch_urls(batch_fd):
1962 def fixup(url):
1963 if not isinstance(url, compat_str):
1964 url = url.decode('utf-8', 'replace')
28e614de 1965 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1966 if url.startswith(BOM_UTF8):
1967 url = url[len(BOM_UTF8):]
1968 url = url.strip()
1969 if url.startswith(('#', ';', ']')):
1970 return False
1971 return url
1972
1973 with contextlib.closing(batch_fd) as fd:
1974 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1975
1976
1977def urlencode_postdata(*args, **kargs):
15707c7e 1978 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1979
1980
38f9ef31 1981def update_url_query(url, query):
cacd9966
YCH
1982 if not query:
1983 return url
38f9ef31 1984 parsed_url = compat_urlparse.urlparse(url)
1985 qs = compat_parse_qs(parsed_url.query)
1986 qs.update(query)
1987 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 1988 query=compat_urllib_parse_urlencode(qs, True)))
16392824 1989
8e60dc75 1990
ed0291d1
S
1991def update_Request(req, url=None, data=None, headers={}, query={}):
1992 req_headers = req.headers.copy()
1993 req_headers.update(headers)
1994 req_data = data or req.data
1995 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
1996 req_get_method = req.get_method()
1997 if req_get_method == 'HEAD':
1998 req_type = HEADRequest
1999 elif req_get_method == 'PUT':
2000 req_type = PUTRequest
2001 else:
2002 req_type = compat_urllib_request.Request
ed0291d1
S
2003 new_req = req_type(
2004 req_url, data=req_data, headers=req_headers,
2005 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2006 if hasattr(req, 'timeout'):
2007 new_req.timeout = req.timeout
2008 return new_req
2009
2010
86296ad2 2011def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
2012 if isinstance(key_or_keys, (list, tuple)):
2013 for key in key_or_keys:
86296ad2
S
2014 if key not in d or d[key] is None or skip_false_values and not d[key]:
2015 continue
2016 return d[key]
cbecc9b9
S
2017 return default
2018 return d.get(key_or_keys, default)
2019
2020
329ca3be
S
2021def try_get(src, getter, expected_type=None):
2022 try:
2023 v = getter(src)
2024 except (AttributeError, KeyError, TypeError, IndexError):
2025 pass
2026 else:
2027 if expected_type is None or isinstance(v, expected_type):
2028 return v
2029
2030
8e60dc75
S
2031def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2032 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2033
16392824 2034
a1a530b0
PH
2035US_RATINGS = {
2036 'G': 0,
2037 'PG': 10,
2038 'PG-13': 13,
2039 'R': 16,
2040 'NC': 18,
2041}
fac55558
PH
2042
2043
a8795327
S
2044TV_PARENTAL_GUIDELINES = {
2045 'TV-Y': 0,
2046 'TV-Y7': 7,
2047 'TV-G': 0,
2048 'TV-PG': 0,
2049 'TV-14': 14,
2050 'TV-MA': 17,
2051}
2052
2053
146c80e2 2054def parse_age_limit(s):
a8795327
S
2055 if type(s) == int:
2056 return s if 0 <= s <= 21 else None
2057 if not isinstance(s, compat_basestring):
d838b1bd 2058 return None
146c80e2 2059 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2060 if m:
2061 return int(m.group('age'))
2062 if s in US_RATINGS:
2063 return US_RATINGS[s]
2064 return TV_PARENTAL_GUIDELINES.get(s)
146c80e2
S
2065
2066
fac55558 2067def strip_jsonp(code):
609a61e3 2068 return re.sub(
5950cb1d 2069 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
2070
2071
e05f6939
PH
2072def js_to_json(code):
2073 def fix_kv(m):
e7b6d122
PH
2074 v = m.group(0)
2075 if v in ('true', 'false', 'null'):
2076 return v
bd1e4844 2077 elif v.startswith('/*') or v == ',':
2078 return ""
2079
2080 if v[0] in ("'", '"'):
2081 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 2082 '"': '\\"',
bd1e4844 2083 "\\'": "'",
2084 '\\\n': '',
2085 '\\x': '\\u00',
2086 }.get(m.group(0), m.group(0)), v[1:-1])
2087
89ac4a19 2088 INTEGER_TABLE = (
e4659b45
YCH
2089 (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
2090 (r'^(0+[0-7]+)\s*:?$', 8),
89ac4a19
S
2091 )
2092
2093 for regex, base in INTEGER_TABLE:
2094 im = re.match(regex, v)
2095 if im:
e4659b45 2096 i = int(im.group(1), base)
89ac4a19
S
2097 return '"%d":' % i if v.endswith(':') else '%d' % i
2098
e7b6d122 2099 return '"%s"' % v
e05f6939 2100
bd1e4844 2101 return re.sub(r'''(?sx)
2102 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2103 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2104 /\*.*?\*/|,(?=\s*[\]}])|
2105 [a-zA-Z_][.a-zA-Z_0-9]*|
47212f7b 2106 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
bd1e4844 2107 [0-9]+(?=\s*:)
e05f6939 2108 ''', fix_kv, code)
e05f6939
PH
2109
2110
478c2c61
PH
2111def qualities(quality_ids):
2112 """ Get a numeric quality value out of a list of possible values """
2113 def q(qid):
2114 try:
2115 return quality_ids.index(qid)
2116 except ValueError:
2117 return -1
2118 return q
2119
acd69589
PH
2120
2121DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 2122
a020a0dc
PH
2123
2124def limit_length(s, length):
2125 """ Add ellipses to overly long strings """
2126 if s is None:
2127 return None
2128 ELLIPSES = '...'
2129 if len(s) > length:
2130 return s[:length - len(ELLIPSES)] + ELLIPSES
2131 return s
48844745
PH
2132
2133
2134def version_tuple(v):
5f9b8394 2135 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2136
2137
2138def is_outdated_version(version, limit, assume_new=True):
2139 if not version:
2140 return not assume_new
2141 try:
2142 return version_tuple(version) < version_tuple(limit)
2143 except ValueError:
2144 return not assume_new
732ea2f0
PH
2145
2146
2147def ytdl_is_updateable():
2148 """ Returns if youtube-dl can be updated with -U """
2149 from zipimport import zipimporter
2150
2151 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
2152
2153
2154def args_to_str(args):
2155 # Get a short string representation for a subprocess command
702ccf2d 2156 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2157
2158
9b9c5355 2159def error_to_compat_str(err):
fdae2358
S
2160 err_str = str(err)
2161 # On python 2 error byte string must be decoded with proper
2162 # encoding rather than ascii
2163 if sys.version_info[0] < 3:
2164 err_str = err_str.decode(preferredencoding())
2165 return err_str
2166
2167
c460bdd5 2168def mimetype2ext(mt):
eb9ee194
S
2169 if mt is None:
2170 return None
2171
765ac263
JMF
2172 ext = {
2173 'audio/mp4': 'm4a',
6c33d24b
YCH
2174 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2175 # it's the most popular one
2176 'audio/mpeg': 'mp3',
765ac263
JMF
2177 }.get(mt)
2178 if ext is not None:
2179 return ext
2180
c460bdd5 2181 _, _, res = mt.rpartition('/')
6562d34a 2182 res = res.split(';')[0].strip().lower()
c460bdd5
PH
2183
2184 return {
f6861ec9 2185 '3gpp': '3gp',
cafcf657 2186 'smptett+xml': 'tt',
2187 'srt': 'srt',
2188 'ttaf+xml': 'dfxp',
a0d8d704 2189 'ttml+xml': 'ttml',
cafcf657 2190 'vtt': 'vtt',
f6861ec9 2191 'x-flv': 'flv',
a0d8d704
YCH
2192 'x-mp4-fragmented': 'mp4',
2193 'x-ms-wmv': 'wmv',
b4173f15
RA
2194 'mpegurl': 'm3u8',
2195 'x-mpegurl': 'm3u8',
2196 'vnd.apple.mpegurl': 'm3u8',
2197 'dash+xml': 'mpd',
2198 'f4m': 'f4m',
2199 'f4m+xml': 'f4m',
f164b971 2200 'hds+xml': 'f4m',
e910fe2f 2201 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 2202 'quicktime': 'mov',
c460bdd5
PH
2203 }.get(res, res)
2204
2205
4f3c5e06 2206def parse_codecs(codecs_str):
2207 # http://tools.ietf.org/html/rfc6381
2208 if not codecs_str:
2209 return {}
2210 splited_codecs = list(filter(None, map(
2211 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2212 vcodec, acodec = None, None
2213 for full_codec in splited_codecs:
2214 codec = full_codec.split('.')[0]
2215 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2216 if not vcodec:
2217 vcodec = full_codec
073ac122 2218 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
4f3c5e06 2219 if not acodec:
2220 acodec = full_codec
2221 else:
2222 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2223 if not vcodec and not acodec:
2224 if len(splited_codecs) == 2:
2225 return {
2226 'vcodec': vcodec,
2227 'acodec': acodec,
2228 }
2229 elif len(splited_codecs) == 1:
2230 return {
2231 'vcodec': 'none',
2232 'acodec': vcodec,
2233 }
2234 else:
2235 return {
2236 'vcodec': vcodec or 'none',
2237 'acodec': acodec or 'none',
2238 }
2239 return {}
2240
2241
2ccd1b10 2242def urlhandle_detect_ext(url_handle):
79298173 2243 getheader = url_handle.headers.get
2ccd1b10 2244
b55ee18f
PH
2245 cd = getheader('Content-Disposition')
2246 if cd:
2247 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2248 if m:
2249 e = determine_ext(m.group('filename'), default_ext=None)
2250 if e:
2251 return e
2252
c460bdd5 2253 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2254
2255
1e399778
YCH
2256def encode_data_uri(data, mime_type):
2257 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2258
2259
05900629 2260def age_restricted(content_limit, age_limit):
6ec6cb4e 2261 """ Returns True iff the content should be blocked """
05900629
PH
2262
2263 if age_limit is None: # No limit set
2264 return False
2265 if content_limit is None:
2266 return False # Content available for everyone
2267 return age_limit < content_limit
61ca9a80
PH
2268
2269
2270def is_html(first_bytes):
2271 """ Detect whether a file contains HTML by examining its first bytes. """
2272
2273 BOMS = [
2274 (b'\xef\xbb\xbf', 'utf-8'),
2275 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2276 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2277 (b'\xff\xfe', 'utf-16-le'),
2278 (b'\xfe\xff', 'utf-16-be'),
2279 ]
2280 for bom, enc in BOMS:
2281 if first_bytes.startswith(bom):
2282 s = first_bytes[len(bom):].decode(enc, 'replace')
2283 break
2284 else:
2285 s = first_bytes.decode('utf-8', 'replace')
2286
2287 return re.match(r'^\s*<', s)
a055469f
PH
2288
2289
2290def determine_protocol(info_dict):
2291 protocol = info_dict.get('protocol')
2292 if protocol is not None:
2293 return protocol
2294
2295 url = info_dict['url']
2296 if url.startswith('rtmp'):
2297 return 'rtmp'
2298 elif url.startswith('mms'):
2299 return 'mms'
2300 elif url.startswith('rtsp'):
2301 return 'rtsp'
2302
2303 ext = determine_ext(url)
2304 if ext == 'm3u8':
2305 return 'm3u8'
2306 elif ext == 'f4m':
2307 return 'f4m'
2308
2309 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2310
2311
2312def render_table(header_row, data):
2313 """ Render a list of rows, each as a list of values """
2314 table = [header_row] + data
2315 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2316 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2317 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2318
2319
2320def _match_one(filter_part, dct):
2321 COMPARISON_OPERATORS = {
2322 '<': operator.lt,
2323 '<=': operator.le,
2324 '>': operator.gt,
2325 '>=': operator.ge,
2326 '=': operator.eq,
2327 '!=': operator.ne,
2328 }
2329 operator_rex = re.compile(r'''(?x)\s*
2330 (?P<key>[a-z_]+)
2331 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2332 (?:
2333 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2334 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2335 )
2336 \s*$
2337 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2338 m = operator_rex.search(filter_part)
2339 if m:
2340 op = COMPARISON_OPERATORS[m.group('op')]
2341 if m.group('strval') is not None:
2342 if m.group('op') not in ('=', '!='):
2343 raise ValueError(
2344 'Operator %s does not support string values!' % m.group('op'))
2345 comparison_value = m.group('strval')
2346 else:
2347 try:
2348 comparison_value = int(m.group('intval'))
2349 except ValueError:
2350 comparison_value = parse_filesize(m.group('intval'))
2351 if comparison_value is None:
2352 comparison_value = parse_filesize(m.group('intval') + 'B')
2353 if comparison_value is None:
2354 raise ValueError(
2355 'Invalid integer value %r in filter part %r' % (
2356 m.group('intval'), filter_part))
2357 actual_value = dct.get(m.group('key'))
2358 if actual_value is None:
2359 return m.group('none_inclusive')
2360 return op(actual_value, comparison_value)
2361
2362 UNARY_OPERATORS = {
2363 '': lambda v: v is not None,
2364 '!': lambda v: v is None,
2365 }
2366 operator_rex = re.compile(r'''(?x)\s*
2367 (?P<op>%s)\s*(?P<key>[a-z_]+)
2368 \s*$
2369 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2370 m = operator_rex.search(filter_part)
2371 if m:
2372 op = UNARY_OPERATORS[m.group('op')]
2373 actual_value = dct.get(m.group('key'))
2374 return op(actual_value)
2375
2376 raise ValueError('Invalid filter part %r' % filter_part)
2377
2378
2379def match_str(filter_str, dct):
2380 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2381
2382 return all(
2383 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2384
2385
2386def match_filter_func(filter_str):
2387 def _match_func(info_dict):
2388 if match_str(filter_str, info_dict):
2389 return None
2390 else:
2391 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2392 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2393 return _match_func
91410c9b
PH
2394
2395
bf6427d2
YCH
2396def parse_dfxp_time_expr(time_expr):
2397 if not time_expr:
d631d5f9 2398 return
bf6427d2
YCH
2399
2400 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2401 if mobj:
2402 return float(mobj.group('time_offset'))
2403
db2fe38b 2404 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2405 if mobj:
db2fe38b 2406 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2407
2408
c1c924ab
YCH
2409def srt_subtitles_timecode(seconds):
2410 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2411
2412
2413def dfxp2srt(dfxp_data):
4e335771
YCH
2414 _x = functools.partial(xpath_with_ns, ns_map={
2415 'ttml': 'http://www.w3.org/ns/ttml',
2416 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
5bf28d78 2417 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
4e335771 2418 })
bf6427d2 2419
87de7069 2420 class TTMLPElementParser(object):
2b14cb56 2421 out = ''
bf6427d2 2422
2b14cb56 2423 def start(self, tag, attrib):
2424 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2425 self.out += '\n'
bf6427d2 2426
2b14cb56 2427 def end(self, tag):
2428 pass
bf6427d2 2429
2b14cb56 2430 def data(self, data):
2431 self.out += data
2432
2433 def close(self):
2434 return self.out.strip()
2435
2436 def parse_node(node):
2437 target = TTMLPElementParser()
2438 parser = xml.etree.ElementTree.XMLParser(target=target)
2439 parser.feed(xml.etree.ElementTree.tostring(node))
2440 return parser.close()
bf6427d2 2441
36e6f62c 2442 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
bf6427d2 2443 out = []
5bf28d78 2444 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2445
2446 if not paras:
2447 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
2448
2449 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2450 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2451 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2452 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2453 if begin_time is None:
2454 continue
7dff0363 2455 if not end_time:
d631d5f9
YCH
2456 if not dur:
2457 continue
2458 end_time = begin_time + dur
bf6427d2
YCH
2459 out.append('%d\n%s --> %s\n%s\n\n' % (
2460 index,
c1c924ab
YCH
2461 srt_subtitles_timecode(begin_time),
2462 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2463 parse_node(para)))
2464
2465 return ''.join(out)
2466
2467
66e289ba
S
2468def cli_option(params, command_option, param):
2469 param = params.get(param)
98e698f1
RA
2470 if param:
2471 param = compat_str(param)
66e289ba
S
2472 return [command_option, param] if param is not None else []
2473
2474
2475def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2476 param = params.get(param)
2477 assert isinstance(param, bool)
2478 if separator:
2479 return [command_option + separator + (true_value if param else false_value)]
2480 return [command_option, true_value if param else false_value]
2481
2482
2483def cli_valueless_option(params, command_option, param, expected_value=True):
2484 param = params.get(param)
2485 return [command_option] if param == expected_value else []
2486
2487
2488def cli_configuration_args(params, param, default=[]):
2489 ex_args = params.get(param)
2490 if ex_args is None:
2491 return default
2492 assert isinstance(ex_args, list)
2493 return ex_args
2494
2495
39672624
YCH
2496class ISO639Utils(object):
2497 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2498 _lang_map = {
2499 'aa': 'aar',
2500 'ab': 'abk',
2501 'ae': 'ave',
2502 'af': 'afr',
2503 'ak': 'aka',
2504 'am': 'amh',
2505 'an': 'arg',
2506 'ar': 'ara',
2507 'as': 'asm',
2508 'av': 'ava',
2509 'ay': 'aym',
2510 'az': 'aze',
2511 'ba': 'bak',
2512 'be': 'bel',
2513 'bg': 'bul',
2514 'bh': 'bih',
2515 'bi': 'bis',
2516 'bm': 'bam',
2517 'bn': 'ben',
2518 'bo': 'bod',
2519 'br': 'bre',
2520 'bs': 'bos',
2521 'ca': 'cat',
2522 'ce': 'che',
2523 'ch': 'cha',
2524 'co': 'cos',
2525 'cr': 'cre',
2526 'cs': 'ces',
2527 'cu': 'chu',
2528 'cv': 'chv',
2529 'cy': 'cym',
2530 'da': 'dan',
2531 'de': 'deu',
2532 'dv': 'div',
2533 'dz': 'dzo',
2534 'ee': 'ewe',
2535 'el': 'ell',
2536 'en': 'eng',
2537 'eo': 'epo',
2538 'es': 'spa',
2539 'et': 'est',
2540 'eu': 'eus',
2541 'fa': 'fas',
2542 'ff': 'ful',
2543 'fi': 'fin',
2544 'fj': 'fij',
2545 'fo': 'fao',
2546 'fr': 'fra',
2547 'fy': 'fry',
2548 'ga': 'gle',
2549 'gd': 'gla',
2550 'gl': 'glg',
2551 'gn': 'grn',
2552 'gu': 'guj',
2553 'gv': 'glv',
2554 'ha': 'hau',
2555 'he': 'heb',
2556 'hi': 'hin',
2557 'ho': 'hmo',
2558 'hr': 'hrv',
2559 'ht': 'hat',
2560 'hu': 'hun',
2561 'hy': 'hye',
2562 'hz': 'her',
2563 'ia': 'ina',
2564 'id': 'ind',
2565 'ie': 'ile',
2566 'ig': 'ibo',
2567 'ii': 'iii',
2568 'ik': 'ipk',
2569 'io': 'ido',
2570 'is': 'isl',
2571 'it': 'ita',
2572 'iu': 'iku',
2573 'ja': 'jpn',
2574 'jv': 'jav',
2575 'ka': 'kat',
2576 'kg': 'kon',
2577 'ki': 'kik',
2578 'kj': 'kua',
2579 'kk': 'kaz',
2580 'kl': 'kal',
2581 'km': 'khm',
2582 'kn': 'kan',
2583 'ko': 'kor',
2584 'kr': 'kau',
2585 'ks': 'kas',
2586 'ku': 'kur',
2587 'kv': 'kom',
2588 'kw': 'cor',
2589 'ky': 'kir',
2590 'la': 'lat',
2591 'lb': 'ltz',
2592 'lg': 'lug',
2593 'li': 'lim',
2594 'ln': 'lin',
2595 'lo': 'lao',
2596 'lt': 'lit',
2597 'lu': 'lub',
2598 'lv': 'lav',
2599 'mg': 'mlg',
2600 'mh': 'mah',
2601 'mi': 'mri',
2602 'mk': 'mkd',
2603 'ml': 'mal',
2604 'mn': 'mon',
2605 'mr': 'mar',
2606 'ms': 'msa',
2607 'mt': 'mlt',
2608 'my': 'mya',
2609 'na': 'nau',
2610 'nb': 'nob',
2611 'nd': 'nde',
2612 'ne': 'nep',
2613 'ng': 'ndo',
2614 'nl': 'nld',
2615 'nn': 'nno',
2616 'no': 'nor',
2617 'nr': 'nbl',
2618 'nv': 'nav',
2619 'ny': 'nya',
2620 'oc': 'oci',
2621 'oj': 'oji',
2622 'om': 'orm',
2623 'or': 'ori',
2624 'os': 'oss',
2625 'pa': 'pan',
2626 'pi': 'pli',
2627 'pl': 'pol',
2628 'ps': 'pus',
2629 'pt': 'por',
2630 'qu': 'que',
2631 'rm': 'roh',
2632 'rn': 'run',
2633 'ro': 'ron',
2634 'ru': 'rus',
2635 'rw': 'kin',
2636 'sa': 'san',
2637 'sc': 'srd',
2638 'sd': 'snd',
2639 'se': 'sme',
2640 'sg': 'sag',
2641 'si': 'sin',
2642 'sk': 'slk',
2643 'sl': 'slv',
2644 'sm': 'smo',
2645 'sn': 'sna',
2646 'so': 'som',
2647 'sq': 'sqi',
2648 'sr': 'srp',
2649 'ss': 'ssw',
2650 'st': 'sot',
2651 'su': 'sun',
2652 'sv': 'swe',
2653 'sw': 'swa',
2654 'ta': 'tam',
2655 'te': 'tel',
2656 'tg': 'tgk',
2657 'th': 'tha',
2658 'ti': 'tir',
2659 'tk': 'tuk',
2660 'tl': 'tgl',
2661 'tn': 'tsn',
2662 'to': 'ton',
2663 'tr': 'tur',
2664 'ts': 'tso',
2665 'tt': 'tat',
2666 'tw': 'twi',
2667 'ty': 'tah',
2668 'ug': 'uig',
2669 'uk': 'ukr',
2670 'ur': 'urd',
2671 'uz': 'uzb',
2672 've': 'ven',
2673 'vi': 'vie',
2674 'vo': 'vol',
2675 'wa': 'wln',
2676 'wo': 'wol',
2677 'xh': 'xho',
2678 'yi': 'yid',
2679 'yo': 'yor',
2680 'za': 'zha',
2681 'zh': 'zho',
2682 'zu': 'zul',
2683 }
2684
2685 @classmethod
2686 def short2long(cls, code):
2687 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2688 return cls._lang_map.get(code[:2])
2689
2690 @classmethod
2691 def long2short(cls, code):
2692 """Convert language code from ISO 639-2/T to ISO 639-1"""
2693 for short_name, long_name in cls._lang_map.items():
2694 if long_name == code:
2695 return short_name
2696
2697
4eb10f66
YCH
2698class ISO3166Utils(object):
2699 # From http://data.okfn.org/data/core/country-list
2700 _country_map = {
2701 'AF': 'Afghanistan',
2702 'AX': 'Åland Islands',
2703 'AL': 'Albania',
2704 'DZ': 'Algeria',
2705 'AS': 'American Samoa',
2706 'AD': 'Andorra',
2707 'AO': 'Angola',
2708 'AI': 'Anguilla',
2709 'AQ': 'Antarctica',
2710 'AG': 'Antigua and Barbuda',
2711 'AR': 'Argentina',
2712 'AM': 'Armenia',
2713 'AW': 'Aruba',
2714 'AU': 'Australia',
2715 'AT': 'Austria',
2716 'AZ': 'Azerbaijan',
2717 'BS': 'Bahamas',
2718 'BH': 'Bahrain',
2719 'BD': 'Bangladesh',
2720 'BB': 'Barbados',
2721 'BY': 'Belarus',
2722 'BE': 'Belgium',
2723 'BZ': 'Belize',
2724 'BJ': 'Benin',
2725 'BM': 'Bermuda',
2726 'BT': 'Bhutan',
2727 'BO': 'Bolivia, Plurinational State of',
2728 'BQ': 'Bonaire, Sint Eustatius and Saba',
2729 'BA': 'Bosnia and Herzegovina',
2730 'BW': 'Botswana',
2731 'BV': 'Bouvet Island',
2732 'BR': 'Brazil',
2733 'IO': 'British Indian Ocean Territory',
2734 'BN': 'Brunei Darussalam',
2735 'BG': 'Bulgaria',
2736 'BF': 'Burkina Faso',
2737 'BI': 'Burundi',
2738 'KH': 'Cambodia',
2739 'CM': 'Cameroon',
2740 'CA': 'Canada',
2741 'CV': 'Cape Verde',
2742 'KY': 'Cayman Islands',
2743 'CF': 'Central African Republic',
2744 'TD': 'Chad',
2745 'CL': 'Chile',
2746 'CN': 'China',
2747 'CX': 'Christmas Island',
2748 'CC': 'Cocos (Keeling) Islands',
2749 'CO': 'Colombia',
2750 'KM': 'Comoros',
2751 'CG': 'Congo',
2752 'CD': 'Congo, the Democratic Republic of the',
2753 'CK': 'Cook Islands',
2754 'CR': 'Costa Rica',
2755 'CI': 'Côte d\'Ivoire',
2756 'HR': 'Croatia',
2757 'CU': 'Cuba',
2758 'CW': 'Curaçao',
2759 'CY': 'Cyprus',
2760 'CZ': 'Czech Republic',
2761 'DK': 'Denmark',
2762 'DJ': 'Djibouti',
2763 'DM': 'Dominica',
2764 'DO': 'Dominican Republic',
2765 'EC': 'Ecuador',
2766 'EG': 'Egypt',
2767 'SV': 'El Salvador',
2768 'GQ': 'Equatorial Guinea',
2769 'ER': 'Eritrea',
2770 'EE': 'Estonia',
2771 'ET': 'Ethiopia',
2772 'FK': 'Falkland Islands (Malvinas)',
2773 'FO': 'Faroe Islands',
2774 'FJ': 'Fiji',
2775 'FI': 'Finland',
2776 'FR': 'France',
2777 'GF': 'French Guiana',
2778 'PF': 'French Polynesia',
2779 'TF': 'French Southern Territories',
2780 'GA': 'Gabon',
2781 'GM': 'Gambia',
2782 'GE': 'Georgia',
2783 'DE': 'Germany',
2784 'GH': 'Ghana',
2785 'GI': 'Gibraltar',
2786 'GR': 'Greece',
2787 'GL': 'Greenland',
2788 'GD': 'Grenada',
2789 'GP': 'Guadeloupe',
2790 'GU': 'Guam',
2791 'GT': 'Guatemala',
2792 'GG': 'Guernsey',
2793 'GN': 'Guinea',
2794 'GW': 'Guinea-Bissau',
2795 'GY': 'Guyana',
2796 'HT': 'Haiti',
2797 'HM': 'Heard Island and McDonald Islands',
2798 'VA': 'Holy See (Vatican City State)',
2799 'HN': 'Honduras',
2800 'HK': 'Hong Kong',
2801 'HU': 'Hungary',
2802 'IS': 'Iceland',
2803 'IN': 'India',
2804 'ID': 'Indonesia',
2805 'IR': 'Iran, Islamic Republic of',
2806 'IQ': 'Iraq',
2807 'IE': 'Ireland',
2808 'IM': 'Isle of Man',
2809 'IL': 'Israel',
2810 'IT': 'Italy',
2811 'JM': 'Jamaica',
2812 'JP': 'Japan',
2813 'JE': 'Jersey',
2814 'JO': 'Jordan',
2815 'KZ': 'Kazakhstan',
2816 'KE': 'Kenya',
2817 'KI': 'Kiribati',
2818 'KP': 'Korea, Democratic People\'s Republic of',
2819 'KR': 'Korea, Republic of',
2820 'KW': 'Kuwait',
2821 'KG': 'Kyrgyzstan',
2822 'LA': 'Lao People\'s Democratic Republic',
2823 'LV': 'Latvia',
2824 'LB': 'Lebanon',
2825 'LS': 'Lesotho',
2826 'LR': 'Liberia',
2827 'LY': 'Libya',
2828 'LI': 'Liechtenstein',
2829 'LT': 'Lithuania',
2830 'LU': 'Luxembourg',
2831 'MO': 'Macao',
2832 'MK': 'Macedonia, the Former Yugoslav Republic of',
2833 'MG': 'Madagascar',
2834 'MW': 'Malawi',
2835 'MY': 'Malaysia',
2836 'MV': 'Maldives',
2837 'ML': 'Mali',
2838 'MT': 'Malta',
2839 'MH': 'Marshall Islands',
2840 'MQ': 'Martinique',
2841 'MR': 'Mauritania',
2842 'MU': 'Mauritius',
2843 'YT': 'Mayotte',
2844 'MX': 'Mexico',
2845 'FM': 'Micronesia, Federated States of',
2846 'MD': 'Moldova, Republic of',
2847 'MC': 'Monaco',
2848 'MN': 'Mongolia',
2849 'ME': 'Montenegro',
2850 'MS': 'Montserrat',
2851 'MA': 'Morocco',
2852 'MZ': 'Mozambique',
2853 'MM': 'Myanmar',
2854 'NA': 'Namibia',
2855 'NR': 'Nauru',
2856 'NP': 'Nepal',
2857 'NL': 'Netherlands',
2858 'NC': 'New Caledonia',
2859 'NZ': 'New Zealand',
2860 'NI': 'Nicaragua',
2861 'NE': 'Niger',
2862 'NG': 'Nigeria',
2863 'NU': 'Niue',
2864 'NF': 'Norfolk Island',
2865 'MP': 'Northern Mariana Islands',
2866 'NO': 'Norway',
2867 'OM': 'Oman',
2868 'PK': 'Pakistan',
2869 'PW': 'Palau',
2870 'PS': 'Palestine, State of',
2871 'PA': 'Panama',
2872 'PG': 'Papua New Guinea',
2873 'PY': 'Paraguay',
2874 'PE': 'Peru',
2875 'PH': 'Philippines',
2876 'PN': 'Pitcairn',
2877 'PL': 'Poland',
2878 'PT': 'Portugal',
2879 'PR': 'Puerto Rico',
2880 'QA': 'Qatar',
2881 'RE': 'Réunion',
2882 'RO': 'Romania',
2883 'RU': 'Russian Federation',
2884 'RW': 'Rwanda',
2885 'BL': 'Saint Barthélemy',
2886 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2887 'KN': 'Saint Kitts and Nevis',
2888 'LC': 'Saint Lucia',
2889 'MF': 'Saint Martin (French part)',
2890 'PM': 'Saint Pierre and Miquelon',
2891 'VC': 'Saint Vincent and the Grenadines',
2892 'WS': 'Samoa',
2893 'SM': 'San Marino',
2894 'ST': 'Sao Tome and Principe',
2895 'SA': 'Saudi Arabia',
2896 'SN': 'Senegal',
2897 'RS': 'Serbia',
2898 'SC': 'Seychelles',
2899 'SL': 'Sierra Leone',
2900 'SG': 'Singapore',
2901 'SX': 'Sint Maarten (Dutch part)',
2902 'SK': 'Slovakia',
2903 'SI': 'Slovenia',
2904 'SB': 'Solomon Islands',
2905 'SO': 'Somalia',
2906 'ZA': 'South Africa',
2907 'GS': 'South Georgia and the South Sandwich Islands',
2908 'SS': 'South Sudan',
2909 'ES': 'Spain',
2910 'LK': 'Sri Lanka',
2911 'SD': 'Sudan',
2912 'SR': 'Suriname',
2913 'SJ': 'Svalbard and Jan Mayen',
2914 'SZ': 'Swaziland',
2915 'SE': 'Sweden',
2916 'CH': 'Switzerland',
2917 'SY': 'Syrian Arab Republic',
2918 'TW': 'Taiwan, Province of China',
2919 'TJ': 'Tajikistan',
2920 'TZ': 'Tanzania, United Republic of',
2921 'TH': 'Thailand',
2922 'TL': 'Timor-Leste',
2923 'TG': 'Togo',
2924 'TK': 'Tokelau',
2925 'TO': 'Tonga',
2926 'TT': 'Trinidad and Tobago',
2927 'TN': 'Tunisia',
2928 'TR': 'Turkey',
2929 'TM': 'Turkmenistan',
2930 'TC': 'Turks and Caicos Islands',
2931 'TV': 'Tuvalu',
2932 'UG': 'Uganda',
2933 'UA': 'Ukraine',
2934 'AE': 'United Arab Emirates',
2935 'GB': 'United Kingdom',
2936 'US': 'United States',
2937 'UM': 'United States Minor Outlying Islands',
2938 'UY': 'Uruguay',
2939 'UZ': 'Uzbekistan',
2940 'VU': 'Vanuatu',
2941 'VE': 'Venezuela, Bolivarian Republic of',
2942 'VN': 'Viet Nam',
2943 'VG': 'Virgin Islands, British',
2944 'VI': 'Virgin Islands, U.S.',
2945 'WF': 'Wallis and Futuna',
2946 'EH': 'Western Sahara',
2947 'YE': 'Yemen',
2948 'ZM': 'Zambia',
2949 'ZW': 'Zimbabwe',
2950 }
2951
2952 @classmethod
2953 def short2full(cls, code):
2954 """Convert an ISO 3166-2 country code to the corresponding full name"""
2955 return cls._country_map.get(code.upper())
2956
2957
91410c9b 2958class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2959 def __init__(self, proxies=None):
2960 # Set default handlers
2961 for type in ('http', 'https'):
2962 setattr(self, '%s_open' % type,
2963 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2964 meth(r, proxy, type))
2965 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2966
91410c9b 2967 def proxy_open(self, req, proxy, type):
2461f79d 2968 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
2969 if req_proxy is not None:
2970 proxy = req_proxy
2461f79d
PH
2971 del req.headers['Ytdl-request-proxy']
2972
2973 if proxy == '__noproxy__':
2974 return None # No Proxy
51fb4995 2975 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
2976 req.add_header('Ytdl-socks-proxy', proxy)
2977 # youtube-dl's http/https handlers do wrapping the socket with socks
2978 return None
91410c9b
PH
2979 return compat_urllib_request.ProxyHandler.proxy_open(
2980 self, req, proxy, type)
5bc880b9
YCH
2981
2982
2983def ohdave_rsa_encrypt(data, exponent, modulus):
2984 '''
2985 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2986
2987 Input:
2988 data: data to encrypt, bytes-like object
2989 exponent, modulus: parameter e and N of RSA algorithm, both integer
2990 Output: hex string of encrypted data
2991
2992 Limitation: supports one block encryption only
2993 '''
2994
2995 payload = int(binascii.hexlify(data[::-1]), 16)
2996 encrypted = pow(payload, exponent, modulus)
2997 return '%x' % encrypted
81bdc8fd
YCH
2998
2999
5eb6bdce 3000def encode_base_n(num, n, table=None):
59f898b7 3001 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
3002 if not table:
3003 table = FULL_TABLE[:n]
3004
5eb6bdce
YCH
3005 if n > len(table):
3006 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3007
3008 if num == 0:
3009 return table[0]
3010
81bdc8fd
YCH
3011 ret = ''
3012 while num:
3013 ret = table[num % n] + ret
3014 num = num // n
3015 return ret
f52354a8
YCH
3016
3017
3018def decode_packed_codes(code):
3019 mobj = re.search(
680079be 3020 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
f52354a8
YCH
3021 code)
3022 obfucasted_code, base, count, symbols = mobj.groups()
3023 base = int(base)
3024 count = int(count)
3025 symbols = symbols.split('|')
3026 symbol_table = {}
3027
3028 while count:
3029 count -= 1
5eb6bdce 3030 base_n_count = encode_base_n(count, base)
f52354a8
YCH
3031 symbol_table[base_n_count] = symbols[count] or base_n_count
3032
3033 return re.sub(
3034 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3035 obfucasted_code)
e154c651 3036
3037
3038def parse_m3u8_attributes(attrib):
3039 info = {}
3040 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3041 if val.startswith('"'):
3042 val = val[1:-1]
3043 info[key] = val
3044 return info
1143535d
YCH
3045
3046
3047def urshift(val, n):
3048 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
3049
3050
3051# Based on png2str() written by @gdkchan and improved by @yokrysty
3052# Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3053def decode_png(png_data):
3054 # Reference: https://www.w3.org/TR/PNG/
3055 header = png_data[8:]
3056
3057 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3058 raise IOError('Not a valid PNG file.')
3059
3060 int_map = {1: '>B', 2: '>H', 4: '>I'}
3061 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3062
3063 chunks = []
3064
3065 while header:
3066 length = unpack_integer(header[:4])
3067 header = header[4:]
3068
3069 chunk_type = header[:4]
3070 header = header[4:]
3071
3072 chunk_data = header[:length]
3073 header = header[length:]
3074
3075 header = header[4:] # Skip CRC
3076
3077 chunks.append({
3078 'type': chunk_type,
3079 'length': length,
3080 'data': chunk_data
3081 })
3082
3083 ihdr = chunks[0]['data']
3084
3085 width = unpack_integer(ihdr[:4])
3086 height = unpack_integer(ihdr[4:8])
3087
3088 idat = b''
3089
3090 for chunk in chunks:
3091 if chunk['type'] == b'IDAT':
3092 idat += chunk['data']
3093
3094 if not idat:
3095 raise IOError('Unable to read PNG data.')
3096
3097 decompressed_data = bytearray(zlib.decompress(idat))
3098
3099 stride = width * 3
3100 pixels = []
3101
3102 def _get_pixel(idx):
3103 x = idx % stride
3104 y = idx // stride
3105 return pixels[y][x]
3106
3107 for y in range(height):
3108 basePos = y * (1 + stride)
3109 filter_type = decompressed_data[basePos]
3110
3111 current_row = []
3112
3113 pixels.append(current_row)
3114
3115 for x in range(stride):
3116 color = decompressed_data[1 + basePos + x]
3117 basex = y * stride + x
3118 left = 0
3119 up = 0
3120
3121 if x > 2:
3122 left = _get_pixel(basex - 3)
3123 if y > 0:
3124 up = _get_pixel(basex - stride)
3125
3126 if filter_type == 1: # Sub
3127 color = (color + left) & 0xff
3128 elif filter_type == 2: # Up
3129 color = (color + up) & 0xff
3130 elif filter_type == 3: # Average
3131 color = (color + ((left + up) >> 1)) & 0xff
3132 elif filter_type == 4: # Paeth
3133 a = left
3134 b = up
3135 c = 0
3136
3137 if x > 2 and y > 0:
3138 c = _get_pixel(basex - stride - 3)
3139
3140 p = a + b - c
3141
3142 pa = abs(p - a)
3143 pb = abs(p - b)
3144 pc = abs(p - c)
3145
3146 if pa <= pb and pa <= pc:
3147 color = (color + a) & 0xff
3148 elif pb <= pc:
3149 color = (color + b) & 0xff
3150 else:
3151 color = (color + c) & 0xff
3152
3153 current_row.append(color)
3154
3155 return width, height, pixels
efa97bdc
YCH
3156
3157
3158def write_xattr(path, key, value):
3159 # This mess below finds the best xattr tool for the job
3160 try:
3161 # try the pyxattr module...
3162 import xattr
3163
3164 # Unicode arguments are not supported in python-pyxattr until
3165 # version 0.5.0
3166 # See https://github.com/rg3/youtube-dl/issues/5498
3167 pyxattr_required_version = '0.5.0'
3168 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3169 # TODO: fallback to CLI tools
3170 raise XAttrUnavailableError(
3171 'python-pyxattr is detected but is too old. '
3172 'youtube-dl requires %s or above while your version is %s. '
3173 'Falling back to other xattr implementations' % (
3174 pyxattr_required_version, xattr.__version__))
3175
3176 try:
3177 xattr.set(path, key, value)
3178 except EnvironmentError as e:
3179 raise XAttrMetadataError(e.errno, e.strerror)
3180
3181 except ImportError:
3182 if compat_os_name == 'nt':
3183 # Write xattrs to NTFS Alternate Data Streams:
3184 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3185 assert ':' not in key
3186 assert os.path.exists(path)
3187
3188 ads_fn = path + ':' + key
3189 try:
3190 with open(ads_fn, 'wb') as f:
3191 f.write(value)
3192 except EnvironmentError as e:
3193 raise XAttrMetadataError(e.errno, e.strerror)
3194 else:
3195 user_has_setfattr = check_executable('setfattr', ['--version'])
3196 user_has_xattr = check_executable('xattr', ['-h'])
3197
3198 if user_has_setfattr or user_has_xattr:
3199
3200 value = value.decode('utf-8')
3201 if user_has_setfattr:
3202 executable = 'setfattr'
3203 opts = ['-n', key, '-v', value]
3204 elif user_has_xattr:
3205 executable = 'xattr'
3206 opts = ['-w', key, value]
3207
3208 cmd = ([encodeFilename(executable, True)] +
3209 [encodeArgument(o) for o in opts] +
3210 [encodeFilename(path, True)])
3211
3212 try:
3213 p = subprocess.Popen(
3214 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3215 except EnvironmentError as e:
3216 raise XAttrMetadataError(e.errno, e.strerror)
3217 stdout, stderr = p.communicate()
3218 stderr = stderr.decode('utf-8', 'replace')
3219 if p.returncode != 0:
3220 raise XAttrMetadataError(p.returncode, stderr)
3221
3222 else:
3223 # On Unix, and can't find pyxattr, setfattr, or xattr.
3224 if sys.platform.startswith('linux'):
3225 raise XAttrUnavailableError(
3226 "Couldn't find a tool to set the xattrs. "
3227 "Install either the python 'pyxattr' or 'xattr' "
3228 "modules, or the GNU 'attr' package "
3229 "(which contains the 'setfattr' tool).")
3230 else:
3231 raise XAttrUnavailableError(
3232 "Couldn't find a tool to set the xattrs. "
3233 "Install either the python 'xattr' module, "
3234 "or the 'xattr' binary.")