]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[postprocessor/execafterdownload] Encode command line (closes #13407)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd 1#!/usr/bin/env python
dcdb292f 2# coding: utf-8
d77c3dfd 3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
0c265486 14import email.header
f45c185f 15import errno
be4a824d 16import functools
d77c3dfd 17import gzip
03f9daab 18import io
79a2e94e 19import itertools
f4bfd65f 20import json
d77c3dfd 21import locale
02dbf93f 22import math
347de493 23import operator
d77c3dfd 24import os
4eb7f1d1 25import pipes
c496ca96 26import platform
773f291d 27import random
d77c3dfd 28import re
c496ca96 29import socket
79a2e94e 30import ssl
1c088fa8 31import subprocess
d77c3dfd 32import sys
181c8655 33import tempfile
01951dda 34import traceback
bcf89ce6 35import xml.etree.ElementTree
d77c3dfd 36import zlib
d77c3dfd 37
8c25f81b 38from .compat import (
b4a3d461 39 compat_HTMLParseError,
8bb56eee 40 compat_HTMLParser,
8f9312c3 41 compat_basestring,
8c25f81b 42 compat_chr,
36e6f62c 43 compat_etree_fromstring,
51098426 44 compat_expanduser,
8c25f81b 45 compat_html_entities,
55b2f099 46 compat_html_entities_html5,
be4a824d 47 compat_http_client,
c86b6142 48 compat_kwargs,
efa97bdc 49 compat_os_name,
8c25f81b 50 compat_parse_qs,
702ccf2d 51 compat_shlex_quote,
be4a824d 52 compat_socket_create_connection,
8c25f81b 53 compat_str,
edaa23f8 54 compat_struct_pack,
d3f8e038 55 compat_struct_unpack,
8c25f81b
PH
56 compat_urllib_error,
57 compat_urllib_parse,
15707c7e 58 compat_urllib_parse_urlencode,
8c25f81b 59 compat_urllib_parse_urlparse,
7581bfc9 60 compat_urllib_parse_unquote_plus,
8c25f81b
PH
61 compat_urllib_request,
62 compat_urlparse,
810c10ba 63 compat_xpath,
8c25f81b 64)
4644ac55 65
71aff188
YCH
66from .socks import (
67 ProxyType,
68 sockssocket,
69)
70
4644ac55 71
51fb4995
YCH
72def register_socks_protocols():
73 # "Register" SOCKS protocols
d5ae6bb5
YCH
74 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
75 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
76 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
77 if scheme not in compat_urlparse.uses_netloc:
78 compat_urlparse.uses_netloc.append(scheme)
79
80
468e2e92
FV
81# This is not clearly defined otherwise
82compiled_regex_type = type(re.compile(''))
83
3e669f36 84std_headers = {
15d10678 85 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
59ae15a5
PH
86 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
87 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
88 'Accept-Encoding': 'gzip, deflate',
89 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 90}
f427df17 91
5f6a1245 92
fb37eb25
S
93USER_AGENTS = {
94 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
95}
96
97
bf42a990
S
98NO_DEFAULT = object()
99
7105440c
YCH
100ENGLISH_MONTH_NAMES = [
101 'January', 'February', 'March', 'April', 'May', 'June',
102 'July', 'August', 'September', 'October', 'November', 'December']
103
f6717dec
S
104MONTH_NAMES = {
105 'en': ENGLISH_MONTH_NAMES,
106 'fr': [
3e4185c3
S
107 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
108 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 109}
a942d6cb 110
a7aaa398
S
111KNOWN_EXTENSIONS = (
112 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
113 'flv', 'f4v', 'f4a', 'f4b',
114 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
115 'mkv', 'mka', 'mk3d',
116 'avi', 'divx',
117 'mov',
118 'asf', 'wmv', 'wma',
119 '3gp', '3g2',
120 'mp3',
121 'flac',
122 'ape',
123 'wav',
124 'f4f', 'f4m', 'm3u8', 'smil')
125
c587cbb7 126# needed for sanitizing filenames in restricted mode
c8827027 127ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
128 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
129 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
c587cbb7 130
46f59e89
S
131DATE_FORMATS = (
132 '%d %B %Y',
133 '%d %b %Y',
134 '%B %d %Y',
cb655f34
S
135 '%B %dst %Y',
136 '%B %dnd %Y',
137 '%B %dth %Y',
46f59e89 138 '%b %d %Y',
cb655f34
S
139 '%b %dst %Y',
140 '%b %dnd %Y',
141 '%b %dth %Y',
46f59e89
S
142 '%b %dst %Y %I:%M',
143 '%b %dnd %Y %I:%M',
144 '%b %dth %Y %I:%M',
145 '%Y %m %d',
146 '%Y-%m-%d',
147 '%Y/%m/%d',
81c13222 148 '%Y/%m/%d %H:%M',
46f59e89 149 '%Y/%m/%d %H:%M:%S',
0c1c6f4b 150 '%Y-%m-%d %H:%M',
46f59e89
S
151 '%Y-%m-%d %H:%M:%S',
152 '%Y-%m-%d %H:%M:%S.%f',
153 '%d.%m.%Y %H:%M',
154 '%d.%m.%Y %H.%M',
155 '%Y-%m-%dT%H:%M:%SZ',
156 '%Y-%m-%dT%H:%M:%S.%fZ',
157 '%Y-%m-%dT%H:%M:%S.%f0Z',
158 '%Y-%m-%dT%H:%M:%S',
159 '%Y-%m-%dT%H:%M:%S.%f',
160 '%Y-%m-%dT%H:%M',
c6eed6b8
S
161 '%b %d %Y at %H:%M',
162 '%b %d %Y at %H:%M:%S',
46f59e89
S
163)
164
165DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
166DATE_FORMATS_DAY_FIRST.extend([
167 '%d-%m-%Y',
168 '%d.%m.%Y',
169 '%d.%m.%y',
170 '%d/%m/%Y',
171 '%d/%m/%y',
172 '%d/%m/%Y %H:%M:%S',
173])
174
175DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
176DATE_FORMATS_MONTH_FIRST.extend([
177 '%m-%d-%Y',
178 '%m.%d.%Y',
179 '%m/%d/%Y',
180 '%m/%d/%y',
181 '%m/%d/%Y %H:%M:%S',
182])
183
06b3fe29
S
184PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
185
7105440c 186
d77c3dfd 187def preferredencoding():
59ae15a5 188 """Get preferred encoding.
d77c3dfd 189
59ae15a5
PH
190 Returns the best encoding scheme for the system, based on
191 locale.getpreferredencoding() and some further tweaks.
192 """
193 try:
194 pref = locale.getpreferredencoding()
28e614de 195 'TEST'.encode(pref)
70a1165b 196 except Exception:
59ae15a5 197 pref = 'UTF-8'
bae611f2 198
59ae15a5 199 return pref
d77c3dfd 200
f4bfd65f 201
181c8655 202def write_json_file(obj, fn):
1394646a 203 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 204
92120217 205 fn = encodeFilename(fn)
61ee5aeb 206 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
207 encoding = get_filesystem_encoding()
208 # os.path.basename returns a bytes object, but NamedTemporaryFile
209 # will fail if the filename contains non ascii characters unless we
210 # use a unicode object
211 path_basename = lambda f: os.path.basename(fn).decode(encoding)
212 # the same for os.path.dirname
213 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
214 else:
215 path_basename = os.path.basename
216 path_dirname = os.path.dirname
217
73159f99
S
218 args = {
219 'suffix': '.tmp',
ec5f6016
JMF
220 'prefix': path_basename(fn) + '.',
221 'dir': path_dirname(fn),
73159f99
S
222 'delete': False,
223 }
224
181c8655
PH
225 # In Python 2.x, json.dump expects a bytestream.
226 # In Python 3.x, it writes to a character stream
227 if sys.version_info < (3, 0):
73159f99 228 args['mode'] = 'wb'
181c8655 229 else:
73159f99
S
230 args.update({
231 'mode': 'w',
232 'encoding': 'utf-8',
233 })
234
c86b6142 235 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
236
237 try:
238 with tf:
239 json.dump(obj, tf)
1394646a
IK
240 if sys.platform == 'win32':
241 # Need to remove existing file on Windows, else os.rename raises
242 # WindowsError or FileExistsError.
243 try:
244 os.unlink(fn)
245 except OSError:
246 pass
181c8655 247 os.rename(tf.name, fn)
70a1165b 248 except Exception:
181c8655
PH
249 try:
250 os.remove(tf.name)
251 except OSError:
252 pass
253 raise
254
255
256if sys.version_info >= (2, 7):
ee114368 257 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 258 """ Find the xpath xpath[@key=val] """
5d2354f1 259 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 260 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
261 return node.find(expr)
262else:
ee114368 263 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 264 for f in node.findall(compat_xpath(xpath)):
ee114368
S
265 if key not in f.attrib:
266 continue
267 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
268 return f
269 return None
270
d7e66d39
JMF
271# On python2.6 the xml.etree.ElementTree.Element methods don't support
272# the namespace parameter
5f6a1245
JW
273
274
d7e66d39
JMF
275def xpath_with_ns(path, ns_map):
276 components = [c.split(':') for c in path.split('/')]
277 replaced = []
278 for c in components:
279 if len(c) == 1:
280 replaced.append(c[0])
281 else:
282 ns, tag = c
283 replaced.append('{%s}%s' % (ns_map[ns], tag))
284 return '/'.join(replaced)
285
d77c3dfd 286
a41fb80c 287def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 288 def _find_xpath(xpath):
810c10ba 289 return node.find(compat_xpath(xpath))
578c0745
S
290
291 if isinstance(xpath, (str, compat_str)):
292 n = _find_xpath(xpath)
293 else:
294 for xp in xpath:
295 n = _find_xpath(xp)
296 if n is not None:
297 break
d74bebd5 298
8e636da4 299 if n is None:
bf42a990
S
300 if default is not NO_DEFAULT:
301 return default
302 elif fatal:
bf0ff932
PH
303 name = xpath if name is None else name
304 raise ExtractorError('Could not find XML element %s' % name)
305 else:
306 return None
a41fb80c
S
307 return n
308
309
310def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
311 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
312 if n is None or n == default:
313 return n
314 if n.text is None:
315 if default is not NO_DEFAULT:
316 return default
317 elif fatal:
318 name = xpath if name is None else name
319 raise ExtractorError('Could not find XML element\'s text %s' % name)
320 else:
321 return None
322 return n.text
a41fb80c
S
323
324
325def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
326 n = find_xpath_attr(node, xpath, key)
327 if n is None:
328 if default is not NO_DEFAULT:
329 return default
330 elif fatal:
331 name = '%s[@%s]' % (xpath, key) if name is None else name
332 raise ExtractorError('Could not find XML attribute %s' % name)
333 else:
334 return None
335 return n.attrib[key]
bf0ff932
PH
336
337
9e6dd238 338def get_element_by_id(id, html):
43e8fafd 339 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 340 return get_element_by_attribute('id', id, html)
43e8fafd 341
12ea2f30 342
84c237fb 343def get_element_by_class(class_name, html):
2af12ad9
TC
344 """Return the content of the first tag with the specified class in the passed HTML document"""
345 retval = get_elements_by_class(class_name, html)
346 return retval[0] if retval else None
347
348
349def get_element_by_attribute(attribute, value, html, escape_value=True):
350 retval = get_elements_by_attribute(attribute, value, html, escape_value)
351 return retval[0] if retval else None
352
353
354def get_elements_by_class(class_name, html):
355 """Return the content of all tags with the specified class in the passed HTML document as a list"""
356 return get_elements_by_attribute(
84c237fb
YCH
357 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
358 html, escape_value=False)
359
360
2af12ad9 361def get_elements_by_attribute(attribute, value, html, escape_value=True):
43e8fafd 362 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 363
84c237fb
YCH
364 value = re.escape(value) if escape_value else value
365
2af12ad9
TC
366 retlist = []
367 for m in re.finditer(r'''(?xs)
38285056 368 <([a-zA-Z0-9:._-]+)
abc97b5e 369 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056 370 \s+%s=['"]?%s['"]?
abc97b5e 371 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056
PH
372 \s*>
373 (?P<content>.*?)
374 </\1>
2af12ad9
TC
375 ''' % (re.escape(attribute), value), html):
376 res = m.group('content')
38285056 377
2af12ad9
TC
378 if res.startswith('"') or res.startswith("'"):
379 res = res[1:-1]
38285056 380
2af12ad9 381 retlist.append(unescapeHTML(res))
a921f407 382
2af12ad9 383 return retlist
a921f407 384
c5229f39 385
8bb56eee
BF
386class HTMLAttributeParser(compat_HTMLParser):
387 """Trivial HTML parser to gather the attributes for a single element"""
388 def __init__(self):
c5229f39 389 self.attrs = {}
8bb56eee
BF
390 compat_HTMLParser.__init__(self)
391
392 def handle_starttag(self, tag, attrs):
393 self.attrs = dict(attrs)
394
c5229f39 395
8bb56eee
BF
396def extract_attributes(html_element):
397 """Given a string for an HTML element such as
398 <el
399 a="foo" B="bar" c="&98;az" d=boz
400 empty= noval entity="&amp;"
401 sq='"' dq="'"
402 >
403 Decode and return a dictionary of attributes.
404 {
405 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
406 'empty': '', 'noval': None, 'entity': '&',
407 'sq': '"', 'dq': '\''
408 }.
409 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
410 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
411 """
412 parser = HTMLAttributeParser()
b4a3d461
S
413 try:
414 parser.feed(html_element)
415 parser.close()
416 # Older Python may throw HTMLParseError in case of malformed HTML
417 except compat_HTMLParseError:
418 pass
8bb56eee 419 return parser.attrs
9e6dd238 420
c5229f39 421
9e6dd238 422def clean_html(html):
59ae15a5 423 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
424
425 if html is None: # Convenience for sanitizing descriptions etc.
426 return html
427
59ae15a5
PH
428 # Newline vs <br />
429 html = html.replace('\n', ' ')
edd9221c
TF
430 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
431 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
432 # Strip html tags
433 html = re.sub('<.*?>', '', html)
434 # Replace html entities
435 html = unescapeHTML(html)
7decf895 436 return html.strip()
9e6dd238
FV
437
438
d77c3dfd 439def sanitize_open(filename, open_mode):
59ae15a5
PH
440 """Try to open the given filename, and slightly tweak it if this fails.
441
442 Attempts to open the given filename. If this fails, it tries to change
443 the filename slightly, step by step, until it's either able to open it
444 or it fails and raises a final exception, like the standard open()
445 function.
446
447 It returns the tuple (stream, definitive_file_name).
448 """
449 try:
28e614de 450 if filename == '-':
59ae15a5
PH
451 if sys.platform == 'win32':
452 import msvcrt
453 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 454 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
455 stream = open(encodeFilename(filename), open_mode)
456 return (stream, filename)
457 except (IOError, OSError) as err:
f45c185f
PH
458 if err.errno in (errno.EACCES,):
459 raise
59ae15a5 460
f45c185f 461 # In case of error, try to remove win32 forbidden chars
d55de57b 462 alt_filename = sanitize_path(filename)
f45c185f
PH
463 if alt_filename == filename:
464 raise
465 else:
466 # An exception here should be caught in the caller
d55de57b 467 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 468 return (stream, alt_filename)
d77c3dfd
FV
469
470
471def timeconvert(timestr):
59ae15a5
PH
472 """Convert RFC 2822 defined time string into system timestamp"""
473 timestamp = None
474 timetuple = email.utils.parsedate_tz(timestr)
475 if timetuple is not None:
476 timestamp = email.utils.mktime_tz(timetuple)
477 return timestamp
1c469a94 478
5f6a1245 479
796173d0 480def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
481 """Sanitizes a string so it could be used as part of a filename.
482 If restricted is set, use a stricter subset of allowed characters.
158af524
S
483 Set is_id if this is not an arbitrary string, but an ID that should be kept
484 if possible.
59ae15a5
PH
485 """
486 def replace_insane(char):
c587cbb7
AT
487 if restricted and char in ACCENT_CHARS:
488 return ACCENT_CHARS[char]
59ae15a5
PH
489 if char == '?' or ord(char) < 32 or ord(char) == 127:
490 return ''
491 elif char == '"':
492 return '' if restricted else '\''
493 elif char == ':':
494 return '_-' if restricted else ' -'
495 elif char in '\\/|*<>':
496 return '_'
627dcfff 497 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
498 return '_'
499 if restricted and ord(char) > 127:
500 return '_'
501 return char
502
2aeb06d6
PH
503 # Handle timestamps
504 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 505 result = ''.join(map(replace_insane, s))
796173d0
PH
506 if not is_id:
507 while '__' in result:
508 result = result.replace('__', '_')
509 result = result.strip('_')
510 # Common case of "Foreign band name - English song title"
511 if restricted and result.startswith('-_'):
512 result = result[2:]
5a42414b
PH
513 if result.startswith('-'):
514 result = '_' + result[len('-'):]
a7440261 515 result = result.lstrip('.')
796173d0
PH
516 if not result:
517 result = '_'
59ae15a5 518 return result
d77c3dfd 519
5f6a1245 520
a2aaf4db
S
521def sanitize_path(s):
522 """Sanitizes and normalizes path on Windows"""
523 if sys.platform != 'win32':
524 return s
be531ef1
S
525 drive_or_unc, _ = os.path.splitdrive(s)
526 if sys.version_info < (2, 7) and not drive_or_unc:
527 drive_or_unc, _ = os.path.splitunc(s)
528 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
529 if drive_or_unc:
a2aaf4db
S
530 norm_path.pop(0)
531 sanitized_path = [
ec85ded8 532 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 533 for path_part in norm_path]
be531ef1
S
534 if drive_or_unc:
535 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
536 return os.path.join(*sanitized_path)
537
538
67dda517
S
539# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
540# unwanted failures due to missing protocol
17bcc626
S
541def sanitize_url(url):
542 return 'http:%s' % url if url.startswith('//') else url
543
544
67dda517 545def sanitized_Request(url, *args, **kwargs):
17bcc626 546 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
547
548
51098426
S
549def expand_path(s):
550 """Expand shell variables and ~"""
551 return os.path.expandvars(compat_expanduser(s))
552
553
d77c3dfd 554def orderedSet(iterable):
59ae15a5
PH
555 """ Remove all duplicates from the input iterable """
556 res = []
557 for el in iterable:
558 if el not in res:
559 res.append(el)
560 return res
d77c3dfd 561
912b38b4 562
55b2f099 563def _htmlentity_transform(entity_with_semicolon):
4e408e47 564 """Transforms an HTML entity to a character."""
55b2f099
YCH
565 entity = entity_with_semicolon[:-1]
566
4e408e47
PH
567 # Known non-numeric HTML entity
568 if entity in compat_html_entities.name2codepoint:
569 return compat_chr(compat_html_entities.name2codepoint[entity])
570
55b2f099
YCH
571 # TODO: HTML5 allows entities without a semicolon. For example,
572 # '&Eacuteric' should be decoded as 'Éric'.
573 if entity_with_semicolon in compat_html_entities_html5:
574 return compat_html_entities_html5[entity_with_semicolon]
575
91757b0f 576 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
577 if mobj is not None:
578 numstr = mobj.group(1)
28e614de 579 if numstr.startswith('x'):
4e408e47 580 base = 16
28e614de 581 numstr = '0%s' % numstr
4e408e47
PH
582 else:
583 base = 10
7aefc49c
S
584 # See https://github.com/rg3/youtube-dl/issues/7518
585 try:
586 return compat_chr(int(numstr, base))
587 except ValueError:
588 pass
4e408e47
PH
589
590 # Unknown entity in name, return its literal representation
7a3f0c00 591 return '&%s;' % entity
4e408e47
PH
592
593
d77c3dfd 594def unescapeHTML(s):
912b38b4
PH
595 if s is None:
596 return None
597 assert type(s) == compat_str
d77c3dfd 598
4e408e47 599 return re.sub(
55b2f099 600 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 601
8bf48f23 602
aa49acd1
S
603def get_subprocess_encoding():
604 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
605 # For subprocess calls, encode with locale encoding
606 # Refer to http://stackoverflow.com/a/9951851/35070
607 encoding = preferredencoding()
608 else:
609 encoding = sys.getfilesystemencoding()
610 if encoding is None:
611 encoding = 'utf-8'
612 return encoding
613
614
8bf48f23 615def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
616 """
617 @param s The name of the file
618 """
d77c3dfd 619
8bf48f23 620 assert type(s) == compat_str
d77c3dfd 621
59ae15a5
PH
622 # Python 3 has a Unicode API
623 if sys.version_info >= (3, 0):
624 return s
0f00efed 625
aa49acd1
S
626 # Pass '' directly to use Unicode APIs on Windows 2000 and up
627 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
628 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
629 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
630 return s
631
8ee239e9
YCH
632 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
633 if sys.platform.startswith('java'):
634 return s
635
aa49acd1
S
636 return s.encode(get_subprocess_encoding(), 'ignore')
637
638
639def decodeFilename(b, for_subprocess=False):
640
641 if sys.version_info >= (3, 0):
642 return b
643
644 if not isinstance(b, bytes):
645 return b
646
647 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 648
f07b74fc
PH
649
650def encodeArgument(s):
651 if not isinstance(s, compat_str):
652 # Legacy code that uses byte strings
653 # Uncomment the following line after fixing all post processors
7af808a5 654 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
655 s = s.decode('ascii')
656 return encodeFilename(s, True)
657
658
aa49acd1
S
659def decodeArgument(b):
660 return decodeFilename(b, True)
661
662
8271226a
PH
663def decodeOption(optval):
664 if optval is None:
665 return optval
666 if isinstance(optval, bytes):
667 optval = optval.decode(preferredencoding())
668
669 assert isinstance(optval, compat_str)
670 return optval
1c256f70 671
5f6a1245 672
4539dd30
PH
673def formatSeconds(secs):
674 if secs > 3600:
675 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
676 elif secs > 60:
677 return '%d:%02d' % (secs // 60, secs % 60)
678 else:
679 return '%d' % secs
680
a0ddb8a2 681
be4a824d
PH
682def make_HTTPS_handler(params, **kwargs):
683 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 684 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 685 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 686 if opts_no_check_certificate:
be5f2c19 687 context.check_hostname = False
0db261ba 688 context.verify_mode = ssl.CERT_NONE
a2366922 689 try:
be4a824d 690 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
691 except TypeError:
692 # Python 2.7.8
693 # (create_default_context present but HTTPSHandler has no context=)
694 pass
695
696 if sys.version_info < (3, 2):
d7932313 697 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 698 else: # Python < 3.4
d7932313 699 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 700 context.verify_mode = (ssl.CERT_NONE
dca08720 701 if opts_no_check_certificate
ea6d901e 702 else ssl.CERT_REQUIRED)
303b479e 703 context.set_default_verify_paths()
be4a824d 704 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 705
732ea2f0 706
08f2a92c
JMF
707def bug_reports_message():
708 if ytdl_is_updateable():
709 update_cmd = 'type youtube-dl -U to update'
710 else:
711 update_cmd = 'see https://yt-dl.org/update on how to update'
712 msg = '; please report this issue on https://yt-dl.org/bug .'
713 msg += ' Make sure you are using the latest version; %s.' % update_cmd
714 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
715 return msg
716
717
bf5b9d85
PM
718class YoutubeDLError(Exception):
719 """Base exception for YoutubeDL errors."""
720 pass
721
722
723class ExtractorError(YoutubeDLError):
1c256f70 724 """Error during info extraction."""
5f6a1245 725
d11271dd 726 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
727 """ tb, if given, is the original traceback (so that it can be printed out).
728 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
729 """
730
731 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
732 expected = True
d11271dd
PH
733 if video_id is not None:
734 msg = video_id + ': ' + msg
410f3e73 735 if cause:
28e614de 736 msg += ' (caused by %r)' % cause
9a82b238 737 if not expected:
08f2a92c 738 msg += bug_reports_message()
1c256f70 739 super(ExtractorError, self).__init__(msg)
d5979c5d 740
1c256f70 741 self.traceback = tb
8cc83b8d 742 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 743 self.cause = cause
d11271dd 744 self.video_id = video_id
1c256f70 745
01951dda
PH
746 def format_traceback(self):
747 if self.traceback is None:
748 return None
28e614de 749 return ''.join(traceback.format_tb(self.traceback))
01951dda 750
1c256f70 751
416c7fcb
PH
752class UnsupportedError(ExtractorError):
753 def __init__(self, url):
754 super(UnsupportedError, self).__init__(
755 'Unsupported URL: %s' % url, expected=True)
756 self.url = url
757
758
55b3e45b
JMF
759class RegexNotFoundError(ExtractorError):
760 """Error when a regex didn't match"""
761 pass
762
763
773f291d
S
764class GeoRestrictedError(ExtractorError):
765 """Geographic restriction Error exception.
766
767 This exception may be thrown when a video is not available from your
768 geographic location due to geographic restrictions imposed by a website.
769 """
770 def __init__(self, msg, countries=None):
771 super(GeoRestrictedError, self).__init__(msg, expected=True)
772 self.msg = msg
773 self.countries = countries
774
775
bf5b9d85 776class DownloadError(YoutubeDLError):
59ae15a5 777 """Download Error exception.
d77c3dfd 778
59ae15a5
PH
779 This exception may be thrown by FileDownloader objects if they are not
780 configured to continue on errors. They will contain the appropriate
781 error message.
782 """
5f6a1245 783
8cc83b8d
FV
784 def __init__(self, msg, exc_info=None):
785 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
786 super(DownloadError, self).__init__(msg)
787 self.exc_info = exc_info
d77c3dfd
FV
788
789
bf5b9d85 790class SameFileError(YoutubeDLError):
59ae15a5 791 """Same File exception.
d77c3dfd 792
59ae15a5
PH
793 This exception will be thrown by FileDownloader objects if they detect
794 multiple files would have to be downloaded to the same file on disk.
795 """
796 pass
d77c3dfd
FV
797
798
bf5b9d85 799class PostProcessingError(YoutubeDLError):
59ae15a5 800 """Post Processing exception.
d77c3dfd 801
59ae15a5
PH
802 This exception may be raised by PostProcessor's .run() method to
803 indicate an error in the postprocessing task.
804 """
5f6a1245 805
7851b379 806 def __init__(self, msg):
bf5b9d85 807 super(PostProcessingError, self).__init__(msg)
7851b379 808 self.msg = msg
d77c3dfd 809
5f6a1245 810
bf5b9d85 811class MaxDownloadsReached(YoutubeDLError):
59ae15a5
PH
812 """ --max-downloads limit has been reached. """
813 pass
d77c3dfd
FV
814
815
bf5b9d85 816class UnavailableVideoError(YoutubeDLError):
59ae15a5 817 """Unavailable Format exception.
d77c3dfd 818
59ae15a5
PH
819 This exception will be thrown when a video is requested
820 in a format that is not available for that video.
821 """
822 pass
d77c3dfd
FV
823
824
bf5b9d85 825class ContentTooShortError(YoutubeDLError):
59ae15a5 826 """Content Too Short exception.
d77c3dfd 827
59ae15a5
PH
828 This exception may be raised by FileDownloader objects when a file they
829 download is too small for what the server announced first, indicating
830 the connection was probably interrupted.
831 """
d77c3dfd 832
59ae15a5 833 def __init__(self, downloaded, expected):
bf5b9d85
PM
834 super(ContentTooShortError, self).__init__(
835 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
836 )
2c7ed247 837 # Both in bytes
59ae15a5
PH
838 self.downloaded = downloaded
839 self.expected = expected
d77c3dfd 840
5f6a1245 841
bf5b9d85 842class XAttrMetadataError(YoutubeDLError):
efa97bdc
YCH
843 def __init__(self, code=None, msg='Unknown error'):
844 super(XAttrMetadataError, self).__init__(msg)
845 self.code = code
bd264412 846 self.msg = msg
efa97bdc
YCH
847
848 # Parsing code and msg
849 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
850 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
851 self.reason = 'NO_SPACE'
852 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
853 self.reason = 'VALUE_TOO_LONG'
854 else:
855 self.reason = 'NOT_SUPPORTED'
856
857
bf5b9d85 858class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
859 pass
860
861
c5a59d93 862def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
863 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
864 # expected HTTP responses to meet HTTP/1.0 or later (see also
865 # https://github.com/rg3/youtube-dl/issues/6727)
866 if sys.version_info < (3, 0):
5a1a2e94 867 kwargs[b'strict'] = True
be4a824d
PH
868 hc = http_class(*args, **kwargs)
869 source_address = ydl_handler._params.get('source_address')
870 if source_address is not None:
871 sa = (source_address, 0)
872 if hasattr(hc, 'source_address'): # Python 2.7+
873 hc.source_address = sa
874 else: # Python 2.6
875 def _hc_connect(self, *args, **kwargs):
876 sock = compat_socket_create_connection(
877 (self.host, self.port), self.timeout, sa)
878 if is_https:
d7932313
PH
879 self.sock = ssl.wrap_socket(
880 sock, self.key_file, self.cert_file,
881 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
882 else:
883 self.sock = sock
884 hc.connect = functools.partial(_hc_connect, hc)
885
886 return hc
887
888
87f0e62d 889def handle_youtubedl_headers(headers):
992fc9d6
YCH
890 filtered_headers = headers
891
892 if 'Youtubedl-no-compression' in filtered_headers:
893 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 894 del filtered_headers['Youtubedl-no-compression']
87f0e62d 895
992fc9d6 896 return filtered_headers
87f0e62d
YCH
897
898
acebc9cd 899class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
900 """Handler for HTTP requests and responses.
901
902 This class, when installed with an OpenerDirector, automatically adds
903 the standard headers to every HTTP request and handles gzipped and
904 deflated responses from web servers. If compression is to be avoided in
905 a particular request, the original request in the program code only has
0424ec30 906 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
907 removed before making the real request.
908
909 Part of this code was copied from:
910
911 http://techknack.net/python-urllib2-handlers/
912
913 Andrew Rowls, the author of that code, agreed to release it to the
914 public domain.
915 """
916
be4a824d
PH
917 def __init__(self, params, *args, **kwargs):
918 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
919 self._params = params
920
921 def http_open(self, req):
71aff188
YCH
922 conn_class = compat_http_client.HTTPConnection
923
924 socks_proxy = req.headers.get('Ytdl-socks-proxy')
925 if socks_proxy:
926 conn_class = make_socks_conn_class(conn_class, socks_proxy)
927 del req.headers['Ytdl-socks-proxy']
928
be4a824d 929 return self.do_open(functools.partial(
71aff188 930 _create_http_connection, self, conn_class, False),
be4a824d
PH
931 req)
932
59ae15a5
PH
933 @staticmethod
934 def deflate(data):
935 try:
936 return zlib.decompress(data, -zlib.MAX_WBITS)
937 except zlib.error:
938 return zlib.decompress(data)
939
acebc9cd 940 def http_request(self, req):
51f267d9
S
941 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
942 # always respected by websites, some tend to give out URLs with non percent-encoded
943 # non-ASCII characters (see telemb.py, ard.py [#3412])
944 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
945 # To work around aforementioned issue we will replace request's original URL with
946 # percent-encoded one
947 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
948 # the code of this workaround has been moved here from YoutubeDL.urlopen()
949 url = req.get_full_url()
950 url_escaped = escape_url(url)
951
952 # Substitute URL if any change after escaping
953 if url != url_escaped:
15d260eb 954 req = update_Request(req, url=url_escaped)
51f267d9 955
33ac271b 956 for h, v in std_headers.items():
3d5f7a39
JK
957 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
958 # The dict keys are capitalized because of this bug by urllib
959 if h.capitalize() not in req.headers:
33ac271b 960 req.add_header(h, v)
87f0e62d
YCH
961
962 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
963
964 if sys.version_info < (2, 7) and '#' in req.get_full_url():
965 # Python 2.6 is brain-dead when it comes to fragments
966 req._Request__original = req._Request__original.partition('#')[0]
967 req._Request__r_type = req._Request__r_type.partition('#')[0]
968
59ae15a5
PH
969 return req
970
acebc9cd 971 def http_response(self, req, resp):
59ae15a5
PH
972 old_resp = resp
973 # gzip
974 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
975 content = resp.read()
976 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
977 try:
978 uncompressed = io.BytesIO(gz.read())
979 except IOError as original_ioerror:
980 # There may be junk add the end of the file
981 # See http://stackoverflow.com/q/4928560/35070 for details
982 for i in range(1, 1024):
983 try:
984 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
985 uncompressed = io.BytesIO(gz.read())
986 except IOError:
987 continue
988 break
989 else:
990 raise original_ioerror
b407d853 991 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 992 resp.msg = old_resp.msg
c047270c 993 del resp.headers['Content-encoding']
59ae15a5
PH
994 # deflate
995 if resp.headers.get('Content-encoding', '') == 'deflate':
996 gz = io.BytesIO(self.deflate(resp.read()))
b407d853 997 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 998 resp.msg = old_resp.msg
c047270c 999 del resp.headers['Content-encoding']
ad729172
S
1000 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1001 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
1002 if 300 <= resp.code < 400:
1003 location = resp.headers.get('Location')
1004 if location:
1005 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1006 if sys.version_info >= (3, 0):
1007 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
1008 else:
1009 location = location.decode('utf-8')
5a4d9ddb
S
1010 location_escaped = escape_url(location)
1011 if location != location_escaped:
1012 del resp.headers['Location']
9a4aec8b
YCH
1013 if sys.version_info < (3, 0):
1014 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 1015 resp.headers['Location'] = location_escaped
59ae15a5 1016 return resp
0f8d03f8 1017
acebc9cd
PH
1018 https_request = http_request
1019 https_response = http_response
bf50b038 1020
5de90176 1021
71aff188
YCH
1022def make_socks_conn_class(base_class, socks_proxy):
1023 assert issubclass(base_class, (
1024 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1025
1026 url_components = compat_urlparse.urlparse(socks_proxy)
1027 if url_components.scheme.lower() == 'socks5':
1028 socks_type = ProxyType.SOCKS5
1029 elif url_components.scheme.lower() in ('socks', 'socks4'):
1030 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1031 elif url_components.scheme.lower() == 'socks4a':
1032 socks_type = ProxyType.SOCKS4A
71aff188 1033
cdd94c2e
YCH
1034 def unquote_if_non_empty(s):
1035 if not s:
1036 return s
1037 return compat_urllib_parse_unquote_plus(s)
1038
71aff188
YCH
1039 proxy_args = (
1040 socks_type,
1041 url_components.hostname, url_components.port or 1080,
1042 True, # Remote DNS
cdd94c2e
YCH
1043 unquote_if_non_empty(url_components.username),
1044 unquote_if_non_empty(url_components.password),
71aff188
YCH
1045 )
1046
1047 class SocksConnection(base_class):
1048 def connect(self):
1049 self.sock = sockssocket()
1050 self.sock.setproxy(*proxy_args)
1051 if type(self.timeout) in (int, float):
1052 self.sock.settimeout(self.timeout)
1053 self.sock.connect((self.host, self.port))
1054
1055 if isinstance(self, compat_http_client.HTTPSConnection):
1056 if hasattr(self, '_context'): # Python > 2.6
1057 self.sock = self._context.wrap_socket(
1058 self.sock, server_hostname=self.host)
1059 else:
1060 self.sock = ssl.wrap_socket(self.sock)
1061
1062 return SocksConnection
1063
1064
be4a824d
PH
1065class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1066 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1067 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1068 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1069 self._params = params
1070
1071 def https_open(self, req):
4f264c02 1072 kwargs = {}
71aff188
YCH
1073 conn_class = self._https_conn_class
1074
4f264c02
JMF
1075 if hasattr(self, '_context'): # python > 2.6
1076 kwargs['context'] = self._context
1077 if hasattr(self, '_check_hostname'): # python 3.x
1078 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1079
1080 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1081 if socks_proxy:
1082 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1083 del req.headers['Ytdl-socks-proxy']
1084
be4a824d 1085 return self.do_open(functools.partial(
71aff188 1086 _create_http_connection, self, conn_class, True),
4f264c02 1087 req, **kwargs)
be4a824d
PH
1088
1089
a6420bf5
S
1090class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1091 def __init__(self, cookiejar=None):
1092 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1093
1094 def http_response(self, request, response):
1095 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1096 # characters in Set-Cookie HTTP header of last response (see
1097 # https://github.com/rg3/youtube-dl/issues/6769).
1098 # In order to at least prevent crashing we will percent encode Set-Cookie
1099 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1100 # if sys.version_info < (3, 0) and response.headers:
1101 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1102 # set_cookie = response.headers.get(set_cookie_header)
1103 # if set_cookie:
1104 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1105 # if set_cookie != set_cookie_escaped:
1106 # del response.headers[set_cookie_header]
1107 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1108 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1109
1110 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1111 https_response = http_response
1112
1113
46f59e89
S
1114def extract_timezone(date_str):
1115 m = re.search(
1116 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1117 date_str)
1118 if not m:
1119 timezone = datetime.timedelta()
1120 else:
1121 date_str = date_str[:-len(m.group('tz'))]
1122 if not m.group('sign'):
1123 timezone = datetime.timedelta()
1124 else:
1125 sign = 1 if m.group('sign') == '+' else -1
1126 timezone = datetime.timedelta(
1127 hours=sign * int(m.group('hours')),
1128 minutes=sign * int(m.group('minutes')))
1129 return timezone, date_str
1130
1131
08b38d54 1132def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1133 """ Return a UNIX timestamp from the given date """
1134
1135 if date_str is None:
1136 return None
1137
52c3a6e4
S
1138 date_str = re.sub(r'\.[0-9]+', '', date_str)
1139
08b38d54 1140 if timezone is None:
46f59e89
S
1141 timezone, date_str = extract_timezone(date_str)
1142
52c3a6e4
S
1143 try:
1144 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1145 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1146 return calendar.timegm(dt.timetuple())
1147 except ValueError:
1148 pass
912b38b4
PH
1149
1150
46f59e89
S
1151def date_formats(day_first=True):
1152 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1153
1154
42bdd9d0 1155def unified_strdate(date_str, day_first=True):
bf50b038 1156 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1157
1158 if date_str is None:
1159 return None
bf50b038 1160 upload_date = None
5f6a1245 1161 # Replace commas
026fcc04 1162 date_str = date_str.replace(',', ' ')
42bdd9d0 1163 # Remove AM/PM + timezone
9bb8e0a3 1164 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1165 _, date_str = extract_timezone(date_str)
42bdd9d0 1166
46f59e89 1167 for expression in date_formats(day_first):
bf50b038
JMF
1168 try:
1169 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1170 except ValueError:
bf50b038 1171 pass
42393ce2
PH
1172 if upload_date is None:
1173 timetuple = email.utils.parsedate_tz(date_str)
1174 if timetuple:
c6b9cf05
S
1175 try:
1176 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1177 except ValueError:
1178 pass
6a750402
JMF
1179 if upload_date is not None:
1180 return compat_str(upload_date)
bf50b038 1181
5f6a1245 1182
46f59e89
S
1183def unified_timestamp(date_str, day_first=True):
1184 if date_str is None:
1185 return None
1186
2ae2ffda 1187 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1188
7dc2a74e 1189 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1190 timezone, date_str = extract_timezone(date_str)
1191
1192 # Remove AM/PM + timezone
1193 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1194
deef3195
S
1195 # Remove unrecognized timezones from ISO 8601 alike timestamps
1196 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1197 if m:
1198 date_str = date_str[:-len(m.group('tz'))]
1199
46f59e89
S
1200 for expression in date_formats(day_first):
1201 try:
7dc2a74e 1202 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1203 return calendar.timegm(dt.timetuple())
1204 except ValueError:
1205 pass
1206 timetuple = email.utils.parsedate_tz(date_str)
1207 if timetuple:
7dc2a74e 1208 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1209
1210
28e614de 1211def determine_ext(url, default_ext='unknown_video'):
f4776371
S
1212 if url is None:
1213 return default_ext
9cb9a5df 1214 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1215 if re.match(r'^[A-Za-z0-9]+$', guess):
1216 return guess
a7aaa398
S
1217 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1218 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1219 return guess.rstrip('/')
73e79f2a 1220 else:
cbdbb766 1221 return default_ext
73e79f2a 1222
5f6a1245 1223
d4051a8e 1224def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1225 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1226
5f6a1245 1227
bd558525 1228def date_from_str(date_str):
37254abc
JMF
1229 """
1230 Return a datetime object from a string in the format YYYYMMDD or
1231 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1232 today = datetime.date.today()
f8795e10 1233 if date_str in ('now', 'today'):
37254abc 1234 return today
f8795e10
PH
1235 if date_str == 'yesterday':
1236 return today - datetime.timedelta(days=1)
ec85ded8 1237 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
37254abc
JMF
1238 if match is not None:
1239 sign = match.group('sign')
1240 time = int(match.group('time'))
1241 if sign == '-':
1242 time = -time
1243 unit = match.group('unit')
dfb1b146 1244 # A bad approximation?
37254abc
JMF
1245 if unit == 'month':
1246 unit = 'day'
1247 time *= 30
1248 elif unit == 'year':
1249 unit = 'day'
1250 time *= 365
1251 unit += 's'
1252 delta = datetime.timedelta(**{unit: time})
1253 return today + delta
611c1dd9 1254 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1255
1256
e63fc1be 1257def hyphenate_date(date_str):
1258 """
1259 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1260 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1261 if match is not None:
1262 return '-'.join(match.groups())
1263 else:
1264 return date_str
1265
5f6a1245 1266
bd558525
JMF
1267class DateRange(object):
1268 """Represents a time interval between two dates"""
5f6a1245 1269
bd558525
JMF
1270 def __init__(self, start=None, end=None):
1271 """start and end must be strings in the format accepted by date"""
1272 if start is not None:
1273 self.start = date_from_str(start)
1274 else:
1275 self.start = datetime.datetime.min.date()
1276 if end is not None:
1277 self.end = date_from_str(end)
1278 else:
1279 self.end = datetime.datetime.max.date()
37254abc 1280 if self.start > self.end:
bd558525 1281 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1282
bd558525
JMF
1283 @classmethod
1284 def day(cls, day):
1285 """Returns a range that only contains the given day"""
5f6a1245
JW
1286 return cls(day, day)
1287
bd558525
JMF
1288 def __contains__(self, date):
1289 """Check if the date is in the range"""
37254abc
JMF
1290 if not isinstance(date, datetime.date):
1291 date = date_from_str(date)
1292 return self.start <= date <= self.end
5f6a1245 1293
bd558525 1294 def __str__(self):
5f6a1245 1295 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1296
1297
1298def platform_name():
1299 """ Returns the platform name as a compat_str """
1300 res = platform.platform()
1301 if isinstance(res, bytes):
1302 res = res.decode(preferredencoding())
1303
1304 assert isinstance(res, compat_str)
1305 return res
c257baff
PH
1306
1307
b58ddb32
PH
1308def _windows_write_string(s, out):
1309 """ Returns True if the string was written using special methods,
1310 False if it has yet to be written out."""
1311 # Adapted from http://stackoverflow.com/a/3259271/35070
1312
1313 import ctypes
1314 import ctypes.wintypes
1315
1316 WIN_OUTPUT_IDS = {
1317 1: -11,
1318 2: -12,
1319 }
1320
a383a98a
PH
1321 try:
1322 fileno = out.fileno()
1323 except AttributeError:
1324 # If the output stream doesn't have a fileno, it's virtual
1325 return False
aa42e873
PH
1326 except io.UnsupportedOperation:
1327 # Some strange Windows pseudo files?
1328 return False
b58ddb32
PH
1329 if fileno not in WIN_OUTPUT_IDS:
1330 return False
1331
e2f89ec7 1332 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1333 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1334 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1335 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1336
e2f89ec7 1337 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1338 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1339 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1340 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1341 written = ctypes.wintypes.DWORD(0)
1342
611c1dd9 1343 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1344 FILE_TYPE_CHAR = 0x0002
1345 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1346 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1347 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1348 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1349 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1350 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1351
1352 def not_a_console(handle):
1353 if handle == INVALID_HANDLE_VALUE or handle is None:
1354 return True
8fb3ac36
PH
1355 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1356 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1357
1358 if not_a_console(h):
1359 return False
1360
d1b9c912
PH
1361 def next_nonbmp_pos(s):
1362 try:
1363 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1364 except StopIteration:
1365 return len(s)
1366
1367 while s:
1368 count = min(next_nonbmp_pos(s), 1024)
1369
b58ddb32 1370 ret = WriteConsoleW(
d1b9c912 1371 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1372 if ret == 0:
1373 raise OSError('Failed to write string')
d1b9c912
PH
1374 if not count: # We just wrote a non-BMP character
1375 assert written.value == 2
1376 s = s[1:]
1377 else:
1378 assert written.value > 0
1379 s = s[written.value:]
b58ddb32
PH
1380 return True
1381
1382
734f90bb 1383def write_string(s, out=None, encoding=None):
7459e3a2
PH
1384 if out is None:
1385 out = sys.stderr
8bf48f23 1386 assert type(s) == compat_str
7459e3a2 1387
b58ddb32
PH
1388 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1389 if _windows_write_string(s, out):
1390 return
1391
7459e3a2
PH
1392 if ('b' in getattr(out, 'mode', '') or
1393 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1394 byt = s.encode(encoding or preferredencoding(), 'ignore')
1395 out.write(byt)
1396 elif hasattr(out, 'buffer'):
1397 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1398 byt = s.encode(enc, 'ignore')
1399 out.buffer.write(byt)
1400 else:
8bf48f23 1401 out.write(s)
7459e3a2
PH
1402 out.flush()
1403
1404
48ea9cea
PH
1405def bytes_to_intlist(bs):
1406 if not bs:
1407 return []
1408 if isinstance(bs[0], int): # Python 3
1409 return list(bs)
1410 else:
1411 return [ord(c) for c in bs]
1412
c257baff 1413
cba892fa 1414def intlist_to_bytes(xs):
1415 if not xs:
1416 return b''
edaa23f8 1417 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1418
1419
c1c9a79c
PH
1420# Cross-platform file locking
1421if sys.platform == 'win32':
1422 import ctypes.wintypes
1423 import msvcrt
1424
1425 class OVERLAPPED(ctypes.Structure):
1426 _fields_ = [
1427 ('Internal', ctypes.wintypes.LPVOID),
1428 ('InternalHigh', ctypes.wintypes.LPVOID),
1429 ('Offset', ctypes.wintypes.DWORD),
1430 ('OffsetHigh', ctypes.wintypes.DWORD),
1431 ('hEvent', ctypes.wintypes.HANDLE),
1432 ]
1433
1434 kernel32 = ctypes.windll.kernel32
1435 LockFileEx = kernel32.LockFileEx
1436 LockFileEx.argtypes = [
1437 ctypes.wintypes.HANDLE, # hFile
1438 ctypes.wintypes.DWORD, # dwFlags
1439 ctypes.wintypes.DWORD, # dwReserved
1440 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1441 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1442 ctypes.POINTER(OVERLAPPED) # Overlapped
1443 ]
1444 LockFileEx.restype = ctypes.wintypes.BOOL
1445 UnlockFileEx = kernel32.UnlockFileEx
1446 UnlockFileEx.argtypes = [
1447 ctypes.wintypes.HANDLE, # hFile
1448 ctypes.wintypes.DWORD, # dwReserved
1449 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1450 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1451 ctypes.POINTER(OVERLAPPED) # Overlapped
1452 ]
1453 UnlockFileEx.restype = ctypes.wintypes.BOOL
1454 whole_low = 0xffffffff
1455 whole_high = 0x7fffffff
1456
1457 def _lock_file(f, exclusive):
1458 overlapped = OVERLAPPED()
1459 overlapped.Offset = 0
1460 overlapped.OffsetHigh = 0
1461 overlapped.hEvent = 0
1462 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1463 handle = msvcrt.get_osfhandle(f.fileno())
1464 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1465 whole_low, whole_high, f._lock_file_overlapped_p):
1466 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1467
1468 def _unlock_file(f):
1469 assert f._lock_file_overlapped_p
1470 handle = msvcrt.get_osfhandle(f.fileno())
1471 if not UnlockFileEx(handle, 0,
1472 whole_low, whole_high, f._lock_file_overlapped_p):
1473 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1474
1475else:
399a76e6
YCH
1476 # Some platforms, such as Jython, is missing fcntl
1477 try:
1478 import fcntl
c1c9a79c 1479
399a76e6
YCH
1480 def _lock_file(f, exclusive):
1481 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1482
399a76e6
YCH
1483 def _unlock_file(f):
1484 fcntl.flock(f, fcntl.LOCK_UN)
1485 except ImportError:
1486 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1487
1488 def _lock_file(f, exclusive):
1489 raise IOError(UNSUPPORTED_MSG)
1490
1491 def _unlock_file(f):
1492 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1493
1494
1495class locked_file(object):
1496 def __init__(self, filename, mode, encoding=None):
1497 assert mode in ['r', 'a', 'w']
1498 self.f = io.open(filename, mode, encoding=encoding)
1499 self.mode = mode
1500
1501 def __enter__(self):
1502 exclusive = self.mode != 'r'
1503 try:
1504 _lock_file(self.f, exclusive)
1505 except IOError:
1506 self.f.close()
1507 raise
1508 return self
1509
1510 def __exit__(self, etype, value, traceback):
1511 try:
1512 _unlock_file(self.f)
1513 finally:
1514 self.f.close()
1515
1516 def __iter__(self):
1517 return iter(self.f)
1518
1519 def write(self, *args):
1520 return self.f.write(*args)
1521
1522 def read(self, *args):
1523 return self.f.read(*args)
4eb7f1d1
JMF
1524
1525
4644ac55
S
1526def get_filesystem_encoding():
1527 encoding = sys.getfilesystemencoding()
1528 return encoding if encoding is not None else 'utf-8'
1529
1530
4eb7f1d1 1531def shell_quote(args):
a6a173c2 1532 quoted_args = []
4644ac55 1533 encoding = get_filesystem_encoding()
a6a173c2
JMF
1534 for a in args:
1535 if isinstance(a, bytes):
1536 # We may get a filename encoded with 'encodeFilename'
1537 a = a.decode(encoding)
1538 quoted_args.append(pipes.quote(a))
28e614de 1539 return ' '.join(quoted_args)
9d4660ca
PH
1540
1541
1542def smuggle_url(url, data):
1543 """ Pass additional data in a URL for internal use. """
1544
81953d1a
RA
1545 url, idata = unsmuggle_url(url, {})
1546 data.update(idata)
15707c7e 1547 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1548 {'__youtubedl_smuggle': json.dumps(data)})
1549 return url + '#' + sdata
9d4660ca
PH
1550
1551
79f82953 1552def unsmuggle_url(smug_url, default=None):
83e865a3 1553 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1554 return smug_url, default
28e614de
PH
1555 url, _, sdata = smug_url.rpartition('#')
1556 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1557 data = json.loads(jsond)
1558 return url, data
02dbf93f
PH
1559
1560
02dbf93f
PH
1561def format_bytes(bytes):
1562 if bytes is None:
28e614de 1563 return 'N/A'
02dbf93f
PH
1564 if type(bytes) is str:
1565 bytes = float(bytes)
1566 if bytes == 0.0:
1567 exponent = 0
1568 else:
1569 exponent = int(math.log(bytes, 1024.0))
28e614de 1570 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1571 converted = float(bytes) / float(1024 ** exponent)
28e614de 1572 return '%.2f%s' % (converted, suffix)
f53c966a 1573
1c088fa8 1574
fb47597b
S
1575def lookup_unit_table(unit_table, s):
1576 units_re = '|'.join(re.escape(u) for u in unit_table)
1577 m = re.match(
782b1b5b 1578 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1579 if not m:
1580 return None
1581 num_str = m.group('num').replace(',', '.')
1582 mult = unit_table[m.group('unit')]
1583 return int(float(num_str) * mult)
1584
1585
be64b5b0
PH
1586def parse_filesize(s):
1587 if s is None:
1588 return None
1589
dfb1b146 1590 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1591 # but we support those too
1592 _UNIT_TABLE = {
1593 'B': 1,
1594 'b': 1,
70852b47 1595 'bytes': 1,
be64b5b0
PH
1596 'KiB': 1024,
1597 'KB': 1000,
1598 'kB': 1024,
1599 'Kb': 1000,
13585d76 1600 'kb': 1000,
70852b47
YCH
1601 'kilobytes': 1000,
1602 'kibibytes': 1024,
be64b5b0
PH
1603 'MiB': 1024 ** 2,
1604 'MB': 1000 ** 2,
1605 'mB': 1024 ** 2,
1606 'Mb': 1000 ** 2,
13585d76 1607 'mb': 1000 ** 2,
70852b47
YCH
1608 'megabytes': 1000 ** 2,
1609 'mebibytes': 1024 ** 2,
be64b5b0
PH
1610 'GiB': 1024 ** 3,
1611 'GB': 1000 ** 3,
1612 'gB': 1024 ** 3,
1613 'Gb': 1000 ** 3,
13585d76 1614 'gb': 1000 ** 3,
70852b47
YCH
1615 'gigabytes': 1000 ** 3,
1616 'gibibytes': 1024 ** 3,
be64b5b0
PH
1617 'TiB': 1024 ** 4,
1618 'TB': 1000 ** 4,
1619 'tB': 1024 ** 4,
1620 'Tb': 1000 ** 4,
13585d76 1621 'tb': 1000 ** 4,
70852b47
YCH
1622 'terabytes': 1000 ** 4,
1623 'tebibytes': 1024 ** 4,
be64b5b0
PH
1624 'PiB': 1024 ** 5,
1625 'PB': 1000 ** 5,
1626 'pB': 1024 ** 5,
1627 'Pb': 1000 ** 5,
13585d76 1628 'pb': 1000 ** 5,
70852b47
YCH
1629 'petabytes': 1000 ** 5,
1630 'pebibytes': 1024 ** 5,
be64b5b0
PH
1631 'EiB': 1024 ** 6,
1632 'EB': 1000 ** 6,
1633 'eB': 1024 ** 6,
1634 'Eb': 1000 ** 6,
13585d76 1635 'eb': 1000 ** 6,
70852b47
YCH
1636 'exabytes': 1000 ** 6,
1637 'exbibytes': 1024 ** 6,
be64b5b0
PH
1638 'ZiB': 1024 ** 7,
1639 'ZB': 1000 ** 7,
1640 'zB': 1024 ** 7,
1641 'Zb': 1000 ** 7,
13585d76 1642 'zb': 1000 ** 7,
70852b47
YCH
1643 'zettabytes': 1000 ** 7,
1644 'zebibytes': 1024 ** 7,
be64b5b0
PH
1645 'YiB': 1024 ** 8,
1646 'YB': 1000 ** 8,
1647 'yB': 1024 ** 8,
1648 'Yb': 1000 ** 8,
13585d76 1649 'yb': 1000 ** 8,
70852b47
YCH
1650 'yottabytes': 1000 ** 8,
1651 'yobibytes': 1024 ** 8,
be64b5b0
PH
1652 }
1653
fb47597b
S
1654 return lookup_unit_table(_UNIT_TABLE, s)
1655
1656
1657def parse_count(s):
1658 if s is None:
be64b5b0
PH
1659 return None
1660
fb47597b
S
1661 s = s.strip()
1662
1663 if re.match(r'^[\d,.]+$', s):
1664 return str_to_int(s)
1665
1666 _UNIT_TABLE = {
1667 'k': 1000,
1668 'K': 1000,
1669 'm': 1000 ** 2,
1670 'M': 1000 ** 2,
1671 'kk': 1000 ** 2,
1672 'KK': 1000 ** 2,
1673 }
be64b5b0 1674
fb47597b 1675 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1676
2f7ae819 1677
a942d6cb 1678def month_by_name(name, lang='en'):
caefb1de
PH
1679 """ Return the number of a month by (locale-independently) English name """
1680
f6717dec 1681 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1682
caefb1de 1683 try:
f6717dec 1684 return month_names.index(name) + 1
7105440c
YCH
1685 except ValueError:
1686 return None
1687
1688
1689def month_by_abbreviation(abbrev):
1690 """ Return the number of a month by (locale-independently) English
1691 abbreviations """
1692
1693 try:
1694 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1695 except ValueError:
1696 return None
18258362
JMF
1697
1698
5aafe895 1699def fix_xml_ampersands(xml_str):
18258362 1700 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1701 return re.sub(
1702 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1703 '&amp;',
5aafe895 1704 xml_str)
e3946f98
PH
1705
1706
1707def setproctitle(title):
8bf48f23 1708 assert isinstance(title, compat_str)
c1c05c67
YCH
1709
1710 # ctypes in Jython is not complete
1711 # http://bugs.jython.org/issue2148
1712 if sys.platform.startswith('java'):
1713 return
1714
e3946f98 1715 try:
611c1dd9 1716 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1717 except OSError:
1718 return
2f49bcd6
RC
1719 except TypeError:
1720 # LoadLibrary in Windows Python 2.7.13 only expects
1721 # a bytestring, but since unicode_literals turns
1722 # every string into a unicode string, it fails.
1723 return
6eefe533
PH
1724 title_bytes = title.encode('utf-8')
1725 buf = ctypes.create_string_buffer(len(title_bytes))
1726 buf.value = title_bytes
e3946f98 1727 try:
6eefe533 1728 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1729 except AttributeError:
1730 return # Strange libc, just skip this
d7dda168
PH
1731
1732
1733def remove_start(s, start):
46bc9b7d 1734 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1735
1736
2b9faf55 1737def remove_end(s, end):
46bc9b7d 1738 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1739
1740
31b2051e
S
1741def remove_quotes(s):
1742 if s is None or len(s) < 2:
1743 return s
1744 for quote in ('"', "'", ):
1745 if s[0] == quote and s[-1] == quote:
1746 return s[1:-1]
1747 return s
1748
1749
29eb5174 1750def url_basename(url):
9b8aaeed 1751 path = compat_urlparse.urlparse(url).path
28e614de 1752 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1753
1754
02dc0a36
S
1755def base_url(url):
1756 return re.match(r'https?://[^?#&]+/', url).group()
1757
1758
e34c3361 1759def urljoin(base, path):
4b5de77b
S
1760 if isinstance(path, bytes):
1761 path = path.decode('utf-8')
e34c3361
S
1762 if not isinstance(path, compat_str) or not path:
1763 return None
b0c65c67 1764 if re.match(r'^(?:https?:)?//', path):
e34c3361 1765 return path
4b5de77b
S
1766 if isinstance(base, bytes):
1767 base = base.decode('utf-8')
1768 if not isinstance(base, compat_str) or not re.match(
1769 r'^(?:https?:)?//', base):
e34c3361
S
1770 return None
1771 return compat_urlparse.urljoin(base, path)
1772
1773
aa94a6d3
PH
1774class HEADRequest(compat_urllib_request.Request):
1775 def get_method(self):
611c1dd9 1776 return 'HEAD'
7217e148
PH
1777
1778
95cf60e8
S
1779class PUTRequest(compat_urllib_request.Request):
1780 def get_method(self):
1781 return 'PUT'
1782
1783
9732d77e 1784def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1785 if get_attr:
1786 if v is not None:
1787 v = getattr(v, get_attr, None)
9572013d
PH
1788 if v == '':
1789 v = None
1812afb7
S
1790 if v is None:
1791 return default
1792 try:
1793 return int(v) * invscale // scale
1794 except ValueError:
af98f8ff 1795 return default
9732d77e 1796
9572013d 1797
40a90862
JMF
1798def str_or_none(v, default=None):
1799 return default if v is None else compat_str(v)
1800
9732d77e
PH
1801
1802def str_to_int(int_str):
48d4681e 1803 """ A more relaxed version of int_or_none """
9732d77e
PH
1804 if int_str is None:
1805 return None
28e614de 1806 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1807 return int(int_str)
608d11f5
PH
1808
1809
9732d77e 1810def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1811 if v is None:
1812 return default
1813 try:
1814 return float(v) * invscale / scale
1815 except ValueError:
1816 return default
43f775e4
PH
1817
1818
b72b4431
S
1819def strip_or_none(v):
1820 return None if v is None else v.strip()
1821
1822
608d11f5 1823def parse_duration(s):
8f9312c3 1824 if not isinstance(s, compat_basestring):
608d11f5
PH
1825 return None
1826
ca7b3246
S
1827 s = s.strip()
1828
acaff495 1829 days, hours, mins, secs, ms = [None] * 5
15846398 1830 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
acaff495 1831 if m:
1832 days, hours, mins, secs, ms = m.groups()
1833 else:
1834 m = re.match(
1835 r'''(?ix)(?:P?T)?
8f4b58d7 1836 (?:
acaff495 1837 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1838 )?
acaff495 1839 (?:
1840 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1841 )?
1842 (?:
1843 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1844 )?
1845 (?:
1846 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 1847 )?Z?$''', s)
acaff495 1848 if m:
1849 days, hours, mins, secs, ms = m.groups()
1850 else:
15846398 1851 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 1852 if m:
1853 hours, mins = m.groups()
1854 else:
1855 return None
1856
1857 duration = 0
1858 if secs:
1859 duration += float(secs)
1860 if mins:
1861 duration += float(mins) * 60
1862 if hours:
1863 duration += float(hours) * 60 * 60
1864 if days:
1865 duration += float(days) * 24 * 60 * 60
1866 if ms:
1867 duration += float(ms)
1868 return duration
91d7d0b3
JMF
1869
1870
e65e4c88 1871def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1872 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1873 return (
1874 '{0}.{1}{2}'.format(name, ext, real_ext)
1875 if not expected_real_ext or real_ext[1:] == expected_real_ext
1876 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1877
1878
b3ed15b7
S
1879def replace_extension(filename, ext, expected_real_ext=None):
1880 name, real_ext = os.path.splitext(filename)
1881 return '{0}.{1}'.format(
1882 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1883 ext)
1884
1885
d70ad093
PH
1886def check_executable(exe, args=[]):
1887 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1888 args can be a list of arguments for a short output (like -version) """
1889 try:
1890 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1891 except OSError:
1892 return False
1893 return exe
b7ab0590
PH
1894
1895
95807118 1896def get_exe_version(exe, args=['--version'],
cae97f65 1897 version_re=None, unrecognized='present'):
95807118
PH
1898 """ Returns the version of the specified executable,
1899 or False if the executable is not present """
1900 try:
b64d04c1
YCH
1901 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1902 # SIGTTOU if youtube-dl is run in the background.
1903 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
cae97f65 1904 out, _ = subprocess.Popen(
54116803 1905 [encodeArgument(exe)] + args,
00ca7552 1906 stdin=subprocess.PIPE,
95807118
PH
1907 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1908 except OSError:
1909 return False
cae97f65
PH
1910 if isinstance(out, bytes): # Python 2.x
1911 out = out.decode('ascii', 'ignore')
1912 return detect_exe_version(out, version_re, unrecognized)
1913
1914
1915def detect_exe_version(output, version_re=None, unrecognized='present'):
1916 assert isinstance(output, compat_str)
1917 if version_re is None:
1918 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1919 m = re.search(version_re, output)
95807118
PH
1920 if m:
1921 return m.group(1)
1922 else:
1923 return unrecognized
1924
1925
b7ab0590 1926class PagedList(object):
dd26ced1
PH
1927 def __len__(self):
1928 # This is only useful for tests
1929 return len(self.getslice())
1930
9c44d242
PH
1931
1932class OnDemandPagedList(PagedList):
b95dc034 1933 def __init__(self, pagefunc, pagesize, use_cache=False):
9c44d242
PH
1934 self._pagefunc = pagefunc
1935 self._pagesize = pagesize
b95dc034
YCH
1936 self._use_cache = use_cache
1937 if use_cache:
1938 self._cache = {}
9c44d242 1939
b7ab0590
PH
1940 def getslice(self, start=0, end=None):
1941 res = []
1942 for pagenum in itertools.count(start // self._pagesize):
1943 firstid = pagenum * self._pagesize
1944 nextfirstid = pagenum * self._pagesize + self._pagesize
1945 if start >= nextfirstid:
1946 continue
1947
b95dc034
YCH
1948 page_results = None
1949 if self._use_cache:
1950 page_results = self._cache.get(pagenum)
1951 if page_results is None:
1952 page_results = list(self._pagefunc(pagenum))
1953 if self._use_cache:
1954 self._cache[pagenum] = page_results
b7ab0590
PH
1955
1956 startv = (
1957 start % self._pagesize
1958 if firstid <= start < nextfirstid
1959 else 0)
1960
1961 endv = (
1962 ((end - 1) % self._pagesize) + 1
1963 if (end is not None and firstid <= end <= nextfirstid)
1964 else None)
1965
1966 if startv != 0 or endv is not None:
1967 page_results = page_results[startv:endv]
1968 res.extend(page_results)
1969
1970 # A little optimization - if current page is not "full", ie. does
1971 # not contain page_size videos then we can assume that this page
1972 # is the last one - there are no more ids on further pages -
1973 # i.e. no need to query again.
1974 if len(page_results) + startv < self._pagesize:
1975 break
1976
1977 # If we got the whole page, but the next page is not interesting,
1978 # break out early as well
1979 if end == nextfirstid:
1980 break
1981 return res
81c2f20b
PH
1982
1983
9c44d242
PH
1984class InAdvancePagedList(PagedList):
1985 def __init__(self, pagefunc, pagecount, pagesize):
1986 self._pagefunc = pagefunc
1987 self._pagecount = pagecount
1988 self._pagesize = pagesize
1989
1990 def getslice(self, start=0, end=None):
1991 res = []
1992 start_page = start // self._pagesize
1993 end_page = (
1994 self._pagecount if end is None else (end // self._pagesize + 1))
1995 skip_elems = start - start_page * self._pagesize
1996 only_more = None if end is None else end - start
1997 for pagenum in range(start_page, end_page):
1998 page = list(self._pagefunc(pagenum))
1999 if skip_elems:
2000 page = page[skip_elems:]
2001 skip_elems = None
2002 if only_more is not None:
2003 if len(page) < only_more:
2004 only_more -= len(page)
2005 else:
2006 page = page[:only_more]
2007 res.extend(page)
2008 break
2009 res.extend(page)
2010 return res
2011
2012
81c2f20b 2013def uppercase_escape(s):
676eb3f2 2014 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2015 return re.sub(
a612753d 2016 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2017 lambda m: unicode_escape(m.group(0))[0],
2018 s)
0fe2ff78
YCH
2019
2020
2021def lowercase_escape(s):
2022 unicode_escape = codecs.getdecoder('unicode_escape')
2023 return re.sub(
2024 r'\\u[0-9a-fA-F]{4}',
2025 lambda m: unicode_escape(m.group(0))[0],
2026 s)
b53466e1 2027
d05cfe06
S
2028
2029def escape_rfc3986(s):
2030 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 2031 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 2032 s = s.encode('utf-8')
ecc0c5ee 2033 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2034
2035
2036def escape_url(url):
2037 """Escape URL as suggested by RFC 3986"""
2038 url_parsed = compat_urllib_parse_urlparse(url)
2039 return url_parsed._replace(
efbed08d 2040 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2041 path=escape_rfc3986(url_parsed.path),
2042 params=escape_rfc3986(url_parsed.params),
2043 query=escape_rfc3986(url_parsed.query),
2044 fragment=escape_rfc3986(url_parsed.fragment)
2045 ).geturl()
2046
62e609ab
PH
2047
2048def read_batch_urls(batch_fd):
2049 def fixup(url):
2050 if not isinstance(url, compat_str):
2051 url = url.decode('utf-8', 'replace')
28e614de 2052 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
2053 if url.startswith(BOM_UTF8):
2054 url = url[len(BOM_UTF8):]
2055 url = url.strip()
2056 if url.startswith(('#', ';', ']')):
2057 return False
2058 return url
2059
2060 with contextlib.closing(batch_fd) as fd:
2061 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2062
2063
2064def urlencode_postdata(*args, **kargs):
15707c7e 2065 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2066
2067
38f9ef31 2068def update_url_query(url, query):
cacd9966
YCH
2069 if not query:
2070 return url
38f9ef31 2071 parsed_url = compat_urlparse.urlparse(url)
2072 qs = compat_parse_qs(parsed_url.query)
2073 qs.update(query)
2074 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2075 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2076
8e60dc75 2077
ed0291d1
S
2078def update_Request(req, url=None, data=None, headers={}, query={}):
2079 req_headers = req.headers.copy()
2080 req_headers.update(headers)
2081 req_data = data or req.data
2082 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2083 req_get_method = req.get_method()
2084 if req_get_method == 'HEAD':
2085 req_type = HEADRequest
2086 elif req_get_method == 'PUT':
2087 req_type = PUTRequest
2088 else:
2089 req_type = compat_urllib_request.Request
ed0291d1
S
2090 new_req = req_type(
2091 req_url, data=req_data, headers=req_headers,
2092 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2093 if hasattr(req, 'timeout'):
2094 new_req.timeout = req.timeout
2095 return new_req
2096
2097
10c87c15 2098def _multipart_encode_impl(data, boundary):
0c265486
YCH
2099 content_type = 'multipart/form-data; boundary=%s' % boundary
2100
2101 out = b''
2102 for k, v in data.items():
2103 out += b'--' + boundary.encode('ascii') + b'\r\n'
2104 if isinstance(k, compat_str):
2105 k = k.encode('utf-8')
2106 if isinstance(v, compat_str):
2107 v = v.encode('utf-8')
2108 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2109 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2110 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2111 if boundary.encode('ascii') in content:
2112 raise ValueError('Boundary overlaps with data')
2113 out += content
2114
2115 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2116
2117 return out, content_type
2118
2119
2120def multipart_encode(data, boundary=None):
2121 '''
2122 Encode a dict to RFC 7578-compliant form-data
2123
2124 data:
2125 A dict where keys and values can be either Unicode or bytes-like
2126 objects.
2127 boundary:
2128 If specified a Unicode object, it's used as the boundary. Otherwise
2129 a random boundary is generated.
2130
2131 Reference: https://tools.ietf.org/html/rfc7578
2132 '''
2133 has_specified_boundary = boundary is not None
2134
2135 while True:
2136 if boundary is None:
2137 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2138
2139 try:
10c87c15 2140 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2141 break
2142 except ValueError:
2143 if has_specified_boundary:
2144 raise
2145 boundary = None
2146
2147 return out, content_type
2148
2149
86296ad2 2150def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
2151 if isinstance(key_or_keys, (list, tuple)):
2152 for key in key_or_keys:
86296ad2
S
2153 if key not in d or d[key] is None or skip_false_values and not d[key]:
2154 continue
2155 return d[key]
cbecc9b9
S
2156 return default
2157 return d.get(key_or_keys, default)
2158
2159
329ca3be 2160def try_get(src, getter, expected_type=None):
a32a9a7e
S
2161 if not isinstance(getter, (list, tuple)):
2162 getter = [getter]
2163 for get in getter:
2164 try:
2165 v = get(src)
2166 except (AttributeError, KeyError, TypeError, IndexError):
2167 pass
2168 else:
2169 if expected_type is None or isinstance(v, expected_type):
2170 return v
329ca3be
S
2171
2172
8e60dc75
S
2173def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2174 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2175
16392824 2176
a1a530b0
PH
2177US_RATINGS = {
2178 'G': 0,
2179 'PG': 10,
2180 'PG-13': 13,
2181 'R': 16,
2182 'NC': 18,
2183}
fac55558
PH
2184
2185
a8795327
S
2186TV_PARENTAL_GUIDELINES = {
2187 'TV-Y': 0,
2188 'TV-Y7': 7,
2189 'TV-G': 0,
2190 'TV-PG': 0,
2191 'TV-14': 14,
2192 'TV-MA': 17,
2193}
2194
2195
146c80e2 2196def parse_age_limit(s):
a8795327
S
2197 if type(s) == int:
2198 return s if 0 <= s <= 21 else None
2199 if not isinstance(s, compat_basestring):
d838b1bd 2200 return None
146c80e2 2201 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2202 if m:
2203 return int(m.group('age'))
2204 if s in US_RATINGS:
2205 return US_RATINGS[s]
2206 return TV_PARENTAL_GUIDELINES.get(s)
146c80e2
S
2207
2208
fac55558 2209def strip_jsonp(code):
609a61e3 2210 return re.sub(
5552c9eb
YCH
2211 r'''(?sx)^
2212 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+)
2213 (?:\s*&&\s*(?P=func_name))?
2214 \s*\(\s*(?P<callback_data>.*)\);?
2215 \s*?(?://[^\n]*)*$''',
2216 r'\g<callback_data>', code)
478c2c61
PH
2217
2218
e05f6939 2219def js_to_json(code):
4195096e
S
2220 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2221 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2222 INTEGER_TABLE = (
2223 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2224 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2225 )
2226
e05f6939 2227 def fix_kv(m):
e7b6d122
PH
2228 v = m.group(0)
2229 if v in ('true', 'false', 'null'):
2230 return v
b3ee552e 2231 elif v.startswith('/*') or v.startswith('//') or v == ',':
bd1e4844 2232 return ""
2233
2234 if v[0] in ("'", '"'):
2235 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 2236 '"': '\\"',
bd1e4844 2237 "\\'": "'",
2238 '\\\n': '',
2239 '\\x': '\\u00',
2240 }.get(m.group(0), m.group(0)), v[1:-1])
2241
89ac4a19
S
2242 for regex, base in INTEGER_TABLE:
2243 im = re.match(regex, v)
2244 if im:
e4659b45 2245 i = int(im.group(1), base)
89ac4a19
S
2246 return '"%d":' % i if v.endswith(':') else '%d' % i
2247
e7b6d122 2248 return '"%s"' % v
e05f6939 2249
bd1e4844 2250 return re.sub(r'''(?sx)
2251 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2252 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 2253 {comment}|,(?={skip}[\]}}])|
bd1e4844 2254 [a-zA-Z_][.a-zA-Z_0-9]*|
4195096e
S
2255 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2256 [0-9]+(?={skip}:)
2257 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
2258
2259
478c2c61
PH
2260def qualities(quality_ids):
2261 """ Get a numeric quality value out of a list of possible values """
2262 def q(qid):
2263 try:
2264 return quality_ids.index(qid)
2265 except ValueError:
2266 return -1
2267 return q
2268
acd69589
PH
2269
2270DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 2271
a020a0dc
PH
2272
2273def limit_length(s, length):
2274 """ Add ellipses to overly long strings """
2275 if s is None:
2276 return None
2277 ELLIPSES = '...'
2278 if len(s) > length:
2279 return s[:length - len(ELLIPSES)] + ELLIPSES
2280 return s
48844745
PH
2281
2282
2283def version_tuple(v):
5f9b8394 2284 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2285
2286
2287def is_outdated_version(version, limit, assume_new=True):
2288 if not version:
2289 return not assume_new
2290 try:
2291 return version_tuple(version) < version_tuple(limit)
2292 except ValueError:
2293 return not assume_new
732ea2f0
PH
2294
2295
2296def ytdl_is_updateable():
2297 """ Returns if youtube-dl can be updated with -U """
2298 from zipimport import zipimporter
2299
2300 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
2301
2302
2303def args_to_str(args):
2304 # Get a short string representation for a subprocess command
702ccf2d 2305 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2306
2307
9b9c5355 2308def error_to_compat_str(err):
fdae2358
S
2309 err_str = str(err)
2310 # On python 2 error byte string must be decoded with proper
2311 # encoding rather than ascii
2312 if sys.version_info[0] < 3:
2313 err_str = err_str.decode(preferredencoding())
2314 return err_str
2315
2316
c460bdd5 2317def mimetype2ext(mt):
eb9ee194
S
2318 if mt is None:
2319 return None
2320
765ac263
JMF
2321 ext = {
2322 'audio/mp4': 'm4a',
6c33d24b
YCH
2323 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2324 # it's the most popular one
2325 'audio/mpeg': 'mp3',
765ac263
JMF
2326 }.get(mt)
2327 if ext is not None:
2328 return ext
2329
c460bdd5 2330 _, _, res = mt.rpartition('/')
6562d34a 2331 res = res.split(';')[0].strip().lower()
c460bdd5
PH
2332
2333 return {
f6861ec9 2334 '3gpp': '3gp',
cafcf657 2335 'smptett+xml': 'tt',
cafcf657 2336 'ttaf+xml': 'dfxp',
a0d8d704 2337 'ttml+xml': 'ttml',
f6861ec9 2338 'x-flv': 'flv',
a0d8d704
YCH
2339 'x-mp4-fragmented': 'mp4',
2340 'x-ms-wmv': 'wmv',
b4173f15
RA
2341 'mpegurl': 'm3u8',
2342 'x-mpegurl': 'm3u8',
2343 'vnd.apple.mpegurl': 'm3u8',
2344 'dash+xml': 'mpd',
b4173f15 2345 'f4m+xml': 'f4m',
f164b971 2346 'hds+xml': 'f4m',
e910fe2f 2347 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 2348 'quicktime': 'mov',
98ce1a3f 2349 'mp2t': 'ts',
c460bdd5
PH
2350 }.get(res, res)
2351
2352
4f3c5e06 2353def parse_codecs(codecs_str):
2354 # http://tools.ietf.org/html/rfc6381
2355 if not codecs_str:
2356 return {}
2357 splited_codecs = list(filter(None, map(
2358 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2359 vcodec, acodec = None, None
2360 for full_codec in splited_codecs:
2361 codec = full_codec.split('.')[0]
2362 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2363 if not vcodec:
2364 vcodec = full_codec
60f5c9fb 2365 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 2366 if not acodec:
2367 acodec = full_codec
2368 else:
60f5c9fb 2369 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
4f3c5e06 2370 if not vcodec and not acodec:
2371 if len(splited_codecs) == 2:
2372 return {
2373 'vcodec': vcodec,
2374 'acodec': acodec,
2375 }
2376 elif len(splited_codecs) == 1:
2377 return {
2378 'vcodec': 'none',
2379 'acodec': vcodec,
2380 }
2381 else:
2382 return {
2383 'vcodec': vcodec or 'none',
2384 'acodec': acodec or 'none',
2385 }
2386 return {}
2387
2388
2ccd1b10 2389def urlhandle_detect_ext(url_handle):
79298173 2390 getheader = url_handle.headers.get
2ccd1b10 2391
b55ee18f
PH
2392 cd = getheader('Content-Disposition')
2393 if cd:
2394 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2395 if m:
2396 e = determine_ext(m.group('filename'), default_ext=None)
2397 if e:
2398 return e
2399
c460bdd5 2400 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2401
2402
1e399778
YCH
2403def encode_data_uri(data, mime_type):
2404 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2405
2406
05900629 2407def age_restricted(content_limit, age_limit):
6ec6cb4e 2408 """ Returns True iff the content should be blocked """
05900629
PH
2409
2410 if age_limit is None: # No limit set
2411 return False
2412 if content_limit is None:
2413 return False # Content available for everyone
2414 return age_limit < content_limit
61ca9a80
PH
2415
2416
2417def is_html(first_bytes):
2418 """ Detect whether a file contains HTML by examining its first bytes. """
2419
2420 BOMS = [
2421 (b'\xef\xbb\xbf', 'utf-8'),
2422 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2423 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2424 (b'\xff\xfe', 'utf-16-le'),
2425 (b'\xfe\xff', 'utf-16-be'),
2426 ]
2427 for bom, enc in BOMS:
2428 if first_bytes.startswith(bom):
2429 s = first_bytes[len(bom):].decode(enc, 'replace')
2430 break
2431 else:
2432 s = first_bytes.decode('utf-8', 'replace')
2433
2434 return re.match(r'^\s*<', s)
a055469f
PH
2435
2436
2437def determine_protocol(info_dict):
2438 protocol = info_dict.get('protocol')
2439 if protocol is not None:
2440 return protocol
2441
2442 url = info_dict['url']
2443 if url.startswith('rtmp'):
2444 return 'rtmp'
2445 elif url.startswith('mms'):
2446 return 'mms'
2447 elif url.startswith('rtsp'):
2448 return 'rtsp'
2449
2450 ext = determine_ext(url)
2451 if ext == 'm3u8':
2452 return 'm3u8'
2453 elif ext == 'f4m':
2454 return 'f4m'
2455
2456 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2457
2458
2459def render_table(header_row, data):
2460 """ Render a list of rows, each as a list of values """
2461 table = [header_row] + data
2462 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2463 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2464 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2465
2466
2467def _match_one(filter_part, dct):
2468 COMPARISON_OPERATORS = {
2469 '<': operator.lt,
2470 '<=': operator.le,
2471 '>': operator.gt,
2472 '>=': operator.ge,
2473 '=': operator.eq,
2474 '!=': operator.ne,
2475 }
2476 operator_rex = re.compile(r'''(?x)\s*
2477 (?P<key>[a-z_]+)
2478 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2479 (?:
2480 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
db13c16e 2481 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
347de493
PH
2482 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2483 )
2484 \s*$
2485 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2486 m = operator_rex.search(filter_part)
2487 if m:
2488 op = COMPARISON_OPERATORS[m.group('op')]
e5a088dc 2489 actual_value = dct.get(m.group('key'))
db13c16e
S
2490 if (m.group('quotedstrval') is not None or
2491 m.group('strval') is not None or
e5a088dc
S
2492 # If the original field is a string and matching comparisonvalue is
2493 # a number we should respect the origin of the original field
2494 # and process comparison value as a string (see
2495 # https://github.com/rg3/youtube-dl/issues/11082).
2496 actual_value is not None and m.group('intval') is not None and
2497 isinstance(actual_value, compat_str)):
347de493
PH
2498 if m.group('op') not in ('=', '!='):
2499 raise ValueError(
2500 'Operator %s does not support string values!' % m.group('op'))
db13c16e
S
2501 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2502 quote = m.group('quote')
2503 if quote is not None:
2504 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
347de493
PH
2505 else:
2506 try:
2507 comparison_value = int(m.group('intval'))
2508 except ValueError:
2509 comparison_value = parse_filesize(m.group('intval'))
2510 if comparison_value is None:
2511 comparison_value = parse_filesize(m.group('intval') + 'B')
2512 if comparison_value is None:
2513 raise ValueError(
2514 'Invalid integer value %r in filter part %r' % (
2515 m.group('intval'), filter_part))
347de493
PH
2516 if actual_value is None:
2517 return m.group('none_inclusive')
2518 return op(actual_value, comparison_value)
2519
2520 UNARY_OPERATORS = {
2521 '': lambda v: v is not None,
2522 '!': lambda v: v is None,
2523 }
2524 operator_rex = re.compile(r'''(?x)\s*
2525 (?P<op>%s)\s*(?P<key>[a-z_]+)
2526 \s*$
2527 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2528 m = operator_rex.search(filter_part)
2529 if m:
2530 op = UNARY_OPERATORS[m.group('op')]
2531 actual_value = dct.get(m.group('key'))
2532 return op(actual_value)
2533
2534 raise ValueError('Invalid filter part %r' % filter_part)
2535
2536
2537def match_str(filter_str, dct):
2538 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2539
2540 return all(
2541 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2542
2543
2544def match_filter_func(filter_str):
2545 def _match_func(info_dict):
2546 if match_str(filter_str, info_dict):
2547 return None
2548 else:
2549 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2550 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2551 return _match_func
91410c9b
PH
2552
2553
bf6427d2
YCH
2554def parse_dfxp_time_expr(time_expr):
2555 if not time_expr:
d631d5f9 2556 return
bf6427d2
YCH
2557
2558 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2559 if mobj:
2560 return float(mobj.group('time_offset'))
2561
db2fe38b 2562 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2563 if mobj:
db2fe38b 2564 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2565
2566
c1c924ab
YCH
2567def srt_subtitles_timecode(seconds):
2568 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2569
2570
2571def dfxp2srt(dfxp_data):
5b995f71
RA
2572 LEGACY_NAMESPACES = (
2573 ('http://www.w3.org/ns/ttml', [
2574 'http://www.w3.org/2004/11/ttaf1',
2575 'http://www.w3.org/2006/04/ttaf1',
2576 'http://www.w3.org/2006/10/ttaf1',
2577 ]),
2578 ('http://www.w3.org/ns/ttml#styling', [
2579 'http://www.w3.org/ns/ttml#style',
2580 ]),
2581 )
2582
2583 SUPPORTED_STYLING = [
2584 'color',
2585 'fontFamily',
2586 'fontSize',
2587 'fontStyle',
2588 'fontWeight',
2589 'textDecoration'
2590 ]
2591
4e335771
YCH
2592 _x = functools.partial(xpath_with_ns, ns_map={
2593 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 2594 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 2595 })
bf6427d2 2596
5b995f71
RA
2597 styles = {}
2598 default_style = {}
2599
87de7069 2600 class TTMLPElementParser(object):
5b995f71
RA
2601 _out = ''
2602 _unclosed_elements = []
2603 _applied_styles = []
bf6427d2 2604
2b14cb56 2605 def start(self, tag, attrib):
5b995f71
RA
2606 if tag in (_x('ttml:br'), 'br'):
2607 self._out += '\n'
2608 else:
2609 unclosed_elements = []
2610 style = {}
2611 element_style_id = attrib.get('style')
2612 if default_style:
2613 style.update(default_style)
2614 if element_style_id:
2615 style.update(styles.get(element_style_id, {}))
2616 for prop in SUPPORTED_STYLING:
2617 prop_val = attrib.get(_x('tts:' + prop))
2618 if prop_val:
2619 style[prop] = prop_val
2620 if style:
2621 font = ''
2622 for k, v in sorted(style.items()):
2623 if self._applied_styles and self._applied_styles[-1].get(k) == v:
2624 continue
2625 if k == 'color':
2626 font += ' color="%s"' % v
2627 elif k == 'fontSize':
2628 font += ' size="%s"' % v
2629 elif k == 'fontFamily':
2630 font += ' face="%s"' % v
2631 elif k == 'fontWeight' and v == 'bold':
2632 self._out += '<b>'
2633 unclosed_elements.append('b')
2634 elif k == 'fontStyle' and v == 'italic':
2635 self._out += '<i>'
2636 unclosed_elements.append('i')
2637 elif k == 'textDecoration' and v == 'underline':
2638 self._out += '<u>'
2639 unclosed_elements.append('u')
2640 if font:
2641 self._out += '<font' + font + '>'
2642 unclosed_elements.append('font')
2643 applied_style = {}
2644 if self._applied_styles:
2645 applied_style.update(self._applied_styles[-1])
2646 applied_style.update(style)
2647 self._applied_styles.append(applied_style)
2648 self._unclosed_elements.append(unclosed_elements)
bf6427d2 2649
2b14cb56 2650 def end(self, tag):
5b995f71
RA
2651 if tag not in (_x('ttml:br'), 'br'):
2652 unclosed_elements = self._unclosed_elements.pop()
2653 for element in reversed(unclosed_elements):
2654 self._out += '</%s>' % element
2655 if unclosed_elements and self._applied_styles:
2656 self._applied_styles.pop()
bf6427d2 2657
2b14cb56 2658 def data(self, data):
5b995f71 2659 self._out += data
2b14cb56 2660
2661 def close(self):
5b995f71 2662 return self._out.strip()
2b14cb56 2663
2664 def parse_node(node):
2665 target = TTMLPElementParser()
2666 parser = xml.etree.ElementTree.XMLParser(target=target)
2667 parser.feed(xml.etree.ElementTree.tostring(node))
2668 return parser.close()
bf6427d2 2669
5b995f71
RA
2670 for k, v in LEGACY_NAMESPACES:
2671 for ns in v:
2672 dfxp_data = dfxp_data.replace(ns, k)
2673
36e6f62c 2674 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
bf6427d2 2675 out = []
5b995f71 2676 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2677
2678 if not paras:
2679 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 2680
5b995f71
RA
2681 repeat = False
2682 while True:
2683 for style in dfxp.findall(_x('.//ttml:style')):
2684 style_id = style.get('id')
2685 parent_style_id = style.get('style')
2686 if parent_style_id:
2687 if parent_style_id not in styles:
2688 repeat = True
2689 continue
2690 styles[style_id] = styles[parent_style_id].copy()
2691 for prop in SUPPORTED_STYLING:
2692 prop_val = style.get(_x('tts:' + prop))
2693 if prop_val:
2694 styles.setdefault(style_id, {})[prop] = prop_val
2695 if repeat:
2696 repeat = False
2697 else:
2698 break
2699
2700 for p in ('body', 'div'):
2701 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2702 if ele is None:
2703 continue
2704 style = styles.get(ele.get('style'))
2705 if not style:
2706 continue
2707 default_style.update(style)
2708
bf6427d2 2709 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2710 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2711 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2712 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2713 if begin_time is None:
2714 continue
7dff0363 2715 if not end_time:
d631d5f9
YCH
2716 if not dur:
2717 continue
2718 end_time = begin_time + dur
bf6427d2
YCH
2719 out.append('%d\n%s --> %s\n%s\n\n' % (
2720 index,
c1c924ab
YCH
2721 srt_subtitles_timecode(begin_time),
2722 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2723 parse_node(para)))
2724
2725 return ''.join(out)
2726
2727
66e289ba
S
2728def cli_option(params, command_option, param):
2729 param = params.get(param)
98e698f1
RA
2730 if param:
2731 param = compat_str(param)
66e289ba
S
2732 return [command_option, param] if param is not None else []
2733
2734
2735def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2736 param = params.get(param)
2737 assert isinstance(param, bool)
2738 if separator:
2739 return [command_option + separator + (true_value if param else false_value)]
2740 return [command_option, true_value if param else false_value]
2741
2742
2743def cli_valueless_option(params, command_option, param, expected_value=True):
2744 param = params.get(param)
2745 return [command_option] if param == expected_value else []
2746
2747
2748def cli_configuration_args(params, param, default=[]):
2749 ex_args = params.get(param)
2750 if ex_args is None:
2751 return default
2752 assert isinstance(ex_args, list)
2753 return ex_args
2754
2755
39672624
YCH
2756class ISO639Utils(object):
2757 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2758 _lang_map = {
2759 'aa': 'aar',
2760 'ab': 'abk',
2761 'ae': 'ave',
2762 'af': 'afr',
2763 'ak': 'aka',
2764 'am': 'amh',
2765 'an': 'arg',
2766 'ar': 'ara',
2767 'as': 'asm',
2768 'av': 'ava',
2769 'ay': 'aym',
2770 'az': 'aze',
2771 'ba': 'bak',
2772 'be': 'bel',
2773 'bg': 'bul',
2774 'bh': 'bih',
2775 'bi': 'bis',
2776 'bm': 'bam',
2777 'bn': 'ben',
2778 'bo': 'bod',
2779 'br': 'bre',
2780 'bs': 'bos',
2781 'ca': 'cat',
2782 'ce': 'che',
2783 'ch': 'cha',
2784 'co': 'cos',
2785 'cr': 'cre',
2786 'cs': 'ces',
2787 'cu': 'chu',
2788 'cv': 'chv',
2789 'cy': 'cym',
2790 'da': 'dan',
2791 'de': 'deu',
2792 'dv': 'div',
2793 'dz': 'dzo',
2794 'ee': 'ewe',
2795 'el': 'ell',
2796 'en': 'eng',
2797 'eo': 'epo',
2798 'es': 'spa',
2799 'et': 'est',
2800 'eu': 'eus',
2801 'fa': 'fas',
2802 'ff': 'ful',
2803 'fi': 'fin',
2804 'fj': 'fij',
2805 'fo': 'fao',
2806 'fr': 'fra',
2807 'fy': 'fry',
2808 'ga': 'gle',
2809 'gd': 'gla',
2810 'gl': 'glg',
2811 'gn': 'grn',
2812 'gu': 'guj',
2813 'gv': 'glv',
2814 'ha': 'hau',
2815 'he': 'heb',
2816 'hi': 'hin',
2817 'ho': 'hmo',
2818 'hr': 'hrv',
2819 'ht': 'hat',
2820 'hu': 'hun',
2821 'hy': 'hye',
2822 'hz': 'her',
2823 'ia': 'ina',
2824 'id': 'ind',
2825 'ie': 'ile',
2826 'ig': 'ibo',
2827 'ii': 'iii',
2828 'ik': 'ipk',
2829 'io': 'ido',
2830 'is': 'isl',
2831 'it': 'ita',
2832 'iu': 'iku',
2833 'ja': 'jpn',
2834 'jv': 'jav',
2835 'ka': 'kat',
2836 'kg': 'kon',
2837 'ki': 'kik',
2838 'kj': 'kua',
2839 'kk': 'kaz',
2840 'kl': 'kal',
2841 'km': 'khm',
2842 'kn': 'kan',
2843 'ko': 'kor',
2844 'kr': 'kau',
2845 'ks': 'kas',
2846 'ku': 'kur',
2847 'kv': 'kom',
2848 'kw': 'cor',
2849 'ky': 'kir',
2850 'la': 'lat',
2851 'lb': 'ltz',
2852 'lg': 'lug',
2853 'li': 'lim',
2854 'ln': 'lin',
2855 'lo': 'lao',
2856 'lt': 'lit',
2857 'lu': 'lub',
2858 'lv': 'lav',
2859 'mg': 'mlg',
2860 'mh': 'mah',
2861 'mi': 'mri',
2862 'mk': 'mkd',
2863 'ml': 'mal',
2864 'mn': 'mon',
2865 'mr': 'mar',
2866 'ms': 'msa',
2867 'mt': 'mlt',
2868 'my': 'mya',
2869 'na': 'nau',
2870 'nb': 'nob',
2871 'nd': 'nde',
2872 'ne': 'nep',
2873 'ng': 'ndo',
2874 'nl': 'nld',
2875 'nn': 'nno',
2876 'no': 'nor',
2877 'nr': 'nbl',
2878 'nv': 'nav',
2879 'ny': 'nya',
2880 'oc': 'oci',
2881 'oj': 'oji',
2882 'om': 'orm',
2883 'or': 'ori',
2884 'os': 'oss',
2885 'pa': 'pan',
2886 'pi': 'pli',
2887 'pl': 'pol',
2888 'ps': 'pus',
2889 'pt': 'por',
2890 'qu': 'que',
2891 'rm': 'roh',
2892 'rn': 'run',
2893 'ro': 'ron',
2894 'ru': 'rus',
2895 'rw': 'kin',
2896 'sa': 'san',
2897 'sc': 'srd',
2898 'sd': 'snd',
2899 'se': 'sme',
2900 'sg': 'sag',
2901 'si': 'sin',
2902 'sk': 'slk',
2903 'sl': 'slv',
2904 'sm': 'smo',
2905 'sn': 'sna',
2906 'so': 'som',
2907 'sq': 'sqi',
2908 'sr': 'srp',
2909 'ss': 'ssw',
2910 'st': 'sot',
2911 'su': 'sun',
2912 'sv': 'swe',
2913 'sw': 'swa',
2914 'ta': 'tam',
2915 'te': 'tel',
2916 'tg': 'tgk',
2917 'th': 'tha',
2918 'ti': 'tir',
2919 'tk': 'tuk',
2920 'tl': 'tgl',
2921 'tn': 'tsn',
2922 'to': 'ton',
2923 'tr': 'tur',
2924 'ts': 'tso',
2925 'tt': 'tat',
2926 'tw': 'twi',
2927 'ty': 'tah',
2928 'ug': 'uig',
2929 'uk': 'ukr',
2930 'ur': 'urd',
2931 'uz': 'uzb',
2932 've': 'ven',
2933 'vi': 'vie',
2934 'vo': 'vol',
2935 'wa': 'wln',
2936 'wo': 'wol',
2937 'xh': 'xho',
2938 'yi': 'yid',
2939 'yo': 'yor',
2940 'za': 'zha',
2941 'zh': 'zho',
2942 'zu': 'zul',
2943 }
2944
2945 @classmethod
2946 def short2long(cls, code):
2947 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2948 return cls._lang_map.get(code[:2])
2949
2950 @classmethod
2951 def long2short(cls, code):
2952 """Convert language code from ISO 639-2/T to ISO 639-1"""
2953 for short_name, long_name in cls._lang_map.items():
2954 if long_name == code:
2955 return short_name
2956
2957
4eb10f66
YCH
2958class ISO3166Utils(object):
2959 # From http://data.okfn.org/data/core/country-list
2960 _country_map = {
2961 'AF': 'Afghanistan',
2962 'AX': 'Åland Islands',
2963 'AL': 'Albania',
2964 'DZ': 'Algeria',
2965 'AS': 'American Samoa',
2966 'AD': 'Andorra',
2967 'AO': 'Angola',
2968 'AI': 'Anguilla',
2969 'AQ': 'Antarctica',
2970 'AG': 'Antigua and Barbuda',
2971 'AR': 'Argentina',
2972 'AM': 'Armenia',
2973 'AW': 'Aruba',
2974 'AU': 'Australia',
2975 'AT': 'Austria',
2976 'AZ': 'Azerbaijan',
2977 'BS': 'Bahamas',
2978 'BH': 'Bahrain',
2979 'BD': 'Bangladesh',
2980 'BB': 'Barbados',
2981 'BY': 'Belarus',
2982 'BE': 'Belgium',
2983 'BZ': 'Belize',
2984 'BJ': 'Benin',
2985 'BM': 'Bermuda',
2986 'BT': 'Bhutan',
2987 'BO': 'Bolivia, Plurinational State of',
2988 'BQ': 'Bonaire, Sint Eustatius and Saba',
2989 'BA': 'Bosnia and Herzegovina',
2990 'BW': 'Botswana',
2991 'BV': 'Bouvet Island',
2992 'BR': 'Brazil',
2993 'IO': 'British Indian Ocean Territory',
2994 'BN': 'Brunei Darussalam',
2995 'BG': 'Bulgaria',
2996 'BF': 'Burkina Faso',
2997 'BI': 'Burundi',
2998 'KH': 'Cambodia',
2999 'CM': 'Cameroon',
3000 'CA': 'Canada',
3001 'CV': 'Cape Verde',
3002 'KY': 'Cayman Islands',
3003 'CF': 'Central African Republic',
3004 'TD': 'Chad',
3005 'CL': 'Chile',
3006 'CN': 'China',
3007 'CX': 'Christmas Island',
3008 'CC': 'Cocos (Keeling) Islands',
3009 'CO': 'Colombia',
3010 'KM': 'Comoros',
3011 'CG': 'Congo',
3012 'CD': 'Congo, the Democratic Republic of the',
3013 'CK': 'Cook Islands',
3014 'CR': 'Costa Rica',
3015 'CI': 'Côte d\'Ivoire',
3016 'HR': 'Croatia',
3017 'CU': 'Cuba',
3018 'CW': 'Curaçao',
3019 'CY': 'Cyprus',
3020 'CZ': 'Czech Republic',
3021 'DK': 'Denmark',
3022 'DJ': 'Djibouti',
3023 'DM': 'Dominica',
3024 'DO': 'Dominican Republic',
3025 'EC': 'Ecuador',
3026 'EG': 'Egypt',
3027 'SV': 'El Salvador',
3028 'GQ': 'Equatorial Guinea',
3029 'ER': 'Eritrea',
3030 'EE': 'Estonia',
3031 'ET': 'Ethiopia',
3032 'FK': 'Falkland Islands (Malvinas)',
3033 'FO': 'Faroe Islands',
3034 'FJ': 'Fiji',
3035 'FI': 'Finland',
3036 'FR': 'France',
3037 'GF': 'French Guiana',
3038 'PF': 'French Polynesia',
3039 'TF': 'French Southern Territories',
3040 'GA': 'Gabon',
3041 'GM': 'Gambia',
3042 'GE': 'Georgia',
3043 'DE': 'Germany',
3044 'GH': 'Ghana',
3045 'GI': 'Gibraltar',
3046 'GR': 'Greece',
3047 'GL': 'Greenland',
3048 'GD': 'Grenada',
3049 'GP': 'Guadeloupe',
3050 'GU': 'Guam',
3051 'GT': 'Guatemala',
3052 'GG': 'Guernsey',
3053 'GN': 'Guinea',
3054 'GW': 'Guinea-Bissau',
3055 'GY': 'Guyana',
3056 'HT': 'Haiti',
3057 'HM': 'Heard Island and McDonald Islands',
3058 'VA': 'Holy See (Vatican City State)',
3059 'HN': 'Honduras',
3060 'HK': 'Hong Kong',
3061 'HU': 'Hungary',
3062 'IS': 'Iceland',
3063 'IN': 'India',
3064 'ID': 'Indonesia',
3065 'IR': 'Iran, Islamic Republic of',
3066 'IQ': 'Iraq',
3067 'IE': 'Ireland',
3068 'IM': 'Isle of Man',
3069 'IL': 'Israel',
3070 'IT': 'Italy',
3071 'JM': 'Jamaica',
3072 'JP': 'Japan',
3073 'JE': 'Jersey',
3074 'JO': 'Jordan',
3075 'KZ': 'Kazakhstan',
3076 'KE': 'Kenya',
3077 'KI': 'Kiribati',
3078 'KP': 'Korea, Democratic People\'s Republic of',
3079 'KR': 'Korea, Republic of',
3080 'KW': 'Kuwait',
3081 'KG': 'Kyrgyzstan',
3082 'LA': 'Lao People\'s Democratic Republic',
3083 'LV': 'Latvia',
3084 'LB': 'Lebanon',
3085 'LS': 'Lesotho',
3086 'LR': 'Liberia',
3087 'LY': 'Libya',
3088 'LI': 'Liechtenstein',
3089 'LT': 'Lithuania',
3090 'LU': 'Luxembourg',
3091 'MO': 'Macao',
3092 'MK': 'Macedonia, the Former Yugoslav Republic of',
3093 'MG': 'Madagascar',
3094 'MW': 'Malawi',
3095 'MY': 'Malaysia',
3096 'MV': 'Maldives',
3097 'ML': 'Mali',
3098 'MT': 'Malta',
3099 'MH': 'Marshall Islands',
3100 'MQ': 'Martinique',
3101 'MR': 'Mauritania',
3102 'MU': 'Mauritius',
3103 'YT': 'Mayotte',
3104 'MX': 'Mexico',
3105 'FM': 'Micronesia, Federated States of',
3106 'MD': 'Moldova, Republic of',
3107 'MC': 'Monaco',
3108 'MN': 'Mongolia',
3109 'ME': 'Montenegro',
3110 'MS': 'Montserrat',
3111 'MA': 'Morocco',
3112 'MZ': 'Mozambique',
3113 'MM': 'Myanmar',
3114 'NA': 'Namibia',
3115 'NR': 'Nauru',
3116 'NP': 'Nepal',
3117 'NL': 'Netherlands',
3118 'NC': 'New Caledonia',
3119 'NZ': 'New Zealand',
3120 'NI': 'Nicaragua',
3121 'NE': 'Niger',
3122 'NG': 'Nigeria',
3123 'NU': 'Niue',
3124 'NF': 'Norfolk Island',
3125 'MP': 'Northern Mariana Islands',
3126 'NO': 'Norway',
3127 'OM': 'Oman',
3128 'PK': 'Pakistan',
3129 'PW': 'Palau',
3130 'PS': 'Palestine, State of',
3131 'PA': 'Panama',
3132 'PG': 'Papua New Guinea',
3133 'PY': 'Paraguay',
3134 'PE': 'Peru',
3135 'PH': 'Philippines',
3136 'PN': 'Pitcairn',
3137 'PL': 'Poland',
3138 'PT': 'Portugal',
3139 'PR': 'Puerto Rico',
3140 'QA': 'Qatar',
3141 'RE': 'Réunion',
3142 'RO': 'Romania',
3143 'RU': 'Russian Federation',
3144 'RW': 'Rwanda',
3145 'BL': 'Saint Barthélemy',
3146 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3147 'KN': 'Saint Kitts and Nevis',
3148 'LC': 'Saint Lucia',
3149 'MF': 'Saint Martin (French part)',
3150 'PM': 'Saint Pierre and Miquelon',
3151 'VC': 'Saint Vincent and the Grenadines',
3152 'WS': 'Samoa',
3153 'SM': 'San Marino',
3154 'ST': 'Sao Tome and Principe',
3155 'SA': 'Saudi Arabia',
3156 'SN': 'Senegal',
3157 'RS': 'Serbia',
3158 'SC': 'Seychelles',
3159 'SL': 'Sierra Leone',
3160 'SG': 'Singapore',
3161 'SX': 'Sint Maarten (Dutch part)',
3162 'SK': 'Slovakia',
3163 'SI': 'Slovenia',
3164 'SB': 'Solomon Islands',
3165 'SO': 'Somalia',
3166 'ZA': 'South Africa',
3167 'GS': 'South Georgia and the South Sandwich Islands',
3168 'SS': 'South Sudan',
3169 'ES': 'Spain',
3170 'LK': 'Sri Lanka',
3171 'SD': 'Sudan',
3172 'SR': 'Suriname',
3173 'SJ': 'Svalbard and Jan Mayen',
3174 'SZ': 'Swaziland',
3175 'SE': 'Sweden',
3176 'CH': 'Switzerland',
3177 'SY': 'Syrian Arab Republic',
3178 'TW': 'Taiwan, Province of China',
3179 'TJ': 'Tajikistan',
3180 'TZ': 'Tanzania, United Republic of',
3181 'TH': 'Thailand',
3182 'TL': 'Timor-Leste',
3183 'TG': 'Togo',
3184 'TK': 'Tokelau',
3185 'TO': 'Tonga',
3186 'TT': 'Trinidad and Tobago',
3187 'TN': 'Tunisia',
3188 'TR': 'Turkey',
3189 'TM': 'Turkmenistan',
3190 'TC': 'Turks and Caicos Islands',
3191 'TV': 'Tuvalu',
3192 'UG': 'Uganda',
3193 'UA': 'Ukraine',
3194 'AE': 'United Arab Emirates',
3195 'GB': 'United Kingdom',
3196 'US': 'United States',
3197 'UM': 'United States Minor Outlying Islands',
3198 'UY': 'Uruguay',
3199 'UZ': 'Uzbekistan',
3200 'VU': 'Vanuatu',
3201 'VE': 'Venezuela, Bolivarian Republic of',
3202 'VN': 'Viet Nam',
3203 'VG': 'Virgin Islands, British',
3204 'VI': 'Virgin Islands, U.S.',
3205 'WF': 'Wallis and Futuna',
3206 'EH': 'Western Sahara',
3207 'YE': 'Yemen',
3208 'ZM': 'Zambia',
3209 'ZW': 'Zimbabwe',
3210 }
3211
3212 @classmethod
3213 def short2full(cls, code):
3214 """Convert an ISO 3166-2 country code to the corresponding full name"""
3215 return cls._country_map.get(code.upper())
3216
3217
773f291d
S
3218class GeoUtils(object):
3219 # Major IPv4 address blocks per country
3220 _country_ip_map = {
3221 'AD': '85.94.160.0/19',
3222 'AE': '94.200.0.0/13',
3223 'AF': '149.54.0.0/17',
3224 'AG': '209.59.64.0/18',
3225 'AI': '204.14.248.0/21',
3226 'AL': '46.99.0.0/16',
3227 'AM': '46.70.0.0/15',
3228 'AO': '105.168.0.0/13',
3229 'AP': '159.117.192.0/21',
3230 'AR': '181.0.0.0/12',
3231 'AS': '202.70.112.0/20',
3232 'AT': '84.112.0.0/13',
3233 'AU': '1.128.0.0/11',
3234 'AW': '181.41.0.0/18',
3235 'AZ': '5.191.0.0/16',
3236 'BA': '31.176.128.0/17',
3237 'BB': '65.48.128.0/17',
3238 'BD': '114.130.0.0/16',
3239 'BE': '57.0.0.0/8',
3240 'BF': '129.45.128.0/17',
3241 'BG': '95.42.0.0/15',
3242 'BH': '37.131.0.0/17',
3243 'BI': '154.117.192.0/18',
3244 'BJ': '137.255.0.0/16',
3245 'BL': '192.131.134.0/24',
3246 'BM': '196.12.64.0/18',
3247 'BN': '156.31.0.0/16',
3248 'BO': '161.56.0.0/16',
3249 'BQ': '161.0.80.0/20',
3250 'BR': '152.240.0.0/12',
3251 'BS': '24.51.64.0/18',
3252 'BT': '119.2.96.0/19',
3253 'BW': '168.167.0.0/16',
3254 'BY': '178.120.0.0/13',
3255 'BZ': '179.42.192.0/18',
3256 'CA': '99.224.0.0/11',
3257 'CD': '41.243.0.0/16',
3258 'CF': '196.32.200.0/21',
3259 'CG': '197.214.128.0/17',
3260 'CH': '85.0.0.0/13',
3261 'CI': '154.232.0.0/14',
3262 'CK': '202.65.32.0/19',
3263 'CL': '152.172.0.0/14',
3264 'CM': '165.210.0.0/15',
3265 'CN': '36.128.0.0/10',
3266 'CO': '181.240.0.0/12',
3267 'CR': '201.192.0.0/12',
3268 'CU': '152.206.0.0/15',
3269 'CV': '165.90.96.0/19',
3270 'CW': '190.88.128.0/17',
3271 'CY': '46.198.0.0/15',
3272 'CZ': '88.100.0.0/14',
3273 'DE': '53.0.0.0/8',
3274 'DJ': '197.241.0.0/17',
3275 'DK': '87.48.0.0/12',
3276 'DM': '192.243.48.0/20',
3277 'DO': '152.166.0.0/15',
3278 'DZ': '41.96.0.0/12',
3279 'EC': '186.68.0.0/15',
3280 'EE': '90.190.0.0/15',
3281 'EG': '156.160.0.0/11',
3282 'ER': '196.200.96.0/20',
3283 'ES': '88.0.0.0/11',
3284 'ET': '196.188.0.0/14',
3285 'EU': '2.16.0.0/13',
3286 'FI': '91.152.0.0/13',
3287 'FJ': '144.120.0.0/16',
3288 'FM': '119.252.112.0/20',
3289 'FO': '88.85.32.0/19',
3290 'FR': '90.0.0.0/9',
3291 'GA': '41.158.0.0/15',
3292 'GB': '25.0.0.0/8',
3293 'GD': '74.122.88.0/21',
3294 'GE': '31.146.0.0/16',
3295 'GF': '161.22.64.0/18',
3296 'GG': '62.68.160.0/19',
3297 'GH': '45.208.0.0/14',
3298 'GI': '85.115.128.0/19',
3299 'GL': '88.83.0.0/19',
3300 'GM': '160.182.0.0/15',
3301 'GN': '197.149.192.0/18',
3302 'GP': '104.250.0.0/19',
3303 'GQ': '105.235.224.0/20',
3304 'GR': '94.64.0.0/13',
3305 'GT': '168.234.0.0/16',
3306 'GU': '168.123.0.0/16',
3307 'GW': '197.214.80.0/20',
3308 'GY': '181.41.64.0/18',
3309 'HK': '113.252.0.0/14',
3310 'HN': '181.210.0.0/16',
3311 'HR': '93.136.0.0/13',
3312 'HT': '148.102.128.0/17',
3313 'HU': '84.0.0.0/14',
3314 'ID': '39.192.0.0/10',
3315 'IE': '87.32.0.0/12',
3316 'IL': '79.176.0.0/13',
3317 'IM': '5.62.80.0/20',
3318 'IN': '117.192.0.0/10',
3319 'IO': '203.83.48.0/21',
3320 'IQ': '37.236.0.0/14',
3321 'IR': '2.176.0.0/12',
3322 'IS': '82.221.0.0/16',
3323 'IT': '79.0.0.0/10',
3324 'JE': '87.244.64.0/18',
3325 'JM': '72.27.0.0/17',
3326 'JO': '176.29.0.0/16',
3327 'JP': '126.0.0.0/8',
3328 'KE': '105.48.0.0/12',
3329 'KG': '158.181.128.0/17',
3330 'KH': '36.37.128.0/17',
3331 'KI': '103.25.140.0/22',
3332 'KM': '197.255.224.0/20',
3333 'KN': '198.32.32.0/19',
3334 'KP': '175.45.176.0/22',
3335 'KR': '175.192.0.0/10',
3336 'KW': '37.36.0.0/14',
3337 'KY': '64.96.0.0/15',
3338 'KZ': '2.72.0.0/13',
3339 'LA': '115.84.64.0/18',
3340 'LB': '178.135.0.0/16',
3341 'LC': '192.147.231.0/24',
3342 'LI': '82.117.0.0/19',
3343 'LK': '112.134.0.0/15',
3344 'LR': '41.86.0.0/19',
3345 'LS': '129.232.0.0/17',
3346 'LT': '78.56.0.0/13',
3347 'LU': '188.42.0.0/16',
3348 'LV': '46.109.0.0/16',
3349 'LY': '41.252.0.0/14',
3350 'MA': '105.128.0.0/11',
3351 'MC': '88.209.64.0/18',
3352 'MD': '37.246.0.0/16',
3353 'ME': '178.175.0.0/17',
3354 'MF': '74.112.232.0/21',
3355 'MG': '154.126.0.0/17',
3356 'MH': '117.103.88.0/21',
3357 'MK': '77.28.0.0/15',
3358 'ML': '154.118.128.0/18',
3359 'MM': '37.111.0.0/17',
3360 'MN': '49.0.128.0/17',
3361 'MO': '60.246.0.0/16',
3362 'MP': '202.88.64.0/20',
3363 'MQ': '109.203.224.0/19',
3364 'MR': '41.188.64.0/18',
3365 'MS': '208.90.112.0/22',
3366 'MT': '46.11.0.0/16',
3367 'MU': '105.16.0.0/12',
3368 'MV': '27.114.128.0/18',
3369 'MW': '105.234.0.0/16',
3370 'MX': '187.192.0.0/11',
3371 'MY': '175.136.0.0/13',
3372 'MZ': '197.218.0.0/15',
3373 'NA': '41.182.0.0/16',
3374 'NC': '101.101.0.0/18',
3375 'NE': '197.214.0.0/18',
3376 'NF': '203.17.240.0/22',
3377 'NG': '105.112.0.0/12',
3378 'NI': '186.76.0.0/15',
3379 'NL': '145.96.0.0/11',
3380 'NO': '84.208.0.0/13',
3381 'NP': '36.252.0.0/15',
3382 'NR': '203.98.224.0/19',
3383 'NU': '49.156.48.0/22',
3384 'NZ': '49.224.0.0/14',
3385 'OM': '5.36.0.0/15',
3386 'PA': '186.72.0.0/15',
3387 'PE': '186.160.0.0/14',
3388 'PF': '123.50.64.0/18',
3389 'PG': '124.240.192.0/19',
3390 'PH': '49.144.0.0/13',
3391 'PK': '39.32.0.0/11',
3392 'PL': '83.0.0.0/11',
3393 'PM': '70.36.0.0/20',
3394 'PR': '66.50.0.0/16',
3395 'PS': '188.161.0.0/16',
3396 'PT': '85.240.0.0/13',
3397 'PW': '202.124.224.0/20',
3398 'PY': '181.120.0.0/14',
3399 'QA': '37.210.0.0/15',
3400 'RE': '139.26.0.0/16',
3401 'RO': '79.112.0.0/13',
3402 'RS': '178.220.0.0/14',
3403 'RU': '5.136.0.0/13',
3404 'RW': '105.178.0.0/15',
3405 'SA': '188.48.0.0/13',
3406 'SB': '202.1.160.0/19',
3407 'SC': '154.192.0.0/11',
3408 'SD': '154.96.0.0/13',
3409 'SE': '78.64.0.0/12',
3410 'SG': '152.56.0.0/14',
3411 'SI': '188.196.0.0/14',
3412 'SK': '78.98.0.0/15',
3413 'SL': '197.215.0.0/17',
3414 'SM': '89.186.32.0/19',
3415 'SN': '41.82.0.0/15',
3416 'SO': '197.220.64.0/19',
3417 'SR': '186.179.128.0/17',
3418 'SS': '105.235.208.0/21',
3419 'ST': '197.159.160.0/19',
3420 'SV': '168.243.0.0/16',
3421 'SX': '190.102.0.0/20',
3422 'SY': '5.0.0.0/16',
3423 'SZ': '41.84.224.0/19',
3424 'TC': '65.255.48.0/20',
3425 'TD': '154.68.128.0/19',
3426 'TG': '196.168.0.0/14',
3427 'TH': '171.96.0.0/13',
3428 'TJ': '85.9.128.0/18',
3429 'TK': '27.96.24.0/21',
3430 'TL': '180.189.160.0/20',
3431 'TM': '95.85.96.0/19',
3432 'TN': '197.0.0.0/11',
3433 'TO': '175.176.144.0/21',
3434 'TR': '78.160.0.0/11',
3435 'TT': '186.44.0.0/15',
3436 'TV': '202.2.96.0/19',
3437 'TW': '120.96.0.0/11',
3438 'TZ': '156.156.0.0/14',
3439 'UA': '93.72.0.0/13',
3440 'UG': '154.224.0.0/13',
3441 'US': '3.0.0.0/8',
3442 'UY': '167.56.0.0/13',
3443 'UZ': '82.215.64.0/18',
3444 'VA': '212.77.0.0/19',
3445 'VC': '24.92.144.0/20',
3446 'VE': '186.88.0.0/13',
3447 'VG': '172.103.64.0/18',
3448 'VI': '146.226.0.0/16',
3449 'VN': '14.160.0.0/11',
3450 'VU': '202.80.32.0/20',
3451 'WF': '117.20.32.0/21',
3452 'WS': '202.4.32.0/19',
3453 'YE': '134.35.0.0/16',
3454 'YT': '41.242.116.0/22',
3455 'ZA': '41.0.0.0/11',
3456 'ZM': '165.56.0.0/13',
3457 'ZW': '41.85.192.0/19',
3458 }
3459
3460 @classmethod
3461 def random_ipv4(cls, code):
3462 block = cls._country_ip_map.get(code.upper())
3463 if not block:
3464 return None
3465 addr, preflen = block.split('/')
3466 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3467 addr_max = addr_min | (0xffffffff >> int(preflen))
18a0defa 3468 return compat_str(socket.inet_ntoa(
4248dad9 3469 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
3470
3471
91410c9b 3472class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
3473 def __init__(self, proxies=None):
3474 # Set default handlers
3475 for type in ('http', 'https'):
3476 setattr(self, '%s_open' % type,
3477 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3478 meth(r, proxy, type))
3479 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3480
91410c9b 3481 def proxy_open(self, req, proxy, type):
2461f79d 3482 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
3483 if req_proxy is not None:
3484 proxy = req_proxy
2461f79d
PH
3485 del req.headers['Ytdl-request-proxy']
3486
3487 if proxy == '__noproxy__':
3488 return None # No Proxy
51fb4995 3489 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
3490 req.add_header('Ytdl-socks-proxy', proxy)
3491 # youtube-dl's http/https handlers do wrapping the socket with socks
3492 return None
91410c9b
PH
3493 return compat_urllib_request.ProxyHandler.proxy_open(
3494 self, req, proxy, type)
5bc880b9
YCH
3495
3496
0a5445dd
YCH
3497# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3498# released into Public Domain
3499# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3500
3501def long_to_bytes(n, blocksize=0):
3502 """long_to_bytes(n:long, blocksize:int) : string
3503 Convert a long integer to a byte string.
3504
3505 If optional blocksize is given and greater than zero, pad the front of the
3506 byte string with binary zeros so that the length is a multiple of
3507 blocksize.
3508 """
3509 # after much testing, this algorithm was deemed to be the fastest
3510 s = b''
3511 n = int(n)
3512 while n > 0:
3513 s = compat_struct_pack('>I', n & 0xffffffff) + s
3514 n = n >> 32
3515 # strip off leading zeros
3516 for i in range(len(s)):
3517 if s[i] != b'\000'[0]:
3518 break
3519 else:
3520 # only happens when n == 0
3521 s = b'\000'
3522 i = 0
3523 s = s[i:]
3524 # add back some pad bytes. this could be done more efficiently w.r.t. the
3525 # de-padding being done above, but sigh...
3526 if blocksize > 0 and len(s) % blocksize:
3527 s = (blocksize - len(s) % blocksize) * b'\000' + s
3528 return s
3529
3530
3531def bytes_to_long(s):
3532 """bytes_to_long(string) : long
3533 Convert a byte string to a long integer.
3534
3535 This is (essentially) the inverse of long_to_bytes().
3536 """
3537 acc = 0
3538 length = len(s)
3539 if length % 4:
3540 extra = (4 - length % 4)
3541 s = b'\000' * extra + s
3542 length = length + extra
3543 for i in range(0, length, 4):
3544 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3545 return acc
3546
3547
5bc880b9
YCH
3548def ohdave_rsa_encrypt(data, exponent, modulus):
3549 '''
3550 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3551
3552 Input:
3553 data: data to encrypt, bytes-like object
3554 exponent, modulus: parameter e and N of RSA algorithm, both integer
3555 Output: hex string of encrypted data
3556
3557 Limitation: supports one block encryption only
3558 '''
3559
3560 payload = int(binascii.hexlify(data[::-1]), 16)
3561 encrypted = pow(payload, exponent, modulus)
3562 return '%x' % encrypted
81bdc8fd
YCH
3563
3564
f48409c7
YCH
3565def pkcs1pad(data, length):
3566 """
3567 Padding input data with PKCS#1 scheme
3568
3569 @param {int[]} data input data
3570 @param {int} length target length
3571 @returns {int[]} padded data
3572 """
3573 if len(data) > length - 11:
3574 raise ValueError('Input data too long for PKCS#1 padding')
3575
3576 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3577 return [0, 2] + pseudo_random + [0] + data
3578
3579
5eb6bdce 3580def encode_base_n(num, n, table=None):
59f898b7 3581 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
3582 if not table:
3583 table = FULL_TABLE[:n]
3584
5eb6bdce
YCH
3585 if n > len(table):
3586 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3587
3588 if num == 0:
3589 return table[0]
3590
81bdc8fd
YCH
3591 ret = ''
3592 while num:
3593 ret = table[num % n] + ret
3594 num = num // n
3595 return ret
f52354a8
YCH
3596
3597
3598def decode_packed_codes(code):
06b3fe29 3599 mobj = re.search(PACKED_CODES_RE, code)
f52354a8
YCH
3600 obfucasted_code, base, count, symbols = mobj.groups()
3601 base = int(base)
3602 count = int(count)
3603 symbols = symbols.split('|')
3604 symbol_table = {}
3605
3606 while count:
3607 count -= 1
5eb6bdce 3608 base_n_count = encode_base_n(count, base)
f52354a8
YCH
3609 symbol_table[base_n_count] = symbols[count] or base_n_count
3610
3611 return re.sub(
3612 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3613 obfucasted_code)
e154c651 3614
3615
3616def parse_m3u8_attributes(attrib):
3617 info = {}
3618 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3619 if val.startswith('"'):
3620 val = val[1:-1]
3621 info[key] = val
3622 return info
1143535d
YCH
3623
3624
3625def urshift(val, n):
3626 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
3627
3628
3629# Based on png2str() written by @gdkchan and improved by @yokrysty
3630# Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3631def decode_png(png_data):
3632 # Reference: https://www.w3.org/TR/PNG/
3633 header = png_data[8:]
3634
3635 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3636 raise IOError('Not a valid PNG file.')
3637
3638 int_map = {1: '>B', 2: '>H', 4: '>I'}
3639 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3640
3641 chunks = []
3642
3643 while header:
3644 length = unpack_integer(header[:4])
3645 header = header[4:]
3646
3647 chunk_type = header[:4]
3648 header = header[4:]
3649
3650 chunk_data = header[:length]
3651 header = header[length:]
3652
3653 header = header[4:] # Skip CRC
3654
3655 chunks.append({
3656 'type': chunk_type,
3657 'length': length,
3658 'data': chunk_data
3659 })
3660
3661 ihdr = chunks[0]['data']
3662
3663 width = unpack_integer(ihdr[:4])
3664 height = unpack_integer(ihdr[4:8])
3665
3666 idat = b''
3667
3668 for chunk in chunks:
3669 if chunk['type'] == b'IDAT':
3670 idat += chunk['data']
3671
3672 if not idat:
3673 raise IOError('Unable to read PNG data.')
3674
3675 decompressed_data = bytearray(zlib.decompress(idat))
3676
3677 stride = width * 3
3678 pixels = []
3679
3680 def _get_pixel(idx):
3681 x = idx % stride
3682 y = idx // stride
3683 return pixels[y][x]
3684
3685 for y in range(height):
3686 basePos = y * (1 + stride)
3687 filter_type = decompressed_data[basePos]
3688
3689 current_row = []
3690
3691 pixels.append(current_row)
3692
3693 for x in range(stride):
3694 color = decompressed_data[1 + basePos + x]
3695 basex = y * stride + x
3696 left = 0
3697 up = 0
3698
3699 if x > 2:
3700 left = _get_pixel(basex - 3)
3701 if y > 0:
3702 up = _get_pixel(basex - stride)
3703
3704 if filter_type == 1: # Sub
3705 color = (color + left) & 0xff
3706 elif filter_type == 2: # Up
3707 color = (color + up) & 0xff
3708 elif filter_type == 3: # Average
3709 color = (color + ((left + up) >> 1)) & 0xff
3710 elif filter_type == 4: # Paeth
3711 a = left
3712 b = up
3713 c = 0
3714
3715 if x > 2 and y > 0:
3716 c = _get_pixel(basex - stride - 3)
3717
3718 p = a + b - c
3719
3720 pa = abs(p - a)
3721 pb = abs(p - b)
3722 pc = abs(p - c)
3723
3724 if pa <= pb and pa <= pc:
3725 color = (color + a) & 0xff
3726 elif pb <= pc:
3727 color = (color + b) & 0xff
3728 else:
3729 color = (color + c) & 0xff
3730
3731 current_row.append(color)
3732
3733 return width, height, pixels
efa97bdc
YCH
3734
3735
3736def write_xattr(path, key, value):
3737 # This mess below finds the best xattr tool for the job
3738 try:
3739 # try the pyxattr module...
3740 import xattr
3741
53a7e3d2
YCH
3742 if hasattr(xattr, 'set'): # pyxattr
3743 # Unicode arguments are not supported in python-pyxattr until
3744 # version 0.5.0
3745 # See https://github.com/rg3/youtube-dl/issues/5498
3746 pyxattr_required_version = '0.5.0'
3747 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3748 # TODO: fallback to CLI tools
3749 raise XAttrUnavailableError(
3750 'python-pyxattr is detected but is too old. '
3751 'youtube-dl requires %s or above while your version is %s. '
3752 'Falling back to other xattr implementations' % (
3753 pyxattr_required_version, xattr.__version__))
3754
3755 setxattr = xattr.set
3756 else: # xattr
3757 setxattr = xattr.setxattr
efa97bdc
YCH
3758
3759 try:
53a7e3d2 3760 setxattr(path, key, value)
efa97bdc
YCH
3761 except EnvironmentError as e:
3762 raise XAttrMetadataError(e.errno, e.strerror)
3763
3764 except ImportError:
3765 if compat_os_name == 'nt':
3766 # Write xattrs to NTFS Alternate Data Streams:
3767 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3768 assert ':' not in key
3769 assert os.path.exists(path)
3770
3771 ads_fn = path + ':' + key
3772 try:
3773 with open(ads_fn, 'wb') as f:
3774 f.write(value)
3775 except EnvironmentError as e:
3776 raise XAttrMetadataError(e.errno, e.strerror)
3777 else:
3778 user_has_setfattr = check_executable('setfattr', ['--version'])
3779 user_has_xattr = check_executable('xattr', ['-h'])
3780
3781 if user_has_setfattr or user_has_xattr:
3782
3783 value = value.decode('utf-8')
3784 if user_has_setfattr:
3785 executable = 'setfattr'
3786 opts = ['-n', key, '-v', value]
3787 elif user_has_xattr:
3788 executable = 'xattr'
3789 opts = ['-w', key, value]
3790
3791 cmd = ([encodeFilename(executable, True)] +
3792 [encodeArgument(o) for o in opts] +
3793 [encodeFilename(path, True)])
3794
3795 try:
3796 p = subprocess.Popen(
3797 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3798 except EnvironmentError as e:
3799 raise XAttrMetadataError(e.errno, e.strerror)
3800 stdout, stderr = p.communicate()
3801 stderr = stderr.decode('utf-8', 'replace')
3802 if p.returncode != 0:
3803 raise XAttrMetadataError(p.returncode, stderr)
3804
3805 else:
3806 # On Unix, and can't find pyxattr, setfattr, or xattr.
3807 if sys.platform.startswith('linux'):
3808 raise XAttrUnavailableError(
3809 "Couldn't find a tool to set the xattrs. "
3810 "Install either the python 'pyxattr' or 'xattr' "
3811 "modules, or the GNU 'attr' package "
3812 "(which contains the 'setfattr' tool).")
3813 else:
3814 raise XAttrUnavailableError(
3815 "Couldn't find a tool to set the xattrs. "
3816 "Install either the python 'xattr' module, "
3817 "or the 'xattr' binary.")
0c265486
YCH
3818
3819
3820def random_birthday(year_field, month_field, day_field):
3821 return {
3822 year_field: str(random.randint(1950, 1995)),
3823 month_field: str(random.randint(1, 12)),
3824 day_field: str(random.randint(1, 31)),
3825 }