]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[newgrounds:playlist] Add extractor (closes #10611)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd 1#!/usr/bin/env python
dcdb292f 2# coding: utf-8
d77c3dfd 3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
0c265486 14import email.header
f45c185f 15import errno
be4a824d 16import functools
d77c3dfd 17import gzip
03f9daab 18import io
79a2e94e 19import itertools
f4bfd65f 20import json
d77c3dfd 21import locale
02dbf93f 22import math
347de493 23import operator
d77c3dfd 24import os
4eb7f1d1 25import pipes
c496ca96 26import platform
773f291d 27import random
d77c3dfd 28import re
c496ca96 29import socket
79a2e94e 30import ssl
1c088fa8 31import subprocess
d77c3dfd 32import sys
181c8655 33import tempfile
01951dda 34import traceback
bcf89ce6 35import xml.etree.ElementTree
d77c3dfd 36import zlib
d77c3dfd 37
8c25f81b 38from .compat import (
8bb56eee 39 compat_HTMLParser,
8f9312c3 40 compat_basestring,
8c25f81b 41 compat_chr,
36e6f62c 42 compat_etree_fromstring,
51098426 43 compat_expanduser,
8c25f81b 44 compat_html_entities,
55b2f099 45 compat_html_entities_html5,
be4a824d 46 compat_http_client,
c86b6142 47 compat_kwargs,
efa97bdc 48 compat_os_name,
8c25f81b 49 compat_parse_qs,
702ccf2d 50 compat_shlex_quote,
be4a824d 51 compat_socket_create_connection,
8c25f81b 52 compat_str,
edaa23f8 53 compat_struct_pack,
d3f8e038 54 compat_struct_unpack,
8c25f81b
PH
55 compat_urllib_error,
56 compat_urllib_parse,
15707c7e 57 compat_urllib_parse_urlencode,
8c25f81b 58 compat_urllib_parse_urlparse,
7581bfc9 59 compat_urllib_parse_unquote_plus,
8c25f81b
PH
60 compat_urllib_request,
61 compat_urlparse,
810c10ba 62 compat_xpath,
8c25f81b 63)
4644ac55 64
71aff188
YCH
65from .socks import (
66 ProxyType,
67 sockssocket,
68)
69
4644ac55 70
51fb4995
YCH
71def register_socks_protocols():
72 # "Register" SOCKS protocols
d5ae6bb5
YCH
73 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
74 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
75 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
76 if scheme not in compat_urlparse.uses_netloc:
77 compat_urlparse.uses_netloc.append(scheme)
78
79
468e2e92
FV
80# This is not clearly defined otherwise
81compiled_regex_type = type(re.compile(''))
82
3e669f36 83std_headers = {
15d10678 84 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
59ae15a5
PH
85 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
86 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
87 'Accept-Encoding': 'gzip, deflate',
88 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 89}
f427df17 90
5f6a1245 91
fb37eb25
S
92USER_AGENTS = {
93 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
94}
95
96
bf42a990
S
97NO_DEFAULT = object()
98
7105440c
YCH
99ENGLISH_MONTH_NAMES = [
100 'January', 'February', 'March', 'April', 'May', 'June',
101 'July', 'August', 'September', 'October', 'November', 'December']
102
f6717dec
S
103MONTH_NAMES = {
104 'en': ENGLISH_MONTH_NAMES,
105 'fr': [
3e4185c3
S
106 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
107 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 108}
a942d6cb 109
a7aaa398
S
110KNOWN_EXTENSIONS = (
111 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
112 'flv', 'f4v', 'f4a', 'f4b',
113 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
114 'mkv', 'mka', 'mk3d',
115 'avi', 'divx',
116 'mov',
117 'asf', 'wmv', 'wma',
118 '3gp', '3g2',
119 'mp3',
120 'flac',
121 'ape',
122 'wav',
123 'f4f', 'f4m', 'm3u8', 'smil')
124
c587cbb7 125# needed for sanitizing filenames in restricted mode
c8827027 126ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
127 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
128 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
c587cbb7 129
46f59e89
S
130DATE_FORMATS = (
131 '%d %B %Y',
132 '%d %b %Y',
133 '%B %d %Y',
cb655f34
S
134 '%B %dst %Y',
135 '%B %dnd %Y',
136 '%B %dth %Y',
46f59e89 137 '%b %d %Y',
cb655f34
S
138 '%b %dst %Y',
139 '%b %dnd %Y',
140 '%b %dth %Y',
46f59e89
S
141 '%b %dst %Y %I:%M',
142 '%b %dnd %Y %I:%M',
143 '%b %dth %Y %I:%M',
144 '%Y %m %d',
145 '%Y-%m-%d',
146 '%Y/%m/%d',
81c13222 147 '%Y/%m/%d %H:%M',
46f59e89 148 '%Y/%m/%d %H:%M:%S',
0c1c6f4b 149 '%Y-%m-%d %H:%M',
46f59e89
S
150 '%Y-%m-%d %H:%M:%S',
151 '%Y-%m-%d %H:%M:%S.%f',
152 '%d.%m.%Y %H:%M',
153 '%d.%m.%Y %H.%M',
154 '%Y-%m-%dT%H:%M:%SZ',
155 '%Y-%m-%dT%H:%M:%S.%fZ',
156 '%Y-%m-%dT%H:%M:%S.%f0Z',
157 '%Y-%m-%dT%H:%M:%S',
158 '%Y-%m-%dT%H:%M:%S.%f',
159 '%Y-%m-%dT%H:%M',
c6eed6b8
S
160 '%b %d %Y at %H:%M',
161 '%b %d %Y at %H:%M:%S',
46f59e89
S
162)
163
164DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
165DATE_FORMATS_DAY_FIRST.extend([
166 '%d-%m-%Y',
167 '%d.%m.%Y',
168 '%d.%m.%y',
169 '%d/%m/%Y',
170 '%d/%m/%y',
171 '%d/%m/%Y %H:%M:%S',
172])
173
174DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
175DATE_FORMATS_MONTH_FIRST.extend([
176 '%m-%d-%Y',
177 '%m.%d.%Y',
178 '%m/%d/%Y',
179 '%m/%d/%y',
180 '%m/%d/%Y %H:%M:%S',
181])
182
06b3fe29
S
183PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
184
7105440c 185
d77c3dfd 186def preferredencoding():
59ae15a5 187 """Get preferred encoding.
d77c3dfd 188
59ae15a5
PH
189 Returns the best encoding scheme for the system, based on
190 locale.getpreferredencoding() and some further tweaks.
191 """
192 try:
193 pref = locale.getpreferredencoding()
28e614de 194 'TEST'.encode(pref)
70a1165b 195 except Exception:
59ae15a5 196 pref = 'UTF-8'
bae611f2 197
59ae15a5 198 return pref
d77c3dfd 199
f4bfd65f 200
181c8655 201def write_json_file(obj, fn):
1394646a 202 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 203
92120217 204 fn = encodeFilename(fn)
61ee5aeb 205 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
206 encoding = get_filesystem_encoding()
207 # os.path.basename returns a bytes object, but NamedTemporaryFile
208 # will fail if the filename contains non ascii characters unless we
209 # use a unicode object
210 path_basename = lambda f: os.path.basename(fn).decode(encoding)
211 # the same for os.path.dirname
212 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
213 else:
214 path_basename = os.path.basename
215 path_dirname = os.path.dirname
216
73159f99
S
217 args = {
218 'suffix': '.tmp',
ec5f6016
JMF
219 'prefix': path_basename(fn) + '.',
220 'dir': path_dirname(fn),
73159f99
S
221 'delete': False,
222 }
223
181c8655
PH
224 # In Python 2.x, json.dump expects a bytestream.
225 # In Python 3.x, it writes to a character stream
226 if sys.version_info < (3, 0):
73159f99 227 args['mode'] = 'wb'
181c8655 228 else:
73159f99
S
229 args.update({
230 'mode': 'w',
231 'encoding': 'utf-8',
232 })
233
c86b6142 234 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
235
236 try:
237 with tf:
238 json.dump(obj, tf)
1394646a
IK
239 if sys.platform == 'win32':
240 # Need to remove existing file on Windows, else os.rename raises
241 # WindowsError or FileExistsError.
242 try:
243 os.unlink(fn)
244 except OSError:
245 pass
181c8655 246 os.rename(tf.name, fn)
70a1165b 247 except Exception:
181c8655
PH
248 try:
249 os.remove(tf.name)
250 except OSError:
251 pass
252 raise
253
254
255if sys.version_info >= (2, 7):
ee114368 256 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 257 """ Find the xpath xpath[@key=val] """
5d2354f1 258 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 259 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
260 return node.find(expr)
261else:
ee114368 262 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 263 for f in node.findall(compat_xpath(xpath)):
ee114368
S
264 if key not in f.attrib:
265 continue
266 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
267 return f
268 return None
269
d7e66d39
JMF
270# On python2.6 the xml.etree.ElementTree.Element methods don't support
271# the namespace parameter
5f6a1245
JW
272
273
d7e66d39
JMF
274def xpath_with_ns(path, ns_map):
275 components = [c.split(':') for c in path.split('/')]
276 replaced = []
277 for c in components:
278 if len(c) == 1:
279 replaced.append(c[0])
280 else:
281 ns, tag = c
282 replaced.append('{%s}%s' % (ns_map[ns], tag))
283 return '/'.join(replaced)
284
d77c3dfd 285
a41fb80c 286def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 287 def _find_xpath(xpath):
810c10ba 288 return node.find(compat_xpath(xpath))
578c0745
S
289
290 if isinstance(xpath, (str, compat_str)):
291 n = _find_xpath(xpath)
292 else:
293 for xp in xpath:
294 n = _find_xpath(xp)
295 if n is not None:
296 break
d74bebd5 297
8e636da4 298 if n is None:
bf42a990
S
299 if default is not NO_DEFAULT:
300 return default
301 elif fatal:
bf0ff932
PH
302 name = xpath if name is None else name
303 raise ExtractorError('Could not find XML element %s' % name)
304 else:
305 return None
a41fb80c
S
306 return n
307
308
309def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
310 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
311 if n is None or n == default:
312 return n
313 if n.text is None:
314 if default is not NO_DEFAULT:
315 return default
316 elif fatal:
317 name = xpath if name is None else name
318 raise ExtractorError('Could not find XML element\'s text %s' % name)
319 else:
320 return None
321 return n.text
a41fb80c
S
322
323
324def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
325 n = find_xpath_attr(node, xpath, key)
326 if n is None:
327 if default is not NO_DEFAULT:
328 return default
329 elif fatal:
330 name = '%s[@%s]' % (xpath, key) if name is None else name
331 raise ExtractorError('Could not find XML attribute %s' % name)
332 else:
333 return None
334 return n.attrib[key]
bf0ff932
PH
335
336
9e6dd238 337def get_element_by_id(id, html):
43e8fafd 338 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 339 return get_element_by_attribute('id', id, html)
43e8fafd 340
12ea2f30 341
84c237fb 342def get_element_by_class(class_name, html):
2af12ad9
TC
343 """Return the content of the first tag with the specified class in the passed HTML document"""
344 retval = get_elements_by_class(class_name, html)
345 return retval[0] if retval else None
346
347
348def get_element_by_attribute(attribute, value, html, escape_value=True):
349 retval = get_elements_by_attribute(attribute, value, html, escape_value)
350 return retval[0] if retval else None
351
352
353def get_elements_by_class(class_name, html):
354 """Return the content of all tags with the specified class in the passed HTML document as a list"""
355 return get_elements_by_attribute(
84c237fb
YCH
356 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
357 html, escape_value=False)
358
359
2af12ad9 360def get_elements_by_attribute(attribute, value, html, escape_value=True):
43e8fafd 361 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 362
84c237fb
YCH
363 value = re.escape(value) if escape_value else value
364
2af12ad9
TC
365 retlist = []
366 for m in re.finditer(r'''(?xs)
38285056 367 <([a-zA-Z0-9:._-]+)
abc97b5e 368 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056 369 \s+%s=['"]?%s['"]?
abc97b5e 370 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056
PH
371 \s*>
372 (?P<content>.*?)
373 </\1>
2af12ad9
TC
374 ''' % (re.escape(attribute), value), html):
375 res = m.group('content')
38285056 376
2af12ad9
TC
377 if res.startswith('"') or res.startswith("'"):
378 res = res[1:-1]
38285056 379
2af12ad9 380 retlist.append(unescapeHTML(res))
a921f407 381
2af12ad9 382 return retlist
a921f407 383
c5229f39 384
8bb56eee
BF
385class HTMLAttributeParser(compat_HTMLParser):
386 """Trivial HTML parser to gather the attributes for a single element"""
387 def __init__(self):
c5229f39 388 self.attrs = {}
8bb56eee
BF
389 compat_HTMLParser.__init__(self)
390
391 def handle_starttag(self, tag, attrs):
392 self.attrs = dict(attrs)
393
c5229f39 394
8bb56eee
BF
395def extract_attributes(html_element):
396 """Given a string for an HTML element such as
397 <el
398 a="foo" B="bar" c="&98;az" d=boz
399 empty= noval entity="&amp;"
400 sq='"' dq="'"
401 >
402 Decode and return a dictionary of attributes.
403 {
404 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
405 'empty': '', 'noval': None, 'entity': '&',
406 'sq': '"', 'dq': '\''
407 }.
408 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
409 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
410 """
411 parser = HTMLAttributeParser()
412 parser.feed(html_element)
413 parser.close()
414 return parser.attrs
9e6dd238 415
c5229f39 416
9e6dd238 417def clean_html(html):
59ae15a5 418 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
419
420 if html is None: # Convenience for sanitizing descriptions etc.
421 return html
422
59ae15a5
PH
423 # Newline vs <br />
424 html = html.replace('\n', ' ')
edd9221c
TF
425 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
426 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
427 # Strip html tags
428 html = re.sub('<.*?>', '', html)
429 # Replace html entities
430 html = unescapeHTML(html)
7decf895 431 return html.strip()
9e6dd238
FV
432
433
d77c3dfd 434def sanitize_open(filename, open_mode):
59ae15a5
PH
435 """Try to open the given filename, and slightly tweak it if this fails.
436
437 Attempts to open the given filename. If this fails, it tries to change
438 the filename slightly, step by step, until it's either able to open it
439 or it fails and raises a final exception, like the standard open()
440 function.
441
442 It returns the tuple (stream, definitive_file_name).
443 """
444 try:
28e614de 445 if filename == '-':
59ae15a5
PH
446 if sys.platform == 'win32':
447 import msvcrt
448 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 449 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
450 stream = open(encodeFilename(filename), open_mode)
451 return (stream, filename)
452 except (IOError, OSError) as err:
f45c185f
PH
453 if err.errno in (errno.EACCES,):
454 raise
59ae15a5 455
f45c185f 456 # In case of error, try to remove win32 forbidden chars
d55de57b 457 alt_filename = sanitize_path(filename)
f45c185f
PH
458 if alt_filename == filename:
459 raise
460 else:
461 # An exception here should be caught in the caller
d55de57b 462 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 463 return (stream, alt_filename)
d77c3dfd
FV
464
465
466def timeconvert(timestr):
59ae15a5
PH
467 """Convert RFC 2822 defined time string into system timestamp"""
468 timestamp = None
469 timetuple = email.utils.parsedate_tz(timestr)
470 if timetuple is not None:
471 timestamp = email.utils.mktime_tz(timetuple)
472 return timestamp
1c469a94 473
5f6a1245 474
796173d0 475def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
476 """Sanitizes a string so it could be used as part of a filename.
477 If restricted is set, use a stricter subset of allowed characters.
158af524
S
478 Set is_id if this is not an arbitrary string, but an ID that should be kept
479 if possible.
59ae15a5
PH
480 """
481 def replace_insane(char):
c587cbb7
AT
482 if restricted and char in ACCENT_CHARS:
483 return ACCENT_CHARS[char]
59ae15a5
PH
484 if char == '?' or ord(char) < 32 or ord(char) == 127:
485 return ''
486 elif char == '"':
487 return '' if restricted else '\''
488 elif char == ':':
489 return '_-' if restricted else ' -'
490 elif char in '\\/|*<>':
491 return '_'
627dcfff 492 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
493 return '_'
494 if restricted and ord(char) > 127:
495 return '_'
496 return char
497
2aeb06d6
PH
498 # Handle timestamps
499 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 500 result = ''.join(map(replace_insane, s))
796173d0
PH
501 if not is_id:
502 while '__' in result:
503 result = result.replace('__', '_')
504 result = result.strip('_')
505 # Common case of "Foreign band name - English song title"
506 if restricted and result.startswith('-_'):
507 result = result[2:]
5a42414b
PH
508 if result.startswith('-'):
509 result = '_' + result[len('-'):]
a7440261 510 result = result.lstrip('.')
796173d0
PH
511 if not result:
512 result = '_'
59ae15a5 513 return result
d77c3dfd 514
5f6a1245 515
a2aaf4db
S
516def sanitize_path(s):
517 """Sanitizes and normalizes path on Windows"""
518 if sys.platform != 'win32':
519 return s
be531ef1
S
520 drive_or_unc, _ = os.path.splitdrive(s)
521 if sys.version_info < (2, 7) and not drive_or_unc:
522 drive_or_unc, _ = os.path.splitunc(s)
523 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
524 if drive_or_unc:
a2aaf4db
S
525 norm_path.pop(0)
526 sanitized_path = [
ec85ded8 527 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 528 for path_part in norm_path]
be531ef1
S
529 if drive_or_unc:
530 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
531 return os.path.join(*sanitized_path)
532
533
67dda517
S
534# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
535# unwanted failures due to missing protocol
17bcc626
S
536def sanitize_url(url):
537 return 'http:%s' % url if url.startswith('//') else url
538
539
67dda517 540def sanitized_Request(url, *args, **kwargs):
17bcc626 541 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
542
543
51098426
S
544def expand_path(s):
545 """Expand shell variables and ~"""
546 return os.path.expandvars(compat_expanduser(s))
547
548
d77c3dfd 549def orderedSet(iterable):
59ae15a5
PH
550 """ Remove all duplicates from the input iterable """
551 res = []
552 for el in iterable:
553 if el not in res:
554 res.append(el)
555 return res
d77c3dfd 556
912b38b4 557
55b2f099 558def _htmlentity_transform(entity_with_semicolon):
4e408e47 559 """Transforms an HTML entity to a character."""
55b2f099
YCH
560 entity = entity_with_semicolon[:-1]
561
4e408e47
PH
562 # Known non-numeric HTML entity
563 if entity in compat_html_entities.name2codepoint:
564 return compat_chr(compat_html_entities.name2codepoint[entity])
565
55b2f099
YCH
566 # TODO: HTML5 allows entities without a semicolon. For example,
567 # '&Eacuteric' should be decoded as 'Éric'.
568 if entity_with_semicolon in compat_html_entities_html5:
569 return compat_html_entities_html5[entity_with_semicolon]
570
91757b0f 571 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
572 if mobj is not None:
573 numstr = mobj.group(1)
28e614de 574 if numstr.startswith('x'):
4e408e47 575 base = 16
28e614de 576 numstr = '0%s' % numstr
4e408e47
PH
577 else:
578 base = 10
7aefc49c
S
579 # See https://github.com/rg3/youtube-dl/issues/7518
580 try:
581 return compat_chr(int(numstr, base))
582 except ValueError:
583 pass
4e408e47
PH
584
585 # Unknown entity in name, return its literal representation
7a3f0c00 586 return '&%s;' % entity
4e408e47
PH
587
588
d77c3dfd 589def unescapeHTML(s):
912b38b4
PH
590 if s is None:
591 return None
592 assert type(s) == compat_str
d77c3dfd 593
4e408e47 594 return re.sub(
55b2f099 595 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 596
8bf48f23 597
aa49acd1
S
598def get_subprocess_encoding():
599 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
600 # For subprocess calls, encode with locale encoding
601 # Refer to http://stackoverflow.com/a/9951851/35070
602 encoding = preferredencoding()
603 else:
604 encoding = sys.getfilesystemencoding()
605 if encoding is None:
606 encoding = 'utf-8'
607 return encoding
608
609
8bf48f23 610def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
611 """
612 @param s The name of the file
613 """
d77c3dfd 614
8bf48f23 615 assert type(s) == compat_str
d77c3dfd 616
59ae15a5
PH
617 # Python 3 has a Unicode API
618 if sys.version_info >= (3, 0):
619 return s
0f00efed 620
aa49acd1
S
621 # Pass '' directly to use Unicode APIs on Windows 2000 and up
622 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
623 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
624 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
625 return s
626
8ee239e9
YCH
627 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
628 if sys.platform.startswith('java'):
629 return s
630
aa49acd1
S
631 return s.encode(get_subprocess_encoding(), 'ignore')
632
633
634def decodeFilename(b, for_subprocess=False):
635
636 if sys.version_info >= (3, 0):
637 return b
638
639 if not isinstance(b, bytes):
640 return b
641
642 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 643
f07b74fc
PH
644
645def encodeArgument(s):
646 if not isinstance(s, compat_str):
647 # Legacy code that uses byte strings
648 # Uncomment the following line after fixing all post processors
7af808a5 649 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
650 s = s.decode('ascii')
651 return encodeFilename(s, True)
652
653
aa49acd1
S
654def decodeArgument(b):
655 return decodeFilename(b, True)
656
657
8271226a
PH
658def decodeOption(optval):
659 if optval is None:
660 return optval
661 if isinstance(optval, bytes):
662 optval = optval.decode(preferredencoding())
663
664 assert isinstance(optval, compat_str)
665 return optval
1c256f70 666
5f6a1245 667
4539dd30
PH
668def formatSeconds(secs):
669 if secs > 3600:
670 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
671 elif secs > 60:
672 return '%d:%02d' % (secs // 60, secs % 60)
673 else:
674 return '%d' % secs
675
a0ddb8a2 676
be4a824d
PH
677def make_HTTPS_handler(params, **kwargs):
678 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 679 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 680 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 681 if opts_no_check_certificate:
be5f2c19 682 context.check_hostname = False
0db261ba 683 context.verify_mode = ssl.CERT_NONE
a2366922 684 try:
be4a824d 685 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
686 except TypeError:
687 # Python 2.7.8
688 # (create_default_context present but HTTPSHandler has no context=)
689 pass
690
691 if sys.version_info < (3, 2):
d7932313 692 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 693 else: # Python < 3.4
d7932313 694 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 695 context.verify_mode = (ssl.CERT_NONE
dca08720 696 if opts_no_check_certificate
ea6d901e 697 else ssl.CERT_REQUIRED)
303b479e 698 context.set_default_verify_paths()
be4a824d 699 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 700
732ea2f0 701
08f2a92c
JMF
702def bug_reports_message():
703 if ytdl_is_updateable():
704 update_cmd = 'type youtube-dl -U to update'
705 else:
706 update_cmd = 'see https://yt-dl.org/update on how to update'
707 msg = '; please report this issue on https://yt-dl.org/bug .'
708 msg += ' Make sure you are using the latest version; %s.' % update_cmd
709 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
710 return msg
711
712
bf5b9d85
PM
713class YoutubeDLError(Exception):
714 """Base exception for YoutubeDL errors."""
715 pass
716
717
718class ExtractorError(YoutubeDLError):
1c256f70 719 """Error during info extraction."""
5f6a1245 720
d11271dd 721 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
722 """ tb, if given, is the original traceback (so that it can be printed out).
723 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
724 """
725
726 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
727 expected = True
d11271dd
PH
728 if video_id is not None:
729 msg = video_id + ': ' + msg
410f3e73 730 if cause:
28e614de 731 msg += ' (caused by %r)' % cause
9a82b238 732 if not expected:
08f2a92c 733 msg += bug_reports_message()
1c256f70 734 super(ExtractorError, self).__init__(msg)
d5979c5d 735
1c256f70 736 self.traceback = tb
8cc83b8d 737 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 738 self.cause = cause
d11271dd 739 self.video_id = video_id
1c256f70 740
01951dda
PH
741 def format_traceback(self):
742 if self.traceback is None:
743 return None
28e614de 744 return ''.join(traceback.format_tb(self.traceback))
01951dda 745
1c256f70 746
416c7fcb
PH
747class UnsupportedError(ExtractorError):
748 def __init__(self, url):
749 super(UnsupportedError, self).__init__(
750 'Unsupported URL: %s' % url, expected=True)
751 self.url = url
752
753
55b3e45b
JMF
754class RegexNotFoundError(ExtractorError):
755 """Error when a regex didn't match"""
756 pass
757
758
773f291d
S
759class GeoRestrictedError(ExtractorError):
760 """Geographic restriction Error exception.
761
762 This exception may be thrown when a video is not available from your
763 geographic location due to geographic restrictions imposed by a website.
764 """
765 def __init__(self, msg, countries=None):
766 super(GeoRestrictedError, self).__init__(msg, expected=True)
767 self.msg = msg
768 self.countries = countries
769
770
bf5b9d85 771class DownloadError(YoutubeDLError):
59ae15a5 772 """Download Error exception.
d77c3dfd 773
59ae15a5
PH
774 This exception may be thrown by FileDownloader objects if they are not
775 configured to continue on errors. They will contain the appropriate
776 error message.
777 """
5f6a1245 778
8cc83b8d
FV
779 def __init__(self, msg, exc_info=None):
780 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
781 super(DownloadError, self).__init__(msg)
782 self.exc_info = exc_info
d77c3dfd
FV
783
784
bf5b9d85 785class SameFileError(YoutubeDLError):
59ae15a5 786 """Same File exception.
d77c3dfd 787
59ae15a5
PH
788 This exception will be thrown by FileDownloader objects if they detect
789 multiple files would have to be downloaded to the same file on disk.
790 """
791 pass
d77c3dfd
FV
792
793
bf5b9d85 794class PostProcessingError(YoutubeDLError):
59ae15a5 795 """Post Processing exception.
d77c3dfd 796
59ae15a5
PH
797 This exception may be raised by PostProcessor's .run() method to
798 indicate an error in the postprocessing task.
799 """
5f6a1245 800
7851b379 801 def __init__(self, msg):
bf5b9d85 802 super(PostProcessingError, self).__init__(msg)
7851b379 803 self.msg = msg
d77c3dfd 804
5f6a1245 805
bf5b9d85 806class MaxDownloadsReached(YoutubeDLError):
59ae15a5
PH
807 """ --max-downloads limit has been reached. """
808 pass
d77c3dfd
FV
809
810
bf5b9d85 811class UnavailableVideoError(YoutubeDLError):
59ae15a5 812 """Unavailable Format exception.
d77c3dfd 813
59ae15a5
PH
814 This exception will be thrown when a video is requested
815 in a format that is not available for that video.
816 """
817 pass
d77c3dfd
FV
818
819
bf5b9d85 820class ContentTooShortError(YoutubeDLError):
59ae15a5 821 """Content Too Short exception.
d77c3dfd 822
59ae15a5
PH
823 This exception may be raised by FileDownloader objects when a file they
824 download is too small for what the server announced first, indicating
825 the connection was probably interrupted.
826 """
d77c3dfd 827
59ae15a5 828 def __init__(self, downloaded, expected):
bf5b9d85
PM
829 super(ContentTooShortError, self).__init__(
830 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
831 )
2c7ed247 832 # Both in bytes
59ae15a5
PH
833 self.downloaded = downloaded
834 self.expected = expected
d77c3dfd 835
5f6a1245 836
bf5b9d85 837class XAttrMetadataError(YoutubeDLError):
efa97bdc
YCH
838 def __init__(self, code=None, msg='Unknown error'):
839 super(XAttrMetadataError, self).__init__(msg)
840 self.code = code
bd264412 841 self.msg = msg
efa97bdc
YCH
842
843 # Parsing code and msg
844 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
845 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
846 self.reason = 'NO_SPACE'
847 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
848 self.reason = 'VALUE_TOO_LONG'
849 else:
850 self.reason = 'NOT_SUPPORTED'
851
852
bf5b9d85 853class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
854 pass
855
856
c5a59d93 857def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
858 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
859 # expected HTTP responses to meet HTTP/1.0 or later (see also
860 # https://github.com/rg3/youtube-dl/issues/6727)
861 if sys.version_info < (3, 0):
5a1a2e94 862 kwargs[b'strict'] = True
be4a824d
PH
863 hc = http_class(*args, **kwargs)
864 source_address = ydl_handler._params.get('source_address')
865 if source_address is not None:
866 sa = (source_address, 0)
867 if hasattr(hc, 'source_address'): # Python 2.7+
868 hc.source_address = sa
869 else: # Python 2.6
870 def _hc_connect(self, *args, **kwargs):
871 sock = compat_socket_create_connection(
872 (self.host, self.port), self.timeout, sa)
873 if is_https:
d7932313
PH
874 self.sock = ssl.wrap_socket(
875 sock, self.key_file, self.cert_file,
876 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
877 else:
878 self.sock = sock
879 hc.connect = functools.partial(_hc_connect, hc)
880
881 return hc
882
883
87f0e62d 884def handle_youtubedl_headers(headers):
992fc9d6
YCH
885 filtered_headers = headers
886
887 if 'Youtubedl-no-compression' in filtered_headers:
888 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 889 del filtered_headers['Youtubedl-no-compression']
87f0e62d 890
992fc9d6 891 return filtered_headers
87f0e62d
YCH
892
893
acebc9cd 894class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
895 """Handler for HTTP requests and responses.
896
897 This class, when installed with an OpenerDirector, automatically adds
898 the standard headers to every HTTP request and handles gzipped and
899 deflated responses from web servers. If compression is to be avoided in
900 a particular request, the original request in the program code only has
0424ec30 901 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
902 removed before making the real request.
903
904 Part of this code was copied from:
905
906 http://techknack.net/python-urllib2-handlers/
907
908 Andrew Rowls, the author of that code, agreed to release it to the
909 public domain.
910 """
911
be4a824d
PH
912 def __init__(self, params, *args, **kwargs):
913 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
914 self._params = params
915
916 def http_open(self, req):
71aff188
YCH
917 conn_class = compat_http_client.HTTPConnection
918
919 socks_proxy = req.headers.get('Ytdl-socks-proxy')
920 if socks_proxy:
921 conn_class = make_socks_conn_class(conn_class, socks_proxy)
922 del req.headers['Ytdl-socks-proxy']
923
be4a824d 924 return self.do_open(functools.partial(
71aff188 925 _create_http_connection, self, conn_class, False),
be4a824d
PH
926 req)
927
59ae15a5
PH
928 @staticmethod
929 def deflate(data):
930 try:
931 return zlib.decompress(data, -zlib.MAX_WBITS)
932 except zlib.error:
933 return zlib.decompress(data)
934
acebc9cd 935 def http_request(self, req):
51f267d9
S
936 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
937 # always respected by websites, some tend to give out URLs with non percent-encoded
938 # non-ASCII characters (see telemb.py, ard.py [#3412])
939 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
940 # To work around aforementioned issue we will replace request's original URL with
941 # percent-encoded one
942 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
943 # the code of this workaround has been moved here from YoutubeDL.urlopen()
944 url = req.get_full_url()
945 url_escaped = escape_url(url)
946
947 # Substitute URL if any change after escaping
948 if url != url_escaped:
15d260eb 949 req = update_Request(req, url=url_escaped)
51f267d9 950
33ac271b 951 for h, v in std_headers.items():
3d5f7a39
JK
952 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
953 # The dict keys are capitalized because of this bug by urllib
954 if h.capitalize() not in req.headers:
33ac271b 955 req.add_header(h, v)
87f0e62d
YCH
956
957 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
958
959 if sys.version_info < (2, 7) and '#' in req.get_full_url():
960 # Python 2.6 is brain-dead when it comes to fragments
961 req._Request__original = req._Request__original.partition('#')[0]
962 req._Request__r_type = req._Request__r_type.partition('#')[0]
963
59ae15a5
PH
964 return req
965
acebc9cd 966 def http_response(self, req, resp):
59ae15a5
PH
967 old_resp = resp
968 # gzip
969 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
970 content = resp.read()
971 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
972 try:
973 uncompressed = io.BytesIO(gz.read())
974 except IOError as original_ioerror:
975 # There may be junk add the end of the file
976 # See http://stackoverflow.com/q/4928560/35070 for details
977 for i in range(1, 1024):
978 try:
979 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
980 uncompressed = io.BytesIO(gz.read())
981 except IOError:
982 continue
983 break
984 else:
985 raise original_ioerror
b407d853 986 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 987 resp.msg = old_resp.msg
c047270c 988 del resp.headers['Content-encoding']
59ae15a5
PH
989 # deflate
990 if resp.headers.get('Content-encoding', '') == 'deflate':
991 gz = io.BytesIO(self.deflate(resp.read()))
b407d853 992 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 993 resp.msg = old_resp.msg
c047270c 994 del resp.headers['Content-encoding']
ad729172
S
995 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
996 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
997 if 300 <= resp.code < 400:
998 location = resp.headers.get('Location')
999 if location:
1000 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1001 if sys.version_info >= (3, 0):
1002 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
1003 else:
1004 location = location.decode('utf-8')
5a4d9ddb
S
1005 location_escaped = escape_url(location)
1006 if location != location_escaped:
1007 del resp.headers['Location']
9a4aec8b
YCH
1008 if sys.version_info < (3, 0):
1009 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 1010 resp.headers['Location'] = location_escaped
59ae15a5 1011 return resp
0f8d03f8 1012
acebc9cd
PH
1013 https_request = http_request
1014 https_response = http_response
bf50b038 1015
5de90176 1016
71aff188
YCH
1017def make_socks_conn_class(base_class, socks_proxy):
1018 assert issubclass(base_class, (
1019 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1020
1021 url_components = compat_urlparse.urlparse(socks_proxy)
1022 if url_components.scheme.lower() == 'socks5':
1023 socks_type = ProxyType.SOCKS5
1024 elif url_components.scheme.lower() in ('socks', 'socks4'):
1025 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1026 elif url_components.scheme.lower() == 'socks4a':
1027 socks_type = ProxyType.SOCKS4A
71aff188 1028
cdd94c2e
YCH
1029 def unquote_if_non_empty(s):
1030 if not s:
1031 return s
1032 return compat_urllib_parse_unquote_plus(s)
1033
71aff188
YCH
1034 proxy_args = (
1035 socks_type,
1036 url_components.hostname, url_components.port or 1080,
1037 True, # Remote DNS
cdd94c2e
YCH
1038 unquote_if_non_empty(url_components.username),
1039 unquote_if_non_empty(url_components.password),
71aff188
YCH
1040 )
1041
1042 class SocksConnection(base_class):
1043 def connect(self):
1044 self.sock = sockssocket()
1045 self.sock.setproxy(*proxy_args)
1046 if type(self.timeout) in (int, float):
1047 self.sock.settimeout(self.timeout)
1048 self.sock.connect((self.host, self.port))
1049
1050 if isinstance(self, compat_http_client.HTTPSConnection):
1051 if hasattr(self, '_context'): # Python > 2.6
1052 self.sock = self._context.wrap_socket(
1053 self.sock, server_hostname=self.host)
1054 else:
1055 self.sock = ssl.wrap_socket(self.sock)
1056
1057 return SocksConnection
1058
1059
be4a824d
PH
1060class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1061 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1062 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1063 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1064 self._params = params
1065
1066 def https_open(self, req):
4f264c02 1067 kwargs = {}
71aff188
YCH
1068 conn_class = self._https_conn_class
1069
4f264c02
JMF
1070 if hasattr(self, '_context'): # python > 2.6
1071 kwargs['context'] = self._context
1072 if hasattr(self, '_check_hostname'): # python 3.x
1073 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1074
1075 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1076 if socks_proxy:
1077 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1078 del req.headers['Ytdl-socks-proxy']
1079
be4a824d 1080 return self.do_open(functools.partial(
71aff188 1081 _create_http_connection, self, conn_class, True),
4f264c02 1082 req, **kwargs)
be4a824d
PH
1083
1084
a6420bf5
S
1085class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1086 def __init__(self, cookiejar=None):
1087 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1088
1089 def http_response(self, request, response):
1090 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1091 # characters in Set-Cookie HTTP header of last response (see
1092 # https://github.com/rg3/youtube-dl/issues/6769).
1093 # In order to at least prevent crashing we will percent encode Set-Cookie
1094 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1095 # if sys.version_info < (3, 0) and response.headers:
1096 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1097 # set_cookie = response.headers.get(set_cookie_header)
1098 # if set_cookie:
1099 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1100 # if set_cookie != set_cookie_escaped:
1101 # del response.headers[set_cookie_header]
1102 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1103 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1104
1105 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1106 https_response = http_response
1107
1108
46f59e89
S
1109def extract_timezone(date_str):
1110 m = re.search(
1111 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1112 date_str)
1113 if not m:
1114 timezone = datetime.timedelta()
1115 else:
1116 date_str = date_str[:-len(m.group('tz'))]
1117 if not m.group('sign'):
1118 timezone = datetime.timedelta()
1119 else:
1120 sign = 1 if m.group('sign') == '+' else -1
1121 timezone = datetime.timedelta(
1122 hours=sign * int(m.group('hours')),
1123 minutes=sign * int(m.group('minutes')))
1124 return timezone, date_str
1125
1126
08b38d54 1127def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1128 """ Return a UNIX timestamp from the given date """
1129
1130 if date_str is None:
1131 return None
1132
52c3a6e4
S
1133 date_str = re.sub(r'\.[0-9]+', '', date_str)
1134
08b38d54 1135 if timezone is None:
46f59e89
S
1136 timezone, date_str = extract_timezone(date_str)
1137
52c3a6e4
S
1138 try:
1139 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1140 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1141 return calendar.timegm(dt.timetuple())
1142 except ValueError:
1143 pass
912b38b4
PH
1144
1145
46f59e89
S
1146def date_formats(day_first=True):
1147 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1148
1149
42bdd9d0 1150def unified_strdate(date_str, day_first=True):
bf50b038 1151 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1152
1153 if date_str is None:
1154 return None
bf50b038 1155 upload_date = None
5f6a1245 1156 # Replace commas
026fcc04 1157 date_str = date_str.replace(',', ' ')
42bdd9d0 1158 # Remove AM/PM + timezone
9bb8e0a3 1159 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1160 _, date_str = extract_timezone(date_str)
42bdd9d0 1161
46f59e89 1162 for expression in date_formats(day_first):
bf50b038
JMF
1163 try:
1164 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1165 except ValueError:
bf50b038 1166 pass
42393ce2
PH
1167 if upload_date is None:
1168 timetuple = email.utils.parsedate_tz(date_str)
1169 if timetuple:
c6b9cf05
S
1170 try:
1171 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1172 except ValueError:
1173 pass
6a750402
JMF
1174 if upload_date is not None:
1175 return compat_str(upload_date)
bf50b038 1176
5f6a1245 1177
46f59e89
S
1178def unified_timestamp(date_str, day_first=True):
1179 if date_str is None:
1180 return None
1181
1182 date_str = date_str.replace(',', ' ')
1183
7dc2a74e 1184 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1185 timezone, date_str = extract_timezone(date_str)
1186
1187 # Remove AM/PM + timezone
1188 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1189
deef3195
S
1190 # Remove unrecognized timezones from ISO 8601 alike timestamps
1191 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1192 if m:
1193 date_str = date_str[:-len(m.group('tz'))]
1194
46f59e89
S
1195 for expression in date_formats(day_first):
1196 try:
7dc2a74e 1197 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1198 return calendar.timegm(dt.timetuple())
1199 except ValueError:
1200 pass
1201 timetuple = email.utils.parsedate_tz(date_str)
1202 if timetuple:
7dc2a74e 1203 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1204
1205
28e614de 1206def determine_ext(url, default_ext='unknown_video'):
f4776371
S
1207 if url is None:
1208 return default_ext
9cb9a5df 1209 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1210 if re.match(r'^[A-Za-z0-9]+$', guess):
1211 return guess
a7aaa398
S
1212 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1213 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1214 return guess.rstrip('/')
73e79f2a 1215 else:
cbdbb766 1216 return default_ext
73e79f2a 1217
5f6a1245 1218
d4051a8e 1219def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1220 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1221
5f6a1245 1222
bd558525 1223def date_from_str(date_str):
37254abc
JMF
1224 """
1225 Return a datetime object from a string in the format YYYYMMDD or
1226 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1227 today = datetime.date.today()
f8795e10 1228 if date_str in ('now', 'today'):
37254abc 1229 return today
f8795e10
PH
1230 if date_str == 'yesterday':
1231 return today - datetime.timedelta(days=1)
ec85ded8 1232 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
37254abc
JMF
1233 if match is not None:
1234 sign = match.group('sign')
1235 time = int(match.group('time'))
1236 if sign == '-':
1237 time = -time
1238 unit = match.group('unit')
dfb1b146 1239 # A bad approximation?
37254abc
JMF
1240 if unit == 'month':
1241 unit = 'day'
1242 time *= 30
1243 elif unit == 'year':
1244 unit = 'day'
1245 time *= 365
1246 unit += 's'
1247 delta = datetime.timedelta(**{unit: time})
1248 return today + delta
611c1dd9 1249 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1250
1251
e63fc1be 1252def hyphenate_date(date_str):
1253 """
1254 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1255 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1256 if match is not None:
1257 return '-'.join(match.groups())
1258 else:
1259 return date_str
1260
5f6a1245 1261
bd558525
JMF
1262class DateRange(object):
1263 """Represents a time interval between two dates"""
5f6a1245 1264
bd558525
JMF
1265 def __init__(self, start=None, end=None):
1266 """start and end must be strings in the format accepted by date"""
1267 if start is not None:
1268 self.start = date_from_str(start)
1269 else:
1270 self.start = datetime.datetime.min.date()
1271 if end is not None:
1272 self.end = date_from_str(end)
1273 else:
1274 self.end = datetime.datetime.max.date()
37254abc 1275 if self.start > self.end:
bd558525 1276 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1277
bd558525
JMF
1278 @classmethod
1279 def day(cls, day):
1280 """Returns a range that only contains the given day"""
5f6a1245
JW
1281 return cls(day, day)
1282
bd558525
JMF
1283 def __contains__(self, date):
1284 """Check if the date is in the range"""
37254abc
JMF
1285 if not isinstance(date, datetime.date):
1286 date = date_from_str(date)
1287 return self.start <= date <= self.end
5f6a1245 1288
bd558525 1289 def __str__(self):
5f6a1245 1290 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1291
1292
1293def platform_name():
1294 """ Returns the platform name as a compat_str """
1295 res = platform.platform()
1296 if isinstance(res, bytes):
1297 res = res.decode(preferredencoding())
1298
1299 assert isinstance(res, compat_str)
1300 return res
c257baff
PH
1301
1302
b58ddb32
PH
1303def _windows_write_string(s, out):
1304 """ Returns True if the string was written using special methods,
1305 False if it has yet to be written out."""
1306 # Adapted from http://stackoverflow.com/a/3259271/35070
1307
1308 import ctypes
1309 import ctypes.wintypes
1310
1311 WIN_OUTPUT_IDS = {
1312 1: -11,
1313 2: -12,
1314 }
1315
a383a98a
PH
1316 try:
1317 fileno = out.fileno()
1318 except AttributeError:
1319 # If the output stream doesn't have a fileno, it's virtual
1320 return False
aa42e873
PH
1321 except io.UnsupportedOperation:
1322 # Some strange Windows pseudo files?
1323 return False
b58ddb32
PH
1324 if fileno not in WIN_OUTPUT_IDS:
1325 return False
1326
e2f89ec7 1327 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1328 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1329 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1330 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1331
e2f89ec7 1332 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1333 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1334 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1335 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1336 written = ctypes.wintypes.DWORD(0)
1337
611c1dd9 1338 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1339 FILE_TYPE_CHAR = 0x0002
1340 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1341 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1342 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1343 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1344 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1345 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1346
1347 def not_a_console(handle):
1348 if handle == INVALID_HANDLE_VALUE or handle is None:
1349 return True
8fb3ac36
PH
1350 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1351 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1352
1353 if not_a_console(h):
1354 return False
1355
d1b9c912
PH
1356 def next_nonbmp_pos(s):
1357 try:
1358 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1359 except StopIteration:
1360 return len(s)
1361
1362 while s:
1363 count = min(next_nonbmp_pos(s), 1024)
1364
b58ddb32 1365 ret = WriteConsoleW(
d1b9c912 1366 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1367 if ret == 0:
1368 raise OSError('Failed to write string')
d1b9c912
PH
1369 if not count: # We just wrote a non-BMP character
1370 assert written.value == 2
1371 s = s[1:]
1372 else:
1373 assert written.value > 0
1374 s = s[written.value:]
b58ddb32
PH
1375 return True
1376
1377
734f90bb 1378def write_string(s, out=None, encoding=None):
7459e3a2
PH
1379 if out is None:
1380 out = sys.stderr
8bf48f23 1381 assert type(s) == compat_str
7459e3a2 1382
b58ddb32
PH
1383 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1384 if _windows_write_string(s, out):
1385 return
1386
7459e3a2
PH
1387 if ('b' in getattr(out, 'mode', '') or
1388 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1389 byt = s.encode(encoding or preferredencoding(), 'ignore')
1390 out.write(byt)
1391 elif hasattr(out, 'buffer'):
1392 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1393 byt = s.encode(enc, 'ignore')
1394 out.buffer.write(byt)
1395 else:
8bf48f23 1396 out.write(s)
7459e3a2
PH
1397 out.flush()
1398
1399
48ea9cea
PH
1400def bytes_to_intlist(bs):
1401 if not bs:
1402 return []
1403 if isinstance(bs[0], int): # Python 3
1404 return list(bs)
1405 else:
1406 return [ord(c) for c in bs]
1407
c257baff 1408
cba892fa 1409def intlist_to_bytes(xs):
1410 if not xs:
1411 return b''
edaa23f8 1412 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1413
1414
c1c9a79c
PH
1415# Cross-platform file locking
1416if sys.platform == 'win32':
1417 import ctypes.wintypes
1418 import msvcrt
1419
1420 class OVERLAPPED(ctypes.Structure):
1421 _fields_ = [
1422 ('Internal', ctypes.wintypes.LPVOID),
1423 ('InternalHigh', ctypes.wintypes.LPVOID),
1424 ('Offset', ctypes.wintypes.DWORD),
1425 ('OffsetHigh', ctypes.wintypes.DWORD),
1426 ('hEvent', ctypes.wintypes.HANDLE),
1427 ]
1428
1429 kernel32 = ctypes.windll.kernel32
1430 LockFileEx = kernel32.LockFileEx
1431 LockFileEx.argtypes = [
1432 ctypes.wintypes.HANDLE, # hFile
1433 ctypes.wintypes.DWORD, # dwFlags
1434 ctypes.wintypes.DWORD, # dwReserved
1435 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1436 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1437 ctypes.POINTER(OVERLAPPED) # Overlapped
1438 ]
1439 LockFileEx.restype = ctypes.wintypes.BOOL
1440 UnlockFileEx = kernel32.UnlockFileEx
1441 UnlockFileEx.argtypes = [
1442 ctypes.wintypes.HANDLE, # hFile
1443 ctypes.wintypes.DWORD, # dwReserved
1444 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1445 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1446 ctypes.POINTER(OVERLAPPED) # Overlapped
1447 ]
1448 UnlockFileEx.restype = ctypes.wintypes.BOOL
1449 whole_low = 0xffffffff
1450 whole_high = 0x7fffffff
1451
1452 def _lock_file(f, exclusive):
1453 overlapped = OVERLAPPED()
1454 overlapped.Offset = 0
1455 overlapped.OffsetHigh = 0
1456 overlapped.hEvent = 0
1457 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1458 handle = msvcrt.get_osfhandle(f.fileno())
1459 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1460 whole_low, whole_high, f._lock_file_overlapped_p):
1461 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1462
1463 def _unlock_file(f):
1464 assert f._lock_file_overlapped_p
1465 handle = msvcrt.get_osfhandle(f.fileno())
1466 if not UnlockFileEx(handle, 0,
1467 whole_low, whole_high, f._lock_file_overlapped_p):
1468 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1469
1470else:
399a76e6
YCH
1471 # Some platforms, such as Jython, is missing fcntl
1472 try:
1473 import fcntl
c1c9a79c 1474
399a76e6
YCH
1475 def _lock_file(f, exclusive):
1476 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1477
399a76e6
YCH
1478 def _unlock_file(f):
1479 fcntl.flock(f, fcntl.LOCK_UN)
1480 except ImportError:
1481 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1482
1483 def _lock_file(f, exclusive):
1484 raise IOError(UNSUPPORTED_MSG)
1485
1486 def _unlock_file(f):
1487 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1488
1489
1490class locked_file(object):
1491 def __init__(self, filename, mode, encoding=None):
1492 assert mode in ['r', 'a', 'w']
1493 self.f = io.open(filename, mode, encoding=encoding)
1494 self.mode = mode
1495
1496 def __enter__(self):
1497 exclusive = self.mode != 'r'
1498 try:
1499 _lock_file(self.f, exclusive)
1500 except IOError:
1501 self.f.close()
1502 raise
1503 return self
1504
1505 def __exit__(self, etype, value, traceback):
1506 try:
1507 _unlock_file(self.f)
1508 finally:
1509 self.f.close()
1510
1511 def __iter__(self):
1512 return iter(self.f)
1513
1514 def write(self, *args):
1515 return self.f.write(*args)
1516
1517 def read(self, *args):
1518 return self.f.read(*args)
4eb7f1d1
JMF
1519
1520
4644ac55
S
1521def get_filesystem_encoding():
1522 encoding = sys.getfilesystemencoding()
1523 return encoding if encoding is not None else 'utf-8'
1524
1525
4eb7f1d1 1526def shell_quote(args):
a6a173c2 1527 quoted_args = []
4644ac55 1528 encoding = get_filesystem_encoding()
a6a173c2
JMF
1529 for a in args:
1530 if isinstance(a, bytes):
1531 # We may get a filename encoded with 'encodeFilename'
1532 a = a.decode(encoding)
1533 quoted_args.append(pipes.quote(a))
28e614de 1534 return ' '.join(quoted_args)
9d4660ca
PH
1535
1536
1537def smuggle_url(url, data):
1538 """ Pass additional data in a URL for internal use. """
1539
81953d1a
RA
1540 url, idata = unsmuggle_url(url, {})
1541 data.update(idata)
15707c7e 1542 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1543 {'__youtubedl_smuggle': json.dumps(data)})
1544 return url + '#' + sdata
9d4660ca
PH
1545
1546
79f82953 1547def unsmuggle_url(smug_url, default=None):
83e865a3 1548 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1549 return smug_url, default
28e614de
PH
1550 url, _, sdata = smug_url.rpartition('#')
1551 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1552 data = json.loads(jsond)
1553 return url, data
02dbf93f
PH
1554
1555
02dbf93f
PH
1556def format_bytes(bytes):
1557 if bytes is None:
28e614de 1558 return 'N/A'
02dbf93f
PH
1559 if type(bytes) is str:
1560 bytes = float(bytes)
1561 if bytes == 0.0:
1562 exponent = 0
1563 else:
1564 exponent = int(math.log(bytes, 1024.0))
28e614de 1565 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1566 converted = float(bytes) / float(1024 ** exponent)
28e614de 1567 return '%.2f%s' % (converted, suffix)
f53c966a 1568
1c088fa8 1569
fb47597b
S
1570def lookup_unit_table(unit_table, s):
1571 units_re = '|'.join(re.escape(u) for u in unit_table)
1572 m = re.match(
782b1b5b 1573 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1574 if not m:
1575 return None
1576 num_str = m.group('num').replace(',', '.')
1577 mult = unit_table[m.group('unit')]
1578 return int(float(num_str) * mult)
1579
1580
be64b5b0
PH
1581def parse_filesize(s):
1582 if s is None:
1583 return None
1584
dfb1b146 1585 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1586 # but we support those too
1587 _UNIT_TABLE = {
1588 'B': 1,
1589 'b': 1,
70852b47 1590 'bytes': 1,
be64b5b0
PH
1591 'KiB': 1024,
1592 'KB': 1000,
1593 'kB': 1024,
1594 'Kb': 1000,
13585d76 1595 'kb': 1000,
70852b47
YCH
1596 'kilobytes': 1000,
1597 'kibibytes': 1024,
be64b5b0
PH
1598 'MiB': 1024 ** 2,
1599 'MB': 1000 ** 2,
1600 'mB': 1024 ** 2,
1601 'Mb': 1000 ** 2,
13585d76 1602 'mb': 1000 ** 2,
70852b47
YCH
1603 'megabytes': 1000 ** 2,
1604 'mebibytes': 1024 ** 2,
be64b5b0
PH
1605 'GiB': 1024 ** 3,
1606 'GB': 1000 ** 3,
1607 'gB': 1024 ** 3,
1608 'Gb': 1000 ** 3,
13585d76 1609 'gb': 1000 ** 3,
70852b47
YCH
1610 'gigabytes': 1000 ** 3,
1611 'gibibytes': 1024 ** 3,
be64b5b0
PH
1612 'TiB': 1024 ** 4,
1613 'TB': 1000 ** 4,
1614 'tB': 1024 ** 4,
1615 'Tb': 1000 ** 4,
13585d76 1616 'tb': 1000 ** 4,
70852b47
YCH
1617 'terabytes': 1000 ** 4,
1618 'tebibytes': 1024 ** 4,
be64b5b0
PH
1619 'PiB': 1024 ** 5,
1620 'PB': 1000 ** 5,
1621 'pB': 1024 ** 5,
1622 'Pb': 1000 ** 5,
13585d76 1623 'pb': 1000 ** 5,
70852b47
YCH
1624 'petabytes': 1000 ** 5,
1625 'pebibytes': 1024 ** 5,
be64b5b0
PH
1626 'EiB': 1024 ** 6,
1627 'EB': 1000 ** 6,
1628 'eB': 1024 ** 6,
1629 'Eb': 1000 ** 6,
13585d76 1630 'eb': 1000 ** 6,
70852b47
YCH
1631 'exabytes': 1000 ** 6,
1632 'exbibytes': 1024 ** 6,
be64b5b0
PH
1633 'ZiB': 1024 ** 7,
1634 'ZB': 1000 ** 7,
1635 'zB': 1024 ** 7,
1636 'Zb': 1000 ** 7,
13585d76 1637 'zb': 1000 ** 7,
70852b47
YCH
1638 'zettabytes': 1000 ** 7,
1639 'zebibytes': 1024 ** 7,
be64b5b0
PH
1640 'YiB': 1024 ** 8,
1641 'YB': 1000 ** 8,
1642 'yB': 1024 ** 8,
1643 'Yb': 1000 ** 8,
13585d76 1644 'yb': 1000 ** 8,
70852b47
YCH
1645 'yottabytes': 1000 ** 8,
1646 'yobibytes': 1024 ** 8,
be64b5b0
PH
1647 }
1648
fb47597b
S
1649 return lookup_unit_table(_UNIT_TABLE, s)
1650
1651
1652def parse_count(s):
1653 if s is None:
be64b5b0
PH
1654 return None
1655
fb47597b
S
1656 s = s.strip()
1657
1658 if re.match(r'^[\d,.]+$', s):
1659 return str_to_int(s)
1660
1661 _UNIT_TABLE = {
1662 'k': 1000,
1663 'K': 1000,
1664 'm': 1000 ** 2,
1665 'M': 1000 ** 2,
1666 'kk': 1000 ** 2,
1667 'KK': 1000 ** 2,
1668 }
be64b5b0 1669
fb47597b 1670 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1671
2f7ae819 1672
a942d6cb 1673def month_by_name(name, lang='en'):
caefb1de
PH
1674 """ Return the number of a month by (locale-independently) English name """
1675
f6717dec 1676 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1677
caefb1de 1678 try:
f6717dec 1679 return month_names.index(name) + 1
7105440c
YCH
1680 except ValueError:
1681 return None
1682
1683
1684def month_by_abbreviation(abbrev):
1685 """ Return the number of a month by (locale-independently) English
1686 abbreviations """
1687
1688 try:
1689 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1690 except ValueError:
1691 return None
18258362
JMF
1692
1693
5aafe895 1694def fix_xml_ampersands(xml_str):
18258362 1695 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1696 return re.sub(
1697 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1698 '&amp;',
5aafe895 1699 xml_str)
e3946f98
PH
1700
1701
1702def setproctitle(title):
8bf48f23 1703 assert isinstance(title, compat_str)
c1c05c67
YCH
1704
1705 # ctypes in Jython is not complete
1706 # http://bugs.jython.org/issue2148
1707 if sys.platform.startswith('java'):
1708 return
1709
e3946f98 1710 try:
611c1dd9 1711 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1712 except OSError:
1713 return
2f49bcd6
RC
1714 except TypeError:
1715 # LoadLibrary in Windows Python 2.7.13 only expects
1716 # a bytestring, but since unicode_literals turns
1717 # every string into a unicode string, it fails.
1718 return
6eefe533
PH
1719 title_bytes = title.encode('utf-8')
1720 buf = ctypes.create_string_buffer(len(title_bytes))
1721 buf.value = title_bytes
e3946f98 1722 try:
6eefe533 1723 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1724 except AttributeError:
1725 return # Strange libc, just skip this
d7dda168
PH
1726
1727
1728def remove_start(s, start):
46bc9b7d 1729 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1730
1731
2b9faf55 1732def remove_end(s, end):
46bc9b7d 1733 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1734
1735
31b2051e
S
1736def remove_quotes(s):
1737 if s is None or len(s) < 2:
1738 return s
1739 for quote in ('"', "'", ):
1740 if s[0] == quote and s[-1] == quote:
1741 return s[1:-1]
1742 return s
1743
1744
29eb5174 1745def url_basename(url):
9b8aaeed 1746 path = compat_urlparse.urlparse(url).path
28e614de 1747 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1748
1749
02dc0a36
S
1750def base_url(url):
1751 return re.match(r'https?://[^?#&]+/', url).group()
1752
1753
e34c3361 1754def urljoin(base, path):
4b5de77b
S
1755 if isinstance(path, bytes):
1756 path = path.decode('utf-8')
e34c3361
S
1757 if not isinstance(path, compat_str) or not path:
1758 return None
b0c65c67 1759 if re.match(r'^(?:https?:)?//', path):
e34c3361 1760 return path
4b5de77b
S
1761 if isinstance(base, bytes):
1762 base = base.decode('utf-8')
1763 if not isinstance(base, compat_str) or not re.match(
1764 r'^(?:https?:)?//', base):
e34c3361
S
1765 return None
1766 return compat_urlparse.urljoin(base, path)
1767
1768
aa94a6d3
PH
1769class HEADRequest(compat_urllib_request.Request):
1770 def get_method(self):
611c1dd9 1771 return 'HEAD'
7217e148
PH
1772
1773
95cf60e8
S
1774class PUTRequest(compat_urllib_request.Request):
1775 def get_method(self):
1776 return 'PUT'
1777
1778
9732d77e 1779def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1780 if get_attr:
1781 if v is not None:
1782 v = getattr(v, get_attr, None)
9572013d
PH
1783 if v == '':
1784 v = None
1812afb7
S
1785 if v is None:
1786 return default
1787 try:
1788 return int(v) * invscale // scale
1789 except ValueError:
af98f8ff 1790 return default
9732d77e 1791
9572013d 1792
40a90862
JMF
1793def str_or_none(v, default=None):
1794 return default if v is None else compat_str(v)
1795
9732d77e
PH
1796
1797def str_to_int(int_str):
48d4681e 1798 """ A more relaxed version of int_or_none """
9732d77e
PH
1799 if int_str is None:
1800 return None
28e614de 1801 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1802 return int(int_str)
608d11f5
PH
1803
1804
9732d77e 1805def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1806 if v is None:
1807 return default
1808 try:
1809 return float(v) * invscale / scale
1810 except ValueError:
1811 return default
43f775e4
PH
1812
1813
b72b4431
S
1814def strip_or_none(v):
1815 return None if v is None else v.strip()
1816
1817
608d11f5 1818def parse_duration(s):
8f9312c3 1819 if not isinstance(s, compat_basestring):
608d11f5
PH
1820 return None
1821
ca7b3246
S
1822 s = s.strip()
1823
acaff495 1824 days, hours, mins, secs, ms = [None] * 5
15846398 1825 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
acaff495 1826 if m:
1827 days, hours, mins, secs, ms = m.groups()
1828 else:
1829 m = re.match(
1830 r'''(?ix)(?:P?T)?
8f4b58d7 1831 (?:
acaff495 1832 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1833 )?
acaff495 1834 (?:
1835 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1836 )?
1837 (?:
1838 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1839 )?
1840 (?:
1841 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 1842 )?Z?$''', s)
acaff495 1843 if m:
1844 days, hours, mins, secs, ms = m.groups()
1845 else:
15846398 1846 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 1847 if m:
1848 hours, mins = m.groups()
1849 else:
1850 return None
1851
1852 duration = 0
1853 if secs:
1854 duration += float(secs)
1855 if mins:
1856 duration += float(mins) * 60
1857 if hours:
1858 duration += float(hours) * 60 * 60
1859 if days:
1860 duration += float(days) * 24 * 60 * 60
1861 if ms:
1862 duration += float(ms)
1863 return duration
91d7d0b3
JMF
1864
1865
e65e4c88 1866def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1867 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1868 return (
1869 '{0}.{1}{2}'.format(name, ext, real_ext)
1870 if not expected_real_ext or real_ext[1:] == expected_real_ext
1871 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1872
1873
b3ed15b7
S
1874def replace_extension(filename, ext, expected_real_ext=None):
1875 name, real_ext = os.path.splitext(filename)
1876 return '{0}.{1}'.format(
1877 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1878 ext)
1879
1880
d70ad093
PH
1881def check_executable(exe, args=[]):
1882 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1883 args can be a list of arguments for a short output (like -version) """
1884 try:
1885 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1886 except OSError:
1887 return False
1888 return exe
b7ab0590
PH
1889
1890
95807118 1891def get_exe_version(exe, args=['--version'],
cae97f65 1892 version_re=None, unrecognized='present'):
95807118
PH
1893 """ Returns the version of the specified executable,
1894 or False if the executable is not present """
1895 try:
b64d04c1
YCH
1896 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1897 # SIGTTOU if youtube-dl is run in the background.
1898 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
cae97f65 1899 out, _ = subprocess.Popen(
54116803 1900 [encodeArgument(exe)] + args,
00ca7552 1901 stdin=subprocess.PIPE,
95807118
PH
1902 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1903 except OSError:
1904 return False
cae97f65
PH
1905 if isinstance(out, bytes): # Python 2.x
1906 out = out.decode('ascii', 'ignore')
1907 return detect_exe_version(out, version_re, unrecognized)
1908
1909
1910def detect_exe_version(output, version_re=None, unrecognized='present'):
1911 assert isinstance(output, compat_str)
1912 if version_re is None:
1913 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1914 m = re.search(version_re, output)
95807118
PH
1915 if m:
1916 return m.group(1)
1917 else:
1918 return unrecognized
1919
1920
b7ab0590 1921class PagedList(object):
dd26ced1
PH
1922 def __len__(self):
1923 # This is only useful for tests
1924 return len(self.getslice())
1925
9c44d242
PH
1926
1927class OnDemandPagedList(PagedList):
b95dc034 1928 def __init__(self, pagefunc, pagesize, use_cache=False):
9c44d242
PH
1929 self._pagefunc = pagefunc
1930 self._pagesize = pagesize
b95dc034
YCH
1931 self._use_cache = use_cache
1932 if use_cache:
1933 self._cache = {}
9c44d242 1934
b7ab0590
PH
1935 def getslice(self, start=0, end=None):
1936 res = []
1937 for pagenum in itertools.count(start // self._pagesize):
1938 firstid = pagenum * self._pagesize
1939 nextfirstid = pagenum * self._pagesize + self._pagesize
1940 if start >= nextfirstid:
1941 continue
1942
b95dc034
YCH
1943 page_results = None
1944 if self._use_cache:
1945 page_results = self._cache.get(pagenum)
1946 if page_results is None:
1947 page_results = list(self._pagefunc(pagenum))
1948 if self._use_cache:
1949 self._cache[pagenum] = page_results
b7ab0590
PH
1950
1951 startv = (
1952 start % self._pagesize
1953 if firstid <= start < nextfirstid
1954 else 0)
1955
1956 endv = (
1957 ((end - 1) % self._pagesize) + 1
1958 if (end is not None and firstid <= end <= nextfirstid)
1959 else None)
1960
1961 if startv != 0 or endv is not None:
1962 page_results = page_results[startv:endv]
1963 res.extend(page_results)
1964
1965 # A little optimization - if current page is not "full", ie. does
1966 # not contain page_size videos then we can assume that this page
1967 # is the last one - there are no more ids on further pages -
1968 # i.e. no need to query again.
1969 if len(page_results) + startv < self._pagesize:
1970 break
1971
1972 # If we got the whole page, but the next page is not interesting,
1973 # break out early as well
1974 if end == nextfirstid:
1975 break
1976 return res
81c2f20b
PH
1977
1978
9c44d242
PH
1979class InAdvancePagedList(PagedList):
1980 def __init__(self, pagefunc, pagecount, pagesize):
1981 self._pagefunc = pagefunc
1982 self._pagecount = pagecount
1983 self._pagesize = pagesize
1984
1985 def getslice(self, start=0, end=None):
1986 res = []
1987 start_page = start // self._pagesize
1988 end_page = (
1989 self._pagecount if end is None else (end // self._pagesize + 1))
1990 skip_elems = start - start_page * self._pagesize
1991 only_more = None if end is None else end - start
1992 for pagenum in range(start_page, end_page):
1993 page = list(self._pagefunc(pagenum))
1994 if skip_elems:
1995 page = page[skip_elems:]
1996 skip_elems = None
1997 if only_more is not None:
1998 if len(page) < only_more:
1999 only_more -= len(page)
2000 else:
2001 page = page[:only_more]
2002 res.extend(page)
2003 break
2004 res.extend(page)
2005 return res
2006
2007
81c2f20b 2008def uppercase_escape(s):
676eb3f2 2009 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2010 return re.sub(
a612753d 2011 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2012 lambda m: unicode_escape(m.group(0))[0],
2013 s)
0fe2ff78
YCH
2014
2015
2016def lowercase_escape(s):
2017 unicode_escape = codecs.getdecoder('unicode_escape')
2018 return re.sub(
2019 r'\\u[0-9a-fA-F]{4}',
2020 lambda m: unicode_escape(m.group(0))[0],
2021 s)
b53466e1 2022
d05cfe06
S
2023
2024def escape_rfc3986(s):
2025 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 2026 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 2027 s = s.encode('utf-8')
ecc0c5ee 2028 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2029
2030
2031def escape_url(url):
2032 """Escape URL as suggested by RFC 3986"""
2033 url_parsed = compat_urllib_parse_urlparse(url)
2034 return url_parsed._replace(
efbed08d 2035 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2036 path=escape_rfc3986(url_parsed.path),
2037 params=escape_rfc3986(url_parsed.params),
2038 query=escape_rfc3986(url_parsed.query),
2039 fragment=escape_rfc3986(url_parsed.fragment)
2040 ).geturl()
2041
62e609ab
PH
2042
2043def read_batch_urls(batch_fd):
2044 def fixup(url):
2045 if not isinstance(url, compat_str):
2046 url = url.decode('utf-8', 'replace')
28e614de 2047 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
2048 if url.startswith(BOM_UTF8):
2049 url = url[len(BOM_UTF8):]
2050 url = url.strip()
2051 if url.startswith(('#', ';', ']')):
2052 return False
2053 return url
2054
2055 with contextlib.closing(batch_fd) as fd:
2056 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2057
2058
2059def urlencode_postdata(*args, **kargs):
15707c7e 2060 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2061
2062
38f9ef31 2063def update_url_query(url, query):
cacd9966
YCH
2064 if not query:
2065 return url
38f9ef31 2066 parsed_url = compat_urlparse.urlparse(url)
2067 qs = compat_parse_qs(parsed_url.query)
2068 qs.update(query)
2069 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2070 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2071
8e60dc75 2072
ed0291d1
S
2073def update_Request(req, url=None, data=None, headers={}, query={}):
2074 req_headers = req.headers.copy()
2075 req_headers.update(headers)
2076 req_data = data or req.data
2077 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2078 req_get_method = req.get_method()
2079 if req_get_method == 'HEAD':
2080 req_type = HEADRequest
2081 elif req_get_method == 'PUT':
2082 req_type = PUTRequest
2083 else:
2084 req_type = compat_urllib_request.Request
ed0291d1
S
2085 new_req = req_type(
2086 req_url, data=req_data, headers=req_headers,
2087 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2088 if hasattr(req, 'timeout'):
2089 new_req.timeout = req.timeout
2090 return new_req
2091
2092
10c87c15 2093def _multipart_encode_impl(data, boundary):
0c265486
YCH
2094 content_type = 'multipart/form-data; boundary=%s' % boundary
2095
2096 out = b''
2097 for k, v in data.items():
2098 out += b'--' + boundary.encode('ascii') + b'\r\n'
2099 if isinstance(k, compat_str):
2100 k = k.encode('utf-8')
2101 if isinstance(v, compat_str):
2102 v = v.encode('utf-8')
2103 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2104 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2105 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2106 if boundary.encode('ascii') in content:
2107 raise ValueError('Boundary overlaps with data')
2108 out += content
2109
2110 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2111
2112 return out, content_type
2113
2114
2115def multipart_encode(data, boundary=None):
2116 '''
2117 Encode a dict to RFC 7578-compliant form-data
2118
2119 data:
2120 A dict where keys and values can be either Unicode or bytes-like
2121 objects.
2122 boundary:
2123 If specified a Unicode object, it's used as the boundary. Otherwise
2124 a random boundary is generated.
2125
2126 Reference: https://tools.ietf.org/html/rfc7578
2127 '''
2128 has_specified_boundary = boundary is not None
2129
2130 while True:
2131 if boundary is None:
2132 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2133
2134 try:
10c87c15 2135 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2136 break
2137 except ValueError:
2138 if has_specified_boundary:
2139 raise
2140 boundary = None
2141
2142 return out, content_type
2143
2144
86296ad2 2145def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
2146 if isinstance(key_or_keys, (list, tuple)):
2147 for key in key_or_keys:
86296ad2
S
2148 if key not in d or d[key] is None or skip_false_values and not d[key]:
2149 continue
2150 return d[key]
cbecc9b9
S
2151 return default
2152 return d.get(key_or_keys, default)
2153
2154
329ca3be 2155def try_get(src, getter, expected_type=None):
a32a9a7e
S
2156 if not isinstance(getter, (list, tuple)):
2157 getter = [getter]
2158 for get in getter:
2159 try:
2160 v = get(src)
2161 except (AttributeError, KeyError, TypeError, IndexError):
2162 pass
2163 else:
2164 if expected_type is None or isinstance(v, expected_type):
2165 return v
329ca3be
S
2166
2167
8e60dc75
S
2168def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2169 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2170
16392824 2171
a1a530b0
PH
2172US_RATINGS = {
2173 'G': 0,
2174 'PG': 10,
2175 'PG-13': 13,
2176 'R': 16,
2177 'NC': 18,
2178}
fac55558
PH
2179
2180
a8795327
S
2181TV_PARENTAL_GUIDELINES = {
2182 'TV-Y': 0,
2183 'TV-Y7': 7,
2184 'TV-G': 0,
2185 'TV-PG': 0,
2186 'TV-14': 14,
2187 'TV-MA': 17,
2188}
2189
2190
146c80e2 2191def parse_age_limit(s):
a8795327
S
2192 if type(s) == int:
2193 return s if 0 <= s <= 21 else None
2194 if not isinstance(s, compat_basestring):
d838b1bd 2195 return None
146c80e2 2196 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2197 if m:
2198 return int(m.group('age'))
2199 if s in US_RATINGS:
2200 return US_RATINGS[s]
2201 return TV_PARENTAL_GUIDELINES.get(s)
146c80e2
S
2202
2203
fac55558 2204def strip_jsonp(code):
609a61e3 2205 return re.sub(
5552c9eb
YCH
2206 r'''(?sx)^
2207 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+)
2208 (?:\s*&&\s*(?P=func_name))?
2209 \s*\(\s*(?P<callback_data>.*)\);?
2210 \s*?(?://[^\n]*)*$''',
2211 r'\g<callback_data>', code)
478c2c61
PH
2212
2213
e05f6939 2214def js_to_json(code):
4195096e
S
2215 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2216 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2217 INTEGER_TABLE = (
2218 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2219 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2220 )
2221
e05f6939 2222 def fix_kv(m):
e7b6d122
PH
2223 v = m.group(0)
2224 if v in ('true', 'false', 'null'):
2225 return v
b3ee552e 2226 elif v.startswith('/*') or v.startswith('//') or v == ',':
bd1e4844 2227 return ""
2228
2229 if v[0] in ("'", '"'):
2230 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 2231 '"': '\\"',
bd1e4844 2232 "\\'": "'",
2233 '\\\n': '',
2234 '\\x': '\\u00',
2235 }.get(m.group(0), m.group(0)), v[1:-1])
2236
89ac4a19
S
2237 for regex, base in INTEGER_TABLE:
2238 im = re.match(regex, v)
2239 if im:
e4659b45 2240 i = int(im.group(1), base)
89ac4a19
S
2241 return '"%d":' % i if v.endswith(':') else '%d' % i
2242
e7b6d122 2243 return '"%s"' % v
e05f6939 2244
bd1e4844 2245 return re.sub(r'''(?sx)
2246 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2247 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 2248 {comment}|,(?={skip}[\]}}])|
bd1e4844 2249 [a-zA-Z_][.a-zA-Z_0-9]*|
4195096e
S
2250 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2251 [0-9]+(?={skip}:)
2252 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
2253
2254
478c2c61
PH
2255def qualities(quality_ids):
2256 """ Get a numeric quality value out of a list of possible values """
2257 def q(qid):
2258 try:
2259 return quality_ids.index(qid)
2260 except ValueError:
2261 return -1
2262 return q
2263
acd69589
PH
2264
2265DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 2266
a020a0dc
PH
2267
2268def limit_length(s, length):
2269 """ Add ellipses to overly long strings """
2270 if s is None:
2271 return None
2272 ELLIPSES = '...'
2273 if len(s) > length:
2274 return s[:length - len(ELLIPSES)] + ELLIPSES
2275 return s
48844745
PH
2276
2277
2278def version_tuple(v):
5f9b8394 2279 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2280
2281
2282def is_outdated_version(version, limit, assume_new=True):
2283 if not version:
2284 return not assume_new
2285 try:
2286 return version_tuple(version) < version_tuple(limit)
2287 except ValueError:
2288 return not assume_new
732ea2f0
PH
2289
2290
2291def ytdl_is_updateable():
2292 """ Returns if youtube-dl can be updated with -U """
2293 from zipimport import zipimporter
2294
2295 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
2296
2297
2298def args_to_str(args):
2299 # Get a short string representation for a subprocess command
702ccf2d 2300 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2301
2302
9b9c5355 2303def error_to_compat_str(err):
fdae2358
S
2304 err_str = str(err)
2305 # On python 2 error byte string must be decoded with proper
2306 # encoding rather than ascii
2307 if sys.version_info[0] < 3:
2308 err_str = err_str.decode(preferredencoding())
2309 return err_str
2310
2311
c460bdd5 2312def mimetype2ext(mt):
eb9ee194
S
2313 if mt is None:
2314 return None
2315
765ac263
JMF
2316 ext = {
2317 'audio/mp4': 'm4a',
6c33d24b
YCH
2318 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2319 # it's the most popular one
2320 'audio/mpeg': 'mp3',
765ac263
JMF
2321 }.get(mt)
2322 if ext is not None:
2323 return ext
2324
c460bdd5 2325 _, _, res = mt.rpartition('/')
6562d34a 2326 res = res.split(';')[0].strip().lower()
c460bdd5
PH
2327
2328 return {
f6861ec9 2329 '3gpp': '3gp',
cafcf657 2330 'smptett+xml': 'tt',
cafcf657 2331 'ttaf+xml': 'dfxp',
a0d8d704 2332 'ttml+xml': 'ttml',
f6861ec9 2333 'x-flv': 'flv',
a0d8d704
YCH
2334 'x-mp4-fragmented': 'mp4',
2335 'x-ms-wmv': 'wmv',
b4173f15
RA
2336 'mpegurl': 'm3u8',
2337 'x-mpegurl': 'm3u8',
2338 'vnd.apple.mpegurl': 'm3u8',
2339 'dash+xml': 'mpd',
b4173f15 2340 'f4m+xml': 'f4m',
f164b971 2341 'hds+xml': 'f4m',
e910fe2f 2342 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 2343 'quicktime': 'mov',
98ce1a3f 2344 'mp2t': 'ts',
c460bdd5
PH
2345 }.get(res, res)
2346
2347
4f3c5e06 2348def parse_codecs(codecs_str):
2349 # http://tools.ietf.org/html/rfc6381
2350 if not codecs_str:
2351 return {}
2352 splited_codecs = list(filter(None, map(
2353 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2354 vcodec, acodec = None, None
2355 for full_codec in splited_codecs:
2356 codec = full_codec.split('.')[0]
2357 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2358 if not vcodec:
2359 vcodec = full_codec
60f5c9fb 2360 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 2361 if not acodec:
2362 acodec = full_codec
2363 else:
60f5c9fb 2364 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
4f3c5e06 2365 if not vcodec and not acodec:
2366 if len(splited_codecs) == 2:
2367 return {
2368 'vcodec': vcodec,
2369 'acodec': acodec,
2370 }
2371 elif len(splited_codecs) == 1:
2372 return {
2373 'vcodec': 'none',
2374 'acodec': vcodec,
2375 }
2376 else:
2377 return {
2378 'vcodec': vcodec or 'none',
2379 'acodec': acodec or 'none',
2380 }
2381 return {}
2382
2383
2ccd1b10 2384def urlhandle_detect_ext(url_handle):
79298173 2385 getheader = url_handle.headers.get
2ccd1b10 2386
b55ee18f
PH
2387 cd = getheader('Content-Disposition')
2388 if cd:
2389 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2390 if m:
2391 e = determine_ext(m.group('filename'), default_ext=None)
2392 if e:
2393 return e
2394
c460bdd5 2395 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2396
2397
1e399778
YCH
2398def encode_data_uri(data, mime_type):
2399 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2400
2401
05900629 2402def age_restricted(content_limit, age_limit):
6ec6cb4e 2403 """ Returns True iff the content should be blocked """
05900629
PH
2404
2405 if age_limit is None: # No limit set
2406 return False
2407 if content_limit is None:
2408 return False # Content available for everyone
2409 return age_limit < content_limit
61ca9a80
PH
2410
2411
2412def is_html(first_bytes):
2413 """ Detect whether a file contains HTML by examining its first bytes. """
2414
2415 BOMS = [
2416 (b'\xef\xbb\xbf', 'utf-8'),
2417 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2418 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2419 (b'\xff\xfe', 'utf-16-le'),
2420 (b'\xfe\xff', 'utf-16-be'),
2421 ]
2422 for bom, enc in BOMS:
2423 if first_bytes.startswith(bom):
2424 s = first_bytes[len(bom):].decode(enc, 'replace')
2425 break
2426 else:
2427 s = first_bytes.decode('utf-8', 'replace')
2428
2429 return re.match(r'^\s*<', s)
a055469f
PH
2430
2431
2432def determine_protocol(info_dict):
2433 protocol = info_dict.get('protocol')
2434 if protocol is not None:
2435 return protocol
2436
2437 url = info_dict['url']
2438 if url.startswith('rtmp'):
2439 return 'rtmp'
2440 elif url.startswith('mms'):
2441 return 'mms'
2442 elif url.startswith('rtsp'):
2443 return 'rtsp'
2444
2445 ext = determine_ext(url)
2446 if ext == 'm3u8':
2447 return 'm3u8'
2448 elif ext == 'f4m':
2449 return 'f4m'
2450
2451 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2452
2453
2454def render_table(header_row, data):
2455 """ Render a list of rows, each as a list of values """
2456 table = [header_row] + data
2457 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2458 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2459 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2460
2461
2462def _match_one(filter_part, dct):
2463 COMPARISON_OPERATORS = {
2464 '<': operator.lt,
2465 '<=': operator.le,
2466 '>': operator.gt,
2467 '>=': operator.ge,
2468 '=': operator.eq,
2469 '!=': operator.ne,
2470 }
2471 operator_rex = re.compile(r'''(?x)\s*
2472 (?P<key>[a-z_]+)
2473 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2474 (?:
2475 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
db13c16e 2476 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
347de493
PH
2477 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2478 )
2479 \s*$
2480 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2481 m = operator_rex.search(filter_part)
2482 if m:
2483 op = COMPARISON_OPERATORS[m.group('op')]
e5a088dc 2484 actual_value = dct.get(m.group('key'))
db13c16e
S
2485 if (m.group('quotedstrval') is not None or
2486 m.group('strval') is not None or
e5a088dc
S
2487 # If the original field is a string and matching comparisonvalue is
2488 # a number we should respect the origin of the original field
2489 # and process comparison value as a string (see
2490 # https://github.com/rg3/youtube-dl/issues/11082).
2491 actual_value is not None and m.group('intval') is not None and
2492 isinstance(actual_value, compat_str)):
347de493
PH
2493 if m.group('op') not in ('=', '!='):
2494 raise ValueError(
2495 'Operator %s does not support string values!' % m.group('op'))
db13c16e
S
2496 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2497 quote = m.group('quote')
2498 if quote is not None:
2499 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
347de493
PH
2500 else:
2501 try:
2502 comparison_value = int(m.group('intval'))
2503 except ValueError:
2504 comparison_value = parse_filesize(m.group('intval'))
2505 if comparison_value is None:
2506 comparison_value = parse_filesize(m.group('intval') + 'B')
2507 if comparison_value is None:
2508 raise ValueError(
2509 'Invalid integer value %r in filter part %r' % (
2510 m.group('intval'), filter_part))
347de493
PH
2511 if actual_value is None:
2512 return m.group('none_inclusive')
2513 return op(actual_value, comparison_value)
2514
2515 UNARY_OPERATORS = {
2516 '': lambda v: v is not None,
2517 '!': lambda v: v is None,
2518 }
2519 operator_rex = re.compile(r'''(?x)\s*
2520 (?P<op>%s)\s*(?P<key>[a-z_]+)
2521 \s*$
2522 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2523 m = operator_rex.search(filter_part)
2524 if m:
2525 op = UNARY_OPERATORS[m.group('op')]
2526 actual_value = dct.get(m.group('key'))
2527 return op(actual_value)
2528
2529 raise ValueError('Invalid filter part %r' % filter_part)
2530
2531
2532def match_str(filter_str, dct):
2533 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2534
2535 return all(
2536 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2537
2538
2539def match_filter_func(filter_str):
2540 def _match_func(info_dict):
2541 if match_str(filter_str, info_dict):
2542 return None
2543 else:
2544 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2545 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2546 return _match_func
91410c9b
PH
2547
2548
bf6427d2
YCH
2549def parse_dfxp_time_expr(time_expr):
2550 if not time_expr:
d631d5f9 2551 return
bf6427d2
YCH
2552
2553 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2554 if mobj:
2555 return float(mobj.group('time_offset'))
2556
db2fe38b 2557 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2558 if mobj:
db2fe38b 2559 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2560
2561
c1c924ab
YCH
2562def srt_subtitles_timecode(seconds):
2563 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2564
2565
2566def dfxp2srt(dfxp_data):
5b995f71
RA
2567 LEGACY_NAMESPACES = (
2568 ('http://www.w3.org/ns/ttml', [
2569 'http://www.w3.org/2004/11/ttaf1',
2570 'http://www.w3.org/2006/04/ttaf1',
2571 'http://www.w3.org/2006/10/ttaf1',
2572 ]),
2573 ('http://www.w3.org/ns/ttml#styling', [
2574 'http://www.w3.org/ns/ttml#style',
2575 ]),
2576 )
2577
2578 SUPPORTED_STYLING = [
2579 'color',
2580 'fontFamily',
2581 'fontSize',
2582 'fontStyle',
2583 'fontWeight',
2584 'textDecoration'
2585 ]
2586
4e335771
YCH
2587 _x = functools.partial(xpath_with_ns, ns_map={
2588 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 2589 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 2590 })
bf6427d2 2591
5b995f71
RA
2592 styles = {}
2593 default_style = {}
2594
87de7069 2595 class TTMLPElementParser(object):
5b995f71
RA
2596 _out = ''
2597 _unclosed_elements = []
2598 _applied_styles = []
bf6427d2 2599
2b14cb56 2600 def start(self, tag, attrib):
5b995f71
RA
2601 if tag in (_x('ttml:br'), 'br'):
2602 self._out += '\n'
2603 else:
2604 unclosed_elements = []
2605 style = {}
2606 element_style_id = attrib.get('style')
2607 if default_style:
2608 style.update(default_style)
2609 if element_style_id:
2610 style.update(styles.get(element_style_id, {}))
2611 for prop in SUPPORTED_STYLING:
2612 prop_val = attrib.get(_x('tts:' + prop))
2613 if prop_val:
2614 style[prop] = prop_val
2615 if style:
2616 font = ''
2617 for k, v in sorted(style.items()):
2618 if self._applied_styles and self._applied_styles[-1].get(k) == v:
2619 continue
2620 if k == 'color':
2621 font += ' color="%s"' % v
2622 elif k == 'fontSize':
2623 font += ' size="%s"' % v
2624 elif k == 'fontFamily':
2625 font += ' face="%s"' % v
2626 elif k == 'fontWeight' and v == 'bold':
2627 self._out += '<b>'
2628 unclosed_elements.append('b')
2629 elif k == 'fontStyle' and v == 'italic':
2630 self._out += '<i>'
2631 unclosed_elements.append('i')
2632 elif k == 'textDecoration' and v == 'underline':
2633 self._out += '<u>'
2634 unclosed_elements.append('u')
2635 if font:
2636 self._out += '<font' + font + '>'
2637 unclosed_elements.append('font')
2638 applied_style = {}
2639 if self._applied_styles:
2640 applied_style.update(self._applied_styles[-1])
2641 applied_style.update(style)
2642 self._applied_styles.append(applied_style)
2643 self._unclosed_elements.append(unclosed_elements)
bf6427d2 2644
2b14cb56 2645 def end(self, tag):
5b995f71
RA
2646 if tag not in (_x('ttml:br'), 'br'):
2647 unclosed_elements = self._unclosed_elements.pop()
2648 for element in reversed(unclosed_elements):
2649 self._out += '</%s>' % element
2650 if unclosed_elements and self._applied_styles:
2651 self._applied_styles.pop()
bf6427d2 2652
2b14cb56 2653 def data(self, data):
5b995f71 2654 self._out += data
2b14cb56 2655
2656 def close(self):
5b995f71 2657 return self._out.strip()
2b14cb56 2658
2659 def parse_node(node):
2660 target = TTMLPElementParser()
2661 parser = xml.etree.ElementTree.XMLParser(target=target)
2662 parser.feed(xml.etree.ElementTree.tostring(node))
2663 return parser.close()
bf6427d2 2664
5b995f71
RA
2665 for k, v in LEGACY_NAMESPACES:
2666 for ns in v:
2667 dfxp_data = dfxp_data.replace(ns, k)
2668
36e6f62c 2669 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
bf6427d2 2670 out = []
5b995f71 2671 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2672
2673 if not paras:
2674 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 2675
5b995f71
RA
2676 repeat = False
2677 while True:
2678 for style in dfxp.findall(_x('.//ttml:style')):
2679 style_id = style.get('id')
2680 parent_style_id = style.get('style')
2681 if parent_style_id:
2682 if parent_style_id not in styles:
2683 repeat = True
2684 continue
2685 styles[style_id] = styles[parent_style_id].copy()
2686 for prop in SUPPORTED_STYLING:
2687 prop_val = style.get(_x('tts:' + prop))
2688 if prop_val:
2689 styles.setdefault(style_id, {})[prop] = prop_val
2690 if repeat:
2691 repeat = False
2692 else:
2693 break
2694
2695 for p in ('body', 'div'):
2696 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2697 if ele is None:
2698 continue
2699 style = styles.get(ele.get('style'))
2700 if not style:
2701 continue
2702 default_style.update(style)
2703
bf6427d2 2704 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2705 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2706 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2707 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2708 if begin_time is None:
2709 continue
7dff0363 2710 if not end_time:
d631d5f9
YCH
2711 if not dur:
2712 continue
2713 end_time = begin_time + dur
bf6427d2
YCH
2714 out.append('%d\n%s --> %s\n%s\n\n' % (
2715 index,
c1c924ab
YCH
2716 srt_subtitles_timecode(begin_time),
2717 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2718 parse_node(para)))
2719
2720 return ''.join(out)
2721
2722
66e289ba
S
2723def cli_option(params, command_option, param):
2724 param = params.get(param)
98e698f1
RA
2725 if param:
2726 param = compat_str(param)
66e289ba
S
2727 return [command_option, param] if param is not None else []
2728
2729
2730def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2731 param = params.get(param)
2732 assert isinstance(param, bool)
2733 if separator:
2734 return [command_option + separator + (true_value if param else false_value)]
2735 return [command_option, true_value if param else false_value]
2736
2737
2738def cli_valueless_option(params, command_option, param, expected_value=True):
2739 param = params.get(param)
2740 return [command_option] if param == expected_value else []
2741
2742
2743def cli_configuration_args(params, param, default=[]):
2744 ex_args = params.get(param)
2745 if ex_args is None:
2746 return default
2747 assert isinstance(ex_args, list)
2748 return ex_args
2749
2750
39672624
YCH
2751class ISO639Utils(object):
2752 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2753 _lang_map = {
2754 'aa': 'aar',
2755 'ab': 'abk',
2756 'ae': 'ave',
2757 'af': 'afr',
2758 'ak': 'aka',
2759 'am': 'amh',
2760 'an': 'arg',
2761 'ar': 'ara',
2762 'as': 'asm',
2763 'av': 'ava',
2764 'ay': 'aym',
2765 'az': 'aze',
2766 'ba': 'bak',
2767 'be': 'bel',
2768 'bg': 'bul',
2769 'bh': 'bih',
2770 'bi': 'bis',
2771 'bm': 'bam',
2772 'bn': 'ben',
2773 'bo': 'bod',
2774 'br': 'bre',
2775 'bs': 'bos',
2776 'ca': 'cat',
2777 'ce': 'che',
2778 'ch': 'cha',
2779 'co': 'cos',
2780 'cr': 'cre',
2781 'cs': 'ces',
2782 'cu': 'chu',
2783 'cv': 'chv',
2784 'cy': 'cym',
2785 'da': 'dan',
2786 'de': 'deu',
2787 'dv': 'div',
2788 'dz': 'dzo',
2789 'ee': 'ewe',
2790 'el': 'ell',
2791 'en': 'eng',
2792 'eo': 'epo',
2793 'es': 'spa',
2794 'et': 'est',
2795 'eu': 'eus',
2796 'fa': 'fas',
2797 'ff': 'ful',
2798 'fi': 'fin',
2799 'fj': 'fij',
2800 'fo': 'fao',
2801 'fr': 'fra',
2802 'fy': 'fry',
2803 'ga': 'gle',
2804 'gd': 'gla',
2805 'gl': 'glg',
2806 'gn': 'grn',
2807 'gu': 'guj',
2808 'gv': 'glv',
2809 'ha': 'hau',
2810 'he': 'heb',
2811 'hi': 'hin',
2812 'ho': 'hmo',
2813 'hr': 'hrv',
2814 'ht': 'hat',
2815 'hu': 'hun',
2816 'hy': 'hye',
2817 'hz': 'her',
2818 'ia': 'ina',
2819 'id': 'ind',
2820 'ie': 'ile',
2821 'ig': 'ibo',
2822 'ii': 'iii',
2823 'ik': 'ipk',
2824 'io': 'ido',
2825 'is': 'isl',
2826 'it': 'ita',
2827 'iu': 'iku',
2828 'ja': 'jpn',
2829 'jv': 'jav',
2830 'ka': 'kat',
2831 'kg': 'kon',
2832 'ki': 'kik',
2833 'kj': 'kua',
2834 'kk': 'kaz',
2835 'kl': 'kal',
2836 'km': 'khm',
2837 'kn': 'kan',
2838 'ko': 'kor',
2839 'kr': 'kau',
2840 'ks': 'kas',
2841 'ku': 'kur',
2842 'kv': 'kom',
2843 'kw': 'cor',
2844 'ky': 'kir',
2845 'la': 'lat',
2846 'lb': 'ltz',
2847 'lg': 'lug',
2848 'li': 'lim',
2849 'ln': 'lin',
2850 'lo': 'lao',
2851 'lt': 'lit',
2852 'lu': 'lub',
2853 'lv': 'lav',
2854 'mg': 'mlg',
2855 'mh': 'mah',
2856 'mi': 'mri',
2857 'mk': 'mkd',
2858 'ml': 'mal',
2859 'mn': 'mon',
2860 'mr': 'mar',
2861 'ms': 'msa',
2862 'mt': 'mlt',
2863 'my': 'mya',
2864 'na': 'nau',
2865 'nb': 'nob',
2866 'nd': 'nde',
2867 'ne': 'nep',
2868 'ng': 'ndo',
2869 'nl': 'nld',
2870 'nn': 'nno',
2871 'no': 'nor',
2872 'nr': 'nbl',
2873 'nv': 'nav',
2874 'ny': 'nya',
2875 'oc': 'oci',
2876 'oj': 'oji',
2877 'om': 'orm',
2878 'or': 'ori',
2879 'os': 'oss',
2880 'pa': 'pan',
2881 'pi': 'pli',
2882 'pl': 'pol',
2883 'ps': 'pus',
2884 'pt': 'por',
2885 'qu': 'que',
2886 'rm': 'roh',
2887 'rn': 'run',
2888 'ro': 'ron',
2889 'ru': 'rus',
2890 'rw': 'kin',
2891 'sa': 'san',
2892 'sc': 'srd',
2893 'sd': 'snd',
2894 'se': 'sme',
2895 'sg': 'sag',
2896 'si': 'sin',
2897 'sk': 'slk',
2898 'sl': 'slv',
2899 'sm': 'smo',
2900 'sn': 'sna',
2901 'so': 'som',
2902 'sq': 'sqi',
2903 'sr': 'srp',
2904 'ss': 'ssw',
2905 'st': 'sot',
2906 'su': 'sun',
2907 'sv': 'swe',
2908 'sw': 'swa',
2909 'ta': 'tam',
2910 'te': 'tel',
2911 'tg': 'tgk',
2912 'th': 'tha',
2913 'ti': 'tir',
2914 'tk': 'tuk',
2915 'tl': 'tgl',
2916 'tn': 'tsn',
2917 'to': 'ton',
2918 'tr': 'tur',
2919 'ts': 'tso',
2920 'tt': 'tat',
2921 'tw': 'twi',
2922 'ty': 'tah',
2923 'ug': 'uig',
2924 'uk': 'ukr',
2925 'ur': 'urd',
2926 'uz': 'uzb',
2927 've': 'ven',
2928 'vi': 'vie',
2929 'vo': 'vol',
2930 'wa': 'wln',
2931 'wo': 'wol',
2932 'xh': 'xho',
2933 'yi': 'yid',
2934 'yo': 'yor',
2935 'za': 'zha',
2936 'zh': 'zho',
2937 'zu': 'zul',
2938 }
2939
2940 @classmethod
2941 def short2long(cls, code):
2942 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2943 return cls._lang_map.get(code[:2])
2944
2945 @classmethod
2946 def long2short(cls, code):
2947 """Convert language code from ISO 639-2/T to ISO 639-1"""
2948 for short_name, long_name in cls._lang_map.items():
2949 if long_name == code:
2950 return short_name
2951
2952
4eb10f66
YCH
2953class ISO3166Utils(object):
2954 # From http://data.okfn.org/data/core/country-list
2955 _country_map = {
2956 'AF': 'Afghanistan',
2957 'AX': 'Åland Islands',
2958 'AL': 'Albania',
2959 'DZ': 'Algeria',
2960 'AS': 'American Samoa',
2961 'AD': 'Andorra',
2962 'AO': 'Angola',
2963 'AI': 'Anguilla',
2964 'AQ': 'Antarctica',
2965 'AG': 'Antigua and Barbuda',
2966 'AR': 'Argentina',
2967 'AM': 'Armenia',
2968 'AW': 'Aruba',
2969 'AU': 'Australia',
2970 'AT': 'Austria',
2971 'AZ': 'Azerbaijan',
2972 'BS': 'Bahamas',
2973 'BH': 'Bahrain',
2974 'BD': 'Bangladesh',
2975 'BB': 'Barbados',
2976 'BY': 'Belarus',
2977 'BE': 'Belgium',
2978 'BZ': 'Belize',
2979 'BJ': 'Benin',
2980 'BM': 'Bermuda',
2981 'BT': 'Bhutan',
2982 'BO': 'Bolivia, Plurinational State of',
2983 'BQ': 'Bonaire, Sint Eustatius and Saba',
2984 'BA': 'Bosnia and Herzegovina',
2985 'BW': 'Botswana',
2986 'BV': 'Bouvet Island',
2987 'BR': 'Brazil',
2988 'IO': 'British Indian Ocean Territory',
2989 'BN': 'Brunei Darussalam',
2990 'BG': 'Bulgaria',
2991 'BF': 'Burkina Faso',
2992 'BI': 'Burundi',
2993 'KH': 'Cambodia',
2994 'CM': 'Cameroon',
2995 'CA': 'Canada',
2996 'CV': 'Cape Verde',
2997 'KY': 'Cayman Islands',
2998 'CF': 'Central African Republic',
2999 'TD': 'Chad',
3000 'CL': 'Chile',
3001 'CN': 'China',
3002 'CX': 'Christmas Island',
3003 'CC': 'Cocos (Keeling) Islands',
3004 'CO': 'Colombia',
3005 'KM': 'Comoros',
3006 'CG': 'Congo',
3007 'CD': 'Congo, the Democratic Republic of the',
3008 'CK': 'Cook Islands',
3009 'CR': 'Costa Rica',
3010 'CI': 'Côte d\'Ivoire',
3011 'HR': 'Croatia',
3012 'CU': 'Cuba',
3013 'CW': 'Curaçao',
3014 'CY': 'Cyprus',
3015 'CZ': 'Czech Republic',
3016 'DK': 'Denmark',
3017 'DJ': 'Djibouti',
3018 'DM': 'Dominica',
3019 'DO': 'Dominican Republic',
3020 'EC': 'Ecuador',
3021 'EG': 'Egypt',
3022 'SV': 'El Salvador',
3023 'GQ': 'Equatorial Guinea',
3024 'ER': 'Eritrea',
3025 'EE': 'Estonia',
3026 'ET': 'Ethiopia',
3027 'FK': 'Falkland Islands (Malvinas)',
3028 'FO': 'Faroe Islands',
3029 'FJ': 'Fiji',
3030 'FI': 'Finland',
3031 'FR': 'France',
3032 'GF': 'French Guiana',
3033 'PF': 'French Polynesia',
3034 'TF': 'French Southern Territories',
3035 'GA': 'Gabon',
3036 'GM': 'Gambia',
3037 'GE': 'Georgia',
3038 'DE': 'Germany',
3039 'GH': 'Ghana',
3040 'GI': 'Gibraltar',
3041 'GR': 'Greece',
3042 'GL': 'Greenland',
3043 'GD': 'Grenada',
3044 'GP': 'Guadeloupe',
3045 'GU': 'Guam',
3046 'GT': 'Guatemala',
3047 'GG': 'Guernsey',
3048 'GN': 'Guinea',
3049 'GW': 'Guinea-Bissau',
3050 'GY': 'Guyana',
3051 'HT': 'Haiti',
3052 'HM': 'Heard Island and McDonald Islands',
3053 'VA': 'Holy See (Vatican City State)',
3054 'HN': 'Honduras',
3055 'HK': 'Hong Kong',
3056 'HU': 'Hungary',
3057 'IS': 'Iceland',
3058 'IN': 'India',
3059 'ID': 'Indonesia',
3060 'IR': 'Iran, Islamic Republic of',
3061 'IQ': 'Iraq',
3062 'IE': 'Ireland',
3063 'IM': 'Isle of Man',
3064 'IL': 'Israel',
3065 'IT': 'Italy',
3066 'JM': 'Jamaica',
3067 'JP': 'Japan',
3068 'JE': 'Jersey',
3069 'JO': 'Jordan',
3070 'KZ': 'Kazakhstan',
3071 'KE': 'Kenya',
3072 'KI': 'Kiribati',
3073 'KP': 'Korea, Democratic People\'s Republic of',
3074 'KR': 'Korea, Republic of',
3075 'KW': 'Kuwait',
3076 'KG': 'Kyrgyzstan',
3077 'LA': 'Lao People\'s Democratic Republic',
3078 'LV': 'Latvia',
3079 'LB': 'Lebanon',
3080 'LS': 'Lesotho',
3081 'LR': 'Liberia',
3082 'LY': 'Libya',
3083 'LI': 'Liechtenstein',
3084 'LT': 'Lithuania',
3085 'LU': 'Luxembourg',
3086 'MO': 'Macao',
3087 'MK': 'Macedonia, the Former Yugoslav Republic of',
3088 'MG': 'Madagascar',
3089 'MW': 'Malawi',
3090 'MY': 'Malaysia',
3091 'MV': 'Maldives',
3092 'ML': 'Mali',
3093 'MT': 'Malta',
3094 'MH': 'Marshall Islands',
3095 'MQ': 'Martinique',
3096 'MR': 'Mauritania',
3097 'MU': 'Mauritius',
3098 'YT': 'Mayotte',
3099 'MX': 'Mexico',
3100 'FM': 'Micronesia, Federated States of',
3101 'MD': 'Moldova, Republic of',
3102 'MC': 'Monaco',
3103 'MN': 'Mongolia',
3104 'ME': 'Montenegro',
3105 'MS': 'Montserrat',
3106 'MA': 'Morocco',
3107 'MZ': 'Mozambique',
3108 'MM': 'Myanmar',
3109 'NA': 'Namibia',
3110 'NR': 'Nauru',
3111 'NP': 'Nepal',
3112 'NL': 'Netherlands',
3113 'NC': 'New Caledonia',
3114 'NZ': 'New Zealand',
3115 'NI': 'Nicaragua',
3116 'NE': 'Niger',
3117 'NG': 'Nigeria',
3118 'NU': 'Niue',
3119 'NF': 'Norfolk Island',
3120 'MP': 'Northern Mariana Islands',
3121 'NO': 'Norway',
3122 'OM': 'Oman',
3123 'PK': 'Pakistan',
3124 'PW': 'Palau',
3125 'PS': 'Palestine, State of',
3126 'PA': 'Panama',
3127 'PG': 'Papua New Guinea',
3128 'PY': 'Paraguay',
3129 'PE': 'Peru',
3130 'PH': 'Philippines',
3131 'PN': 'Pitcairn',
3132 'PL': 'Poland',
3133 'PT': 'Portugal',
3134 'PR': 'Puerto Rico',
3135 'QA': 'Qatar',
3136 'RE': 'Réunion',
3137 'RO': 'Romania',
3138 'RU': 'Russian Federation',
3139 'RW': 'Rwanda',
3140 'BL': 'Saint Barthélemy',
3141 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3142 'KN': 'Saint Kitts and Nevis',
3143 'LC': 'Saint Lucia',
3144 'MF': 'Saint Martin (French part)',
3145 'PM': 'Saint Pierre and Miquelon',
3146 'VC': 'Saint Vincent and the Grenadines',
3147 'WS': 'Samoa',
3148 'SM': 'San Marino',
3149 'ST': 'Sao Tome and Principe',
3150 'SA': 'Saudi Arabia',
3151 'SN': 'Senegal',
3152 'RS': 'Serbia',
3153 'SC': 'Seychelles',
3154 'SL': 'Sierra Leone',
3155 'SG': 'Singapore',
3156 'SX': 'Sint Maarten (Dutch part)',
3157 'SK': 'Slovakia',
3158 'SI': 'Slovenia',
3159 'SB': 'Solomon Islands',
3160 'SO': 'Somalia',
3161 'ZA': 'South Africa',
3162 'GS': 'South Georgia and the South Sandwich Islands',
3163 'SS': 'South Sudan',
3164 'ES': 'Spain',
3165 'LK': 'Sri Lanka',
3166 'SD': 'Sudan',
3167 'SR': 'Suriname',
3168 'SJ': 'Svalbard and Jan Mayen',
3169 'SZ': 'Swaziland',
3170 'SE': 'Sweden',
3171 'CH': 'Switzerland',
3172 'SY': 'Syrian Arab Republic',
3173 'TW': 'Taiwan, Province of China',
3174 'TJ': 'Tajikistan',
3175 'TZ': 'Tanzania, United Republic of',
3176 'TH': 'Thailand',
3177 'TL': 'Timor-Leste',
3178 'TG': 'Togo',
3179 'TK': 'Tokelau',
3180 'TO': 'Tonga',
3181 'TT': 'Trinidad and Tobago',
3182 'TN': 'Tunisia',
3183 'TR': 'Turkey',
3184 'TM': 'Turkmenistan',
3185 'TC': 'Turks and Caicos Islands',
3186 'TV': 'Tuvalu',
3187 'UG': 'Uganda',
3188 'UA': 'Ukraine',
3189 'AE': 'United Arab Emirates',
3190 'GB': 'United Kingdom',
3191 'US': 'United States',
3192 'UM': 'United States Minor Outlying Islands',
3193 'UY': 'Uruguay',
3194 'UZ': 'Uzbekistan',
3195 'VU': 'Vanuatu',
3196 'VE': 'Venezuela, Bolivarian Republic of',
3197 'VN': 'Viet Nam',
3198 'VG': 'Virgin Islands, British',
3199 'VI': 'Virgin Islands, U.S.',
3200 'WF': 'Wallis and Futuna',
3201 'EH': 'Western Sahara',
3202 'YE': 'Yemen',
3203 'ZM': 'Zambia',
3204 'ZW': 'Zimbabwe',
3205 }
3206
3207 @classmethod
3208 def short2full(cls, code):
3209 """Convert an ISO 3166-2 country code to the corresponding full name"""
3210 return cls._country_map.get(code.upper())
3211
3212
773f291d
S
3213class GeoUtils(object):
3214 # Major IPv4 address blocks per country
3215 _country_ip_map = {
3216 'AD': '85.94.160.0/19',
3217 'AE': '94.200.0.0/13',
3218 'AF': '149.54.0.0/17',
3219 'AG': '209.59.64.0/18',
3220 'AI': '204.14.248.0/21',
3221 'AL': '46.99.0.0/16',
3222 'AM': '46.70.0.0/15',
3223 'AO': '105.168.0.0/13',
3224 'AP': '159.117.192.0/21',
3225 'AR': '181.0.0.0/12',
3226 'AS': '202.70.112.0/20',
3227 'AT': '84.112.0.0/13',
3228 'AU': '1.128.0.0/11',
3229 'AW': '181.41.0.0/18',
3230 'AZ': '5.191.0.0/16',
3231 'BA': '31.176.128.0/17',
3232 'BB': '65.48.128.0/17',
3233 'BD': '114.130.0.0/16',
3234 'BE': '57.0.0.0/8',
3235 'BF': '129.45.128.0/17',
3236 'BG': '95.42.0.0/15',
3237 'BH': '37.131.0.0/17',
3238 'BI': '154.117.192.0/18',
3239 'BJ': '137.255.0.0/16',
3240 'BL': '192.131.134.0/24',
3241 'BM': '196.12.64.0/18',
3242 'BN': '156.31.0.0/16',
3243 'BO': '161.56.0.0/16',
3244 'BQ': '161.0.80.0/20',
3245 'BR': '152.240.0.0/12',
3246 'BS': '24.51.64.0/18',
3247 'BT': '119.2.96.0/19',
3248 'BW': '168.167.0.0/16',
3249 'BY': '178.120.0.0/13',
3250 'BZ': '179.42.192.0/18',
3251 'CA': '99.224.0.0/11',
3252 'CD': '41.243.0.0/16',
3253 'CF': '196.32.200.0/21',
3254 'CG': '197.214.128.0/17',
3255 'CH': '85.0.0.0/13',
3256 'CI': '154.232.0.0/14',
3257 'CK': '202.65.32.0/19',
3258 'CL': '152.172.0.0/14',
3259 'CM': '165.210.0.0/15',
3260 'CN': '36.128.0.0/10',
3261 'CO': '181.240.0.0/12',
3262 'CR': '201.192.0.0/12',
3263 'CU': '152.206.0.0/15',
3264 'CV': '165.90.96.0/19',
3265 'CW': '190.88.128.0/17',
3266 'CY': '46.198.0.0/15',
3267 'CZ': '88.100.0.0/14',
3268 'DE': '53.0.0.0/8',
3269 'DJ': '197.241.0.0/17',
3270 'DK': '87.48.0.0/12',
3271 'DM': '192.243.48.0/20',
3272 'DO': '152.166.0.0/15',
3273 'DZ': '41.96.0.0/12',
3274 'EC': '186.68.0.0/15',
3275 'EE': '90.190.0.0/15',
3276 'EG': '156.160.0.0/11',
3277 'ER': '196.200.96.0/20',
3278 'ES': '88.0.0.0/11',
3279 'ET': '196.188.0.0/14',
3280 'EU': '2.16.0.0/13',
3281 'FI': '91.152.0.0/13',
3282 'FJ': '144.120.0.0/16',
3283 'FM': '119.252.112.0/20',
3284 'FO': '88.85.32.0/19',
3285 'FR': '90.0.0.0/9',
3286 'GA': '41.158.0.0/15',
3287 'GB': '25.0.0.0/8',
3288 'GD': '74.122.88.0/21',
3289 'GE': '31.146.0.0/16',
3290 'GF': '161.22.64.0/18',
3291 'GG': '62.68.160.0/19',
3292 'GH': '45.208.0.0/14',
3293 'GI': '85.115.128.0/19',
3294 'GL': '88.83.0.0/19',
3295 'GM': '160.182.0.0/15',
3296 'GN': '197.149.192.0/18',
3297 'GP': '104.250.0.0/19',
3298 'GQ': '105.235.224.0/20',
3299 'GR': '94.64.0.0/13',
3300 'GT': '168.234.0.0/16',
3301 'GU': '168.123.0.0/16',
3302 'GW': '197.214.80.0/20',
3303 'GY': '181.41.64.0/18',
3304 'HK': '113.252.0.0/14',
3305 'HN': '181.210.0.0/16',
3306 'HR': '93.136.0.0/13',
3307 'HT': '148.102.128.0/17',
3308 'HU': '84.0.0.0/14',
3309 'ID': '39.192.0.0/10',
3310 'IE': '87.32.0.0/12',
3311 'IL': '79.176.0.0/13',
3312 'IM': '5.62.80.0/20',
3313 'IN': '117.192.0.0/10',
3314 'IO': '203.83.48.0/21',
3315 'IQ': '37.236.0.0/14',
3316 'IR': '2.176.0.0/12',
3317 'IS': '82.221.0.0/16',
3318 'IT': '79.0.0.0/10',
3319 'JE': '87.244.64.0/18',
3320 'JM': '72.27.0.0/17',
3321 'JO': '176.29.0.0/16',
3322 'JP': '126.0.0.0/8',
3323 'KE': '105.48.0.0/12',
3324 'KG': '158.181.128.0/17',
3325 'KH': '36.37.128.0/17',
3326 'KI': '103.25.140.0/22',
3327 'KM': '197.255.224.0/20',
3328 'KN': '198.32.32.0/19',
3329 'KP': '175.45.176.0/22',
3330 'KR': '175.192.0.0/10',
3331 'KW': '37.36.0.0/14',
3332 'KY': '64.96.0.0/15',
3333 'KZ': '2.72.0.0/13',
3334 'LA': '115.84.64.0/18',
3335 'LB': '178.135.0.0/16',
3336 'LC': '192.147.231.0/24',
3337 'LI': '82.117.0.0/19',
3338 'LK': '112.134.0.0/15',
3339 'LR': '41.86.0.0/19',
3340 'LS': '129.232.0.0/17',
3341 'LT': '78.56.0.0/13',
3342 'LU': '188.42.0.0/16',
3343 'LV': '46.109.0.0/16',
3344 'LY': '41.252.0.0/14',
3345 'MA': '105.128.0.0/11',
3346 'MC': '88.209.64.0/18',
3347 'MD': '37.246.0.0/16',
3348 'ME': '178.175.0.0/17',
3349 'MF': '74.112.232.0/21',
3350 'MG': '154.126.0.0/17',
3351 'MH': '117.103.88.0/21',
3352 'MK': '77.28.0.0/15',
3353 'ML': '154.118.128.0/18',
3354 'MM': '37.111.0.0/17',
3355 'MN': '49.0.128.0/17',
3356 'MO': '60.246.0.0/16',
3357 'MP': '202.88.64.0/20',
3358 'MQ': '109.203.224.0/19',
3359 'MR': '41.188.64.0/18',
3360 'MS': '208.90.112.0/22',
3361 'MT': '46.11.0.0/16',
3362 'MU': '105.16.0.0/12',
3363 'MV': '27.114.128.0/18',
3364 'MW': '105.234.0.0/16',
3365 'MX': '187.192.0.0/11',
3366 'MY': '175.136.0.0/13',
3367 'MZ': '197.218.0.0/15',
3368 'NA': '41.182.0.0/16',
3369 'NC': '101.101.0.0/18',
3370 'NE': '197.214.0.0/18',
3371 'NF': '203.17.240.0/22',
3372 'NG': '105.112.0.0/12',
3373 'NI': '186.76.0.0/15',
3374 'NL': '145.96.0.0/11',
3375 'NO': '84.208.0.0/13',
3376 'NP': '36.252.0.0/15',
3377 'NR': '203.98.224.0/19',
3378 'NU': '49.156.48.0/22',
3379 'NZ': '49.224.0.0/14',
3380 'OM': '5.36.0.0/15',
3381 'PA': '186.72.0.0/15',
3382 'PE': '186.160.0.0/14',
3383 'PF': '123.50.64.0/18',
3384 'PG': '124.240.192.0/19',
3385 'PH': '49.144.0.0/13',
3386 'PK': '39.32.0.0/11',
3387 'PL': '83.0.0.0/11',
3388 'PM': '70.36.0.0/20',
3389 'PR': '66.50.0.0/16',
3390 'PS': '188.161.0.0/16',
3391 'PT': '85.240.0.0/13',
3392 'PW': '202.124.224.0/20',
3393 'PY': '181.120.0.0/14',
3394 'QA': '37.210.0.0/15',
3395 'RE': '139.26.0.0/16',
3396 'RO': '79.112.0.0/13',
3397 'RS': '178.220.0.0/14',
3398 'RU': '5.136.0.0/13',
3399 'RW': '105.178.0.0/15',
3400 'SA': '188.48.0.0/13',
3401 'SB': '202.1.160.0/19',
3402 'SC': '154.192.0.0/11',
3403 'SD': '154.96.0.0/13',
3404 'SE': '78.64.0.0/12',
3405 'SG': '152.56.0.0/14',
3406 'SI': '188.196.0.0/14',
3407 'SK': '78.98.0.0/15',
3408 'SL': '197.215.0.0/17',
3409 'SM': '89.186.32.0/19',
3410 'SN': '41.82.0.0/15',
3411 'SO': '197.220.64.0/19',
3412 'SR': '186.179.128.0/17',
3413 'SS': '105.235.208.0/21',
3414 'ST': '197.159.160.0/19',
3415 'SV': '168.243.0.0/16',
3416 'SX': '190.102.0.0/20',
3417 'SY': '5.0.0.0/16',
3418 'SZ': '41.84.224.0/19',
3419 'TC': '65.255.48.0/20',
3420 'TD': '154.68.128.0/19',
3421 'TG': '196.168.0.0/14',
3422 'TH': '171.96.0.0/13',
3423 'TJ': '85.9.128.0/18',
3424 'TK': '27.96.24.0/21',
3425 'TL': '180.189.160.0/20',
3426 'TM': '95.85.96.0/19',
3427 'TN': '197.0.0.0/11',
3428 'TO': '175.176.144.0/21',
3429 'TR': '78.160.0.0/11',
3430 'TT': '186.44.0.0/15',
3431 'TV': '202.2.96.0/19',
3432 'TW': '120.96.0.0/11',
3433 'TZ': '156.156.0.0/14',
3434 'UA': '93.72.0.0/13',
3435 'UG': '154.224.0.0/13',
3436 'US': '3.0.0.0/8',
3437 'UY': '167.56.0.0/13',
3438 'UZ': '82.215.64.0/18',
3439 'VA': '212.77.0.0/19',
3440 'VC': '24.92.144.0/20',
3441 'VE': '186.88.0.0/13',
3442 'VG': '172.103.64.0/18',
3443 'VI': '146.226.0.0/16',
3444 'VN': '14.160.0.0/11',
3445 'VU': '202.80.32.0/20',
3446 'WF': '117.20.32.0/21',
3447 'WS': '202.4.32.0/19',
3448 'YE': '134.35.0.0/16',
3449 'YT': '41.242.116.0/22',
3450 'ZA': '41.0.0.0/11',
3451 'ZM': '165.56.0.0/13',
3452 'ZW': '41.85.192.0/19',
3453 }
3454
3455 @classmethod
3456 def random_ipv4(cls, code):
3457 block = cls._country_ip_map.get(code.upper())
3458 if not block:
3459 return None
3460 addr, preflen = block.split('/')
3461 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3462 addr_max = addr_min | (0xffffffff >> int(preflen))
18a0defa 3463 return compat_str(socket.inet_ntoa(
4248dad9 3464 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
3465
3466
91410c9b 3467class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
3468 def __init__(self, proxies=None):
3469 # Set default handlers
3470 for type in ('http', 'https'):
3471 setattr(self, '%s_open' % type,
3472 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3473 meth(r, proxy, type))
3474 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3475
91410c9b 3476 def proxy_open(self, req, proxy, type):
2461f79d 3477 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
3478 if req_proxy is not None:
3479 proxy = req_proxy
2461f79d
PH
3480 del req.headers['Ytdl-request-proxy']
3481
3482 if proxy == '__noproxy__':
3483 return None # No Proxy
51fb4995 3484 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
3485 req.add_header('Ytdl-socks-proxy', proxy)
3486 # youtube-dl's http/https handlers do wrapping the socket with socks
3487 return None
91410c9b
PH
3488 return compat_urllib_request.ProxyHandler.proxy_open(
3489 self, req, proxy, type)
5bc880b9
YCH
3490
3491
0a5445dd
YCH
3492# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3493# released into Public Domain
3494# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3495
3496def long_to_bytes(n, blocksize=0):
3497 """long_to_bytes(n:long, blocksize:int) : string
3498 Convert a long integer to a byte string.
3499
3500 If optional blocksize is given and greater than zero, pad the front of the
3501 byte string with binary zeros so that the length is a multiple of
3502 blocksize.
3503 """
3504 # after much testing, this algorithm was deemed to be the fastest
3505 s = b''
3506 n = int(n)
3507 while n > 0:
3508 s = compat_struct_pack('>I', n & 0xffffffff) + s
3509 n = n >> 32
3510 # strip off leading zeros
3511 for i in range(len(s)):
3512 if s[i] != b'\000'[0]:
3513 break
3514 else:
3515 # only happens when n == 0
3516 s = b'\000'
3517 i = 0
3518 s = s[i:]
3519 # add back some pad bytes. this could be done more efficiently w.r.t. the
3520 # de-padding being done above, but sigh...
3521 if blocksize > 0 and len(s) % blocksize:
3522 s = (blocksize - len(s) % blocksize) * b'\000' + s
3523 return s
3524
3525
3526def bytes_to_long(s):
3527 """bytes_to_long(string) : long
3528 Convert a byte string to a long integer.
3529
3530 This is (essentially) the inverse of long_to_bytes().
3531 """
3532 acc = 0
3533 length = len(s)
3534 if length % 4:
3535 extra = (4 - length % 4)
3536 s = b'\000' * extra + s
3537 length = length + extra
3538 for i in range(0, length, 4):
3539 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3540 return acc
3541
3542
5bc880b9
YCH
3543def ohdave_rsa_encrypt(data, exponent, modulus):
3544 '''
3545 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3546
3547 Input:
3548 data: data to encrypt, bytes-like object
3549 exponent, modulus: parameter e and N of RSA algorithm, both integer
3550 Output: hex string of encrypted data
3551
3552 Limitation: supports one block encryption only
3553 '''
3554
3555 payload = int(binascii.hexlify(data[::-1]), 16)
3556 encrypted = pow(payload, exponent, modulus)
3557 return '%x' % encrypted
81bdc8fd
YCH
3558
3559
f48409c7
YCH
3560def pkcs1pad(data, length):
3561 """
3562 Padding input data with PKCS#1 scheme
3563
3564 @param {int[]} data input data
3565 @param {int} length target length
3566 @returns {int[]} padded data
3567 """
3568 if len(data) > length - 11:
3569 raise ValueError('Input data too long for PKCS#1 padding')
3570
3571 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3572 return [0, 2] + pseudo_random + [0] + data
3573
3574
5eb6bdce 3575def encode_base_n(num, n, table=None):
59f898b7 3576 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
3577 if not table:
3578 table = FULL_TABLE[:n]
3579
5eb6bdce
YCH
3580 if n > len(table):
3581 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3582
3583 if num == 0:
3584 return table[0]
3585
81bdc8fd
YCH
3586 ret = ''
3587 while num:
3588 ret = table[num % n] + ret
3589 num = num // n
3590 return ret
f52354a8
YCH
3591
3592
3593def decode_packed_codes(code):
06b3fe29 3594 mobj = re.search(PACKED_CODES_RE, code)
f52354a8
YCH
3595 obfucasted_code, base, count, symbols = mobj.groups()
3596 base = int(base)
3597 count = int(count)
3598 symbols = symbols.split('|')
3599 symbol_table = {}
3600
3601 while count:
3602 count -= 1
5eb6bdce 3603 base_n_count = encode_base_n(count, base)
f52354a8
YCH
3604 symbol_table[base_n_count] = symbols[count] or base_n_count
3605
3606 return re.sub(
3607 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3608 obfucasted_code)
e154c651 3609
3610
3611def parse_m3u8_attributes(attrib):
3612 info = {}
3613 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3614 if val.startswith('"'):
3615 val = val[1:-1]
3616 info[key] = val
3617 return info
1143535d
YCH
3618
3619
3620def urshift(val, n):
3621 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
3622
3623
3624# Based on png2str() written by @gdkchan and improved by @yokrysty
3625# Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3626def decode_png(png_data):
3627 # Reference: https://www.w3.org/TR/PNG/
3628 header = png_data[8:]
3629
3630 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3631 raise IOError('Not a valid PNG file.')
3632
3633 int_map = {1: '>B', 2: '>H', 4: '>I'}
3634 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3635
3636 chunks = []
3637
3638 while header:
3639 length = unpack_integer(header[:4])
3640 header = header[4:]
3641
3642 chunk_type = header[:4]
3643 header = header[4:]
3644
3645 chunk_data = header[:length]
3646 header = header[length:]
3647
3648 header = header[4:] # Skip CRC
3649
3650 chunks.append({
3651 'type': chunk_type,
3652 'length': length,
3653 'data': chunk_data
3654 })
3655
3656 ihdr = chunks[0]['data']
3657
3658 width = unpack_integer(ihdr[:4])
3659 height = unpack_integer(ihdr[4:8])
3660
3661 idat = b''
3662
3663 for chunk in chunks:
3664 if chunk['type'] == b'IDAT':
3665 idat += chunk['data']
3666
3667 if not idat:
3668 raise IOError('Unable to read PNG data.')
3669
3670 decompressed_data = bytearray(zlib.decompress(idat))
3671
3672 stride = width * 3
3673 pixels = []
3674
3675 def _get_pixel(idx):
3676 x = idx % stride
3677 y = idx // stride
3678 return pixels[y][x]
3679
3680 for y in range(height):
3681 basePos = y * (1 + stride)
3682 filter_type = decompressed_data[basePos]
3683
3684 current_row = []
3685
3686 pixels.append(current_row)
3687
3688 for x in range(stride):
3689 color = decompressed_data[1 + basePos + x]
3690 basex = y * stride + x
3691 left = 0
3692 up = 0
3693
3694 if x > 2:
3695 left = _get_pixel(basex - 3)
3696 if y > 0:
3697 up = _get_pixel(basex - stride)
3698
3699 if filter_type == 1: # Sub
3700 color = (color + left) & 0xff
3701 elif filter_type == 2: # Up
3702 color = (color + up) & 0xff
3703 elif filter_type == 3: # Average
3704 color = (color + ((left + up) >> 1)) & 0xff
3705 elif filter_type == 4: # Paeth
3706 a = left
3707 b = up
3708 c = 0
3709
3710 if x > 2 and y > 0:
3711 c = _get_pixel(basex - stride - 3)
3712
3713 p = a + b - c
3714
3715 pa = abs(p - a)
3716 pb = abs(p - b)
3717 pc = abs(p - c)
3718
3719 if pa <= pb and pa <= pc:
3720 color = (color + a) & 0xff
3721 elif pb <= pc:
3722 color = (color + b) & 0xff
3723 else:
3724 color = (color + c) & 0xff
3725
3726 current_row.append(color)
3727
3728 return width, height, pixels
efa97bdc
YCH
3729
3730
3731def write_xattr(path, key, value):
3732 # This mess below finds the best xattr tool for the job
3733 try:
3734 # try the pyxattr module...
3735 import xattr
3736
53a7e3d2
YCH
3737 if hasattr(xattr, 'set'): # pyxattr
3738 # Unicode arguments are not supported in python-pyxattr until
3739 # version 0.5.0
3740 # See https://github.com/rg3/youtube-dl/issues/5498
3741 pyxattr_required_version = '0.5.0'
3742 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3743 # TODO: fallback to CLI tools
3744 raise XAttrUnavailableError(
3745 'python-pyxattr is detected but is too old. '
3746 'youtube-dl requires %s or above while your version is %s. '
3747 'Falling back to other xattr implementations' % (
3748 pyxattr_required_version, xattr.__version__))
3749
3750 setxattr = xattr.set
3751 else: # xattr
3752 setxattr = xattr.setxattr
efa97bdc
YCH
3753
3754 try:
53a7e3d2 3755 setxattr(path, key, value)
efa97bdc
YCH
3756 except EnvironmentError as e:
3757 raise XAttrMetadataError(e.errno, e.strerror)
3758
3759 except ImportError:
3760 if compat_os_name == 'nt':
3761 # Write xattrs to NTFS Alternate Data Streams:
3762 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3763 assert ':' not in key
3764 assert os.path.exists(path)
3765
3766 ads_fn = path + ':' + key
3767 try:
3768 with open(ads_fn, 'wb') as f:
3769 f.write(value)
3770 except EnvironmentError as e:
3771 raise XAttrMetadataError(e.errno, e.strerror)
3772 else:
3773 user_has_setfattr = check_executable('setfattr', ['--version'])
3774 user_has_xattr = check_executable('xattr', ['-h'])
3775
3776 if user_has_setfattr or user_has_xattr:
3777
3778 value = value.decode('utf-8')
3779 if user_has_setfattr:
3780 executable = 'setfattr'
3781 opts = ['-n', key, '-v', value]
3782 elif user_has_xattr:
3783 executable = 'xattr'
3784 opts = ['-w', key, value]
3785
3786 cmd = ([encodeFilename(executable, True)] +
3787 [encodeArgument(o) for o in opts] +
3788 [encodeFilename(path, True)])
3789
3790 try:
3791 p = subprocess.Popen(
3792 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3793 except EnvironmentError as e:
3794 raise XAttrMetadataError(e.errno, e.strerror)
3795 stdout, stderr = p.communicate()
3796 stderr = stderr.decode('utf-8', 'replace')
3797 if p.returncode != 0:
3798 raise XAttrMetadataError(p.returncode, stderr)
3799
3800 else:
3801 # On Unix, and can't find pyxattr, setfattr, or xattr.
3802 if sys.platform.startswith('linux'):
3803 raise XAttrUnavailableError(
3804 "Couldn't find a tool to set the xattrs. "
3805 "Install either the python 'pyxattr' or 'xattr' "
3806 "modules, or the GNU 'attr' package "
3807 "(which contains the 'setfattr' tool).")
3808 else:
3809 raise XAttrUnavailableError(
3810 "Couldn't find a tool to set the xattrs. "
3811 "Install either the python 'xattr' module, "
3812 "or the 'xattr' binary.")
0c265486
YCH
3813
3814
3815def random_birthday(year_field, month_field, day_field):
3816 return {
3817 year_field: str(random.randint(1950, 1995)),
3818 month_field: str(random.randint(1, 12)),
3819 day_field: str(random.randint(1, 31)),
3820 }