]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[toutv] fix login(closes 14614)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd 1#!/usr/bin/env python
dcdb292f 2# coding: utf-8
d77c3dfd 3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
0c265486 14import email.header
f45c185f 15import errno
be4a824d 16import functools
d77c3dfd 17import gzip
03f9daab 18import io
79a2e94e 19import itertools
f4bfd65f 20import json
d77c3dfd 21import locale
02dbf93f 22import math
347de493 23import operator
d77c3dfd 24import os
c496ca96 25import platform
773f291d 26import random
d77c3dfd 27import re
c496ca96 28import socket
79a2e94e 29import ssl
1c088fa8 30import subprocess
d77c3dfd 31import sys
181c8655 32import tempfile
01951dda 33import traceback
bcf89ce6 34import xml.etree.ElementTree
d77c3dfd 35import zlib
d77c3dfd 36
8c25f81b 37from .compat import (
b4a3d461 38 compat_HTMLParseError,
8bb56eee 39 compat_HTMLParser,
8f9312c3 40 compat_basestring,
8c25f81b 41 compat_chr,
36e6f62c 42 compat_etree_fromstring,
51098426 43 compat_expanduser,
8c25f81b 44 compat_html_entities,
55b2f099 45 compat_html_entities_html5,
be4a824d 46 compat_http_client,
c86b6142 47 compat_kwargs,
efa97bdc 48 compat_os_name,
8c25f81b 49 compat_parse_qs,
702ccf2d 50 compat_shlex_quote,
be4a824d 51 compat_socket_create_connection,
8c25f81b 52 compat_str,
edaa23f8 53 compat_struct_pack,
d3f8e038 54 compat_struct_unpack,
8c25f81b
PH
55 compat_urllib_error,
56 compat_urllib_parse,
15707c7e 57 compat_urllib_parse_urlencode,
8c25f81b 58 compat_urllib_parse_urlparse,
7581bfc9 59 compat_urllib_parse_unquote_plus,
8c25f81b
PH
60 compat_urllib_request,
61 compat_urlparse,
810c10ba 62 compat_xpath,
8c25f81b 63)
4644ac55 64
71aff188
YCH
65from .socks import (
66 ProxyType,
67 sockssocket,
68)
69
4644ac55 70
51fb4995
YCH
71def register_socks_protocols():
72 # "Register" SOCKS protocols
d5ae6bb5
YCH
73 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
74 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
75 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
76 if scheme not in compat_urlparse.uses_netloc:
77 compat_urlparse.uses_netloc.append(scheme)
78
79
468e2e92
FV
80# This is not clearly defined otherwise
81compiled_regex_type = type(re.compile(''))
82
3e669f36 83std_headers = {
15d10678 84 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
59ae15a5
PH
85 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
86 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
87 'Accept-Encoding': 'gzip, deflate',
88 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 89}
f427df17 90
5f6a1245 91
fb37eb25
S
92USER_AGENTS = {
93 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
94}
95
96
bf42a990
S
97NO_DEFAULT = object()
98
7105440c
YCH
99ENGLISH_MONTH_NAMES = [
100 'January', 'February', 'March', 'April', 'May', 'June',
101 'July', 'August', 'September', 'October', 'November', 'December']
102
f6717dec
S
103MONTH_NAMES = {
104 'en': ENGLISH_MONTH_NAMES,
105 'fr': [
3e4185c3
S
106 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
107 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 108}
a942d6cb 109
a7aaa398
S
110KNOWN_EXTENSIONS = (
111 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
112 'flv', 'f4v', 'f4a', 'f4b',
113 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
114 'mkv', 'mka', 'mk3d',
115 'avi', 'divx',
116 'mov',
117 'asf', 'wmv', 'wma',
118 '3gp', '3g2',
119 'mp3',
120 'flac',
121 'ape',
122 'wav',
123 'f4f', 'f4m', 'm3u8', 'smil')
124
c587cbb7 125# needed for sanitizing filenames in restricted mode
c8827027 126ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
127 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
128 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
c587cbb7 129
46f59e89
S
130DATE_FORMATS = (
131 '%d %B %Y',
132 '%d %b %Y',
133 '%B %d %Y',
cb655f34
S
134 '%B %dst %Y',
135 '%B %dnd %Y',
136 '%B %dth %Y',
46f59e89 137 '%b %d %Y',
cb655f34
S
138 '%b %dst %Y',
139 '%b %dnd %Y',
140 '%b %dth %Y',
46f59e89
S
141 '%b %dst %Y %I:%M',
142 '%b %dnd %Y %I:%M',
143 '%b %dth %Y %I:%M',
144 '%Y %m %d',
145 '%Y-%m-%d',
146 '%Y/%m/%d',
81c13222 147 '%Y/%m/%d %H:%M',
46f59e89 148 '%Y/%m/%d %H:%M:%S',
0c1c6f4b 149 '%Y-%m-%d %H:%M',
46f59e89
S
150 '%Y-%m-%d %H:%M:%S',
151 '%Y-%m-%d %H:%M:%S.%f',
152 '%d.%m.%Y %H:%M',
153 '%d.%m.%Y %H.%M',
154 '%Y-%m-%dT%H:%M:%SZ',
155 '%Y-%m-%dT%H:%M:%S.%fZ',
156 '%Y-%m-%dT%H:%M:%S.%f0Z',
157 '%Y-%m-%dT%H:%M:%S',
158 '%Y-%m-%dT%H:%M:%S.%f',
159 '%Y-%m-%dT%H:%M',
c6eed6b8
S
160 '%b %d %Y at %H:%M',
161 '%b %d %Y at %H:%M:%S',
46f59e89
S
162)
163
164DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
165DATE_FORMATS_DAY_FIRST.extend([
166 '%d-%m-%Y',
167 '%d.%m.%Y',
168 '%d.%m.%y',
169 '%d/%m/%Y',
170 '%d/%m/%y',
171 '%d/%m/%Y %H:%M:%S',
172])
173
174DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
175DATE_FORMATS_MONTH_FIRST.extend([
176 '%m-%d-%Y',
177 '%m.%d.%Y',
178 '%m/%d/%Y',
179 '%m/%d/%y',
180 '%m/%d/%Y %H:%M:%S',
181])
182
06b3fe29
S
183PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
184
7105440c 185
d77c3dfd 186def preferredencoding():
59ae15a5 187 """Get preferred encoding.
d77c3dfd 188
59ae15a5
PH
189 Returns the best encoding scheme for the system, based on
190 locale.getpreferredencoding() and some further tweaks.
191 """
192 try:
193 pref = locale.getpreferredencoding()
28e614de 194 'TEST'.encode(pref)
70a1165b 195 except Exception:
59ae15a5 196 pref = 'UTF-8'
bae611f2 197
59ae15a5 198 return pref
d77c3dfd 199
f4bfd65f 200
181c8655 201def write_json_file(obj, fn):
1394646a 202 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 203
92120217 204 fn = encodeFilename(fn)
61ee5aeb 205 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
206 encoding = get_filesystem_encoding()
207 # os.path.basename returns a bytes object, but NamedTemporaryFile
208 # will fail if the filename contains non ascii characters unless we
209 # use a unicode object
210 path_basename = lambda f: os.path.basename(fn).decode(encoding)
211 # the same for os.path.dirname
212 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
213 else:
214 path_basename = os.path.basename
215 path_dirname = os.path.dirname
216
73159f99
S
217 args = {
218 'suffix': '.tmp',
ec5f6016
JMF
219 'prefix': path_basename(fn) + '.',
220 'dir': path_dirname(fn),
73159f99
S
221 'delete': False,
222 }
223
181c8655
PH
224 # In Python 2.x, json.dump expects a bytestream.
225 # In Python 3.x, it writes to a character stream
226 if sys.version_info < (3, 0):
73159f99 227 args['mode'] = 'wb'
181c8655 228 else:
73159f99
S
229 args.update({
230 'mode': 'w',
231 'encoding': 'utf-8',
232 })
233
c86b6142 234 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
235
236 try:
237 with tf:
238 json.dump(obj, tf)
1394646a
IK
239 if sys.platform == 'win32':
240 # Need to remove existing file on Windows, else os.rename raises
241 # WindowsError or FileExistsError.
242 try:
243 os.unlink(fn)
244 except OSError:
245 pass
181c8655 246 os.rename(tf.name, fn)
70a1165b 247 except Exception:
181c8655
PH
248 try:
249 os.remove(tf.name)
250 except OSError:
251 pass
252 raise
253
254
255if sys.version_info >= (2, 7):
ee114368 256 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 257 """ Find the xpath xpath[@key=val] """
5d2354f1 258 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 259 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
260 return node.find(expr)
261else:
ee114368 262 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 263 for f in node.findall(compat_xpath(xpath)):
ee114368
S
264 if key not in f.attrib:
265 continue
266 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
267 return f
268 return None
269
d7e66d39
JMF
270# On python2.6 the xml.etree.ElementTree.Element methods don't support
271# the namespace parameter
5f6a1245
JW
272
273
d7e66d39
JMF
274def xpath_with_ns(path, ns_map):
275 components = [c.split(':') for c in path.split('/')]
276 replaced = []
277 for c in components:
278 if len(c) == 1:
279 replaced.append(c[0])
280 else:
281 ns, tag = c
282 replaced.append('{%s}%s' % (ns_map[ns], tag))
283 return '/'.join(replaced)
284
d77c3dfd 285
a41fb80c 286def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 287 def _find_xpath(xpath):
810c10ba 288 return node.find(compat_xpath(xpath))
578c0745
S
289
290 if isinstance(xpath, (str, compat_str)):
291 n = _find_xpath(xpath)
292 else:
293 for xp in xpath:
294 n = _find_xpath(xp)
295 if n is not None:
296 break
d74bebd5 297
8e636da4 298 if n is None:
bf42a990
S
299 if default is not NO_DEFAULT:
300 return default
301 elif fatal:
bf0ff932
PH
302 name = xpath if name is None else name
303 raise ExtractorError('Could not find XML element %s' % name)
304 else:
305 return None
a41fb80c
S
306 return n
307
308
309def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
310 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
311 if n is None or n == default:
312 return n
313 if n.text is None:
314 if default is not NO_DEFAULT:
315 return default
316 elif fatal:
317 name = xpath if name is None else name
318 raise ExtractorError('Could not find XML element\'s text %s' % name)
319 else:
320 return None
321 return n.text
a41fb80c
S
322
323
324def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
325 n = find_xpath_attr(node, xpath, key)
326 if n is None:
327 if default is not NO_DEFAULT:
328 return default
329 elif fatal:
330 name = '%s[@%s]' % (xpath, key) if name is None else name
331 raise ExtractorError('Could not find XML attribute %s' % name)
332 else:
333 return None
334 return n.attrib[key]
bf0ff932
PH
335
336
9e6dd238 337def get_element_by_id(id, html):
43e8fafd 338 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 339 return get_element_by_attribute('id', id, html)
43e8fafd 340
12ea2f30 341
84c237fb 342def get_element_by_class(class_name, html):
2af12ad9
TC
343 """Return the content of the first tag with the specified class in the passed HTML document"""
344 retval = get_elements_by_class(class_name, html)
345 return retval[0] if retval else None
346
347
348def get_element_by_attribute(attribute, value, html, escape_value=True):
349 retval = get_elements_by_attribute(attribute, value, html, escape_value)
350 return retval[0] if retval else None
351
352
353def get_elements_by_class(class_name, html):
354 """Return the content of all tags with the specified class in the passed HTML document as a list"""
355 return get_elements_by_attribute(
84c237fb
YCH
356 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
357 html, escape_value=False)
358
359
2af12ad9 360def get_elements_by_attribute(attribute, value, html, escape_value=True):
43e8fafd 361 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 362
84c237fb
YCH
363 value = re.escape(value) if escape_value else value
364
2af12ad9
TC
365 retlist = []
366 for m in re.finditer(r'''(?xs)
38285056 367 <([a-zA-Z0-9:._-]+)
609ff8ca 368 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38285056 369 \s+%s=['"]?%s['"]?
609ff8ca 370 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38285056
PH
371 \s*>
372 (?P<content>.*?)
373 </\1>
2af12ad9
TC
374 ''' % (re.escape(attribute), value), html):
375 res = m.group('content')
38285056 376
2af12ad9
TC
377 if res.startswith('"') or res.startswith("'"):
378 res = res[1:-1]
38285056 379
2af12ad9 380 retlist.append(unescapeHTML(res))
a921f407 381
2af12ad9 382 return retlist
a921f407 383
c5229f39 384
8bb56eee
BF
385class HTMLAttributeParser(compat_HTMLParser):
386 """Trivial HTML parser to gather the attributes for a single element"""
387 def __init__(self):
c5229f39 388 self.attrs = {}
8bb56eee
BF
389 compat_HTMLParser.__init__(self)
390
391 def handle_starttag(self, tag, attrs):
392 self.attrs = dict(attrs)
393
c5229f39 394
8bb56eee
BF
395def extract_attributes(html_element):
396 """Given a string for an HTML element such as
397 <el
398 a="foo" B="bar" c="&98;az" d=boz
399 empty= noval entity="&amp;"
400 sq='"' dq="'"
401 >
402 Decode and return a dictionary of attributes.
403 {
404 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
405 'empty': '', 'noval': None, 'entity': '&',
406 'sq': '"', 'dq': '\''
407 }.
408 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
409 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
410 """
411 parser = HTMLAttributeParser()
b4a3d461
S
412 try:
413 parser.feed(html_element)
414 parser.close()
415 # Older Python may throw HTMLParseError in case of malformed HTML
416 except compat_HTMLParseError:
417 pass
8bb56eee 418 return parser.attrs
9e6dd238 419
c5229f39 420
9e6dd238 421def clean_html(html):
59ae15a5 422 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
423
424 if html is None: # Convenience for sanitizing descriptions etc.
425 return html
426
59ae15a5
PH
427 # Newline vs <br />
428 html = html.replace('\n', ' ')
edd9221c
TF
429 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
430 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
431 # Strip html tags
432 html = re.sub('<.*?>', '', html)
433 # Replace html entities
434 html = unescapeHTML(html)
7decf895 435 return html.strip()
9e6dd238
FV
436
437
d77c3dfd 438def sanitize_open(filename, open_mode):
59ae15a5
PH
439 """Try to open the given filename, and slightly tweak it if this fails.
440
441 Attempts to open the given filename. If this fails, it tries to change
442 the filename slightly, step by step, until it's either able to open it
443 or it fails and raises a final exception, like the standard open()
444 function.
445
446 It returns the tuple (stream, definitive_file_name).
447 """
448 try:
28e614de 449 if filename == '-':
59ae15a5
PH
450 if sys.platform == 'win32':
451 import msvcrt
452 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 453 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
454 stream = open(encodeFilename(filename), open_mode)
455 return (stream, filename)
456 except (IOError, OSError) as err:
f45c185f
PH
457 if err.errno in (errno.EACCES,):
458 raise
59ae15a5 459
f45c185f 460 # In case of error, try to remove win32 forbidden chars
d55de57b 461 alt_filename = sanitize_path(filename)
f45c185f
PH
462 if alt_filename == filename:
463 raise
464 else:
465 # An exception here should be caught in the caller
d55de57b 466 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 467 return (stream, alt_filename)
d77c3dfd
FV
468
469
470def timeconvert(timestr):
59ae15a5
PH
471 """Convert RFC 2822 defined time string into system timestamp"""
472 timestamp = None
473 timetuple = email.utils.parsedate_tz(timestr)
474 if timetuple is not None:
475 timestamp = email.utils.mktime_tz(timetuple)
476 return timestamp
1c469a94 477
5f6a1245 478
796173d0 479def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
480 """Sanitizes a string so it could be used as part of a filename.
481 If restricted is set, use a stricter subset of allowed characters.
158af524
S
482 Set is_id if this is not an arbitrary string, but an ID that should be kept
483 if possible.
59ae15a5
PH
484 """
485 def replace_insane(char):
c587cbb7
AT
486 if restricted and char in ACCENT_CHARS:
487 return ACCENT_CHARS[char]
59ae15a5
PH
488 if char == '?' or ord(char) < 32 or ord(char) == 127:
489 return ''
490 elif char == '"':
491 return '' if restricted else '\''
492 elif char == ':':
493 return '_-' if restricted else ' -'
494 elif char in '\\/|*<>':
495 return '_'
627dcfff 496 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
497 return '_'
498 if restricted and ord(char) > 127:
499 return '_'
500 return char
501
2aeb06d6
PH
502 # Handle timestamps
503 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 504 result = ''.join(map(replace_insane, s))
796173d0
PH
505 if not is_id:
506 while '__' in result:
507 result = result.replace('__', '_')
508 result = result.strip('_')
509 # Common case of "Foreign band name - English song title"
510 if restricted and result.startswith('-_'):
511 result = result[2:]
5a42414b
PH
512 if result.startswith('-'):
513 result = '_' + result[len('-'):]
a7440261 514 result = result.lstrip('.')
796173d0
PH
515 if not result:
516 result = '_'
59ae15a5 517 return result
d77c3dfd 518
5f6a1245 519
a2aaf4db
S
520def sanitize_path(s):
521 """Sanitizes and normalizes path on Windows"""
522 if sys.platform != 'win32':
523 return s
be531ef1
S
524 drive_or_unc, _ = os.path.splitdrive(s)
525 if sys.version_info < (2, 7) and not drive_or_unc:
526 drive_or_unc, _ = os.path.splitunc(s)
527 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
528 if drive_or_unc:
a2aaf4db
S
529 norm_path.pop(0)
530 sanitized_path = [
ec85ded8 531 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 532 for path_part in norm_path]
be531ef1
S
533 if drive_or_unc:
534 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
535 return os.path.join(*sanitized_path)
536
537
67dda517
S
538# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
539# unwanted failures due to missing protocol
17bcc626
S
540def sanitize_url(url):
541 return 'http:%s' % url if url.startswith('//') else url
542
543
67dda517 544def sanitized_Request(url, *args, **kwargs):
17bcc626 545 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
546
547
51098426
S
548def expand_path(s):
549 """Expand shell variables and ~"""
550 return os.path.expandvars(compat_expanduser(s))
551
552
d77c3dfd 553def orderedSet(iterable):
59ae15a5
PH
554 """ Remove all duplicates from the input iterable """
555 res = []
556 for el in iterable:
557 if el not in res:
558 res.append(el)
559 return res
d77c3dfd 560
912b38b4 561
55b2f099 562def _htmlentity_transform(entity_with_semicolon):
4e408e47 563 """Transforms an HTML entity to a character."""
55b2f099
YCH
564 entity = entity_with_semicolon[:-1]
565
4e408e47
PH
566 # Known non-numeric HTML entity
567 if entity in compat_html_entities.name2codepoint:
568 return compat_chr(compat_html_entities.name2codepoint[entity])
569
55b2f099
YCH
570 # TODO: HTML5 allows entities without a semicolon. For example,
571 # '&Eacuteric' should be decoded as 'Éric'.
572 if entity_with_semicolon in compat_html_entities_html5:
573 return compat_html_entities_html5[entity_with_semicolon]
574
91757b0f 575 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
576 if mobj is not None:
577 numstr = mobj.group(1)
28e614de 578 if numstr.startswith('x'):
4e408e47 579 base = 16
28e614de 580 numstr = '0%s' % numstr
4e408e47
PH
581 else:
582 base = 10
7aefc49c
S
583 # See https://github.com/rg3/youtube-dl/issues/7518
584 try:
585 return compat_chr(int(numstr, base))
586 except ValueError:
587 pass
4e408e47
PH
588
589 # Unknown entity in name, return its literal representation
7a3f0c00 590 return '&%s;' % entity
4e408e47
PH
591
592
d77c3dfd 593def unescapeHTML(s):
912b38b4
PH
594 if s is None:
595 return None
596 assert type(s) == compat_str
d77c3dfd 597
4e408e47 598 return re.sub(
95f3f7c2 599 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 600
8bf48f23 601
aa49acd1
S
602def get_subprocess_encoding():
603 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
604 # For subprocess calls, encode with locale encoding
605 # Refer to http://stackoverflow.com/a/9951851/35070
606 encoding = preferredencoding()
607 else:
608 encoding = sys.getfilesystemencoding()
609 if encoding is None:
610 encoding = 'utf-8'
611 return encoding
612
613
8bf48f23 614def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
615 """
616 @param s The name of the file
617 """
d77c3dfd 618
8bf48f23 619 assert type(s) == compat_str
d77c3dfd 620
59ae15a5
PH
621 # Python 3 has a Unicode API
622 if sys.version_info >= (3, 0):
623 return s
0f00efed 624
aa49acd1
S
625 # Pass '' directly to use Unicode APIs on Windows 2000 and up
626 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
627 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
628 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
629 return s
630
8ee239e9
YCH
631 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
632 if sys.platform.startswith('java'):
633 return s
634
aa49acd1
S
635 return s.encode(get_subprocess_encoding(), 'ignore')
636
637
638def decodeFilename(b, for_subprocess=False):
639
640 if sys.version_info >= (3, 0):
641 return b
642
643 if not isinstance(b, bytes):
644 return b
645
646 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 647
f07b74fc
PH
648
649def encodeArgument(s):
650 if not isinstance(s, compat_str):
651 # Legacy code that uses byte strings
652 # Uncomment the following line after fixing all post processors
7af808a5 653 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
654 s = s.decode('ascii')
655 return encodeFilename(s, True)
656
657
aa49acd1
S
658def decodeArgument(b):
659 return decodeFilename(b, True)
660
661
8271226a
PH
662def decodeOption(optval):
663 if optval is None:
664 return optval
665 if isinstance(optval, bytes):
666 optval = optval.decode(preferredencoding())
667
668 assert isinstance(optval, compat_str)
669 return optval
1c256f70 670
5f6a1245 671
4539dd30
PH
672def formatSeconds(secs):
673 if secs > 3600:
674 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
675 elif secs > 60:
676 return '%d:%02d' % (secs // 60, secs % 60)
677 else:
678 return '%d' % secs
679
a0ddb8a2 680
be4a824d
PH
681def make_HTTPS_handler(params, **kwargs):
682 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 683 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 684 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 685 if opts_no_check_certificate:
be5f2c19 686 context.check_hostname = False
0db261ba 687 context.verify_mode = ssl.CERT_NONE
a2366922 688 try:
be4a824d 689 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
690 except TypeError:
691 # Python 2.7.8
692 # (create_default_context present but HTTPSHandler has no context=)
693 pass
694
695 if sys.version_info < (3, 2):
d7932313 696 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 697 else: # Python < 3.4
d7932313 698 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 699 context.verify_mode = (ssl.CERT_NONE
dca08720 700 if opts_no_check_certificate
ea6d901e 701 else ssl.CERT_REQUIRED)
303b479e 702 context.set_default_verify_paths()
be4a824d 703 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 704
732ea2f0 705
08f2a92c
JMF
706def bug_reports_message():
707 if ytdl_is_updateable():
708 update_cmd = 'type youtube-dl -U to update'
709 else:
710 update_cmd = 'see https://yt-dl.org/update on how to update'
711 msg = '; please report this issue on https://yt-dl.org/bug .'
712 msg += ' Make sure you are using the latest version; %s.' % update_cmd
713 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
714 return msg
715
716
bf5b9d85
PM
717class YoutubeDLError(Exception):
718 """Base exception for YoutubeDL errors."""
719 pass
720
721
722class ExtractorError(YoutubeDLError):
1c256f70 723 """Error during info extraction."""
5f6a1245 724
d11271dd 725 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
726 """ tb, if given, is the original traceback (so that it can be printed out).
727 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
728 """
729
730 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
731 expected = True
d11271dd
PH
732 if video_id is not None:
733 msg = video_id + ': ' + msg
410f3e73 734 if cause:
28e614de 735 msg += ' (caused by %r)' % cause
9a82b238 736 if not expected:
08f2a92c 737 msg += bug_reports_message()
1c256f70 738 super(ExtractorError, self).__init__(msg)
d5979c5d 739
1c256f70 740 self.traceback = tb
8cc83b8d 741 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 742 self.cause = cause
d11271dd 743 self.video_id = video_id
1c256f70 744
01951dda
PH
745 def format_traceback(self):
746 if self.traceback is None:
747 return None
28e614de 748 return ''.join(traceback.format_tb(self.traceback))
01951dda 749
1c256f70 750
416c7fcb
PH
751class UnsupportedError(ExtractorError):
752 def __init__(self, url):
753 super(UnsupportedError, self).__init__(
754 'Unsupported URL: %s' % url, expected=True)
755 self.url = url
756
757
55b3e45b
JMF
758class RegexNotFoundError(ExtractorError):
759 """Error when a regex didn't match"""
760 pass
761
762
773f291d
S
763class GeoRestrictedError(ExtractorError):
764 """Geographic restriction Error exception.
765
766 This exception may be thrown when a video is not available from your
767 geographic location due to geographic restrictions imposed by a website.
768 """
769 def __init__(self, msg, countries=None):
770 super(GeoRestrictedError, self).__init__(msg, expected=True)
771 self.msg = msg
772 self.countries = countries
773
774
bf5b9d85 775class DownloadError(YoutubeDLError):
59ae15a5 776 """Download Error exception.
d77c3dfd 777
59ae15a5
PH
778 This exception may be thrown by FileDownloader objects if they are not
779 configured to continue on errors. They will contain the appropriate
780 error message.
781 """
5f6a1245 782
8cc83b8d
FV
783 def __init__(self, msg, exc_info=None):
784 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
785 super(DownloadError, self).__init__(msg)
786 self.exc_info = exc_info
d77c3dfd
FV
787
788
bf5b9d85 789class SameFileError(YoutubeDLError):
59ae15a5 790 """Same File exception.
d77c3dfd 791
59ae15a5
PH
792 This exception will be thrown by FileDownloader objects if they detect
793 multiple files would have to be downloaded to the same file on disk.
794 """
795 pass
d77c3dfd
FV
796
797
bf5b9d85 798class PostProcessingError(YoutubeDLError):
59ae15a5 799 """Post Processing exception.
d77c3dfd 800
59ae15a5
PH
801 This exception may be raised by PostProcessor's .run() method to
802 indicate an error in the postprocessing task.
803 """
5f6a1245 804
7851b379 805 def __init__(self, msg):
bf5b9d85 806 super(PostProcessingError, self).__init__(msg)
7851b379 807 self.msg = msg
d77c3dfd 808
5f6a1245 809
bf5b9d85 810class MaxDownloadsReached(YoutubeDLError):
59ae15a5
PH
811 """ --max-downloads limit has been reached. """
812 pass
d77c3dfd
FV
813
814
bf5b9d85 815class UnavailableVideoError(YoutubeDLError):
59ae15a5 816 """Unavailable Format exception.
d77c3dfd 817
59ae15a5
PH
818 This exception will be thrown when a video is requested
819 in a format that is not available for that video.
820 """
821 pass
d77c3dfd
FV
822
823
bf5b9d85 824class ContentTooShortError(YoutubeDLError):
59ae15a5 825 """Content Too Short exception.
d77c3dfd 826
59ae15a5
PH
827 This exception may be raised by FileDownloader objects when a file they
828 download is too small for what the server announced first, indicating
829 the connection was probably interrupted.
830 """
d77c3dfd 831
59ae15a5 832 def __init__(self, downloaded, expected):
bf5b9d85
PM
833 super(ContentTooShortError, self).__init__(
834 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
835 )
2c7ed247 836 # Both in bytes
59ae15a5
PH
837 self.downloaded = downloaded
838 self.expected = expected
d77c3dfd 839
5f6a1245 840
bf5b9d85 841class XAttrMetadataError(YoutubeDLError):
efa97bdc
YCH
842 def __init__(self, code=None, msg='Unknown error'):
843 super(XAttrMetadataError, self).__init__(msg)
844 self.code = code
bd264412 845 self.msg = msg
efa97bdc
YCH
846
847 # Parsing code and msg
848 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
849 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
850 self.reason = 'NO_SPACE'
851 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
852 self.reason = 'VALUE_TOO_LONG'
853 else:
854 self.reason = 'NOT_SUPPORTED'
855
856
bf5b9d85 857class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
858 pass
859
860
c5a59d93 861def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
862 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
863 # expected HTTP responses to meet HTTP/1.0 or later (see also
864 # https://github.com/rg3/youtube-dl/issues/6727)
865 if sys.version_info < (3, 0):
5a1a2e94 866 kwargs[b'strict'] = True
be4a824d
PH
867 hc = http_class(*args, **kwargs)
868 source_address = ydl_handler._params.get('source_address')
869 if source_address is not None:
870 sa = (source_address, 0)
871 if hasattr(hc, 'source_address'): # Python 2.7+
872 hc.source_address = sa
873 else: # Python 2.6
874 def _hc_connect(self, *args, **kwargs):
875 sock = compat_socket_create_connection(
876 (self.host, self.port), self.timeout, sa)
877 if is_https:
d7932313
PH
878 self.sock = ssl.wrap_socket(
879 sock, self.key_file, self.cert_file,
880 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
881 else:
882 self.sock = sock
883 hc.connect = functools.partial(_hc_connect, hc)
884
885 return hc
886
887
87f0e62d 888def handle_youtubedl_headers(headers):
992fc9d6
YCH
889 filtered_headers = headers
890
891 if 'Youtubedl-no-compression' in filtered_headers:
892 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 893 del filtered_headers['Youtubedl-no-compression']
87f0e62d 894
992fc9d6 895 return filtered_headers
87f0e62d
YCH
896
897
acebc9cd 898class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
899 """Handler for HTTP requests and responses.
900
901 This class, when installed with an OpenerDirector, automatically adds
902 the standard headers to every HTTP request and handles gzipped and
903 deflated responses from web servers. If compression is to be avoided in
904 a particular request, the original request in the program code only has
0424ec30 905 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
906 removed before making the real request.
907
908 Part of this code was copied from:
909
910 http://techknack.net/python-urllib2-handlers/
911
912 Andrew Rowls, the author of that code, agreed to release it to the
913 public domain.
914 """
915
be4a824d
PH
916 def __init__(self, params, *args, **kwargs):
917 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
918 self._params = params
919
920 def http_open(self, req):
71aff188
YCH
921 conn_class = compat_http_client.HTTPConnection
922
923 socks_proxy = req.headers.get('Ytdl-socks-proxy')
924 if socks_proxy:
925 conn_class = make_socks_conn_class(conn_class, socks_proxy)
926 del req.headers['Ytdl-socks-proxy']
927
be4a824d 928 return self.do_open(functools.partial(
71aff188 929 _create_http_connection, self, conn_class, False),
be4a824d
PH
930 req)
931
59ae15a5
PH
932 @staticmethod
933 def deflate(data):
934 try:
935 return zlib.decompress(data, -zlib.MAX_WBITS)
936 except zlib.error:
937 return zlib.decompress(data)
938
acebc9cd 939 def http_request(self, req):
51f267d9
S
940 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
941 # always respected by websites, some tend to give out URLs with non percent-encoded
942 # non-ASCII characters (see telemb.py, ard.py [#3412])
943 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
944 # To work around aforementioned issue we will replace request's original URL with
945 # percent-encoded one
946 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
947 # the code of this workaround has been moved here from YoutubeDL.urlopen()
948 url = req.get_full_url()
949 url_escaped = escape_url(url)
950
951 # Substitute URL if any change after escaping
952 if url != url_escaped:
15d260eb 953 req = update_Request(req, url=url_escaped)
51f267d9 954
33ac271b 955 for h, v in std_headers.items():
3d5f7a39
JK
956 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
957 # The dict keys are capitalized because of this bug by urllib
958 if h.capitalize() not in req.headers:
33ac271b 959 req.add_header(h, v)
87f0e62d
YCH
960
961 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
962
963 if sys.version_info < (2, 7) and '#' in req.get_full_url():
964 # Python 2.6 is brain-dead when it comes to fragments
965 req._Request__original = req._Request__original.partition('#')[0]
966 req._Request__r_type = req._Request__r_type.partition('#')[0]
967
59ae15a5
PH
968 return req
969
acebc9cd 970 def http_response(self, req, resp):
59ae15a5
PH
971 old_resp = resp
972 # gzip
973 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
974 content = resp.read()
975 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
976 try:
977 uncompressed = io.BytesIO(gz.read())
978 except IOError as original_ioerror:
979 # There may be junk add the end of the file
980 # See http://stackoverflow.com/q/4928560/35070 for details
981 for i in range(1, 1024):
982 try:
983 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
984 uncompressed = io.BytesIO(gz.read())
985 except IOError:
986 continue
987 break
988 else:
989 raise original_ioerror
b407d853 990 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 991 resp.msg = old_resp.msg
c047270c 992 del resp.headers['Content-encoding']
59ae15a5
PH
993 # deflate
994 if resp.headers.get('Content-encoding', '') == 'deflate':
995 gz = io.BytesIO(self.deflate(resp.read()))
b407d853 996 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 997 resp.msg = old_resp.msg
c047270c 998 del resp.headers['Content-encoding']
ad729172
S
999 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1000 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
1001 if 300 <= resp.code < 400:
1002 location = resp.headers.get('Location')
1003 if location:
1004 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1005 if sys.version_info >= (3, 0):
1006 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
1007 else:
1008 location = location.decode('utf-8')
5a4d9ddb
S
1009 location_escaped = escape_url(location)
1010 if location != location_escaped:
1011 del resp.headers['Location']
9a4aec8b
YCH
1012 if sys.version_info < (3, 0):
1013 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 1014 resp.headers['Location'] = location_escaped
59ae15a5 1015 return resp
0f8d03f8 1016
acebc9cd
PH
1017 https_request = http_request
1018 https_response = http_response
bf50b038 1019
5de90176 1020
71aff188
YCH
1021def make_socks_conn_class(base_class, socks_proxy):
1022 assert issubclass(base_class, (
1023 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1024
1025 url_components = compat_urlparse.urlparse(socks_proxy)
1026 if url_components.scheme.lower() == 'socks5':
1027 socks_type = ProxyType.SOCKS5
1028 elif url_components.scheme.lower() in ('socks', 'socks4'):
1029 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1030 elif url_components.scheme.lower() == 'socks4a':
1031 socks_type = ProxyType.SOCKS4A
71aff188 1032
cdd94c2e
YCH
1033 def unquote_if_non_empty(s):
1034 if not s:
1035 return s
1036 return compat_urllib_parse_unquote_plus(s)
1037
71aff188
YCH
1038 proxy_args = (
1039 socks_type,
1040 url_components.hostname, url_components.port or 1080,
1041 True, # Remote DNS
cdd94c2e
YCH
1042 unquote_if_non_empty(url_components.username),
1043 unquote_if_non_empty(url_components.password),
71aff188
YCH
1044 )
1045
1046 class SocksConnection(base_class):
1047 def connect(self):
1048 self.sock = sockssocket()
1049 self.sock.setproxy(*proxy_args)
1050 if type(self.timeout) in (int, float):
1051 self.sock.settimeout(self.timeout)
1052 self.sock.connect((self.host, self.port))
1053
1054 if isinstance(self, compat_http_client.HTTPSConnection):
1055 if hasattr(self, '_context'): # Python > 2.6
1056 self.sock = self._context.wrap_socket(
1057 self.sock, server_hostname=self.host)
1058 else:
1059 self.sock = ssl.wrap_socket(self.sock)
1060
1061 return SocksConnection
1062
1063
be4a824d
PH
1064class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1065 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1066 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1067 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1068 self._params = params
1069
1070 def https_open(self, req):
4f264c02 1071 kwargs = {}
71aff188
YCH
1072 conn_class = self._https_conn_class
1073
4f264c02
JMF
1074 if hasattr(self, '_context'): # python > 2.6
1075 kwargs['context'] = self._context
1076 if hasattr(self, '_check_hostname'): # python 3.x
1077 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1078
1079 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1080 if socks_proxy:
1081 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1082 del req.headers['Ytdl-socks-proxy']
1083
be4a824d 1084 return self.do_open(functools.partial(
71aff188 1085 _create_http_connection, self, conn_class, True),
4f264c02 1086 req, **kwargs)
be4a824d
PH
1087
1088
a6420bf5
S
1089class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1090 def __init__(self, cookiejar=None):
1091 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1092
1093 def http_response(self, request, response):
1094 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1095 # characters in Set-Cookie HTTP header of last response (see
1096 # https://github.com/rg3/youtube-dl/issues/6769).
1097 # In order to at least prevent crashing we will percent encode Set-Cookie
1098 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1099 # if sys.version_info < (3, 0) and response.headers:
1100 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1101 # set_cookie = response.headers.get(set_cookie_header)
1102 # if set_cookie:
1103 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1104 # if set_cookie != set_cookie_escaped:
1105 # del response.headers[set_cookie_header]
1106 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1107 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1108
1109 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1110 https_response = http_response
1111
1112
46f59e89
S
1113def extract_timezone(date_str):
1114 m = re.search(
1115 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1116 date_str)
1117 if not m:
1118 timezone = datetime.timedelta()
1119 else:
1120 date_str = date_str[:-len(m.group('tz'))]
1121 if not m.group('sign'):
1122 timezone = datetime.timedelta()
1123 else:
1124 sign = 1 if m.group('sign') == '+' else -1
1125 timezone = datetime.timedelta(
1126 hours=sign * int(m.group('hours')),
1127 minutes=sign * int(m.group('minutes')))
1128 return timezone, date_str
1129
1130
08b38d54 1131def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1132 """ Return a UNIX timestamp from the given date """
1133
1134 if date_str is None:
1135 return None
1136
52c3a6e4
S
1137 date_str = re.sub(r'\.[0-9]+', '', date_str)
1138
08b38d54 1139 if timezone is None:
46f59e89
S
1140 timezone, date_str = extract_timezone(date_str)
1141
52c3a6e4
S
1142 try:
1143 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1144 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1145 return calendar.timegm(dt.timetuple())
1146 except ValueError:
1147 pass
912b38b4
PH
1148
1149
46f59e89
S
1150def date_formats(day_first=True):
1151 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1152
1153
42bdd9d0 1154def unified_strdate(date_str, day_first=True):
bf50b038 1155 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1156
1157 if date_str is None:
1158 return None
bf50b038 1159 upload_date = None
5f6a1245 1160 # Replace commas
026fcc04 1161 date_str = date_str.replace(',', ' ')
42bdd9d0 1162 # Remove AM/PM + timezone
9bb8e0a3 1163 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1164 _, date_str = extract_timezone(date_str)
42bdd9d0 1165
46f59e89 1166 for expression in date_formats(day_first):
bf50b038
JMF
1167 try:
1168 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1169 except ValueError:
bf50b038 1170 pass
42393ce2
PH
1171 if upload_date is None:
1172 timetuple = email.utils.parsedate_tz(date_str)
1173 if timetuple:
c6b9cf05
S
1174 try:
1175 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1176 except ValueError:
1177 pass
6a750402
JMF
1178 if upload_date is not None:
1179 return compat_str(upload_date)
bf50b038 1180
5f6a1245 1181
46f59e89
S
1182def unified_timestamp(date_str, day_first=True):
1183 if date_str is None:
1184 return None
1185
2ae2ffda 1186 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1187
7dc2a74e 1188 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1189 timezone, date_str = extract_timezone(date_str)
1190
1191 # Remove AM/PM + timezone
1192 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1193
deef3195
S
1194 # Remove unrecognized timezones from ISO 8601 alike timestamps
1195 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1196 if m:
1197 date_str = date_str[:-len(m.group('tz'))]
1198
46f59e89
S
1199 for expression in date_formats(day_first):
1200 try:
7dc2a74e 1201 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1202 return calendar.timegm(dt.timetuple())
1203 except ValueError:
1204 pass
1205 timetuple = email.utils.parsedate_tz(date_str)
1206 if timetuple:
7dc2a74e 1207 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1208
1209
28e614de 1210def determine_ext(url, default_ext='unknown_video'):
f4776371
S
1211 if url is None:
1212 return default_ext
9cb9a5df 1213 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1214 if re.match(r'^[A-Za-z0-9]+$', guess):
1215 return guess
a7aaa398
S
1216 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1217 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1218 return guess.rstrip('/')
73e79f2a 1219 else:
cbdbb766 1220 return default_ext
73e79f2a 1221
5f6a1245 1222
d4051a8e 1223def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1224 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1225
5f6a1245 1226
bd558525 1227def date_from_str(date_str):
37254abc
JMF
1228 """
1229 Return a datetime object from a string in the format YYYYMMDD or
1230 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1231 today = datetime.date.today()
f8795e10 1232 if date_str in ('now', 'today'):
37254abc 1233 return today
f8795e10
PH
1234 if date_str == 'yesterday':
1235 return today - datetime.timedelta(days=1)
ec85ded8 1236 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
37254abc
JMF
1237 if match is not None:
1238 sign = match.group('sign')
1239 time = int(match.group('time'))
1240 if sign == '-':
1241 time = -time
1242 unit = match.group('unit')
dfb1b146 1243 # A bad approximation?
37254abc
JMF
1244 if unit == 'month':
1245 unit = 'day'
1246 time *= 30
1247 elif unit == 'year':
1248 unit = 'day'
1249 time *= 365
1250 unit += 's'
1251 delta = datetime.timedelta(**{unit: time})
1252 return today + delta
611c1dd9 1253 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1254
1255
e63fc1be 1256def hyphenate_date(date_str):
1257 """
1258 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1259 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1260 if match is not None:
1261 return '-'.join(match.groups())
1262 else:
1263 return date_str
1264
5f6a1245 1265
bd558525
JMF
1266class DateRange(object):
1267 """Represents a time interval between two dates"""
5f6a1245 1268
bd558525
JMF
1269 def __init__(self, start=None, end=None):
1270 """start and end must be strings in the format accepted by date"""
1271 if start is not None:
1272 self.start = date_from_str(start)
1273 else:
1274 self.start = datetime.datetime.min.date()
1275 if end is not None:
1276 self.end = date_from_str(end)
1277 else:
1278 self.end = datetime.datetime.max.date()
37254abc 1279 if self.start > self.end:
bd558525 1280 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1281
bd558525
JMF
1282 @classmethod
1283 def day(cls, day):
1284 """Returns a range that only contains the given day"""
5f6a1245
JW
1285 return cls(day, day)
1286
bd558525
JMF
1287 def __contains__(self, date):
1288 """Check if the date is in the range"""
37254abc
JMF
1289 if not isinstance(date, datetime.date):
1290 date = date_from_str(date)
1291 return self.start <= date <= self.end
5f6a1245 1292
bd558525 1293 def __str__(self):
5f6a1245 1294 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1295
1296
1297def platform_name():
1298 """ Returns the platform name as a compat_str """
1299 res = platform.platform()
1300 if isinstance(res, bytes):
1301 res = res.decode(preferredencoding())
1302
1303 assert isinstance(res, compat_str)
1304 return res
c257baff
PH
1305
1306
b58ddb32
PH
1307def _windows_write_string(s, out):
1308 """ Returns True if the string was written using special methods,
1309 False if it has yet to be written out."""
1310 # Adapted from http://stackoverflow.com/a/3259271/35070
1311
1312 import ctypes
1313 import ctypes.wintypes
1314
1315 WIN_OUTPUT_IDS = {
1316 1: -11,
1317 2: -12,
1318 }
1319
a383a98a
PH
1320 try:
1321 fileno = out.fileno()
1322 except AttributeError:
1323 # If the output stream doesn't have a fileno, it's virtual
1324 return False
aa42e873
PH
1325 except io.UnsupportedOperation:
1326 # Some strange Windows pseudo files?
1327 return False
b58ddb32
PH
1328 if fileno not in WIN_OUTPUT_IDS:
1329 return False
1330
e2f89ec7 1331 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1332 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1333 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1334 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1335
e2f89ec7 1336 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1337 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1338 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1339 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1340 written = ctypes.wintypes.DWORD(0)
1341
611c1dd9 1342 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1343 FILE_TYPE_CHAR = 0x0002
1344 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1345 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1346 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1347 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1348 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1349 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1350
1351 def not_a_console(handle):
1352 if handle == INVALID_HANDLE_VALUE or handle is None:
1353 return True
8fb3ac36
PH
1354 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1355 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1356
1357 if not_a_console(h):
1358 return False
1359
d1b9c912
PH
1360 def next_nonbmp_pos(s):
1361 try:
1362 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1363 except StopIteration:
1364 return len(s)
1365
1366 while s:
1367 count = min(next_nonbmp_pos(s), 1024)
1368
b58ddb32 1369 ret = WriteConsoleW(
d1b9c912 1370 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1371 if ret == 0:
1372 raise OSError('Failed to write string')
d1b9c912
PH
1373 if not count: # We just wrote a non-BMP character
1374 assert written.value == 2
1375 s = s[1:]
1376 else:
1377 assert written.value > 0
1378 s = s[written.value:]
b58ddb32
PH
1379 return True
1380
1381
734f90bb 1382def write_string(s, out=None, encoding=None):
7459e3a2
PH
1383 if out is None:
1384 out = sys.stderr
8bf48f23 1385 assert type(s) == compat_str
7459e3a2 1386
b58ddb32
PH
1387 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1388 if _windows_write_string(s, out):
1389 return
1390
7459e3a2
PH
1391 if ('b' in getattr(out, 'mode', '') or
1392 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1393 byt = s.encode(encoding or preferredencoding(), 'ignore')
1394 out.write(byt)
1395 elif hasattr(out, 'buffer'):
1396 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1397 byt = s.encode(enc, 'ignore')
1398 out.buffer.write(byt)
1399 else:
8bf48f23 1400 out.write(s)
7459e3a2
PH
1401 out.flush()
1402
1403
48ea9cea
PH
1404def bytes_to_intlist(bs):
1405 if not bs:
1406 return []
1407 if isinstance(bs[0], int): # Python 3
1408 return list(bs)
1409 else:
1410 return [ord(c) for c in bs]
1411
c257baff 1412
cba892fa 1413def intlist_to_bytes(xs):
1414 if not xs:
1415 return b''
edaa23f8 1416 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1417
1418
c1c9a79c
PH
1419# Cross-platform file locking
1420if sys.platform == 'win32':
1421 import ctypes.wintypes
1422 import msvcrt
1423
1424 class OVERLAPPED(ctypes.Structure):
1425 _fields_ = [
1426 ('Internal', ctypes.wintypes.LPVOID),
1427 ('InternalHigh', ctypes.wintypes.LPVOID),
1428 ('Offset', ctypes.wintypes.DWORD),
1429 ('OffsetHigh', ctypes.wintypes.DWORD),
1430 ('hEvent', ctypes.wintypes.HANDLE),
1431 ]
1432
1433 kernel32 = ctypes.windll.kernel32
1434 LockFileEx = kernel32.LockFileEx
1435 LockFileEx.argtypes = [
1436 ctypes.wintypes.HANDLE, # hFile
1437 ctypes.wintypes.DWORD, # dwFlags
1438 ctypes.wintypes.DWORD, # dwReserved
1439 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1440 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1441 ctypes.POINTER(OVERLAPPED) # Overlapped
1442 ]
1443 LockFileEx.restype = ctypes.wintypes.BOOL
1444 UnlockFileEx = kernel32.UnlockFileEx
1445 UnlockFileEx.argtypes = [
1446 ctypes.wintypes.HANDLE, # hFile
1447 ctypes.wintypes.DWORD, # dwReserved
1448 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1449 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1450 ctypes.POINTER(OVERLAPPED) # Overlapped
1451 ]
1452 UnlockFileEx.restype = ctypes.wintypes.BOOL
1453 whole_low = 0xffffffff
1454 whole_high = 0x7fffffff
1455
1456 def _lock_file(f, exclusive):
1457 overlapped = OVERLAPPED()
1458 overlapped.Offset = 0
1459 overlapped.OffsetHigh = 0
1460 overlapped.hEvent = 0
1461 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1462 handle = msvcrt.get_osfhandle(f.fileno())
1463 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1464 whole_low, whole_high, f._lock_file_overlapped_p):
1465 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1466
1467 def _unlock_file(f):
1468 assert f._lock_file_overlapped_p
1469 handle = msvcrt.get_osfhandle(f.fileno())
1470 if not UnlockFileEx(handle, 0,
1471 whole_low, whole_high, f._lock_file_overlapped_p):
1472 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1473
1474else:
399a76e6
YCH
1475 # Some platforms, such as Jython, is missing fcntl
1476 try:
1477 import fcntl
c1c9a79c 1478
399a76e6
YCH
1479 def _lock_file(f, exclusive):
1480 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1481
399a76e6
YCH
1482 def _unlock_file(f):
1483 fcntl.flock(f, fcntl.LOCK_UN)
1484 except ImportError:
1485 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1486
1487 def _lock_file(f, exclusive):
1488 raise IOError(UNSUPPORTED_MSG)
1489
1490 def _unlock_file(f):
1491 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1492
1493
1494class locked_file(object):
1495 def __init__(self, filename, mode, encoding=None):
1496 assert mode in ['r', 'a', 'w']
1497 self.f = io.open(filename, mode, encoding=encoding)
1498 self.mode = mode
1499
1500 def __enter__(self):
1501 exclusive = self.mode != 'r'
1502 try:
1503 _lock_file(self.f, exclusive)
1504 except IOError:
1505 self.f.close()
1506 raise
1507 return self
1508
1509 def __exit__(self, etype, value, traceback):
1510 try:
1511 _unlock_file(self.f)
1512 finally:
1513 self.f.close()
1514
1515 def __iter__(self):
1516 return iter(self.f)
1517
1518 def write(self, *args):
1519 return self.f.write(*args)
1520
1521 def read(self, *args):
1522 return self.f.read(*args)
4eb7f1d1
JMF
1523
1524
4644ac55
S
1525def get_filesystem_encoding():
1526 encoding = sys.getfilesystemencoding()
1527 return encoding if encoding is not None else 'utf-8'
1528
1529
4eb7f1d1 1530def shell_quote(args):
a6a173c2 1531 quoted_args = []
4644ac55 1532 encoding = get_filesystem_encoding()
a6a173c2
JMF
1533 for a in args:
1534 if isinstance(a, bytes):
1535 # We may get a filename encoded with 'encodeFilename'
1536 a = a.decode(encoding)
aefce8e6 1537 quoted_args.append(compat_shlex_quote(a))
28e614de 1538 return ' '.join(quoted_args)
9d4660ca
PH
1539
1540
1541def smuggle_url(url, data):
1542 """ Pass additional data in a URL for internal use. """
1543
81953d1a
RA
1544 url, idata = unsmuggle_url(url, {})
1545 data.update(idata)
15707c7e 1546 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1547 {'__youtubedl_smuggle': json.dumps(data)})
1548 return url + '#' + sdata
9d4660ca
PH
1549
1550
79f82953 1551def unsmuggle_url(smug_url, default=None):
83e865a3 1552 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1553 return smug_url, default
28e614de
PH
1554 url, _, sdata = smug_url.rpartition('#')
1555 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1556 data = json.loads(jsond)
1557 return url, data
02dbf93f
PH
1558
1559
02dbf93f
PH
1560def format_bytes(bytes):
1561 if bytes is None:
28e614de 1562 return 'N/A'
02dbf93f
PH
1563 if type(bytes) is str:
1564 bytes = float(bytes)
1565 if bytes == 0.0:
1566 exponent = 0
1567 else:
1568 exponent = int(math.log(bytes, 1024.0))
28e614de 1569 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1570 converted = float(bytes) / float(1024 ** exponent)
28e614de 1571 return '%.2f%s' % (converted, suffix)
f53c966a 1572
1c088fa8 1573
fb47597b
S
1574def lookup_unit_table(unit_table, s):
1575 units_re = '|'.join(re.escape(u) for u in unit_table)
1576 m = re.match(
782b1b5b 1577 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1578 if not m:
1579 return None
1580 num_str = m.group('num').replace(',', '.')
1581 mult = unit_table[m.group('unit')]
1582 return int(float(num_str) * mult)
1583
1584
be64b5b0
PH
1585def parse_filesize(s):
1586 if s is None:
1587 return None
1588
dfb1b146 1589 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1590 # but we support those too
1591 _UNIT_TABLE = {
1592 'B': 1,
1593 'b': 1,
70852b47 1594 'bytes': 1,
be64b5b0
PH
1595 'KiB': 1024,
1596 'KB': 1000,
1597 'kB': 1024,
1598 'Kb': 1000,
13585d76 1599 'kb': 1000,
70852b47
YCH
1600 'kilobytes': 1000,
1601 'kibibytes': 1024,
be64b5b0
PH
1602 'MiB': 1024 ** 2,
1603 'MB': 1000 ** 2,
1604 'mB': 1024 ** 2,
1605 'Mb': 1000 ** 2,
13585d76 1606 'mb': 1000 ** 2,
70852b47
YCH
1607 'megabytes': 1000 ** 2,
1608 'mebibytes': 1024 ** 2,
be64b5b0
PH
1609 'GiB': 1024 ** 3,
1610 'GB': 1000 ** 3,
1611 'gB': 1024 ** 3,
1612 'Gb': 1000 ** 3,
13585d76 1613 'gb': 1000 ** 3,
70852b47
YCH
1614 'gigabytes': 1000 ** 3,
1615 'gibibytes': 1024 ** 3,
be64b5b0
PH
1616 'TiB': 1024 ** 4,
1617 'TB': 1000 ** 4,
1618 'tB': 1024 ** 4,
1619 'Tb': 1000 ** 4,
13585d76 1620 'tb': 1000 ** 4,
70852b47
YCH
1621 'terabytes': 1000 ** 4,
1622 'tebibytes': 1024 ** 4,
be64b5b0
PH
1623 'PiB': 1024 ** 5,
1624 'PB': 1000 ** 5,
1625 'pB': 1024 ** 5,
1626 'Pb': 1000 ** 5,
13585d76 1627 'pb': 1000 ** 5,
70852b47
YCH
1628 'petabytes': 1000 ** 5,
1629 'pebibytes': 1024 ** 5,
be64b5b0
PH
1630 'EiB': 1024 ** 6,
1631 'EB': 1000 ** 6,
1632 'eB': 1024 ** 6,
1633 'Eb': 1000 ** 6,
13585d76 1634 'eb': 1000 ** 6,
70852b47
YCH
1635 'exabytes': 1000 ** 6,
1636 'exbibytes': 1024 ** 6,
be64b5b0
PH
1637 'ZiB': 1024 ** 7,
1638 'ZB': 1000 ** 7,
1639 'zB': 1024 ** 7,
1640 'Zb': 1000 ** 7,
13585d76 1641 'zb': 1000 ** 7,
70852b47
YCH
1642 'zettabytes': 1000 ** 7,
1643 'zebibytes': 1024 ** 7,
be64b5b0
PH
1644 'YiB': 1024 ** 8,
1645 'YB': 1000 ** 8,
1646 'yB': 1024 ** 8,
1647 'Yb': 1000 ** 8,
13585d76 1648 'yb': 1000 ** 8,
70852b47
YCH
1649 'yottabytes': 1000 ** 8,
1650 'yobibytes': 1024 ** 8,
be64b5b0
PH
1651 }
1652
fb47597b
S
1653 return lookup_unit_table(_UNIT_TABLE, s)
1654
1655
1656def parse_count(s):
1657 if s is None:
be64b5b0
PH
1658 return None
1659
fb47597b
S
1660 s = s.strip()
1661
1662 if re.match(r'^[\d,.]+$', s):
1663 return str_to_int(s)
1664
1665 _UNIT_TABLE = {
1666 'k': 1000,
1667 'K': 1000,
1668 'm': 1000 ** 2,
1669 'M': 1000 ** 2,
1670 'kk': 1000 ** 2,
1671 'KK': 1000 ** 2,
1672 }
be64b5b0 1673
fb47597b 1674 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1675
2f7ae819 1676
a942d6cb 1677def month_by_name(name, lang='en'):
caefb1de
PH
1678 """ Return the number of a month by (locale-independently) English name """
1679
f6717dec 1680 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1681
caefb1de 1682 try:
f6717dec 1683 return month_names.index(name) + 1
7105440c
YCH
1684 except ValueError:
1685 return None
1686
1687
1688def month_by_abbreviation(abbrev):
1689 """ Return the number of a month by (locale-independently) English
1690 abbreviations """
1691
1692 try:
1693 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1694 except ValueError:
1695 return None
18258362
JMF
1696
1697
5aafe895 1698def fix_xml_ampersands(xml_str):
18258362 1699 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1700 return re.sub(
1701 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1702 '&amp;',
5aafe895 1703 xml_str)
e3946f98
PH
1704
1705
1706def setproctitle(title):
8bf48f23 1707 assert isinstance(title, compat_str)
c1c05c67
YCH
1708
1709 # ctypes in Jython is not complete
1710 # http://bugs.jython.org/issue2148
1711 if sys.platform.startswith('java'):
1712 return
1713
e3946f98 1714 try:
611c1dd9 1715 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1716 except OSError:
1717 return
2f49bcd6
RC
1718 except TypeError:
1719 # LoadLibrary in Windows Python 2.7.13 only expects
1720 # a bytestring, but since unicode_literals turns
1721 # every string into a unicode string, it fails.
1722 return
6eefe533
PH
1723 title_bytes = title.encode('utf-8')
1724 buf = ctypes.create_string_buffer(len(title_bytes))
1725 buf.value = title_bytes
e3946f98 1726 try:
6eefe533 1727 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1728 except AttributeError:
1729 return # Strange libc, just skip this
d7dda168
PH
1730
1731
1732def remove_start(s, start):
46bc9b7d 1733 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1734
1735
2b9faf55 1736def remove_end(s, end):
46bc9b7d 1737 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1738
1739
31b2051e
S
1740def remove_quotes(s):
1741 if s is None or len(s) < 2:
1742 return s
1743 for quote in ('"', "'", ):
1744 if s[0] == quote and s[-1] == quote:
1745 return s[1:-1]
1746 return s
1747
1748
29eb5174 1749def url_basename(url):
9b8aaeed 1750 path = compat_urlparse.urlparse(url).path
28e614de 1751 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1752
1753
02dc0a36
S
1754def base_url(url):
1755 return re.match(r'https?://[^?#&]+/', url).group()
1756
1757
e34c3361 1758def urljoin(base, path):
4b5de77b
S
1759 if isinstance(path, bytes):
1760 path = path.decode('utf-8')
e34c3361
S
1761 if not isinstance(path, compat_str) or not path:
1762 return None
b0c65c67 1763 if re.match(r'^(?:https?:)?//', path):
e34c3361 1764 return path
4b5de77b
S
1765 if isinstance(base, bytes):
1766 base = base.decode('utf-8')
1767 if not isinstance(base, compat_str) or not re.match(
1768 r'^(?:https?:)?//', base):
e34c3361
S
1769 return None
1770 return compat_urlparse.urljoin(base, path)
1771
1772
aa94a6d3
PH
1773class HEADRequest(compat_urllib_request.Request):
1774 def get_method(self):
611c1dd9 1775 return 'HEAD'
7217e148
PH
1776
1777
95cf60e8
S
1778class PUTRequest(compat_urllib_request.Request):
1779 def get_method(self):
1780 return 'PUT'
1781
1782
9732d77e 1783def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1784 if get_attr:
1785 if v is not None:
1786 v = getattr(v, get_attr, None)
9572013d
PH
1787 if v == '':
1788 v = None
1812afb7
S
1789 if v is None:
1790 return default
1791 try:
1792 return int(v) * invscale // scale
1793 except ValueError:
af98f8ff 1794 return default
9732d77e 1795
9572013d 1796
40a90862
JMF
1797def str_or_none(v, default=None):
1798 return default if v is None else compat_str(v)
1799
9732d77e
PH
1800
1801def str_to_int(int_str):
48d4681e 1802 """ A more relaxed version of int_or_none """
9732d77e
PH
1803 if int_str is None:
1804 return None
28e614de 1805 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1806 return int(int_str)
608d11f5
PH
1807
1808
9732d77e 1809def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1810 if v is None:
1811 return default
1812 try:
1813 return float(v) * invscale / scale
1814 except ValueError:
1815 return default
43f775e4
PH
1816
1817
c7e327c4
S
1818def bool_or_none(v, default=None):
1819 return v if isinstance(v, bool) else default
1820
1821
b72b4431
S
1822def strip_or_none(v):
1823 return None if v is None else v.strip()
1824
1825
608d11f5 1826def parse_duration(s):
8f9312c3 1827 if not isinstance(s, compat_basestring):
608d11f5
PH
1828 return None
1829
ca7b3246
S
1830 s = s.strip()
1831
acaff495 1832 days, hours, mins, secs, ms = [None] * 5
15846398 1833 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
acaff495 1834 if m:
1835 days, hours, mins, secs, ms = m.groups()
1836 else:
1837 m = re.match(
056653bb
S
1838 r'''(?ix)(?:P?
1839 (?:
1840 [0-9]+\s*y(?:ears?)?\s*
1841 )?
1842 (?:
1843 [0-9]+\s*m(?:onths?)?\s*
1844 )?
1845 (?:
1846 [0-9]+\s*w(?:eeks?)?\s*
1847 )?
8f4b58d7 1848 (?:
acaff495 1849 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1850 )?
056653bb 1851 T)?
acaff495 1852 (?:
1853 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1854 )?
1855 (?:
1856 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1857 )?
1858 (?:
1859 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 1860 )?Z?$''', s)
acaff495 1861 if m:
1862 days, hours, mins, secs, ms = m.groups()
1863 else:
15846398 1864 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 1865 if m:
1866 hours, mins = m.groups()
1867 else:
1868 return None
1869
1870 duration = 0
1871 if secs:
1872 duration += float(secs)
1873 if mins:
1874 duration += float(mins) * 60
1875 if hours:
1876 duration += float(hours) * 60 * 60
1877 if days:
1878 duration += float(days) * 24 * 60 * 60
1879 if ms:
1880 duration += float(ms)
1881 return duration
91d7d0b3
JMF
1882
1883
e65e4c88 1884def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1885 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1886 return (
1887 '{0}.{1}{2}'.format(name, ext, real_ext)
1888 if not expected_real_ext or real_ext[1:] == expected_real_ext
1889 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1890
1891
b3ed15b7
S
1892def replace_extension(filename, ext, expected_real_ext=None):
1893 name, real_ext = os.path.splitext(filename)
1894 return '{0}.{1}'.format(
1895 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1896 ext)
1897
1898
d70ad093
PH
1899def check_executable(exe, args=[]):
1900 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1901 args can be a list of arguments for a short output (like -version) """
1902 try:
1903 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1904 except OSError:
1905 return False
1906 return exe
b7ab0590
PH
1907
1908
95807118 1909def get_exe_version(exe, args=['--version'],
cae97f65 1910 version_re=None, unrecognized='present'):
95807118
PH
1911 """ Returns the version of the specified executable,
1912 or False if the executable is not present """
1913 try:
b64d04c1
YCH
1914 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1915 # SIGTTOU if youtube-dl is run in the background.
1916 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
cae97f65 1917 out, _ = subprocess.Popen(
54116803 1918 [encodeArgument(exe)] + args,
00ca7552 1919 stdin=subprocess.PIPE,
95807118
PH
1920 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1921 except OSError:
1922 return False
cae97f65
PH
1923 if isinstance(out, bytes): # Python 2.x
1924 out = out.decode('ascii', 'ignore')
1925 return detect_exe_version(out, version_re, unrecognized)
1926
1927
1928def detect_exe_version(output, version_re=None, unrecognized='present'):
1929 assert isinstance(output, compat_str)
1930 if version_re is None:
1931 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1932 m = re.search(version_re, output)
95807118
PH
1933 if m:
1934 return m.group(1)
1935 else:
1936 return unrecognized
1937
1938
b7ab0590 1939class PagedList(object):
dd26ced1
PH
1940 def __len__(self):
1941 # This is only useful for tests
1942 return len(self.getslice())
1943
9c44d242
PH
1944
1945class OnDemandPagedList(PagedList):
6be08ce6 1946 def __init__(self, pagefunc, pagesize, use_cache=True):
9c44d242
PH
1947 self._pagefunc = pagefunc
1948 self._pagesize = pagesize
b95dc034
YCH
1949 self._use_cache = use_cache
1950 if use_cache:
1951 self._cache = {}
9c44d242 1952
b7ab0590
PH
1953 def getslice(self, start=0, end=None):
1954 res = []
1955 for pagenum in itertools.count(start // self._pagesize):
1956 firstid = pagenum * self._pagesize
1957 nextfirstid = pagenum * self._pagesize + self._pagesize
1958 if start >= nextfirstid:
1959 continue
1960
b95dc034
YCH
1961 page_results = None
1962 if self._use_cache:
1963 page_results = self._cache.get(pagenum)
1964 if page_results is None:
1965 page_results = list(self._pagefunc(pagenum))
1966 if self._use_cache:
1967 self._cache[pagenum] = page_results
b7ab0590
PH
1968
1969 startv = (
1970 start % self._pagesize
1971 if firstid <= start < nextfirstid
1972 else 0)
1973
1974 endv = (
1975 ((end - 1) % self._pagesize) + 1
1976 if (end is not None and firstid <= end <= nextfirstid)
1977 else None)
1978
1979 if startv != 0 or endv is not None:
1980 page_results = page_results[startv:endv]
1981 res.extend(page_results)
1982
1983 # A little optimization - if current page is not "full", ie. does
1984 # not contain page_size videos then we can assume that this page
1985 # is the last one - there are no more ids on further pages -
1986 # i.e. no need to query again.
1987 if len(page_results) + startv < self._pagesize:
1988 break
1989
1990 # If we got the whole page, but the next page is not interesting,
1991 # break out early as well
1992 if end == nextfirstid:
1993 break
1994 return res
81c2f20b
PH
1995
1996
9c44d242
PH
1997class InAdvancePagedList(PagedList):
1998 def __init__(self, pagefunc, pagecount, pagesize):
1999 self._pagefunc = pagefunc
2000 self._pagecount = pagecount
2001 self._pagesize = pagesize
2002
2003 def getslice(self, start=0, end=None):
2004 res = []
2005 start_page = start // self._pagesize
2006 end_page = (
2007 self._pagecount if end is None else (end // self._pagesize + 1))
2008 skip_elems = start - start_page * self._pagesize
2009 only_more = None if end is None else end - start
2010 for pagenum in range(start_page, end_page):
2011 page = list(self._pagefunc(pagenum))
2012 if skip_elems:
2013 page = page[skip_elems:]
2014 skip_elems = None
2015 if only_more is not None:
2016 if len(page) < only_more:
2017 only_more -= len(page)
2018 else:
2019 page = page[:only_more]
2020 res.extend(page)
2021 break
2022 res.extend(page)
2023 return res
2024
2025
81c2f20b 2026def uppercase_escape(s):
676eb3f2 2027 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2028 return re.sub(
a612753d 2029 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2030 lambda m: unicode_escape(m.group(0))[0],
2031 s)
0fe2ff78
YCH
2032
2033
2034def lowercase_escape(s):
2035 unicode_escape = codecs.getdecoder('unicode_escape')
2036 return re.sub(
2037 r'\\u[0-9a-fA-F]{4}',
2038 lambda m: unicode_escape(m.group(0))[0],
2039 s)
b53466e1 2040
d05cfe06
S
2041
2042def escape_rfc3986(s):
2043 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 2044 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 2045 s = s.encode('utf-8')
ecc0c5ee 2046 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2047
2048
2049def escape_url(url):
2050 """Escape URL as suggested by RFC 3986"""
2051 url_parsed = compat_urllib_parse_urlparse(url)
2052 return url_parsed._replace(
efbed08d 2053 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2054 path=escape_rfc3986(url_parsed.path),
2055 params=escape_rfc3986(url_parsed.params),
2056 query=escape_rfc3986(url_parsed.query),
2057 fragment=escape_rfc3986(url_parsed.fragment)
2058 ).geturl()
2059
62e609ab
PH
2060
2061def read_batch_urls(batch_fd):
2062 def fixup(url):
2063 if not isinstance(url, compat_str):
2064 url = url.decode('utf-8', 'replace')
28e614de 2065 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
2066 if url.startswith(BOM_UTF8):
2067 url = url[len(BOM_UTF8):]
2068 url = url.strip()
2069 if url.startswith(('#', ';', ']')):
2070 return False
2071 return url
2072
2073 with contextlib.closing(batch_fd) as fd:
2074 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2075
2076
2077def urlencode_postdata(*args, **kargs):
15707c7e 2078 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2079
2080
38f9ef31 2081def update_url_query(url, query):
cacd9966
YCH
2082 if not query:
2083 return url
38f9ef31 2084 parsed_url = compat_urlparse.urlparse(url)
2085 qs = compat_parse_qs(parsed_url.query)
2086 qs.update(query)
2087 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2088 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2089
8e60dc75 2090
ed0291d1
S
2091def update_Request(req, url=None, data=None, headers={}, query={}):
2092 req_headers = req.headers.copy()
2093 req_headers.update(headers)
2094 req_data = data or req.data
2095 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2096 req_get_method = req.get_method()
2097 if req_get_method == 'HEAD':
2098 req_type = HEADRequest
2099 elif req_get_method == 'PUT':
2100 req_type = PUTRequest
2101 else:
2102 req_type = compat_urllib_request.Request
ed0291d1
S
2103 new_req = req_type(
2104 req_url, data=req_data, headers=req_headers,
2105 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2106 if hasattr(req, 'timeout'):
2107 new_req.timeout = req.timeout
2108 return new_req
2109
2110
10c87c15 2111def _multipart_encode_impl(data, boundary):
0c265486
YCH
2112 content_type = 'multipart/form-data; boundary=%s' % boundary
2113
2114 out = b''
2115 for k, v in data.items():
2116 out += b'--' + boundary.encode('ascii') + b'\r\n'
2117 if isinstance(k, compat_str):
2118 k = k.encode('utf-8')
2119 if isinstance(v, compat_str):
2120 v = v.encode('utf-8')
2121 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2122 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2123 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2124 if boundary.encode('ascii') in content:
2125 raise ValueError('Boundary overlaps with data')
2126 out += content
2127
2128 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2129
2130 return out, content_type
2131
2132
2133def multipart_encode(data, boundary=None):
2134 '''
2135 Encode a dict to RFC 7578-compliant form-data
2136
2137 data:
2138 A dict where keys and values can be either Unicode or bytes-like
2139 objects.
2140 boundary:
2141 If specified a Unicode object, it's used as the boundary. Otherwise
2142 a random boundary is generated.
2143
2144 Reference: https://tools.ietf.org/html/rfc7578
2145 '''
2146 has_specified_boundary = boundary is not None
2147
2148 while True:
2149 if boundary is None:
2150 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2151
2152 try:
10c87c15 2153 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2154 break
2155 except ValueError:
2156 if has_specified_boundary:
2157 raise
2158 boundary = None
2159
2160 return out, content_type
2161
2162
86296ad2 2163def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
2164 if isinstance(key_or_keys, (list, tuple)):
2165 for key in key_or_keys:
86296ad2
S
2166 if key not in d or d[key] is None or skip_false_values and not d[key]:
2167 continue
2168 return d[key]
cbecc9b9
S
2169 return default
2170 return d.get(key_or_keys, default)
2171
2172
329ca3be 2173def try_get(src, getter, expected_type=None):
a32a9a7e
S
2174 if not isinstance(getter, (list, tuple)):
2175 getter = [getter]
2176 for get in getter:
2177 try:
2178 v = get(src)
2179 except (AttributeError, KeyError, TypeError, IndexError):
2180 pass
2181 else:
2182 if expected_type is None or isinstance(v, expected_type):
2183 return v
329ca3be
S
2184
2185
8e60dc75
S
2186def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2187 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2188
16392824 2189
a1a530b0
PH
2190US_RATINGS = {
2191 'G': 0,
2192 'PG': 10,
2193 'PG-13': 13,
2194 'R': 16,
2195 'NC': 18,
2196}
fac55558
PH
2197
2198
a8795327
S
2199TV_PARENTAL_GUIDELINES = {
2200 'TV-Y': 0,
2201 'TV-Y7': 7,
2202 'TV-G': 0,
2203 'TV-PG': 0,
2204 'TV-14': 14,
2205 'TV-MA': 17,
2206}
2207
2208
146c80e2 2209def parse_age_limit(s):
a8795327
S
2210 if type(s) == int:
2211 return s if 0 <= s <= 21 else None
2212 if not isinstance(s, compat_basestring):
d838b1bd 2213 return None
146c80e2 2214 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2215 if m:
2216 return int(m.group('age'))
2217 if s in US_RATINGS:
2218 return US_RATINGS[s]
2219 return TV_PARENTAL_GUIDELINES.get(s)
146c80e2
S
2220
2221
fac55558 2222def strip_jsonp(code):
609a61e3 2223 return re.sub(
5552c9eb
YCH
2224 r'''(?sx)^
2225 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+)
2226 (?:\s*&&\s*(?P=func_name))?
2227 \s*\(\s*(?P<callback_data>.*)\);?
2228 \s*?(?://[^\n]*)*$''',
2229 r'\g<callback_data>', code)
478c2c61
PH
2230
2231
e05f6939 2232def js_to_json(code):
4195096e
S
2233 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2234 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2235 INTEGER_TABLE = (
2236 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2237 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2238 )
2239
e05f6939 2240 def fix_kv(m):
e7b6d122
PH
2241 v = m.group(0)
2242 if v in ('true', 'false', 'null'):
2243 return v
b3ee552e 2244 elif v.startswith('/*') or v.startswith('//') or v == ',':
bd1e4844 2245 return ""
2246
2247 if v[0] in ("'", '"'):
2248 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 2249 '"': '\\"',
bd1e4844 2250 "\\'": "'",
2251 '\\\n': '',
2252 '\\x': '\\u00',
2253 }.get(m.group(0), m.group(0)), v[1:-1])
2254
89ac4a19
S
2255 for regex, base in INTEGER_TABLE:
2256 im = re.match(regex, v)
2257 if im:
e4659b45 2258 i = int(im.group(1), base)
89ac4a19
S
2259 return '"%d":' % i if v.endswith(':') else '%d' % i
2260
e7b6d122 2261 return '"%s"' % v
e05f6939 2262
bd1e4844 2263 return re.sub(r'''(?sx)
2264 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2265 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 2266 {comment}|,(?={skip}[\]}}])|
bd1e4844 2267 [a-zA-Z_][.a-zA-Z_0-9]*|
4195096e
S
2268 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2269 [0-9]+(?={skip}:)
2270 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
2271
2272
478c2c61
PH
2273def qualities(quality_ids):
2274 """ Get a numeric quality value out of a list of possible values """
2275 def q(qid):
2276 try:
2277 return quality_ids.index(qid)
2278 except ValueError:
2279 return -1
2280 return q
2281
acd69589
PH
2282
2283DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 2284
a020a0dc
PH
2285
2286def limit_length(s, length):
2287 """ Add ellipses to overly long strings """
2288 if s is None:
2289 return None
2290 ELLIPSES = '...'
2291 if len(s) > length:
2292 return s[:length - len(ELLIPSES)] + ELLIPSES
2293 return s
48844745
PH
2294
2295
2296def version_tuple(v):
5f9b8394 2297 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2298
2299
2300def is_outdated_version(version, limit, assume_new=True):
2301 if not version:
2302 return not assume_new
2303 try:
2304 return version_tuple(version) < version_tuple(limit)
2305 except ValueError:
2306 return not assume_new
732ea2f0
PH
2307
2308
2309def ytdl_is_updateable():
2310 """ Returns if youtube-dl can be updated with -U """
2311 from zipimport import zipimporter
2312
2313 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
2314
2315
2316def args_to_str(args):
2317 # Get a short string representation for a subprocess command
702ccf2d 2318 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2319
2320
9b9c5355 2321def error_to_compat_str(err):
fdae2358
S
2322 err_str = str(err)
2323 # On python 2 error byte string must be decoded with proper
2324 # encoding rather than ascii
2325 if sys.version_info[0] < 3:
2326 err_str = err_str.decode(preferredencoding())
2327 return err_str
2328
2329
c460bdd5 2330def mimetype2ext(mt):
eb9ee194
S
2331 if mt is None:
2332 return None
2333
765ac263
JMF
2334 ext = {
2335 'audio/mp4': 'm4a',
6c33d24b
YCH
2336 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2337 # it's the most popular one
2338 'audio/mpeg': 'mp3',
765ac263
JMF
2339 }.get(mt)
2340 if ext is not None:
2341 return ext
2342
c460bdd5 2343 _, _, res = mt.rpartition('/')
6562d34a 2344 res = res.split(';')[0].strip().lower()
c460bdd5
PH
2345
2346 return {
f6861ec9 2347 '3gpp': '3gp',
cafcf657 2348 'smptett+xml': 'tt',
cafcf657 2349 'ttaf+xml': 'dfxp',
a0d8d704 2350 'ttml+xml': 'ttml',
f6861ec9 2351 'x-flv': 'flv',
a0d8d704
YCH
2352 'x-mp4-fragmented': 'mp4',
2353 'x-ms-wmv': 'wmv',
b4173f15
RA
2354 'mpegurl': 'm3u8',
2355 'x-mpegurl': 'm3u8',
2356 'vnd.apple.mpegurl': 'm3u8',
2357 'dash+xml': 'mpd',
b4173f15 2358 'f4m+xml': 'f4m',
f164b971 2359 'hds+xml': 'f4m',
e910fe2f 2360 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 2361 'quicktime': 'mov',
98ce1a3f 2362 'mp2t': 'ts',
c460bdd5
PH
2363 }.get(res, res)
2364
2365
4f3c5e06 2366def parse_codecs(codecs_str):
2367 # http://tools.ietf.org/html/rfc6381
2368 if not codecs_str:
2369 return {}
2370 splited_codecs = list(filter(None, map(
2371 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2372 vcodec, acodec = None, None
2373 for full_codec in splited_codecs:
2374 codec = full_codec.split('.')[0]
ffe6979e 2375 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1'):
4f3c5e06 2376 if not vcodec:
2377 vcodec = full_codec
60f5c9fb 2378 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 2379 if not acodec:
2380 acodec = full_codec
2381 else:
60f5c9fb 2382 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
4f3c5e06 2383 if not vcodec and not acodec:
2384 if len(splited_codecs) == 2:
2385 return {
2386 'vcodec': vcodec,
2387 'acodec': acodec,
2388 }
2389 elif len(splited_codecs) == 1:
2390 return {
2391 'vcodec': 'none',
2392 'acodec': vcodec,
2393 }
2394 else:
2395 return {
2396 'vcodec': vcodec or 'none',
2397 'acodec': acodec or 'none',
2398 }
2399 return {}
2400
2401
2ccd1b10 2402def urlhandle_detect_ext(url_handle):
79298173 2403 getheader = url_handle.headers.get
2ccd1b10 2404
b55ee18f
PH
2405 cd = getheader('Content-Disposition')
2406 if cd:
2407 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2408 if m:
2409 e = determine_ext(m.group('filename'), default_ext=None)
2410 if e:
2411 return e
2412
c460bdd5 2413 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2414
2415
1e399778
YCH
2416def encode_data_uri(data, mime_type):
2417 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2418
2419
05900629 2420def age_restricted(content_limit, age_limit):
6ec6cb4e 2421 """ Returns True iff the content should be blocked """
05900629
PH
2422
2423 if age_limit is None: # No limit set
2424 return False
2425 if content_limit is None:
2426 return False # Content available for everyone
2427 return age_limit < content_limit
61ca9a80
PH
2428
2429
2430def is_html(first_bytes):
2431 """ Detect whether a file contains HTML by examining its first bytes. """
2432
2433 BOMS = [
2434 (b'\xef\xbb\xbf', 'utf-8'),
2435 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2436 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2437 (b'\xff\xfe', 'utf-16-le'),
2438 (b'\xfe\xff', 'utf-16-be'),
2439 ]
2440 for bom, enc in BOMS:
2441 if first_bytes.startswith(bom):
2442 s = first_bytes[len(bom):].decode(enc, 'replace')
2443 break
2444 else:
2445 s = first_bytes.decode('utf-8', 'replace')
2446
2447 return re.match(r'^\s*<', s)
a055469f
PH
2448
2449
2450def determine_protocol(info_dict):
2451 protocol = info_dict.get('protocol')
2452 if protocol is not None:
2453 return protocol
2454
2455 url = info_dict['url']
2456 if url.startswith('rtmp'):
2457 return 'rtmp'
2458 elif url.startswith('mms'):
2459 return 'mms'
2460 elif url.startswith('rtsp'):
2461 return 'rtsp'
2462
2463 ext = determine_ext(url)
2464 if ext == 'm3u8':
2465 return 'm3u8'
2466 elif ext == 'f4m':
2467 return 'f4m'
2468
2469 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2470
2471
2472def render_table(header_row, data):
2473 """ Render a list of rows, each as a list of values """
2474 table = [header_row] + data
2475 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2476 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2477 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2478
2479
2480def _match_one(filter_part, dct):
2481 COMPARISON_OPERATORS = {
2482 '<': operator.lt,
2483 '<=': operator.le,
2484 '>': operator.gt,
2485 '>=': operator.ge,
2486 '=': operator.eq,
2487 '!=': operator.ne,
2488 }
2489 operator_rex = re.compile(r'''(?x)\s*
2490 (?P<key>[a-z_]+)
2491 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2492 (?:
2493 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
db13c16e 2494 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
347de493
PH
2495 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2496 )
2497 \s*$
2498 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2499 m = operator_rex.search(filter_part)
2500 if m:
2501 op = COMPARISON_OPERATORS[m.group('op')]
e5a088dc 2502 actual_value = dct.get(m.group('key'))
db13c16e
S
2503 if (m.group('quotedstrval') is not None or
2504 m.group('strval') is not None or
e5a088dc
S
2505 # If the original field is a string and matching comparisonvalue is
2506 # a number we should respect the origin of the original field
2507 # and process comparison value as a string (see
2508 # https://github.com/rg3/youtube-dl/issues/11082).
2509 actual_value is not None and m.group('intval') is not None and
2510 isinstance(actual_value, compat_str)):
347de493
PH
2511 if m.group('op') not in ('=', '!='):
2512 raise ValueError(
2513 'Operator %s does not support string values!' % m.group('op'))
db13c16e
S
2514 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2515 quote = m.group('quote')
2516 if quote is not None:
2517 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
347de493
PH
2518 else:
2519 try:
2520 comparison_value = int(m.group('intval'))
2521 except ValueError:
2522 comparison_value = parse_filesize(m.group('intval'))
2523 if comparison_value is None:
2524 comparison_value = parse_filesize(m.group('intval') + 'B')
2525 if comparison_value is None:
2526 raise ValueError(
2527 'Invalid integer value %r in filter part %r' % (
2528 m.group('intval'), filter_part))
347de493
PH
2529 if actual_value is None:
2530 return m.group('none_inclusive')
2531 return op(actual_value, comparison_value)
2532
2533 UNARY_OPERATORS = {
2534 '': lambda v: v is not None,
2535 '!': lambda v: v is None,
2536 }
2537 operator_rex = re.compile(r'''(?x)\s*
2538 (?P<op>%s)\s*(?P<key>[a-z_]+)
2539 \s*$
2540 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2541 m = operator_rex.search(filter_part)
2542 if m:
2543 op = UNARY_OPERATORS[m.group('op')]
2544 actual_value = dct.get(m.group('key'))
2545 return op(actual_value)
2546
2547 raise ValueError('Invalid filter part %r' % filter_part)
2548
2549
2550def match_str(filter_str, dct):
2551 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2552
2553 return all(
2554 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2555
2556
2557def match_filter_func(filter_str):
2558 def _match_func(info_dict):
2559 if match_str(filter_str, info_dict):
2560 return None
2561 else:
2562 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2563 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2564 return _match_func
91410c9b
PH
2565
2566
bf6427d2
YCH
2567def parse_dfxp_time_expr(time_expr):
2568 if not time_expr:
d631d5f9 2569 return
bf6427d2
YCH
2570
2571 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2572 if mobj:
2573 return float(mobj.group('time_offset'))
2574
db2fe38b 2575 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2576 if mobj:
db2fe38b 2577 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2578
2579
c1c924ab
YCH
2580def srt_subtitles_timecode(seconds):
2581 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2582
2583
2584def dfxp2srt(dfxp_data):
3869028f
YCH
2585 '''
2586 @param dfxp_data A bytes-like object containing DFXP data
2587 @returns A unicode object containing converted SRT data
2588 '''
5b995f71 2589 LEGACY_NAMESPACES = (
3869028f
YCH
2590 (b'http://www.w3.org/ns/ttml', [
2591 b'http://www.w3.org/2004/11/ttaf1',
2592 b'http://www.w3.org/2006/04/ttaf1',
2593 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 2594 ]),
3869028f
YCH
2595 (b'http://www.w3.org/ns/ttml#styling', [
2596 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
2597 ]),
2598 )
2599
2600 SUPPORTED_STYLING = [
2601 'color',
2602 'fontFamily',
2603 'fontSize',
2604 'fontStyle',
2605 'fontWeight',
2606 'textDecoration'
2607 ]
2608
4e335771
YCH
2609 _x = functools.partial(xpath_with_ns, ns_map={
2610 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 2611 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 2612 })
bf6427d2 2613
5b995f71
RA
2614 styles = {}
2615 default_style = {}
2616
87de7069 2617 class TTMLPElementParser(object):
5b995f71
RA
2618 _out = ''
2619 _unclosed_elements = []
2620 _applied_styles = []
bf6427d2 2621
2b14cb56 2622 def start(self, tag, attrib):
5b995f71
RA
2623 if tag in (_x('ttml:br'), 'br'):
2624 self._out += '\n'
2625 else:
2626 unclosed_elements = []
2627 style = {}
2628 element_style_id = attrib.get('style')
2629 if default_style:
2630 style.update(default_style)
2631 if element_style_id:
2632 style.update(styles.get(element_style_id, {}))
2633 for prop in SUPPORTED_STYLING:
2634 prop_val = attrib.get(_x('tts:' + prop))
2635 if prop_val:
2636 style[prop] = prop_val
2637 if style:
2638 font = ''
2639 for k, v in sorted(style.items()):
2640 if self._applied_styles and self._applied_styles[-1].get(k) == v:
2641 continue
2642 if k == 'color':
2643 font += ' color="%s"' % v
2644 elif k == 'fontSize':
2645 font += ' size="%s"' % v
2646 elif k == 'fontFamily':
2647 font += ' face="%s"' % v
2648 elif k == 'fontWeight' and v == 'bold':
2649 self._out += '<b>'
2650 unclosed_elements.append('b')
2651 elif k == 'fontStyle' and v == 'italic':
2652 self._out += '<i>'
2653 unclosed_elements.append('i')
2654 elif k == 'textDecoration' and v == 'underline':
2655 self._out += '<u>'
2656 unclosed_elements.append('u')
2657 if font:
2658 self._out += '<font' + font + '>'
2659 unclosed_elements.append('font')
2660 applied_style = {}
2661 if self._applied_styles:
2662 applied_style.update(self._applied_styles[-1])
2663 applied_style.update(style)
2664 self._applied_styles.append(applied_style)
2665 self._unclosed_elements.append(unclosed_elements)
bf6427d2 2666
2b14cb56 2667 def end(self, tag):
5b995f71
RA
2668 if tag not in (_x('ttml:br'), 'br'):
2669 unclosed_elements = self._unclosed_elements.pop()
2670 for element in reversed(unclosed_elements):
2671 self._out += '</%s>' % element
2672 if unclosed_elements and self._applied_styles:
2673 self._applied_styles.pop()
bf6427d2 2674
2b14cb56 2675 def data(self, data):
5b995f71 2676 self._out += data
2b14cb56 2677
2678 def close(self):
5b995f71 2679 return self._out.strip()
2b14cb56 2680
2681 def parse_node(node):
2682 target = TTMLPElementParser()
2683 parser = xml.etree.ElementTree.XMLParser(target=target)
2684 parser.feed(xml.etree.ElementTree.tostring(node))
2685 return parser.close()
bf6427d2 2686
5b995f71
RA
2687 for k, v in LEGACY_NAMESPACES:
2688 for ns in v:
2689 dfxp_data = dfxp_data.replace(ns, k)
2690
3869028f 2691 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 2692 out = []
5b995f71 2693 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2694
2695 if not paras:
2696 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 2697
5b995f71
RA
2698 repeat = False
2699 while True:
2700 for style in dfxp.findall(_x('.//ttml:style')):
2701 style_id = style.get('id')
2702 parent_style_id = style.get('style')
2703 if parent_style_id:
2704 if parent_style_id not in styles:
2705 repeat = True
2706 continue
2707 styles[style_id] = styles[parent_style_id].copy()
2708 for prop in SUPPORTED_STYLING:
2709 prop_val = style.get(_x('tts:' + prop))
2710 if prop_val:
2711 styles.setdefault(style_id, {})[prop] = prop_val
2712 if repeat:
2713 repeat = False
2714 else:
2715 break
2716
2717 for p in ('body', 'div'):
2718 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2719 if ele is None:
2720 continue
2721 style = styles.get(ele.get('style'))
2722 if not style:
2723 continue
2724 default_style.update(style)
2725
bf6427d2 2726 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2727 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2728 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2729 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2730 if begin_time is None:
2731 continue
7dff0363 2732 if not end_time:
d631d5f9
YCH
2733 if not dur:
2734 continue
2735 end_time = begin_time + dur
bf6427d2
YCH
2736 out.append('%d\n%s --> %s\n%s\n\n' % (
2737 index,
c1c924ab
YCH
2738 srt_subtitles_timecode(begin_time),
2739 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2740 parse_node(para)))
2741
2742 return ''.join(out)
2743
2744
66e289ba
S
2745def cli_option(params, command_option, param):
2746 param = params.get(param)
98e698f1
RA
2747 if param:
2748 param = compat_str(param)
66e289ba
S
2749 return [command_option, param] if param is not None else []
2750
2751
2752def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2753 param = params.get(param)
5b232f46
S
2754 if param is None:
2755 return []
66e289ba
S
2756 assert isinstance(param, bool)
2757 if separator:
2758 return [command_option + separator + (true_value if param else false_value)]
2759 return [command_option, true_value if param else false_value]
2760
2761
2762def cli_valueless_option(params, command_option, param, expected_value=True):
2763 param = params.get(param)
2764 return [command_option] if param == expected_value else []
2765
2766
2767def cli_configuration_args(params, param, default=[]):
2768 ex_args = params.get(param)
2769 if ex_args is None:
2770 return default
2771 assert isinstance(ex_args, list)
2772 return ex_args
2773
2774
39672624
YCH
2775class ISO639Utils(object):
2776 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2777 _lang_map = {
2778 'aa': 'aar',
2779 'ab': 'abk',
2780 'ae': 'ave',
2781 'af': 'afr',
2782 'ak': 'aka',
2783 'am': 'amh',
2784 'an': 'arg',
2785 'ar': 'ara',
2786 'as': 'asm',
2787 'av': 'ava',
2788 'ay': 'aym',
2789 'az': 'aze',
2790 'ba': 'bak',
2791 'be': 'bel',
2792 'bg': 'bul',
2793 'bh': 'bih',
2794 'bi': 'bis',
2795 'bm': 'bam',
2796 'bn': 'ben',
2797 'bo': 'bod',
2798 'br': 'bre',
2799 'bs': 'bos',
2800 'ca': 'cat',
2801 'ce': 'che',
2802 'ch': 'cha',
2803 'co': 'cos',
2804 'cr': 'cre',
2805 'cs': 'ces',
2806 'cu': 'chu',
2807 'cv': 'chv',
2808 'cy': 'cym',
2809 'da': 'dan',
2810 'de': 'deu',
2811 'dv': 'div',
2812 'dz': 'dzo',
2813 'ee': 'ewe',
2814 'el': 'ell',
2815 'en': 'eng',
2816 'eo': 'epo',
2817 'es': 'spa',
2818 'et': 'est',
2819 'eu': 'eus',
2820 'fa': 'fas',
2821 'ff': 'ful',
2822 'fi': 'fin',
2823 'fj': 'fij',
2824 'fo': 'fao',
2825 'fr': 'fra',
2826 'fy': 'fry',
2827 'ga': 'gle',
2828 'gd': 'gla',
2829 'gl': 'glg',
2830 'gn': 'grn',
2831 'gu': 'guj',
2832 'gv': 'glv',
2833 'ha': 'hau',
2834 'he': 'heb',
2835 'hi': 'hin',
2836 'ho': 'hmo',
2837 'hr': 'hrv',
2838 'ht': 'hat',
2839 'hu': 'hun',
2840 'hy': 'hye',
2841 'hz': 'her',
2842 'ia': 'ina',
2843 'id': 'ind',
2844 'ie': 'ile',
2845 'ig': 'ibo',
2846 'ii': 'iii',
2847 'ik': 'ipk',
2848 'io': 'ido',
2849 'is': 'isl',
2850 'it': 'ita',
2851 'iu': 'iku',
2852 'ja': 'jpn',
2853 'jv': 'jav',
2854 'ka': 'kat',
2855 'kg': 'kon',
2856 'ki': 'kik',
2857 'kj': 'kua',
2858 'kk': 'kaz',
2859 'kl': 'kal',
2860 'km': 'khm',
2861 'kn': 'kan',
2862 'ko': 'kor',
2863 'kr': 'kau',
2864 'ks': 'kas',
2865 'ku': 'kur',
2866 'kv': 'kom',
2867 'kw': 'cor',
2868 'ky': 'kir',
2869 'la': 'lat',
2870 'lb': 'ltz',
2871 'lg': 'lug',
2872 'li': 'lim',
2873 'ln': 'lin',
2874 'lo': 'lao',
2875 'lt': 'lit',
2876 'lu': 'lub',
2877 'lv': 'lav',
2878 'mg': 'mlg',
2879 'mh': 'mah',
2880 'mi': 'mri',
2881 'mk': 'mkd',
2882 'ml': 'mal',
2883 'mn': 'mon',
2884 'mr': 'mar',
2885 'ms': 'msa',
2886 'mt': 'mlt',
2887 'my': 'mya',
2888 'na': 'nau',
2889 'nb': 'nob',
2890 'nd': 'nde',
2891 'ne': 'nep',
2892 'ng': 'ndo',
2893 'nl': 'nld',
2894 'nn': 'nno',
2895 'no': 'nor',
2896 'nr': 'nbl',
2897 'nv': 'nav',
2898 'ny': 'nya',
2899 'oc': 'oci',
2900 'oj': 'oji',
2901 'om': 'orm',
2902 'or': 'ori',
2903 'os': 'oss',
2904 'pa': 'pan',
2905 'pi': 'pli',
2906 'pl': 'pol',
2907 'ps': 'pus',
2908 'pt': 'por',
2909 'qu': 'que',
2910 'rm': 'roh',
2911 'rn': 'run',
2912 'ro': 'ron',
2913 'ru': 'rus',
2914 'rw': 'kin',
2915 'sa': 'san',
2916 'sc': 'srd',
2917 'sd': 'snd',
2918 'se': 'sme',
2919 'sg': 'sag',
2920 'si': 'sin',
2921 'sk': 'slk',
2922 'sl': 'slv',
2923 'sm': 'smo',
2924 'sn': 'sna',
2925 'so': 'som',
2926 'sq': 'sqi',
2927 'sr': 'srp',
2928 'ss': 'ssw',
2929 'st': 'sot',
2930 'su': 'sun',
2931 'sv': 'swe',
2932 'sw': 'swa',
2933 'ta': 'tam',
2934 'te': 'tel',
2935 'tg': 'tgk',
2936 'th': 'tha',
2937 'ti': 'tir',
2938 'tk': 'tuk',
2939 'tl': 'tgl',
2940 'tn': 'tsn',
2941 'to': 'ton',
2942 'tr': 'tur',
2943 'ts': 'tso',
2944 'tt': 'tat',
2945 'tw': 'twi',
2946 'ty': 'tah',
2947 'ug': 'uig',
2948 'uk': 'ukr',
2949 'ur': 'urd',
2950 'uz': 'uzb',
2951 've': 'ven',
2952 'vi': 'vie',
2953 'vo': 'vol',
2954 'wa': 'wln',
2955 'wo': 'wol',
2956 'xh': 'xho',
2957 'yi': 'yid',
2958 'yo': 'yor',
2959 'za': 'zha',
2960 'zh': 'zho',
2961 'zu': 'zul',
2962 }
2963
2964 @classmethod
2965 def short2long(cls, code):
2966 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2967 return cls._lang_map.get(code[:2])
2968
2969 @classmethod
2970 def long2short(cls, code):
2971 """Convert language code from ISO 639-2/T to ISO 639-1"""
2972 for short_name, long_name in cls._lang_map.items():
2973 if long_name == code:
2974 return short_name
2975
2976
4eb10f66
YCH
2977class ISO3166Utils(object):
2978 # From http://data.okfn.org/data/core/country-list
2979 _country_map = {
2980 'AF': 'Afghanistan',
2981 'AX': 'Åland Islands',
2982 'AL': 'Albania',
2983 'DZ': 'Algeria',
2984 'AS': 'American Samoa',
2985 'AD': 'Andorra',
2986 'AO': 'Angola',
2987 'AI': 'Anguilla',
2988 'AQ': 'Antarctica',
2989 'AG': 'Antigua and Barbuda',
2990 'AR': 'Argentina',
2991 'AM': 'Armenia',
2992 'AW': 'Aruba',
2993 'AU': 'Australia',
2994 'AT': 'Austria',
2995 'AZ': 'Azerbaijan',
2996 'BS': 'Bahamas',
2997 'BH': 'Bahrain',
2998 'BD': 'Bangladesh',
2999 'BB': 'Barbados',
3000 'BY': 'Belarus',
3001 'BE': 'Belgium',
3002 'BZ': 'Belize',
3003 'BJ': 'Benin',
3004 'BM': 'Bermuda',
3005 'BT': 'Bhutan',
3006 'BO': 'Bolivia, Plurinational State of',
3007 'BQ': 'Bonaire, Sint Eustatius and Saba',
3008 'BA': 'Bosnia and Herzegovina',
3009 'BW': 'Botswana',
3010 'BV': 'Bouvet Island',
3011 'BR': 'Brazil',
3012 'IO': 'British Indian Ocean Territory',
3013 'BN': 'Brunei Darussalam',
3014 'BG': 'Bulgaria',
3015 'BF': 'Burkina Faso',
3016 'BI': 'Burundi',
3017 'KH': 'Cambodia',
3018 'CM': 'Cameroon',
3019 'CA': 'Canada',
3020 'CV': 'Cape Verde',
3021 'KY': 'Cayman Islands',
3022 'CF': 'Central African Republic',
3023 'TD': 'Chad',
3024 'CL': 'Chile',
3025 'CN': 'China',
3026 'CX': 'Christmas Island',
3027 'CC': 'Cocos (Keeling) Islands',
3028 'CO': 'Colombia',
3029 'KM': 'Comoros',
3030 'CG': 'Congo',
3031 'CD': 'Congo, the Democratic Republic of the',
3032 'CK': 'Cook Islands',
3033 'CR': 'Costa Rica',
3034 'CI': 'Côte d\'Ivoire',
3035 'HR': 'Croatia',
3036 'CU': 'Cuba',
3037 'CW': 'Curaçao',
3038 'CY': 'Cyprus',
3039 'CZ': 'Czech Republic',
3040 'DK': 'Denmark',
3041 'DJ': 'Djibouti',
3042 'DM': 'Dominica',
3043 'DO': 'Dominican Republic',
3044 'EC': 'Ecuador',
3045 'EG': 'Egypt',
3046 'SV': 'El Salvador',
3047 'GQ': 'Equatorial Guinea',
3048 'ER': 'Eritrea',
3049 'EE': 'Estonia',
3050 'ET': 'Ethiopia',
3051 'FK': 'Falkland Islands (Malvinas)',
3052 'FO': 'Faroe Islands',
3053 'FJ': 'Fiji',
3054 'FI': 'Finland',
3055 'FR': 'France',
3056 'GF': 'French Guiana',
3057 'PF': 'French Polynesia',
3058 'TF': 'French Southern Territories',
3059 'GA': 'Gabon',
3060 'GM': 'Gambia',
3061 'GE': 'Georgia',
3062 'DE': 'Germany',
3063 'GH': 'Ghana',
3064 'GI': 'Gibraltar',
3065 'GR': 'Greece',
3066 'GL': 'Greenland',
3067 'GD': 'Grenada',
3068 'GP': 'Guadeloupe',
3069 'GU': 'Guam',
3070 'GT': 'Guatemala',
3071 'GG': 'Guernsey',
3072 'GN': 'Guinea',
3073 'GW': 'Guinea-Bissau',
3074 'GY': 'Guyana',
3075 'HT': 'Haiti',
3076 'HM': 'Heard Island and McDonald Islands',
3077 'VA': 'Holy See (Vatican City State)',
3078 'HN': 'Honduras',
3079 'HK': 'Hong Kong',
3080 'HU': 'Hungary',
3081 'IS': 'Iceland',
3082 'IN': 'India',
3083 'ID': 'Indonesia',
3084 'IR': 'Iran, Islamic Republic of',
3085 'IQ': 'Iraq',
3086 'IE': 'Ireland',
3087 'IM': 'Isle of Man',
3088 'IL': 'Israel',
3089 'IT': 'Italy',
3090 'JM': 'Jamaica',
3091 'JP': 'Japan',
3092 'JE': 'Jersey',
3093 'JO': 'Jordan',
3094 'KZ': 'Kazakhstan',
3095 'KE': 'Kenya',
3096 'KI': 'Kiribati',
3097 'KP': 'Korea, Democratic People\'s Republic of',
3098 'KR': 'Korea, Republic of',
3099 'KW': 'Kuwait',
3100 'KG': 'Kyrgyzstan',
3101 'LA': 'Lao People\'s Democratic Republic',
3102 'LV': 'Latvia',
3103 'LB': 'Lebanon',
3104 'LS': 'Lesotho',
3105 'LR': 'Liberia',
3106 'LY': 'Libya',
3107 'LI': 'Liechtenstein',
3108 'LT': 'Lithuania',
3109 'LU': 'Luxembourg',
3110 'MO': 'Macao',
3111 'MK': 'Macedonia, the Former Yugoslav Republic of',
3112 'MG': 'Madagascar',
3113 'MW': 'Malawi',
3114 'MY': 'Malaysia',
3115 'MV': 'Maldives',
3116 'ML': 'Mali',
3117 'MT': 'Malta',
3118 'MH': 'Marshall Islands',
3119 'MQ': 'Martinique',
3120 'MR': 'Mauritania',
3121 'MU': 'Mauritius',
3122 'YT': 'Mayotte',
3123 'MX': 'Mexico',
3124 'FM': 'Micronesia, Federated States of',
3125 'MD': 'Moldova, Republic of',
3126 'MC': 'Monaco',
3127 'MN': 'Mongolia',
3128 'ME': 'Montenegro',
3129 'MS': 'Montserrat',
3130 'MA': 'Morocco',
3131 'MZ': 'Mozambique',
3132 'MM': 'Myanmar',
3133 'NA': 'Namibia',
3134 'NR': 'Nauru',
3135 'NP': 'Nepal',
3136 'NL': 'Netherlands',
3137 'NC': 'New Caledonia',
3138 'NZ': 'New Zealand',
3139 'NI': 'Nicaragua',
3140 'NE': 'Niger',
3141 'NG': 'Nigeria',
3142 'NU': 'Niue',
3143 'NF': 'Norfolk Island',
3144 'MP': 'Northern Mariana Islands',
3145 'NO': 'Norway',
3146 'OM': 'Oman',
3147 'PK': 'Pakistan',
3148 'PW': 'Palau',
3149 'PS': 'Palestine, State of',
3150 'PA': 'Panama',
3151 'PG': 'Papua New Guinea',
3152 'PY': 'Paraguay',
3153 'PE': 'Peru',
3154 'PH': 'Philippines',
3155 'PN': 'Pitcairn',
3156 'PL': 'Poland',
3157 'PT': 'Portugal',
3158 'PR': 'Puerto Rico',
3159 'QA': 'Qatar',
3160 'RE': 'Réunion',
3161 'RO': 'Romania',
3162 'RU': 'Russian Federation',
3163 'RW': 'Rwanda',
3164 'BL': 'Saint Barthélemy',
3165 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3166 'KN': 'Saint Kitts and Nevis',
3167 'LC': 'Saint Lucia',
3168 'MF': 'Saint Martin (French part)',
3169 'PM': 'Saint Pierre and Miquelon',
3170 'VC': 'Saint Vincent and the Grenadines',
3171 'WS': 'Samoa',
3172 'SM': 'San Marino',
3173 'ST': 'Sao Tome and Principe',
3174 'SA': 'Saudi Arabia',
3175 'SN': 'Senegal',
3176 'RS': 'Serbia',
3177 'SC': 'Seychelles',
3178 'SL': 'Sierra Leone',
3179 'SG': 'Singapore',
3180 'SX': 'Sint Maarten (Dutch part)',
3181 'SK': 'Slovakia',
3182 'SI': 'Slovenia',
3183 'SB': 'Solomon Islands',
3184 'SO': 'Somalia',
3185 'ZA': 'South Africa',
3186 'GS': 'South Georgia and the South Sandwich Islands',
3187 'SS': 'South Sudan',
3188 'ES': 'Spain',
3189 'LK': 'Sri Lanka',
3190 'SD': 'Sudan',
3191 'SR': 'Suriname',
3192 'SJ': 'Svalbard and Jan Mayen',
3193 'SZ': 'Swaziland',
3194 'SE': 'Sweden',
3195 'CH': 'Switzerland',
3196 'SY': 'Syrian Arab Republic',
3197 'TW': 'Taiwan, Province of China',
3198 'TJ': 'Tajikistan',
3199 'TZ': 'Tanzania, United Republic of',
3200 'TH': 'Thailand',
3201 'TL': 'Timor-Leste',
3202 'TG': 'Togo',
3203 'TK': 'Tokelau',
3204 'TO': 'Tonga',
3205 'TT': 'Trinidad and Tobago',
3206 'TN': 'Tunisia',
3207 'TR': 'Turkey',
3208 'TM': 'Turkmenistan',
3209 'TC': 'Turks and Caicos Islands',
3210 'TV': 'Tuvalu',
3211 'UG': 'Uganda',
3212 'UA': 'Ukraine',
3213 'AE': 'United Arab Emirates',
3214 'GB': 'United Kingdom',
3215 'US': 'United States',
3216 'UM': 'United States Minor Outlying Islands',
3217 'UY': 'Uruguay',
3218 'UZ': 'Uzbekistan',
3219 'VU': 'Vanuatu',
3220 'VE': 'Venezuela, Bolivarian Republic of',
3221 'VN': 'Viet Nam',
3222 'VG': 'Virgin Islands, British',
3223 'VI': 'Virgin Islands, U.S.',
3224 'WF': 'Wallis and Futuna',
3225 'EH': 'Western Sahara',
3226 'YE': 'Yemen',
3227 'ZM': 'Zambia',
3228 'ZW': 'Zimbabwe',
3229 }
3230
3231 @classmethod
3232 def short2full(cls, code):
3233 """Convert an ISO 3166-2 country code to the corresponding full name"""
3234 return cls._country_map.get(code.upper())
3235
3236
773f291d
S
3237class GeoUtils(object):
3238 # Major IPv4 address blocks per country
3239 _country_ip_map = {
3240 'AD': '85.94.160.0/19',
3241 'AE': '94.200.0.0/13',
3242 'AF': '149.54.0.0/17',
3243 'AG': '209.59.64.0/18',
3244 'AI': '204.14.248.0/21',
3245 'AL': '46.99.0.0/16',
3246 'AM': '46.70.0.0/15',
3247 'AO': '105.168.0.0/13',
3248 'AP': '159.117.192.0/21',
3249 'AR': '181.0.0.0/12',
3250 'AS': '202.70.112.0/20',
3251 'AT': '84.112.0.0/13',
3252 'AU': '1.128.0.0/11',
3253 'AW': '181.41.0.0/18',
3254 'AZ': '5.191.0.0/16',
3255 'BA': '31.176.128.0/17',
3256 'BB': '65.48.128.0/17',
3257 'BD': '114.130.0.0/16',
3258 'BE': '57.0.0.0/8',
3259 'BF': '129.45.128.0/17',
3260 'BG': '95.42.0.0/15',
3261 'BH': '37.131.0.0/17',
3262 'BI': '154.117.192.0/18',
3263 'BJ': '137.255.0.0/16',
3264 'BL': '192.131.134.0/24',
3265 'BM': '196.12.64.0/18',
3266 'BN': '156.31.0.0/16',
3267 'BO': '161.56.0.0/16',
3268 'BQ': '161.0.80.0/20',
3269 'BR': '152.240.0.0/12',
3270 'BS': '24.51.64.0/18',
3271 'BT': '119.2.96.0/19',
3272 'BW': '168.167.0.0/16',
3273 'BY': '178.120.0.0/13',
3274 'BZ': '179.42.192.0/18',
3275 'CA': '99.224.0.0/11',
3276 'CD': '41.243.0.0/16',
3277 'CF': '196.32.200.0/21',
3278 'CG': '197.214.128.0/17',
3279 'CH': '85.0.0.0/13',
3280 'CI': '154.232.0.0/14',
3281 'CK': '202.65.32.0/19',
3282 'CL': '152.172.0.0/14',
3283 'CM': '165.210.0.0/15',
3284 'CN': '36.128.0.0/10',
3285 'CO': '181.240.0.0/12',
3286 'CR': '201.192.0.0/12',
3287 'CU': '152.206.0.0/15',
3288 'CV': '165.90.96.0/19',
3289 'CW': '190.88.128.0/17',
3290 'CY': '46.198.0.0/15',
3291 'CZ': '88.100.0.0/14',
3292 'DE': '53.0.0.0/8',
3293 'DJ': '197.241.0.0/17',
3294 'DK': '87.48.0.0/12',
3295 'DM': '192.243.48.0/20',
3296 'DO': '152.166.0.0/15',
3297 'DZ': '41.96.0.0/12',
3298 'EC': '186.68.0.0/15',
3299 'EE': '90.190.0.0/15',
3300 'EG': '156.160.0.0/11',
3301 'ER': '196.200.96.0/20',
3302 'ES': '88.0.0.0/11',
3303 'ET': '196.188.0.0/14',
3304 'EU': '2.16.0.0/13',
3305 'FI': '91.152.0.0/13',
3306 'FJ': '144.120.0.0/16',
3307 'FM': '119.252.112.0/20',
3308 'FO': '88.85.32.0/19',
3309 'FR': '90.0.0.0/9',
3310 'GA': '41.158.0.0/15',
3311 'GB': '25.0.0.0/8',
3312 'GD': '74.122.88.0/21',
3313 'GE': '31.146.0.0/16',
3314 'GF': '161.22.64.0/18',
3315 'GG': '62.68.160.0/19',
3316 'GH': '45.208.0.0/14',
3317 'GI': '85.115.128.0/19',
3318 'GL': '88.83.0.0/19',
3319 'GM': '160.182.0.0/15',
3320 'GN': '197.149.192.0/18',
3321 'GP': '104.250.0.0/19',
3322 'GQ': '105.235.224.0/20',
3323 'GR': '94.64.0.0/13',
3324 'GT': '168.234.0.0/16',
3325 'GU': '168.123.0.0/16',
3326 'GW': '197.214.80.0/20',
3327 'GY': '181.41.64.0/18',
3328 'HK': '113.252.0.0/14',
3329 'HN': '181.210.0.0/16',
3330 'HR': '93.136.0.0/13',
3331 'HT': '148.102.128.0/17',
3332 'HU': '84.0.0.0/14',
3333 'ID': '39.192.0.0/10',
3334 'IE': '87.32.0.0/12',
3335 'IL': '79.176.0.0/13',
3336 'IM': '5.62.80.0/20',
3337 'IN': '117.192.0.0/10',
3338 'IO': '203.83.48.0/21',
3339 'IQ': '37.236.0.0/14',
3340 'IR': '2.176.0.0/12',
3341 'IS': '82.221.0.0/16',
3342 'IT': '79.0.0.0/10',
3343 'JE': '87.244.64.0/18',
3344 'JM': '72.27.0.0/17',
3345 'JO': '176.29.0.0/16',
3346 'JP': '126.0.0.0/8',
3347 'KE': '105.48.0.0/12',
3348 'KG': '158.181.128.0/17',
3349 'KH': '36.37.128.0/17',
3350 'KI': '103.25.140.0/22',
3351 'KM': '197.255.224.0/20',
3352 'KN': '198.32.32.0/19',
3353 'KP': '175.45.176.0/22',
3354 'KR': '175.192.0.0/10',
3355 'KW': '37.36.0.0/14',
3356 'KY': '64.96.0.0/15',
3357 'KZ': '2.72.0.0/13',
3358 'LA': '115.84.64.0/18',
3359 'LB': '178.135.0.0/16',
3360 'LC': '192.147.231.0/24',
3361 'LI': '82.117.0.0/19',
3362 'LK': '112.134.0.0/15',
3363 'LR': '41.86.0.0/19',
3364 'LS': '129.232.0.0/17',
3365 'LT': '78.56.0.0/13',
3366 'LU': '188.42.0.0/16',
3367 'LV': '46.109.0.0/16',
3368 'LY': '41.252.0.0/14',
3369 'MA': '105.128.0.0/11',
3370 'MC': '88.209.64.0/18',
3371 'MD': '37.246.0.0/16',
3372 'ME': '178.175.0.0/17',
3373 'MF': '74.112.232.0/21',
3374 'MG': '154.126.0.0/17',
3375 'MH': '117.103.88.0/21',
3376 'MK': '77.28.0.0/15',
3377 'ML': '154.118.128.0/18',
3378 'MM': '37.111.0.0/17',
3379 'MN': '49.0.128.0/17',
3380 'MO': '60.246.0.0/16',
3381 'MP': '202.88.64.0/20',
3382 'MQ': '109.203.224.0/19',
3383 'MR': '41.188.64.0/18',
3384 'MS': '208.90.112.0/22',
3385 'MT': '46.11.0.0/16',
3386 'MU': '105.16.0.0/12',
3387 'MV': '27.114.128.0/18',
3388 'MW': '105.234.0.0/16',
3389 'MX': '187.192.0.0/11',
3390 'MY': '175.136.0.0/13',
3391 'MZ': '197.218.0.0/15',
3392 'NA': '41.182.0.0/16',
3393 'NC': '101.101.0.0/18',
3394 'NE': '197.214.0.0/18',
3395 'NF': '203.17.240.0/22',
3396 'NG': '105.112.0.0/12',
3397 'NI': '186.76.0.0/15',
3398 'NL': '145.96.0.0/11',
3399 'NO': '84.208.0.0/13',
3400 'NP': '36.252.0.0/15',
3401 'NR': '203.98.224.0/19',
3402 'NU': '49.156.48.0/22',
3403 'NZ': '49.224.0.0/14',
3404 'OM': '5.36.0.0/15',
3405 'PA': '186.72.0.0/15',
3406 'PE': '186.160.0.0/14',
3407 'PF': '123.50.64.0/18',
3408 'PG': '124.240.192.0/19',
3409 'PH': '49.144.0.0/13',
3410 'PK': '39.32.0.0/11',
3411 'PL': '83.0.0.0/11',
3412 'PM': '70.36.0.0/20',
3413 'PR': '66.50.0.0/16',
3414 'PS': '188.161.0.0/16',
3415 'PT': '85.240.0.0/13',
3416 'PW': '202.124.224.0/20',
3417 'PY': '181.120.0.0/14',
3418 'QA': '37.210.0.0/15',
3419 'RE': '139.26.0.0/16',
3420 'RO': '79.112.0.0/13',
3421 'RS': '178.220.0.0/14',
3422 'RU': '5.136.0.0/13',
3423 'RW': '105.178.0.0/15',
3424 'SA': '188.48.0.0/13',
3425 'SB': '202.1.160.0/19',
3426 'SC': '154.192.0.0/11',
3427 'SD': '154.96.0.0/13',
3428 'SE': '78.64.0.0/12',
3429 'SG': '152.56.0.0/14',
3430 'SI': '188.196.0.0/14',
3431 'SK': '78.98.0.0/15',
3432 'SL': '197.215.0.0/17',
3433 'SM': '89.186.32.0/19',
3434 'SN': '41.82.0.0/15',
3435 'SO': '197.220.64.0/19',
3436 'SR': '186.179.128.0/17',
3437 'SS': '105.235.208.0/21',
3438 'ST': '197.159.160.0/19',
3439 'SV': '168.243.0.0/16',
3440 'SX': '190.102.0.0/20',
3441 'SY': '5.0.0.0/16',
3442 'SZ': '41.84.224.0/19',
3443 'TC': '65.255.48.0/20',
3444 'TD': '154.68.128.0/19',
3445 'TG': '196.168.0.0/14',
3446 'TH': '171.96.0.0/13',
3447 'TJ': '85.9.128.0/18',
3448 'TK': '27.96.24.0/21',
3449 'TL': '180.189.160.0/20',
3450 'TM': '95.85.96.0/19',
3451 'TN': '197.0.0.0/11',
3452 'TO': '175.176.144.0/21',
3453 'TR': '78.160.0.0/11',
3454 'TT': '186.44.0.0/15',
3455 'TV': '202.2.96.0/19',
3456 'TW': '120.96.0.0/11',
3457 'TZ': '156.156.0.0/14',
3458 'UA': '93.72.0.0/13',
3459 'UG': '154.224.0.0/13',
3460 'US': '3.0.0.0/8',
3461 'UY': '167.56.0.0/13',
3462 'UZ': '82.215.64.0/18',
3463 'VA': '212.77.0.0/19',
3464 'VC': '24.92.144.0/20',
3465 'VE': '186.88.0.0/13',
3466 'VG': '172.103.64.0/18',
3467 'VI': '146.226.0.0/16',
3468 'VN': '14.160.0.0/11',
3469 'VU': '202.80.32.0/20',
3470 'WF': '117.20.32.0/21',
3471 'WS': '202.4.32.0/19',
3472 'YE': '134.35.0.0/16',
3473 'YT': '41.242.116.0/22',
3474 'ZA': '41.0.0.0/11',
3475 'ZM': '165.56.0.0/13',
3476 'ZW': '41.85.192.0/19',
3477 }
3478
3479 @classmethod
3480 def random_ipv4(cls, code):
3481 block = cls._country_ip_map.get(code.upper())
3482 if not block:
3483 return None
3484 addr, preflen = block.split('/')
3485 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3486 addr_max = addr_min | (0xffffffff >> int(preflen))
18a0defa 3487 return compat_str(socket.inet_ntoa(
4248dad9 3488 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
3489
3490
91410c9b 3491class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
3492 def __init__(self, proxies=None):
3493 # Set default handlers
3494 for type in ('http', 'https'):
3495 setattr(self, '%s_open' % type,
3496 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3497 meth(r, proxy, type))
3498 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3499
91410c9b 3500 def proxy_open(self, req, proxy, type):
2461f79d 3501 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
3502 if req_proxy is not None:
3503 proxy = req_proxy
2461f79d
PH
3504 del req.headers['Ytdl-request-proxy']
3505
3506 if proxy == '__noproxy__':
3507 return None # No Proxy
51fb4995 3508 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
3509 req.add_header('Ytdl-socks-proxy', proxy)
3510 # youtube-dl's http/https handlers do wrapping the socket with socks
3511 return None
91410c9b
PH
3512 return compat_urllib_request.ProxyHandler.proxy_open(
3513 self, req, proxy, type)
5bc880b9
YCH
3514
3515
0a5445dd
YCH
3516# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3517# released into Public Domain
3518# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3519
3520def long_to_bytes(n, blocksize=0):
3521 """long_to_bytes(n:long, blocksize:int) : string
3522 Convert a long integer to a byte string.
3523
3524 If optional blocksize is given and greater than zero, pad the front of the
3525 byte string with binary zeros so that the length is a multiple of
3526 blocksize.
3527 """
3528 # after much testing, this algorithm was deemed to be the fastest
3529 s = b''
3530 n = int(n)
3531 while n > 0:
3532 s = compat_struct_pack('>I', n & 0xffffffff) + s
3533 n = n >> 32
3534 # strip off leading zeros
3535 for i in range(len(s)):
3536 if s[i] != b'\000'[0]:
3537 break
3538 else:
3539 # only happens when n == 0
3540 s = b'\000'
3541 i = 0
3542 s = s[i:]
3543 # add back some pad bytes. this could be done more efficiently w.r.t. the
3544 # de-padding being done above, but sigh...
3545 if blocksize > 0 and len(s) % blocksize:
3546 s = (blocksize - len(s) % blocksize) * b'\000' + s
3547 return s
3548
3549
3550def bytes_to_long(s):
3551 """bytes_to_long(string) : long
3552 Convert a byte string to a long integer.
3553
3554 This is (essentially) the inverse of long_to_bytes().
3555 """
3556 acc = 0
3557 length = len(s)
3558 if length % 4:
3559 extra = (4 - length % 4)
3560 s = b'\000' * extra + s
3561 length = length + extra
3562 for i in range(0, length, 4):
3563 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3564 return acc
3565
3566
5bc880b9
YCH
3567def ohdave_rsa_encrypt(data, exponent, modulus):
3568 '''
3569 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3570
3571 Input:
3572 data: data to encrypt, bytes-like object
3573 exponent, modulus: parameter e and N of RSA algorithm, both integer
3574 Output: hex string of encrypted data
3575
3576 Limitation: supports one block encryption only
3577 '''
3578
3579 payload = int(binascii.hexlify(data[::-1]), 16)
3580 encrypted = pow(payload, exponent, modulus)
3581 return '%x' % encrypted
81bdc8fd
YCH
3582
3583
f48409c7
YCH
3584def pkcs1pad(data, length):
3585 """
3586 Padding input data with PKCS#1 scheme
3587
3588 @param {int[]} data input data
3589 @param {int} length target length
3590 @returns {int[]} padded data
3591 """
3592 if len(data) > length - 11:
3593 raise ValueError('Input data too long for PKCS#1 padding')
3594
3595 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3596 return [0, 2] + pseudo_random + [0] + data
3597
3598
5eb6bdce 3599def encode_base_n(num, n, table=None):
59f898b7 3600 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
3601 if not table:
3602 table = FULL_TABLE[:n]
3603
5eb6bdce
YCH
3604 if n > len(table):
3605 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3606
3607 if num == 0:
3608 return table[0]
3609
81bdc8fd
YCH
3610 ret = ''
3611 while num:
3612 ret = table[num % n] + ret
3613 num = num // n
3614 return ret
f52354a8
YCH
3615
3616
3617def decode_packed_codes(code):
06b3fe29 3618 mobj = re.search(PACKED_CODES_RE, code)
f52354a8
YCH
3619 obfucasted_code, base, count, symbols = mobj.groups()
3620 base = int(base)
3621 count = int(count)
3622 symbols = symbols.split('|')
3623 symbol_table = {}
3624
3625 while count:
3626 count -= 1
5eb6bdce 3627 base_n_count = encode_base_n(count, base)
f52354a8
YCH
3628 symbol_table[base_n_count] = symbols[count] or base_n_count
3629
3630 return re.sub(
3631 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3632 obfucasted_code)
e154c651 3633
3634
3635def parse_m3u8_attributes(attrib):
3636 info = {}
3637 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3638 if val.startswith('"'):
3639 val = val[1:-1]
3640 info[key] = val
3641 return info
1143535d
YCH
3642
3643
3644def urshift(val, n):
3645 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
3646
3647
3648# Based on png2str() written by @gdkchan and improved by @yokrysty
3649# Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3650def decode_png(png_data):
3651 # Reference: https://www.w3.org/TR/PNG/
3652 header = png_data[8:]
3653
3654 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3655 raise IOError('Not a valid PNG file.')
3656
3657 int_map = {1: '>B', 2: '>H', 4: '>I'}
3658 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3659
3660 chunks = []
3661
3662 while header:
3663 length = unpack_integer(header[:4])
3664 header = header[4:]
3665
3666 chunk_type = header[:4]
3667 header = header[4:]
3668
3669 chunk_data = header[:length]
3670 header = header[length:]
3671
3672 header = header[4:] # Skip CRC
3673
3674 chunks.append({
3675 'type': chunk_type,
3676 'length': length,
3677 'data': chunk_data
3678 })
3679
3680 ihdr = chunks[0]['data']
3681
3682 width = unpack_integer(ihdr[:4])
3683 height = unpack_integer(ihdr[4:8])
3684
3685 idat = b''
3686
3687 for chunk in chunks:
3688 if chunk['type'] == b'IDAT':
3689 idat += chunk['data']
3690
3691 if not idat:
3692 raise IOError('Unable to read PNG data.')
3693
3694 decompressed_data = bytearray(zlib.decompress(idat))
3695
3696 stride = width * 3
3697 pixels = []
3698
3699 def _get_pixel(idx):
3700 x = idx % stride
3701 y = idx // stride
3702 return pixels[y][x]
3703
3704 for y in range(height):
3705 basePos = y * (1 + stride)
3706 filter_type = decompressed_data[basePos]
3707
3708 current_row = []
3709
3710 pixels.append(current_row)
3711
3712 for x in range(stride):
3713 color = decompressed_data[1 + basePos + x]
3714 basex = y * stride + x
3715 left = 0
3716 up = 0
3717
3718 if x > 2:
3719 left = _get_pixel(basex - 3)
3720 if y > 0:
3721 up = _get_pixel(basex - stride)
3722
3723 if filter_type == 1: # Sub
3724 color = (color + left) & 0xff
3725 elif filter_type == 2: # Up
3726 color = (color + up) & 0xff
3727 elif filter_type == 3: # Average
3728 color = (color + ((left + up) >> 1)) & 0xff
3729 elif filter_type == 4: # Paeth
3730 a = left
3731 b = up
3732 c = 0
3733
3734 if x > 2 and y > 0:
3735 c = _get_pixel(basex - stride - 3)
3736
3737 p = a + b - c
3738
3739 pa = abs(p - a)
3740 pb = abs(p - b)
3741 pc = abs(p - c)
3742
3743 if pa <= pb and pa <= pc:
3744 color = (color + a) & 0xff
3745 elif pb <= pc:
3746 color = (color + b) & 0xff
3747 else:
3748 color = (color + c) & 0xff
3749
3750 current_row.append(color)
3751
3752 return width, height, pixels
efa97bdc
YCH
3753
3754
3755def write_xattr(path, key, value):
3756 # This mess below finds the best xattr tool for the job
3757 try:
3758 # try the pyxattr module...
3759 import xattr
3760
53a7e3d2
YCH
3761 if hasattr(xattr, 'set'): # pyxattr
3762 # Unicode arguments are not supported in python-pyxattr until
3763 # version 0.5.0
3764 # See https://github.com/rg3/youtube-dl/issues/5498
3765 pyxattr_required_version = '0.5.0'
3766 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3767 # TODO: fallback to CLI tools
3768 raise XAttrUnavailableError(
3769 'python-pyxattr is detected but is too old. '
3770 'youtube-dl requires %s or above while your version is %s. '
3771 'Falling back to other xattr implementations' % (
3772 pyxattr_required_version, xattr.__version__))
3773
3774 setxattr = xattr.set
3775 else: # xattr
3776 setxattr = xattr.setxattr
efa97bdc
YCH
3777
3778 try:
53a7e3d2 3779 setxattr(path, key, value)
efa97bdc
YCH
3780 except EnvironmentError as e:
3781 raise XAttrMetadataError(e.errno, e.strerror)
3782
3783 except ImportError:
3784 if compat_os_name == 'nt':
3785 # Write xattrs to NTFS Alternate Data Streams:
3786 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3787 assert ':' not in key
3788 assert os.path.exists(path)
3789
3790 ads_fn = path + ':' + key
3791 try:
3792 with open(ads_fn, 'wb') as f:
3793 f.write(value)
3794 except EnvironmentError as e:
3795 raise XAttrMetadataError(e.errno, e.strerror)
3796 else:
3797 user_has_setfattr = check_executable('setfattr', ['--version'])
3798 user_has_xattr = check_executable('xattr', ['-h'])
3799
3800 if user_has_setfattr or user_has_xattr:
3801
3802 value = value.decode('utf-8')
3803 if user_has_setfattr:
3804 executable = 'setfattr'
3805 opts = ['-n', key, '-v', value]
3806 elif user_has_xattr:
3807 executable = 'xattr'
3808 opts = ['-w', key, value]
3809
3810 cmd = ([encodeFilename(executable, True)] +
3811 [encodeArgument(o) for o in opts] +
3812 [encodeFilename(path, True)])
3813
3814 try:
3815 p = subprocess.Popen(
3816 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3817 except EnvironmentError as e:
3818 raise XAttrMetadataError(e.errno, e.strerror)
3819 stdout, stderr = p.communicate()
3820 stderr = stderr.decode('utf-8', 'replace')
3821 if p.returncode != 0:
3822 raise XAttrMetadataError(p.returncode, stderr)
3823
3824 else:
3825 # On Unix, and can't find pyxattr, setfattr, or xattr.
3826 if sys.platform.startswith('linux'):
3827 raise XAttrUnavailableError(
3828 "Couldn't find a tool to set the xattrs. "
3829 "Install either the python 'pyxattr' or 'xattr' "
3830 "modules, or the GNU 'attr' package "
3831 "(which contains the 'setfattr' tool).")
3832 else:
3833 raise XAttrUnavailableError(
3834 "Couldn't find a tool to set the xattrs. "
3835 "Install either the python 'xattr' module, "
3836 "or the 'xattr' binary.")
0c265486
YCH
3837
3838
3839def random_birthday(year_field, month_field, day_field):
3840 return {
3841 year_field: str(random.randint(1950, 1995)),
3842 month_field: str(random.randint(1, 12)),
3843 day_field: str(random.randint(1, 31)),
3844 }