]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[fxnetworks] Extract series metadata
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd 1#!/usr/bin/env python
dcdb292f 2# coding: utf-8
d77c3dfd 3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
0c265486 14import email.header
f45c185f 15import errno
be4a824d 16import functools
d77c3dfd 17import gzip
03f9daab 18import io
79a2e94e 19import itertools
f4bfd65f 20import json
d77c3dfd 21import locale
02dbf93f 22import math
347de493 23import operator
d77c3dfd 24import os
c496ca96 25import platform
773f291d 26import random
d77c3dfd 27import re
c496ca96 28import socket
79a2e94e 29import ssl
1c088fa8 30import subprocess
d77c3dfd 31import sys
181c8655 32import tempfile
01951dda 33import traceback
bcf89ce6 34import xml.etree.ElementTree
d77c3dfd 35import zlib
d77c3dfd 36
8c25f81b 37from .compat import (
b4a3d461 38 compat_HTMLParseError,
8bb56eee 39 compat_HTMLParser,
8f9312c3 40 compat_basestring,
8c25f81b 41 compat_chr,
36e6f62c 42 compat_etree_fromstring,
51098426 43 compat_expanduser,
8c25f81b 44 compat_html_entities,
55b2f099 45 compat_html_entities_html5,
be4a824d 46 compat_http_client,
c86b6142 47 compat_kwargs,
efa97bdc 48 compat_os_name,
8c25f81b 49 compat_parse_qs,
702ccf2d 50 compat_shlex_quote,
be4a824d 51 compat_socket_create_connection,
8c25f81b 52 compat_str,
edaa23f8 53 compat_struct_pack,
d3f8e038 54 compat_struct_unpack,
8c25f81b
PH
55 compat_urllib_error,
56 compat_urllib_parse,
15707c7e 57 compat_urllib_parse_urlencode,
8c25f81b 58 compat_urllib_parse_urlparse,
7581bfc9 59 compat_urllib_parse_unquote_plus,
8c25f81b
PH
60 compat_urllib_request,
61 compat_urlparse,
810c10ba 62 compat_xpath,
8c25f81b 63)
4644ac55 64
71aff188
YCH
65from .socks import (
66 ProxyType,
67 sockssocket,
68)
69
4644ac55 70
51fb4995
YCH
71def register_socks_protocols():
72 # "Register" SOCKS protocols
d5ae6bb5
YCH
73 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
74 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
75 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
76 if scheme not in compat_urlparse.uses_netloc:
77 compat_urlparse.uses_netloc.append(scheme)
78
79
468e2e92
FV
80# This is not clearly defined otherwise
81compiled_regex_type = type(re.compile(''))
82
3e669f36 83std_headers = {
15d10678 84 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
59ae15a5
PH
85 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
86 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
87 'Accept-Encoding': 'gzip, deflate',
88 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 89}
f427df17 90
5f6a1245 91
fb37eb25
S
92USER_AGENTS = {
93 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
94}
95
96
bf42a990
S
97NO_DEFAULT = object()
98
7105440c
YCH
99ENGLISH_MONTH_NAMES = [
100 'January', 'February', 'March', 'April', 'May', 'June',
101 'July', 'August', 'September', 'October', 'November', 'December']
102
f6717dec
S
103MONTH_NAMES = {
104 'en': ENGLISH_MONTH_NAMES,
105 'fr': [
3e4185c3
S
106 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
107 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 108}
a942d6cb 109
a7aaa398
S
110KNOWN_EXTENSIONS = (
111 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
112 'flv', 'f4v', 'f4a', 'f4b',
113 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
114 'mkv', 'mka', 'mk3d',
115 'avi', 'divx',
116 'mov',
117 'asf', 'wmv', 'wma',
118 '3gp', '3g2',
119 'mp3',
120 'flac',
121 'ape',
122 'wav',
123 'f4f', 'f4m', 'm3u8', 'smil')
124
c587cbb7 125# needed for sanitizing filenames in restricted mode
c8827027 126ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
127 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
128 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
c587cbb7 129
46f59e89
S
130DATE_FORMATS = (
131 '%d %B %Y',
132 '%d %b %Y',
133 '%B %d %Y',
cb655f34
S
134 '%B %dst %Y',
135 '%B %dnd %Y',
136 '%B %dth %Y',
46f59e89 137 '%b %d %Y',
cb655f34
S
138 '%b %dst %Y',
139 '%b %dnd %Y',
140 '%b %dth %Y',
46f59e89
S
141 '%b %dst %Y %I:%M',
142 '%b %dnd %Y %I:%M',
143 '%b %dth %Y %I:%M',
144 '%Y %m %d',
145 '%Y-%m-%d',
146 '%Y/%m/%d',
81c13222 147 '%Y/%m/%d %H:%M',
46f59e89 148 '%Y/%m/%d %H:%M:%S',
0c1c6f4b 149 '%Y-%m-%d %H:%M',
46f59e89
S
150 '%Y-%m-%d %H:%M:%S',
151 '%Y-%m-%d %H:%M:%S.%f',
152 '%d.%m.%Y %H:%M',
153 '%d.%m.%Y %H.%M',
154 '%Y-%m-%dT%H:%M:%SZ',
155 '%Y-%m-%dT%H:%M:%S.%fZ',
156 '%Y-%m-%dT%H:%M:%S.%f0Z',
157 '%Y-%m-%dT%H:%M:%S',
158 '%Y-%m-%dT%H:%M:%S.%f',
159 '%Y-%m-%dT%H:%M',
c6eed6b8
S
160 '%b %d %Y at %H:%M',
161 '%b %d %Y at %H:%M:%S',
46f59e89
S
162)
163
164DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
165DATE_FORMATS_DAY_FIRST.extend([
166 '%d-%m-%Y',
167 '%d.%m.%Y',
168 '%d.%m.%y',
169 '%d/%m/%Y',
170 '%d/%m/%y',
171 '%d/%m/%Y %H:%M:%S',
172])
173
174DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
175DATE_FORMATS_MONTH_FIRST.extend([
176 '%m-%d-%Y',
177 '%m.%d.%Y',
178 '%m/%d/%Y',
179 '%m/%d/%y',
180 '%m/%d/%Y %H:%M:%S',
181])
182
06b3fe29
S
183PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
184
7105440c 185
d77c3dfd 186def preferredencoding():
59ae15a5 187 """Get preferred encoding.
d77c3dfd 188
59ae15a5
PH
189 Returns the best encoding scheme for the system, based on
190 locale.getpreferredencoding() and some further tweaks.
191 """
192 try:
193 pref = locale.getpreferredencoding()
28e614de 194 'TEST'.encode(pref)
70a1165b 195 except Exception:
59ae15a5 196 pref = 'UTF-8'
bae611f2 197
59ae15a5 198 return pref
d77c3dfd 199
f4bfd65f 200
181c8655 201def write_json_file(obj, fn):
1394646a 202 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 203
92120217 204 fn = encodeFilename(fn)
61ee5aeb 205 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
206 encoding = get_filesystem_encoding()
207 # os.path.basename returns a bytes object, but NamedTemporaryFile
208 # will fail if the filename contains non ascii characters unless we
209 # use a unicode object
210 path_basename = lambda f: os.path.basename(fn).decode(encoding)
211 # the same for os.path.dirname
212 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
213 else:
214 path_basename = os.path.basename
215 path_dirname = os.path.dirname
216
73159f99
S
217 args = {
218 'suffix': '.tmp',
ec5f6016
JMF
219 'prefix': path_basename(fn) + '.',
220 'dir': path_dirname(fn),
73159f99
S
221 'delete': False,
222 }
223
181c8655
PH
224 # In Python 2.x, json.dump expects a bytestream.
225 # In Python 3.x, it writes to a character stream
226 if sys.version_info < (3, 0):
73159f99 227 args['mode'] = 'wb'
181c8655 228 else:
73159f99
S
229 args.update({
230 'mode': 'w',
231 'encoding': 'utf-8',
232 })
233
c86b6142 234 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
235
236 try:
237 with tf:
238 json.dump(obj, tf)
1394646a
IK
239 if sys.platform == 'win32':
240 # Need to remove existing file on Windows, else os.rename raises
241 # WindowsError or FileExistsError.
242 try:
243 os.unlink(fn)
244 except OSError:
245 pass
181c8655 246 os.rename(tf.name, fn)
70a1165b 247 except Exception:
181c8655
PH
248 try:
249 os.remove(tf.name)
250 except OSError:
251 pass
252 raise
253
254
255if sys.version_info >= (2, 7):
ee114368 256 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 257 """ Find the xpath xpath[@key=val] """
5d2354f1 258 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 259 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
260 return node.find(expr)
261else:
ee114368 262 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 263 for f in node.findall(compat_xpath(xpath)):
ee114368
S
264 if key not in f.attrib:
265 continue
266 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
267 return f
268 return None
269
d7e66d39
JMF
270# On python2.6 the xml.etree.ElementTree.Element methods don't support
271# the namespace parameter
5f6a1245
JW
272
273
d7e66d39
JMF
274def xpath_with_ns(path, ns_map):
275 components = [c.split(':') for c in path.split('/')]
276 replaced = []
277 for c in components:
278 if len(c) == 1:
279 replaced.append(c[0])
280 else:
281 ns, tag = c
282 replaced.append('{%s}%s' % (ns_map[ns], tag))
283 return '/'.join(replaced)
284
d77c3dfd 285
a41fb80c 286def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 287 def _find_xpath(xpath):
810c10ba 288 return node.find(compat_xpath(xpath))
578c0745
S
289
290 if isinstance(xpath, (str, compat_str)):
291 n = _find_xpath(xpath)
292 else:
293 for xp in xpath:
294 n = _find_xpath(xp)
295 if n is not None:
296 break
d74bebd5 297
8e636da4 298 if n is None:
bf42a990
S
299 if default is not NO_DEFAULT:
300 return default
301 elif fatal:
bf0ff932
PH
302 name = xpath if name is None else name
303 raise ExtractorError('Could not find XML element %s' % name)
304 else:
305 return None
a41fb80c
S
306 return n
307
308
309def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
310 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
311 if n is None or n == default:
312 return n
313 if n.text is None:
314 if default is not NO_DEFAULT:
315 return default
316 elif fatal:
317 name = xpath if name is None else name
318 raise ExtractorError('Could not find XML element\'s text %s' % name)
319 else:
320 return None
321 return n.text
a41fb80c
S
322
323
324def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
325 n = find_xpath_attr(node, xpath, key)
326 if n is None:
327 if default is not NO_DEFAULT:
328 return default
329 elif fatal:
330 name = '%s[@%s]' % (xpath, key) if name is None else name
331 raise ExtractorError('Could not find XML attribute %s' % name)
332 else:
333 return None
334 return n.attrib[key]
bf0ff932
PH
335
336
9e6dd238 337def get_element_by_id(id, html):
43e8fafd 338 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 339 return get_element_by_attribute('id', id, html)
43e8fafd 340
12ea2f30 341
84c237fb 342def get_element_by_class(class_name, html):
2af12ad9
TC
343 """Return the content of the first tag with the specified class in the passed HTML document"""
344 retval = get_elements_by_class(class_name, html)
345 return retval[0] if retval else None
346
347
348def get_element_by_attribute(attribute, value, html, escape_value=True):
349 retval = get_elements_by_attribute(attribute, value, html, escape_value)
350 return retval[0] if retval else None
351
352
353def get_elements_by_class(class_name, html):
354 """Return the content of all tags with the specified class in the passed HTML document as a list"""
355 return get_elements_by_attribute(
84c237fb
YCH
356 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
357 html, escape_value=False)
358
359
2af12ad9 360def get_elements_by_attribute(attribute, value, html, escape_value=True):
43e8fafd 361 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 362
84c237fb
YCH
363 value = re.escape(value) if escape_value else value
364
2af12ad9
TC
365 retlist = []
366 for m in re.finditer(r'''(?xs)
38285056 367 <([a-zA-Z0-9:._-]+)
609ff8ca 368 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38285056 369 \s+%s=['"]?%s['"]?
609ff8ca 370 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
38285056
PH
371 \s*>
372 (?P<content>.*?)
373 </\1>
2af12ad9
TC
374 ''' % (re.escape(attribute), value), html):
375 res = m.group('content')
38285056 376
2af12ad9
TC
377 if res.startswith('"') or res.startswith("'"):
378 res = res[1:-1]
38285056 379
2af12ad9 380 retlist.append(unescapeHTML(res))
a921f407 381
2af12ad9 382 return retlist
a921f407 383
c5229f39 384
8bb56eee
BF
385class HTMLAttributeParser(compat_HTMLParser):
386 """Trivial HTML parser to gather the attributes for a single element"""
387 def __init__(self):
c5229f39 388 self.attrs = {}
8bb56eee
BF
389 compat_HTMLParser.__init__(self)
390
391 def handle_starttag(self, tag, attrs):
392 self.attrs = dict(attrs)
393
c5229f39 394
8bb56eee
BF
395def extract_attributes(html_element):
396 """Given a string for an HTML element such as
397 <el
398 a="foo" B="bar" c="&98;az" d=boz
399 empty= noval entity="&amp;"
400 sq='"' dq="'"
401 >
402 Decode and return a dictionary of attributes.
403 {
404 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
405 'empty': '', 'noval': None, 'entity': '&',
406 'sq': '"', 'dq': '\''
407 }.
408 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
409 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
410 """
411 parser = HTMLAttributeParser()
b4a3d461
S
412 try:
413 parser.feed(html_element)
414 parser.close()
415 # Older Python may throw HTMLParseError in case of malformed HTML
416 except compat_HTMLParseError:
417 pass
8bb56eee 418 return parser.attrs
9e6dd238 419
c5229f39 420
9e6dd238 421def clean_html(html):
59ae15a5 422 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
423
424 if html is None: # Convenience for sanitizing descriptions etc.
425 return html
426
59ae15a5
PH
427 # Newline vs <br />
428 html = html.replace('\n', ' ')
edd9221c
TF
429 html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
430 html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
431 # Strip html tags
432 html = re.sub('<.*?>', '', html)
433 # Replace html entities
434 html = unescapeHTML(html)
7decf895 435 return html.strip()
9e6dd238
FV
436
437
d77c3dfd 438def sanitize_open(filename, open_mode):
59ae15a5
PH
439 """Try to open the given filename, and slightly tweak it if this fails.
440
441 Attempts to open the given filename. If this fails, it tries to change
442 the filename slightly, step by step, until it's either able to open it
443 or it fails and raises a final exception, like the standard open()
444 function.
445
446 It returns the tuple (stream, definitive_file_name).
447 """
448 try:
28e614de 449 if filename == '-':
59ae15a5
PH
450 if sys.platform == 'win32':
451 import msvcrt
452 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 453 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
454 stream = open(encodeFilename(filename), open_mode)
455 return (stream, filename)
456 except (IOError, OSError) as err:
f45c185f
PH
457 if err.errno in (errno.EACCES,):
458 raise
59ae15a5 459
f45c185f 460 # In case of error, try to remove win32 forbidden chars
d55de57b 461 alt_filename = sanitize_path(filename)
f45c185f
PH
462 if alt_filename == filename:
463 raise
464 else:
465 # An exception here should be caught in the caller
d55de57b 466 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 467 return (stream, alt_filename)
d77c3dfd
FV
468
469
470def timeconvert(timestr):
59ae15a5
PH
471 """Convert RFC 2822 defined time string into system timestamp"""
472 timestamp = None
473 timetuple = email.utils.parsedate_tz(timestr)
474 if timetuple is not None:
475 timestamp = email.utils.mktime_tz(timetuple)
476 return timestamp
1c469a94 477
5f6a1245 478
796173d0 479def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
480 """Sanitizes a string so it could be used as part of a filename.
481 If restricted is set, use a stricter subset of allowed characters.
158af524
S
482 Set is_id if this is not an arbitrary string, but an ID that should be kept
483 if possible.
59ae15a5
PH
484 """
485 def replace_insane(char):
c587cbb7
AT
486 if restricted and char in ACCENT_CHARS:
487 return ACCENT_CHARS[char]
59ae15a5
PH
488 if char == '?' or ord(char) < 32 or ord(char) == 127:
489 return ''
490 elif char == '"':
491 return '' if restricted else '\''
492 elif char == ':':
493 return '_-' if restricted else ' -'
494 elif char in '\\/|*<>':
495 return '_'
627dcfff 496 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
497 return '_'
498 if restricted and ord(char) > 127:
499 return '_'
500 return char
501
2aeb06d6
PH
502 # Handle timestamps
503 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 504 result = ''.join(map(replace_insane, s))
796173d0
PH
505 if not is_id:
506 while '__' in result:
507 result = result.replace('__', '_')
508 result = result.strip('_')
509 # Common case of "Foreign band name - English song title"
510 if restricted and result.startswith('-_'):
511 result = result[2:]
5a42414b
PH
512 if result.startswith('-'):
513 result = '_' + result[len('-'):]
a7440261 514 result = result.lstrip('.')
796173d0
PH
515 if not result:
516 result = '_'
59ae15a5 517 return result
d77c3dfd 518
5f6a1245 519
a2aaf4db
S
520def sanitize_path(s):
521 """Sanitizes and normalizes path on Windows"""
522 if sys.platform != 'win32':
523 return s
be531ef1
S
524 drive_or_unc, _ = os.path.splitdrive(s)
525 if sys.version_info < (2, 7) and not drive_or_unc:
526 drive_or_unc, _ = os.path.splitunc(s)
527 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
528 if drive_or_unc:
a2aaf4db
S
529 norm_path.pop(0)
530 sanitized_path = [
ec85ded8 531 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 532 for path_part in norm_path]
be531ef1
S
533 if drive_or_unc:
534 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
535 return os.path.join(*sanitized_path)
536
537
67dda517
S
538# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
539# unwanted failures due to missing protocol
17bcc626
S
540def sanitize_url(url):
541 return 'http:%s' % url if url.startswith('//') else url
542
543
67dda517 544def sanitized_Request(url, *args, **kwargs):
17bcc626 545 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
546
547
51098426
S
548def expand_path(s):
549 """Expand shell variables and ~"""
550 return os.path.expandvars(compat_expanduser(s))
551
552
d77c3dfd 553def orderedSet(iterable):
59ae15a5
PH
554 """ Remove all duplicates from the input iterable """
555 res = []
556 for el in iterable:
557 if el not in res:
558 res.append(el)
559 return res
d77c3dfd 560
912b38b4 561
55b2f099 562def _htmlentity_transform(entity_with_semicolon):
4e408e47 563 """Transforms an HTML entity to a character."""
55b2f099
YCH
564 entity = entity_with_semicolon[:-1]
565
4e408e47
PH
566 # Known non-numeric HTML entity
567 if entity in compat_html_entities.name2codepoint:
568 return compat_chr(compat_html_entities.name2codepoint[entity])
569
55b2f099
YCH
570 # TODO: HTML5 allows entities without a semicolon. For example,
571 # '&Eacuteric' should be decoded as 'Éric'.
572 if entity_with_semicolon in compat_html_entities_html5:
573 return compat_html_entities_html5[entity_with_semicolon]
574
91757b0f 575 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
576 if mobj is not None:
577 numstr = mobj.group(1)
28e614de 578 if numstr.startswith('x'):
4e408e47 579 base = 16
28e614de 580 numstr = '0%s' % numstr
4e408e47
PH
581 else:
582 base = 10
7aefc49c
S
583 # See https://github.com/rg3/youtube-dl/issues/7518
584 try:
585 return compat_chr(int(numstr, base))
586 except ValueError:
587 pass
4e408e47
PH
588
589 # Unknown entity in name, return its literal representation
7a3f0c00 590 return '&%s;' % entity
4e408e47
PH
591
592
d77c3dfd 593def unescapeHTML(s):
912b38b4
PH
594 if s is None:
595 return None
596 assert type(s) == compat_str
d77c3dfd 597
4e408e47 598 return re.sub(
95f3f7c2 599 r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 600
8bf48f23 601
aa49acd1
S
602def get_subprocess_encoding():
603 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
604 # For subprocess calls, encode with locale encoding
605 # Refer to http://stackoverflow.com/a/9951851/35070
606 encoding = preferredencoding()
607 else:
608 encoding = sys.getfilesystemencoding()
609 if encoding is None:
610 encoding = 'utf-8'
611 return encoding
612
613
8bf48f23 614def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
615 """
616 @param s The name of the file
617 """
d77c3dfd 618
8bf48f23 619 assert type(s) == compat_str
d77c3dfd 620
59ae15a5
PH
621 # Python 3 has a Unicode API
622 if sys.version_info >= (3, 0):
623 return s
0f00efed 624
aa49acd1
S
625 # Pass '' directly to use Unicode APIs on Windows 2000 and up
626 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
627 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
628 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
629 return s
630
8ee239e9
YCH
631 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
632 if sys.platform.startswith('java'):
633 return s
634
aa49acd1
S
635 return s.encode(get_subprocess_encoding(), 'ignore')
636
637
638def decodeFilename(b, for_subprocess=False):
639
640 if sys.version_info >= (3, 0):
641 return b
642
643 if not isinstance(b, bytes):
644 return b
645
646 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 647
f07b74fc
PH
648
649def encodeArgument(s):
650 if not isinstance(s, compat_str):
651 # Legacy code that uses byte strings
652 # Uncomment the following line after fixing all post processors
7af808a5 653 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
654 s = s.decode('ascii')
655 return encodeFilename(s, True)
656
657
aa49acd1
S
658def decodeArgument(b):
659 return decodeFilename(b, True)
660
661
8271226a
PH
662def decodeOption(optval):
663 if optval is None:
664 return optval
665 if isinstance(optval, bytes):
666 optval = optval.decode(preferredencoding())
667
668 assert isinstance(optval, compat_str)
669 return optval
1c256f70 670
5f6a1245 671
4539dd30
PH
672def formatSeconds(secs):
673 if secs > 3600:
674 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
675 elif secs > 60:
676 return '%d:%02d' % (secs // 60, secs % 60)
677 else:
678 return '%d' % secs
679
a0ddb8a2 680
be4a824d
PH
681def make_HTTPS_handler(params, **kwargs):
682 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 683 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 684 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 685 if opts_no_check_certificate:
be5f2c19 686 context.check_hostname = False
0db261ba 687 context.verify_mode = ssl.CERT_NONE
a2366922 688 try:
be4a824d 689 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
690 except TypeError:
691 # Python 2.7.8
692 # (create_default_context present but HTTPSHandler has no context=)
693 pass
694
695 if sys.version_info < (3, 2):
d7932313 696 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 697 else: # Python < 3.4
d7932313 698 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 699 context.verify_mode = (ssl.CERT_NONE
dca08720 700 if opts_no_check_certificate
ea6d901e 701 else ssl.CERT_REQUIRED)
303b479e 702 context.set_default_verify_paths()
be4a824d 703 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 704
732ea2f0 705
08f2a92c
JMF
706def bug_reports_message():
707 if ytdl_is_updateable():
708 update_cmd = 'type youtube-dl -U to update'
709 else:
710 update_cmd = 'see https://yt-dl.org/update on how to update'
711 msg = '; please report this issue on https://yt-dl.org/bug .'
712 msg += ' Make sure you are using the latest version; %s.' % update_cmd
713 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
714 return msg
715
716
bf5b9d85
PM
717class YoutubeDLError(Exception):
718 """Base exception for YoutubeDL errors."""
719 pass
720
721
722class ExtractorError(YoutubeDLError):
1c256f70 723 """Error during info extraction."""
5f6a1245 724
d11271dd 725 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
726 """ tb, if given, is the original traceback (so that it can be printed out).
727 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
728 """
729
730 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
731 expected = True
d11271dd
PH
732 if video_id is not None:
733 msg = video_id + ': ' + msg
410f3e73 734 if cause:
28e614de 735 msg += ' (caused by %r)' % cause
9a82b238 736 if not expected:
08f2a92c 737 msg += bug_reports_message()
1c256f70 738 super(ExtractorError, self).__init__(msg)
d5979c5d 739
1c256f70 740 self.traceback = tb
8cc83b8d 741 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 742 self.cause = cause
d11271dd 743 self.video_id = video_id
1c256f70 744
01951dda
PH
745 def format_traceback(self):
746 if self.traceback is None:
747 return None
28e614de 748 return ''.join(traceback.format_tb(self.traceback))
01951dda 749
1c256f70 750
416c7fcb
PH
751class UnsupportedError(ExtractorError):
752 def __init__(self, url):
753 super(UnsupportedError, self).__init__(
754 'Unsupported URL: %s' % url, expected=True)
755 self.url = url
756
757
55b3e45b
JMF
758class RegexNotFoundError(ExtractorError):
759 """Error when a regex didn't match"""
760 pass
761
762
773f291d
S
763class GeoRestrictedError(ExtractorError):
764 """Geographic restriction Error exception.
765
766 This exception may be thrown when a video is not available from your
767 geographic location due to geographic restrictions imposed by a website.
768 """
769 def __init__(self, msg, countries=None):
770 super(GeoRestrictedError, self).__init__(msg, expected=True)
771 self.msg = msg
772 self.countries = countries
773
774
bf5b9d85 775class DownloadError(YoutubeDLError):
59ae15a5 776 """Download Error exception.
d77c3dfd 777
59ae15a5
PH
778 This exception may be thrown by FileDownloader objects if they are not
779 configured to continue on errors. They will contain the appropriate
780 error message.
781 """
5f6a1245 782
8cc83b8d
FV
783 def __init__(self, msg, exc_info=None):
784 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
785 super(DownloadError, self).__init__(msg)
786 self.exc_info = exc_info
d77c3dfd
FV
787
788
bf5b9d85 789class SameFileError(YoutubeDLError):
59ae15a5 790 """Same File exception.
d77c3dfd 791
59ae15a5
PH
792 This exception will be thrown by FileDownloader objects if they detect
793 multiple files would have to be downloaded to the same file on disk.
794 """
795 pass
d77c3dfd
FV
796
797
bf5b9d85 798class PostProcessingError(YoutubeDLError):
59ae15a5 799 """Post Processing exception.
d77c3dfd 800
59ae15a5
PH
801 This exception may be raised by PostProcessor's .run() method to
802 indicate an error in the postprocessing task.
803 """
5f6a1245 804
7851b379 805 def __init__(self, msg):
bf5b9d85 806 super(PostProcessingError, self).__init__(msg)
7851b379 807 self.msg = msg
d77c3dfd 808
5f6a1245 809
bf5b9d85 810class MaxDownloadsReached(YoutubeDLError):
59ae15a5
PH
811 """ --max-downloads limit has been reached. """
812 pass
d77c3dfd
FV
813
814
bf5b9d85 815class UnavailableVideoError(YoutubeDLError):
59ae15a5 816 """Unavailable Format exception.
d77c3dfd 817
59ae15a5
PH
818 This exception will be thrown when a video is requested
819 in a format that is not available for that video.
820 """
821 pass
d77c3dfd
FV
822
823
bf5b9d85 824class ContentTooShortError(YoutubeDLError):
59ae15a5 825 """Content Too Short exception.
d77c3dfd 826
59ae15a5
PH
827 This exception may be raised by FileDownloader objects when a file they
828 download is too small for what the server announced first, indicating
829 the connection was probably interrupted.
830 """
d77c3dfd 831
59ae15a5 832 def __init__(self, downloaded, expected):
bf5b9d85
PM
833 super(ContentTooShortError, self).__init__(
834 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected)
835 )
2c7ed247 836 # Both in bytes
59ae15a5
PH
837 self.downloaded = downloaded
838 self.expected = expected
d77c3dfd 839
5f6a1245 840
bf5b9d85 841class XAttrMetadataError(YoutubeDLError):
efa97bdc
YCH
842 def __init__(self, code=None, msg='Unknown error'):
843 super(XAttrMetadataError, self).__init__(msg)
844 self.code = code
bd264412 845 self.msg = msg
efa97bdc
YCH
846
847 # Parsing code and msg
848 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
849 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
850 self.reason = 'NO_SPACE'
851 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
852 self.reason = 'VALUE_TOO_LONG'
853 else:
854 self.reason = 'NOT_SUPPORTED'
855
856
bf5b9d85 857class XAttrUnavailableError(YoutubeDLError):
efa97bdc
YCH
858 pass
859
860
c5a59d93 861def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
862 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
863 # expected HTTP responses to meet HTTP/1.0 or later (see also
864 # https://github.com/rg3/youtube-dl/issues/6727)
865 if sys.version_info < (3, 0):
5a1a2e94 866 kwargs[b'strict'] = True
be4a824d
PH
867 hc = http_class(*args, **kwargs)
868 source_address = ydl_handler._params.get('source_address')
869 if source_address is not None:
870 sa = (source_address, 0)
871 if hasattr(hc, 'source_address'): # Python 2.7+
872 hc.source_address = sa
873 else: # Python 2.6
874 def _hc_connect(self, *args, **kwargs):
875 sock = compat_socket_create_connection(
876 (self.host, self.port), self.timeout, sa)
877 if is_https:
d7932313
PH
878 self.sock = ssl.wrap_socket(
879 sock, self.key_file, self.cert_file,
880 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
881 else:
882 self.sock = sock
883 hc.connect = functools.partial(_hc_connect, hc)
884
885 return hc
886
887
87f0e62d 888def handle_youtubedl_headers(headers):
992fc9d6
YCH
889 filtered_headers = headers
890
891 if 'Youtubedl-no-compression' in filtered_headers:
892 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 893 del filtered_headers['Youtubedl-no-compression']
87f0e62d 894
992fc9d6 895 return filtered_headers
87f0e62d
YCH
896
897
acebc9cd 898class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
899 """Handler for HTTP requests and responses.
900
901 This class, when installed with an OpenerDirector, automatically adds
902 the standard headers to every HTTP request and handles gzipped and
903 deflated responses from web servers. If compression is to be avoided in
904 a particular request, the original request in the program code only has
0424ec30 905 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
906 removed before making the real request.
907
908 Part of this code was copied from:
909
910 http://techknack.net/python-urllib2-handlers/
911
912 Andrew Rowls, the author of that code, agreed to release it to the
913 public domain.
914 """
915
be4a824d
PH
916 def __init__(self, params, *args, **kwargs):
917 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
918 self._params = params
919
920 def http_open(self, req):
71aff188
YCH
921 conn_class = compat_http_client.HTTPConnection
922
923 socks_proxy = req.headers.get('Ytdl-socks-proxy')
924 if socks_proxy:
925 conn_class = make_socks_conn_class(conn_class, socks_proxy)
926 del req.headers['Ytdl-socks-proxy']
927
be4a824d 928 return self.do_open(functools.partial(
71aff188 929 _create_http_connection, self, conn_class, False),
be4a824d
PH
930 req)
931
59ae15a5
PH
932 @staticmethod
933 def deflate(data):
934 try:
935 return zlib.decompress(data, -zlib.MAX_WBITS)
936 except zlib.error:
937 return zlib.decompress(data)
938
acebc9cd 939 def http_request(self, req):
51f267d9
S
940 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
941 # always respected by websites, some tend to give out URLs with non percent-encoded
942 # non-ASCII characters (see telemb.py, ard.py [#3412])
943 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
944 # To work around aforementioned issue we will replace request's original URL with
945 # percent-encoded one
946 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
947 # the code of this workaround has been moved here from YoutubeDL.urlopen()
948 url = req.get_full_url()
949 url_escaped = escape_url(url)
950
951 # Substitute URL if any change after escaping
952 if url != url_escaped:
15d260eb 953 req = update_Request(req, url=url_escaped)
51f267d9 954
33ac271b 955 for h, v in std_headers.items():
3d5f7a39
JK
956 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
957 # The dict keys are capitalized because of this bug by urllib
958 if h.capitalize() not in req.headers:
33ac271b 959 req.add_header(h, v)
87f0e62d
YCH
960
961 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
962
963 if sys.version_info < (2, 7) and '#' in req.get_full_url():
964 # Python 2.6 is brain-dead when it comes to fragments
965 req._Request__original = req._Request__original.partition('#')[0]
966 req._Request__r_type = req._Request__r_type.partition('#')[0]
967
59ae15a5
PH
968 return req
969
acebc9cd 970 def http_response(self, req, resp):
59ae15a5
PH
971 old_resp = resp
972 # gzip
973 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
974 content = resp.read()
975 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
976 try:
977 uncompressed = io.BytesIO(gz.read())
978 except IOError as original_ioerror:
979 # There may be junk add the end of the file
980 # See http://stackoverflow.com/q/4928560/35070 for details
981 for i in range(1, 1024):
982 try:
983 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
984 uncompressed = io.BytesIO(gz.read())
985 except IOError:
986 continue
987 break
988 else:
989 raise original_ioerror
b407d853 990 resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 991 resp.msg = old_resp.msg
c047270c 992 del resp.headers['Content-encoding']
59ae15a5
PH
993 # deflate
994 if resp.headers.get('Content-encoding', '') == 'deflate':
995 gz = io.BytesIO(self.deflate(resp.read()))
b407d853 996 resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 997 resp.msg = old_resp.msg
c047270c 998 del resp.headers['Content-encoding']
ad729172
S
999 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
1000 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
1001 if 300 <= resp.code < 400:
1002 location = resp.headers.get('Location')
1003 if location:
1004 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
1005 if sys.version_info >= (3, 0):
1006 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
1007 else:
1008 location = location.decode('utf-8')
5a4d9ddb
S
1009 location_escaped = escape_url(location)
1010 if location != location_escaped:
1011 del resp.headers['Location']
9a4aec8b
YCH
1012 if sys.version_info < (3, 0):
1013 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 1014 resp.headers['Location'] = location_escaped
59ae15a5 1015 return resp
0f8d03f8 1016
acebc9cd
PH
1017 https_request = http_request
1018 https_response = http_response
bf50b038 1019
5de90176 1020
71aff188
YCH
1021def make_socks_conn_class(base_class, socks_proxy):
1022 assert issubclass(base_class, (
1023 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
1024
1025 url_components = compat_urlparse.urlparse(socks_proxy)
1026 if url_components.scheme.lower() == 'socks5':
1027 socks_type = ProxyType.SOCKS5
1028 elif url_components.scheme.lower() in ('socks', 'socks4'):
1029 socks_type = ProxyType.SOCKS4
51fb4995
YCH
1030 elif url_components.scheme.lower() == 'socks4a':
1031 socks_type = ProxyType.SOCKS4A
71aff188 1032
cdd94c2e
YCH
1033 def unquote_if_non_empty(s):
1034 if not s:
1035 return s
1036 return compat_urllib_parse_unquote_plus(s)
1037
71aff188
YCH
1038 proxy_args = (
1039 socks_type,
1040 url_components.hostname, url_components.port or 1080,
1041 True, # Remote DNS
cdd94c2e
YCH
1042 unquote_if_non_empty(url_components.username),
1043 unquote_if_non_empty(url_components.password),
71aff188
YCH
1044 )
1045
1046 class SocksConnection(base_class):
1047 def connect(self):
1048 self.sock = sockssocket()
1049 self.sock.setproxy(*proxy_args)
1050 if type(self.timeout) in (int, float):
1051 self.sock.settimeout(self.timeout)
1052 self.sock.connect((self.host, self.port))
1053
1054 if isinstance(self, compat_http_client.HTTPSConnection):
1055 if hasattr(self, '_context'): # Python > 2.6
1056 self.sock = self._context.wrap_socket(
1057 self.sock, server_hostname=self.host)
1058 else:
1059 self.sock = ssl.wrap_socket(self.sock)
1060
1061 return SocksConnection
1062
1063
be4a824d
PH
1064class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1065 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1066 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1067 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1068 self._params = params
1069
1070 def https_open(self, req):
4f264c02 1071 kwargs = {}
71aff188
YCH
1072 conn_class = self._https_conn_class
1073
4f264c02
JMF
1074 if hasattr(self, '_context'): # python > 2.6
1075 kwargs['context'] = self._context
1076 if hasattr(self, '_check_hostname'): # python 3.x
1077 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1078
1079 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1080 if socks_proxy:
1081 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1082 del req.headers['Ytdl-socks-proxy']
1083
be4a824d 1084 return self.do_open(functools.partial(
71aff188 1085 _create_http_connection, self, conn_class, True),
4f264c02 1086 req, **kwargs)
be4a824d
PH
1087
1088
a6420bf5
S
1089class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1090 def __init__(self, cookiejar=None):
1091 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1092
1093 def http_response(self, request, response):
1094 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1095 # characters in Set-Cookie HTTP header of last response (see
1096 # https://github.com/rg3/youtube-dl/issues/6769).
1097 # In order to at least prevent crashing we will percent encode Set-Cookie
1098 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1099 # if sys.version_info < (3, 0) and response.headers:
1100 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1101 # set_cookie = response.headers.get(set_cookie_header)
1102 # if set_cookie:
1103 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1104 # if set_cookie != set_cookie_escaped:
1105 # del response.headers[set_cookie_header]
1106 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1107 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1108
1109 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1110 https_response = http_response
1111
1112
46f59e89
S
1113def extract_timezone(date_str):
1114 m = re.search(
1115 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1116 date_str)
1117 if not m:
1118 timezone = datetime.timedelta()
1119 else:
1120 date_str = date_str[:-len(m.group('tz'))]
1121 if not m.group('sign'):
1122 timezone = datetime.timedelta()
1123 else:
1124 sign = 1 if m.group('sign') == '+' else -1
1125 timezone = datetime.timedelta(
1126 hours=sign * int(m.group('hours')),
1127 minutes=sign * int(m.group('minutes')))
1128 return timezone, date_str
1129
1130
08b38d54 1131def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1132 """ Return a UNIX timestamp from the given date """
1133
1134 if date_str is None:
1135 return None
1136
52c3a6e4
S
1137 date_str = re.sub(r'\.[0-9]+', '', date_str)
1138
08b38d54 1139 if timezone is None:
46f59e89
S
1140 timezone, date_str = extract_timezone(date_str)
1141
52c3a6e4
S
1142 try:
1143 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1144 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1145 return calendar.timegm(dt.timetuple())
1146 except ValueError:
1147 pass
912b38b4
PH
1148
1149
46f59e89
S
1150def date_formats(day_first=True):
1151 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1152
1153
42bdd9d0 1154def unified_strdate(date_str, day_first=True):
bf50b038 1155 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1156
1157 if date_str is None:
1158 return None
bf50b038 1159 upload_date = None
5f6a1245 1160 # Replace commas
026fcc04 1161 date_str = date_str.replace(',', ' ')
42bdd9d0 1162 # Remove AM/PM + timezone
9bb8e0a3 1163 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1164 _, date_str = extract_timezone(date_str)
42bdd9d0 1165
46f59e89 1166 for expression in date_formats(day_first):
bf50b038
JMF
1167 try:
1168 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1169 except ValueError:
bf50b038 1170 pass
42393ce2
PH
1171 if upload_date is None:
1172 timetuple = email.utils.parsedate_tz(date_str)
1173 if timetuple:
c6b9cf05
S
1174 try:
1175 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1176 except ValueError:
1177 pass
6a750402
JMF
1178 if upload_date is not None:
1179 return compat_str(upload_date)
bf50b038 1180
5f6a1245 1181
46f59e89
S
1182def unified_timestamp(date_str, day_first=True):
1183 if date_str is None:
1184 return None
1185
2ae2ffda 1186 date_str = re.sub(r'[,|]', '', date_str)
46f59e89 1187
7dc2a74e 1188 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1189 timezone, date_str = extract_timezone(date_str)
1190
1191 # Remove AM/PM + timezone
1192 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1193
deef3195
S
1194 # Remove unrecognized timezones from ISO 8601 alike timestamps
1195 m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
1196 if m:
1197 date_str = date_str[:-len(m.group('tz'))]
1198
46f59e89
S
1199 for expression in date_formats(day_first):
1200 try:
7dc2a74e 1201 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1202 return calendar.timegm(dt.timetuple())
1203 except ValueError:
1204 pass
1205 timetuple = email.utils.parsedate_tz(date_str)
1206 if timetuple:
7dc2a74e 1207 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1208
1209
28e614de 1210def determine_ext(url, default_ext='unknown_video'):
f4776371
S
1211 if url is None:
1212 return default_ext
9cb9a5df 1213 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1214 if re.match(r'^[A-Za-z0-9]+$', guess):
1215 return guess
a7aaa398
S
1216 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1217 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1218 return guess.rstrip('/')
73e79f2a 1219 else:
cbdbb766 1220 return default_ext
73e79f2a 1221
5f6a1245 1222
d4051a8e 1223def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1224 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1225
5f6a1245 1226
bd558525 1227def date_from_str(date_str):
37254abc
JMF
1228 """
1229 Return a datetime object from a string in the format YYYYMMDD or
1230 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1231 today = datetime.date.today()
f8795e10 1232 if date_str in ('now', 'today'):
37254abc 1233 return today
f8795e10
PH
1234 if date_str == 'yesterday':
1235 return today - datetime.timedelta(days=1)
ec85ded8 1236 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
37254abc
JMF
1237 if match is not None:
1238 sign = match.group('sign')
1239 time = int(match.group('time'))
1240 if sign == '-':
1241 time = -time
1242 unit = match.group('unit')
dfb1b146 1243 # A bad approximation?
37254abc
JMF
1244 if unit == 'month':
1245 unit = 'day'
1246 time *= 30
1247 elif unit == 'year':
1248 unit = 'day'
1249 time *= 365
1250 unit += 's'
1251 delta = datetime.timedelta(**{unit: time})
1252 return today + delta
611c1dd9 1253 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1254
1255
e63fc1be 1256def hyphenate_date(date_str):
1257 """
1258 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1259 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1260 if match is not None:
1261 return '-'.join(match.groups())
1262 else:
1263 return date_str
1264
5f6a1245 1265
bd558525
JMF
1266class DateRange(object):
1267 """Represents a time interval between two dates"""
5f6a1245 1268
bd558525
JMF
1269 def __init__(self, start=None, end=None):
1270 """start and end must be strings in the format accepted by date"""
1271 if start is not None:
1272 self.start = date_from_str(start)
1273 else:
1274 self.start = datetime.datetime.min.date()
1275 if end is not None:
1276 self.end = date_from_str(end)
1277 else:
1278 self.end = datetime.datetime.max.date()
37254abc 1279 if self.start > self.end:
bd558525 1280 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1281
bd558525
JMF
1282 @classmethod
1283 def day(cls, day):
1284 """Returns a range that only contains the given day"""
5f6a1245
JW
1285 return cls(day, day)
1286
bd558525
JMF
1287 def __contains__(self, date):
1288 """Check if the date is in the range"""
37254abc
JMF
1289 if not isinstance(date, datetime.date):
1290 date = date_from_str(date)
1291 return self.start <= date <= self.end
5f6a1245 1292
bd558525 1293 def __str__(self):
5f6a1245 1294 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1295
1296
1297def platform_name():
1298 """ Returns the platform name as a compat_str """
1299 res = platform.platform()
1300 if isinstance(res, bytes):
1301 res = res.decode(preferredencoding())
1302
1303 assert isinstance(res, compat_str)
1304 return res
c257baff
PH
1305
1306
b58ddb32
PH
1307def _windows_write_string(s, out):
1308 """ Returns True if the string was written using special methods,
1309 False if it has yet to be written out."""
1310 # Adapted from http://stackoverflow.com/a/3259271/35070
1311
1312 import ctypes
1313 import ctypes.wintypes
1314
1315 WIN_OUTPUT_IDS = {
1316 1: -11,
1317 2: -12,
1318 }
1319
a383a98a
PH
1320 try:
1321 fileno = out.fileno()
1322 except AttributeError:
1323 # If the output stream doesn't have a fileno, it's virtual
1324 return False
aa42e873
PH
1325 except io.UnsupportedOperation:
1326 # Some strange Windows pseudo files?
1327 return False
b58ddb32
PH
1328 if fileno not in WIN_OUTPUT_IDS:
1329 return False
1330
e2f89ec7 1331 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1332 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1333 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1334 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1335
e2f89ec7 1336 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1337 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1338 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1339 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1340 written = ctypes.wintypes.DWORD(0)
1341
611c1dd9 1342 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1343 FILE_TYPE_CHAR = 0x0002
1344 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1345 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1346 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1347 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1348 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1349 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1350
1351 def not_a_console(handle):
1352 if handle == INVALID_HANDLE_VALUE or handle is None:
1353 return True
8fb3ac36
PH
1354 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1355 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1356
1357 if not_a_console(h):
1358 return False
1359
d1b9c912
PH
1360 def next_nonbmp_pos(s):
1361 try:
1362 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1363 except StopIteration:
1364 return len(s)
1365
1366 while s:
1367 count = min(next_nonbmp_pos(s), 1024)
1368
b58ddb32 1369 ret = WriteConsoleW(
d1b9c912 1370 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1371 if ret == 0:
1372 raise OSError('Failed to write string')
d1b9c912
PH
1373 if not count: # We just wrote a non-BMP character
1374 assert written.value == 2
1375 s = s[1:]
1376 else:
1377 assert written.value > 0
1378 s = s[written.value:]
b58ddb32
PH
1379 return True
1380
1381
734f90bb 1382def write_string(s, out=None, encoding=None):
7459e3a2
PH
1383 if out is None:
1384 out = sys.stderr
8bf48f23 1385 assert type(s) == compat_str
7459e3a2 1386
b58ddb32
PH
1387 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1388 if _windows_write_string(s, out):
1389 return
1390
7459e3a2
PH
1391 if ('b' in getattr(out, 'mode', '') or
1392 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1393 byt = s.encode(encoding or preferredencoding(), 'ignore')
1394 out.write(byt)
1395 elif hasattr(out, 'buffer'):
1396 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1397 byt = s.encode(enc, 'ignore')
1398 out.buffer.write(byt)
1399 else:
8bf48f23 1400 out.write(s)
7459e3a2
PH
1401 out.flush()
1402
1403
48ea9cea
PH
1404def bytes_to_intlist(bs):
1405 if not bs:
1406 return []
1407 if isinstance(bs[0], int): # Python 3
1408 return list(bs)
1409 else:
1410 return [ord(c) for c in bs]
1411
c257baff 1412
cba892fa 1413def intlist_to_bytes(xs):
1414 if not xs:
1415 return b''
edaa23f8 1416 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1417
1418
c1c9a79c
PH
1419# Cross-platform file locking
1420if sys.platform == 'win32':
1421 import ctypes.wintypes
1422 import msvcrt
1423
1424 class OVERLAPPED(ctypes.Structure):
1425 _fields_ = [
1426 ('Internal', ctypes.wintypes.LPVOID),
1427 ('InternalHigh', ctypes.wintypes.LPVOID),
1428 ('Offset', ctypes.wintypes.DWORD),
1429 ('OffsetHigh', ctypes.wintypes.DWORD),
1430 ('hEvent', ctypes.wintypes.HANDLE),
1431 ]
1432
1433 kernel32 = ctypes.windll.kernel32
1434 LockFileEx = kernel32.LockFileEx
1435 LockFileEx.argtypes = [
1436 ctypes.wintypes.HANDLE, # hFile
1437 ctypes.wintypes.DWORD, # dwFlags
1438 ctypes.wintypes.DWORD, # dwReserved
1439 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1440 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1441 ctypes.POINTER(OVERLAPPED) # Overlapped
1442 ]
1443 LockFileEx.restype = ctypes.wintypes.BOOL
1444 UnlockFileEx = kernel32.UnlockFileEx
1445 UnlockFileEx.argtypes = [
1446 ctypes.wintypes.HANDLE, # hFile
1447 ctypes.wintypes.DWORD, # dwReserved
1448 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1449 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1450 ctypes.POINTER(OVERLAPPED) # Overlapped
1451 ]
1452 UnlockFileEx.restype = ctypes.wintypes.BOOL
1453 whole_low = 0xffffffff
1454 whole_high = 0x7fffffff
1455
1456 def _lock_file(f, exclusive):
1457 overlapped = OVERLAPPED()
1458 overlapped.Offset = 0
1459 overlapped.OffsetHigh = 0
1460 overlapped.hEvent = 0
1461 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1462 handle = msvcrt.get_osfhandle(f.fileno())
1463 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1464 whole_low, whole_high, f._lock_file_overlapped_p):
1465 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1466
1467 def _unlock_file(f):
1468 assert f._lock_file_overlapped_p
1469 handle = msvcrt.get_osfhandle(f.fileno())
1470 if not UnlockFileEx(handle, 0,
1471 whole_low, whole_high, f._lock_file_overlapped_p):
1472 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1473
1474else:
399a76e6
YCH
1475 # Some platforms, such as Jython, is missing fcntl
1476 try:
1477 import fcntl
c1c9a79c 1478
399a76e6
YCH
1479 def _lock_file(f, exclusive):
1480 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1481
399a76e6
YCH
1482 def _unlock_file(f):
1483 fcntl.flock(f, fcntl.LOCK_UN)
1484 except ImportError:
1485 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1486
1487 def _lock_file(f, exclusive):
1488 raise IOError(UNSUPPORTED_MSG)
1489
1490 def _unlock_file(f):
1491 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1492
1493
1494class locked_file(object):
1495 def __init__(self, filename, mode, encoding=None):
1496 assert mode in ['r', 'a', 'w']
1497 self.f = io.open(filename, mode, encoding=encoding)
1498 self.mode = mode
1499
1500 def __enter__(self):
1501 exclusive = self.mode != 'r'
1502 try:
1503 _lock_file(self.f, exclusive)
1504 except IOError:
1505 self.f.close()
1506 raise
1507 return self
1508
1509 def __exit__(self, etype, value, traceback):
1510 try:
1511 _unlock_file(self.f)
1512 finally:
1513 self.f.close()
1514
1515 def __iter__(self):
1516 return iter(self.f)
1517
1518 def write(self, *args):
1519 return self.f.write(*args)
1520
1521 def read(self, *args):
1522 return self.f.read(*args)
4eb7f1d1
JMF
1523
1524
4644ac55
S
1525def get_filesystem_encoding():
1526 encoding = sys.getfilesystemencoding()
1527 return encoding if encoding is not None else 'utf-8'
1528
1529
4eb7f1d1 1530def shell_quote(args):
a6a173c2 1531 quoted_args = []
4644ac55 1532 encoding = get_filesystem_encoding()
a6a173c2
JMF
1533 for a in args:
1534 if isinstance(a, bytes):
1535 # We may get a filename encoded with 'encodeFilename'
1536 a = a.decode(encoding)
aefce8e6 1537 quoted_args.append(compat_shlex_quote(a))
28e614de 1538 return ' '.join(quoted_args)
9d4660ca
PH
1539
1540
1541def smuggle_url(url, data):
1542 """ Pass additional data in a URL for internal use. """
1543
81953d1a
RA
1544 url, idata = unsmuggle_url(url, {})
1545 data.update(idata)
15707c7e 1546 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1547 {'__youtubedl_smuggle': json.dumps(data)})
1548 return url + '#' + sdata
9d4660ca
PH
1549
1550
79f82953 1551def unsmuggle_url(smug_url, default=None):
83e865a3 1552 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1553 return smug_url, default
28e614de
PH
1554 url, _, sdata = smug_url.rpartition('#')
1555 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1556 data = json.loads(jsond)
1557 return url, data
02dbf93f
PH
1558
1559
02dbf93f
PH
1560def format_bytes(bytes):
1561 if bytes is None:
28e614de 1562 return 'N/A'
02dbf93f
PH
1563 if type(bytes) is str:
1564 bytes = float(bytes)
1565 if bytes == 0.0:
1566 exponent = 0
1567 else:
1568 exponent = int(math.log(bytes, 1024.0))
28e614de 1569 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1570 converted = float(bytes) / float(1024 ** exponent)
28e614de 1571 return '%.2f%s' % (converted, suffix)
f53c966a 1572
1c088fa8 1573
fb47597b
S
1574def lookup_unit_table(unit_table, s):
1575 units_re = '|'.join(re.escape(u) for u in unit_table)
1576 m = re.match(
782b1b5b 1577 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1578 if not m:
1579 return None
1580 num_str = m.group('num').replace(',', '.')
1581 mult = unit_table[m.group('unit')]
1582 return int(float(num_str) * mult)
1583
1584
be64b5b0
PH
1585def parse_filesize(s):
1586 if s is None:
1587 return None
1588
dfb1b146 1589 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1590 # but we support those too
1591 _UNIT_TABLE = {
1592 'B': 1,
1593 'b': 1,
70852b47 1594 'bytes': 1,
be64b5b0
PH
1595 'KiB': 1024,
1596 'KB': 1000,
1597 'kB': 1024,
1598 'Kb': 1000,
13585d76 1599 'kb': 1000,
70852b47
YCH
1600 'kilobytes': 1000,
1601 'kibibytes': 1024,
be64b5b0
PH
1602 'MiB': 1024 ** 2,
1603 'MB': 1000 ** 2,
1604 'mB': 1024 ** 2,
1605 'Mb': 1000 ** 2,
13585d76 1606 'mb': 1000 ** 2,
70852b47
YCH
1607 'megabytes': 1000 ** 2,
1608 'mebibytes': 1024 ** 2,
be64b5b0
PH
1609 'GiB': 1024 ** 3,
1610 'GB': 1000 ** 3,
1611 'gB': 1024 ** 3,
1612 'Gb': 1000 ** 3,
13585d76 1613 'gb': 1000 ** 3,
70852b47
YCH
1614 'gigabytes': 1000 ** 3,
1615 'gibibytes': 1024 ** 3,
be64b5b0
PH
1616 'TiB': 1024 ** 4,
1617 'TB': 1000 ** 4,
1618 'tB': 1024 ** 4,
1619 'Tb': 1000 ** 4,
13585d76 1620 'tb': 1000 ** 4,
70852b47
YCH
1621 'terabytes': 1000 ** 4,
1622 'tebibytes': 1024 ** 4,
be64b5b0
PH
1623 'PiB': 1024 ** 5,
1624 'PB': 1000 ** 5,
1625 'pB': 1024 ** 5,
1626 'Pb': 1000 ** 5,
13585d76 1627 'pb': 1000 ** 5,
70852b47
YCH
1628 'petabytes': 1000 ** 5,
1629 'pebibytes': 1024 ** 5,
be64b5b0
PH
1630 'EiB': 1024 ** 6,
1631 'EB': 1000 ** 6,
1632 'eB': 1024 ** 6,
1633 'Eb': 1000 ** 6,
13585d76 1634 'eb': 1000 ** 6,
70852b47
YCH
1635 'exabytes': 1000 ** 6,
1636 'exbibytes': 1024 ** 6,
be64b5b0
PH
1637 'ZiB': 1024 ** 7,
1638 'ZB': 1000 ** 7,
1639 'zB': 1024 ** 7,
1640 'Zb': 1000 ** 7,
13585d76 1641 'zb': 1000 ** 7,
70852b47
YCH
1642 'zettabytes': 1000 ** 7,
1643 'zebibytes': 1024 ** 7,
be64b5b0
PH
1644 'YiB': 1024 ** 8,
1645 'YB': 1000 ** 8,
1646 'yB': 1024 ** 8,
1647 'Yb': 1000 ** 8,
13585d76 1648 'yb': 1000 ** 8,
70852b47
YCH
1649 'yottabytes': 1000 ** 8,
1650 'yobibytes': 1024 ** 8,
be64b5b0
PH
1651 }
1652
fb47597b
S
1653 return lookup_unit_table(_UNIT_TABLE, s)
1654
1655
1656def parse_count(s):
1657 if s is None:
be64b5b0
PH
1658 return None
1659
fb47597b
S
1660 s = s.strip()
1661
1662 if re.match(r'^[\d,.]+$', s):
1663 return str_to_int(s)
1664
1665 _UNIT_TABLE = {
1666 'k': 1000,
1667 'K': 1000,
1668 'm': 1000 ** 2,
1669 'M': 1000 ** 2,
1670 'kk': 1000 ** 2,
1671 'KK': 1000 ** 2,
1672 }
be64b5b0 1673
fb47597b 1674 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1675
2f7ae819 1676
a942d6cb 1677def month_by_name(name, lang='en'):
caefb1de
PH
1678 """ Return the number of a month by (locale-independently) English name """
1679
f6717dec 1680 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1681
caefb1de 1682 try:
f6717dec 1683 return month_names.index(name) + 1
7105440c
YCH
1684 except ValueError:
1685 return None
1686
1687
1688def month_by_abbreviation(abbrev):
1689 """ Return the number of a month by (locale-independently) English
1690 abbreviations """
1691
1692 try:
1693 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1694 except ValueError:
1695 return None
18258362
JMF
1696
1697
5aafe895 1698def fix_xml_ampersands(xml_str):
18258362 1699 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1700 return re.sub(
1701 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1702 '&amp;',
5aafe895 1703 xml_str)
e3946f98
PH
1704
1705
1706def setproctitle(title):
8bf48f23 1707 assert isinstance(title, compat_str)
c1c05c67
YCH
1708
1709 # ctypes in Jython is not complete
1710 # http://bugs.jython.org/issue2148
1711 if sys.platform.startswith('java'):
1712 return
1713
e3946f98 1714 try:
611c1dd9 1715 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1716 except OSError:
1717 return
2f49bcd6
RC
1718 except TypeError:
1719 # LoadLibrary in Windows Python 2.7.13 only expects
1720 # a bytestring, but since unicode_literals turns
1721 # every string into a unicode string, it fails.
1722 return
6eefe533
PH
1723 title_bytes = title.encode('utf-8')
1724 buf = ctypes.create_string_buffer(len(title_bytes))
1725 buf.value = title_bytes
e3946f98 1726 try:
6eefe533 1727 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1728 except AttributeError:
1729 return # Strange libc, just skip this
d7dda168
PH
1730
1731
1732def remove_start(s, start):
46bc9b7d 1733 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1734
1735
2b9faf55 1736def remove_end(s, end):
46bc9b7d 1737 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1738
1739
31b2051e
S
1740def remove_quotes(s):
1741 if s is None or len(s) < 2:
1742 return s
1743 for quote in ('"', "'", ):
1744 if s[0] == quote and s[-1] == quote:
1745 return s[1:-1]
1746 return s
1747
1748
29eb5174 1749def url_basename(url):
9b8aaeed 1750 path = compat_urlparse.urlparse(url).path
28e614de 1751 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1752
1753
02dc0a36
S
1754def base_url(url):
1755 return re.match(r'https?://[^?#&]+/', url).group()
1756
1757
e34c3361 1758def urljoin(base, path):
4b5de77b
S
1759 if isinstance(path, bytes):
1760 path = path.decode('utf-8')
e34c3361
S
1761 if not isinstance(path, compat_str) or not path:
1762 return None
b0c65c67 1763 if re.match(r'^(?:https?:)?//', path):
e34c3361 1764 return path
4b5de77b
S
1765 if isinstance(base, bytes):
1766 base = base.decode('utf-8')
1767 if not isinstance(base, compat_str) or not re.match(
1768 r'^(?:https?:)?//', base):
e34c3361
S
1769 return None
1770 return compat_urlparse.urljoin(base, path)
1771
1772
aa94a6d3
PH
1773class HEADRequest(compat_urllib_request.Request):
1774 def get_method(self):
611c1dd9 1775 return 'HEAD'
7217e148
PH
1776
1777
95cf60e8
S
1778class PUTRequest(compat_urllib_request.Request):
1779 def get_method(self):
1780 return 'PUT'
1781
1782
9732d77e 1783def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1784 if get_attr:
1785 if v is not None:
1786 v = getattr(v, get_attr, None)
9572013d
PH
1787 if v == '':
1788 v = None
1812afb7
S
1789 if v is None:
1790 return default
1791 try:
1792 return int(v) * invscale // scale
1793 except ValueError:
af98f8ff 1794 return default
9732d77e 1795
9572013d 1796
40a90862
JMF
1797def str_or_none(v, default=None):
1798 return default if v is None else compat_str(v)
1799
9732d77e
PH
1800
1801def str_to_int(int_str):
48d4681e 1802 """ A more relaxed version of int_or_none """
9732d77e
PH
1803 if int_str is None:
1804 return None
28e614de 1805 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1806 return int(int_str)
608d11f5
PH
1807
1808
9732d77e 1809def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1810 if v is None:
1811 return default
1812 try:
1813 return float(v) * invscale / scale
1814 except ValueError:
1815 return default
43f775e4
PH
1816
1817
c7e327c4
S
1818def bool_or_none(v, default=None):
1819 return v if isinstance(v, bool) else default
1820
1821
b72b4431
S
1822def strip_or_none(v):
1823 return None if v is None else v.strip()
1824
1825
608d11f5 1826def parse_duration(s):
8f9312c3 1827 if not isinstance(s, compat_basestring):
608d11f5
PH
1828 return None
1829
ca7b3246
S
1830 s = s.strip()
1831
acaff495 1832 days, hours, mins, secs, ms = [None] * 5
15846398 1833 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?Z?$', s)
acaff495 1834 if m:
1835 days, hours, mins, secs, ms = m.groups()
1836 else:
1837 m = re.match(
1838 r'''(?ix)(?:P?T)?
8f4b58d7 1839 (?:
acaff495 1840 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1841 )?
acaff495 1842 (?:
1843 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1844 )?
1845 (?:
1846 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1847 )?
1848 (?:
1849 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
15846398 1850 )?Z?$''', s)
acaff495 1851 if m:
1852 days, hours, mins, secs, ms = m.groups()
1853 else:
15846398 1854 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s)
acaff495 1855 if m:
1856 hours, mins = m.groups()
1857 else:
1858 return None
1859
1860 duration = 0
1861 if secs:
1862 duration += float(secs)
1863 if mins:
1864 duration += float(mins) * 60
1865 if hours:
1866 duration += float(hours) * 60 * 60
1867 if days:
1868 duration += float(days) * 24 * 60 * 60
1869 if ms:
1870 duration += float(ms)
1871 return duration
91d7d0b3
JMF
1872
1873
e65e4c88 1874def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1875 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1876 return (
1877 '{0}.{1}{2}'.format(name, ext, real_ext)
1878 if not expected_real_ext or real_ext[1:] == expected_real_ext
1879 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1880
1881
b3ed15b7
S
1882def replace_extension(filename, ext, expected_real_ext=None):
1883 name, real_ext = os.path.splitext(filename)
1884 return '{0}.{1}'.format(
1885 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1886 ext)
1887
1888
d70ad093
PH
1889def check_executable(exe, args=[]):
1890 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1891 args can be a list of arguments for a short output (like -version) """
1892 try:
1893 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1894 except OSError:
1895 return False
1896 return exe
b7ab0590
PH
1897
1898
95807118 1899def get_exe_version(exe, args=['--version'],
cae97f65 1900 version_re=None, unrecognized='present'):
95807118
PH
1901 """ Returns the version of the specified executable,
1902 or False if the executable is not present """
1903 try:
b64d04c1
YCH
1904 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1905 # SIGTTOU if youtube-dl is run in the background.
1906 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
cae97f65 1907 out, _ = subprocess.Popen(
54116803 1908 [encodeArgument(exe)] + args,
00ca7552 1909 stdin=subprocess.PIPE,
95807118
PH
1910 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1911 except OSError:
1912 return False
cae97f65
PH
1913 if isinstance(out, bytes): # Python 2.x
1914 out = out.decode('ascii', 'ignore')
1915 return detect_exe_version(out, version_re, unrecognized)
1916
1917
1918def detect_exe_version(output, version_re=None, unrecognized='present'):
1919 assert isinstance(output, compat_str)
1920 if version_re is None:
1921 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1922 m = re.search(version_re, output)
95807118
PH
1923 if m:
1924 return m.group(1)
1925 else:
1926 return unrecognized
1927
1928
b7ab0590 1929class PagedList(object):
dd26ced1
PH
1930 def __len__(self):
1931 # This is only useful for tests
1932 return len(self.getslice())
1933
9c44d242
PH
1934
1935class OnDemandPagedList(PagedList):
6be08ce6 1936 def __init__(self, pagefunc, pagesize, use_cache=True):
9c44d242
PH
1937 self._pagefunc = pagefunc
1938 self._pagesize = pagesize
b95dc034
YCH
1939 self._use_cache = use_cache
1940 if use_cache:
1941 self._cache = {}
9c44d242 1942
b7ab0590
PH
1943 def getslice(self, start=0, end=None):
1944 res = []
1945 for pagenum in itertools.count(start // self._pagesize):
1946 firstid = pagenum * self._pagesize
1947 nextfirstid = pagenum * self._pagesize + self._pagesize
1948 if start >= nextfirstid:
1949 continue
1950
b95dc034
YCH
1951 page_results = None
1952 if self._use_cache:
1953 page_results = self._cache.get(pagenum)
1954 if page_results is None:
1955 page_results = list(self._pagefunc(pagenum))
1956 if self._use_cache:
1957 self._cache[pagenum] = page_results
b7ab0590
PH
1958
1959 startv = (
1960 start % self._pagesize
1961 if firstid <= start < nextfirstid
1962 else 0)
1963
1964 endv = (
1965 ((end - 1) % self._pagesize) + 1
1966 if (end is not None and firstid <= end <= nextfirstid)
1967 else None)
1968
1969 if startv != 0 or endv is not None:
1970 page_results = page_results[startv:endv]
1971 res.extend(page_results)
1972
1973 # A little optimization - if current page is not "full", ie. does
1974 # not contain page_size videos then we can assume that this page
1975 # is the last one - there are no more ids on further pages -
1976 # i.e. no need to query again.
1977 if len(page_results) + startv < self._pagesize:
1978 break
1979
1980 # If we got the whole page, but the next page is not interesting,
1981 # break out early as well
1982 if end == nextfirstid:
1983 break
1984 return res
81c2f20b
PH
1985
1986
9c44d242
PH
1987class InAdvancePagedList(PagedList):
1988 def __init__(self, pagefunc, pagecount, pagesize):
1989 self._pagefunc = pagefunc
1990 self._pagecount = pagecount
1991 self._pagesize = pagesize
1992
1993 def getslice(self, start=0, end=None):
1994 res = []
1995 start_page = start // self._pagesize
1996 end_page = (
1997 self._pagecount if end is None else (end // self._pagesize + 1))
1998 skip_elems = start - start_page * self._pagesize
1999 only_more = None if end is None else end - start
2000 for pagenum in range(start_page, end_page):
2001 page = list(self._pagefunc(pagenum))
2002 if skip_elems:
2003 page = page[skip_elems:]
2004 skip_elems = None
2005 if only_more is not None:
2006 if len(page) < only_more:
2007 only_more -= len(page)
2008 else:
2009 page = page[:only_more]
2010 res.extend(page)
2011 break
2012 res.extend(page)
2013 return res
2014
2015
81c2f20b 2016def uppercase_escape(s):
676eb3f2 2017 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 2018 return re.sub(
a612753d 2019 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
2020 lambda m: unicode_escape(m.group(0))[0],
2021 s)
0fe2ff78
YCH
2022
2023
2024def lowercase_escape(s):
2025 unicode_escape = codecs.getdecoder('unicode_escape')
2026 return re.sub(
2027 r'\\u[0-9a-fA-F]{4}',
2028 lambda m: unicode_escape(m.group(0))[0],
2029 s)
b53466e1 2030
d05cfe06
S
2031
2032def escape_rfc3986(s):
2033 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 2034 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 2035 s = s.encode('utf-8')
ecc0c5ee 2036 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
2037
2038
2039def escape_url(url):
2040 """Escape URL as suggested by RFC 3986"""
2041 url_parsed = compat_urllib_parse_urlparse(url)
2042 return url_parsed._replace(
efbed08d 2043 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
2044 path=escape_rfc3986(url_parsed.path),
2045 params=escape_rfc3986(url_parsed.params),
2046 query=escape_rfc3986(url_parsed.query),
2047 fragment=escape_rfc3986(url_parsed.fragment)
2048 ).geturl()
2049
62e609ab
PH
2050
2051def read_batch_urls(batch_fd):
2052 def fixup(url):
2053 if not isinstance(url, compat_str):
2054 url = url.decode('utf-8', 'replace')
28e614de 2055 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
2056 if url.startswith(BOM_UTF8):
2057 url = url[len(BOM_UTF8):]
2058 url = url.strip()
2059 if url.startswith(('#', ';', ']')):
2060 return False
2061 return url
2062
2063 with contextlib.closing(batch_fd) as fd:
2064 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2065
2066
2067def urlencode_postdata(*args, **kargs):
15707c7e 2068 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2069
2070
38f9ef31 2071def update_url_query(url, query):
cacd9966
YCH
2072 if not query:
2073 return url
38f9ef31 2074 parsed_url = compat_urlparse.urlparse(url)
2075 qs = compat_parse_qs(parsed_url.query)
2076 qs.update(query)
2077 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2078 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2079
8e60dc75 2080
ed0291d1
S
2081def update_Request(req, url=None, data=None, headers={}, query={}):
2082 req_headers = req.headers.copy()
2083 req_headers.update(headers)
2084 req_data = data or req.data
2085 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2086 req_get_method = req.get_method()
2087 if req_get_method == 'HEAD':
2088 req_type = HEADRequest
2089 elif req_get_method == 'PUT':
2090 req_type = PUTRequest
2091 else:
2092 req_type = compat_urllib_request.Request
ed0291d1
S
2093 new_req = req_type(
2094 req_url, data=req_data, headers=req_headers,
2095 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2096 if hasattr(req, 'timeout'):
2097 new_req.timeout = req.timeout
2098 return new_req
2099
2100
10c87c15 2101def _multipart_encode_impl(data, boundary):
0c265486
YCH
2102 content_type = 'multipart/form-data; boundary=%s' % boundary
2103
2104 out = b''
2105 for k, v in data.items():
2106 out += b'--' + boundary.encode('ascii') + b'\r\n'
2107 if isinstance(k, compat_str):
2108 k = k.encode('utf-8')
2109 if isinstance(v, compat_str):
2110 v = v.encode('utf-8')
2111 # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
2112 # suggests sending UTF-8 directly. Firefox sends UTF-8, too
b2ad479d 2113 content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
0c265486
YCH
2114 if boundary.encode('ascii') in content:
2115 raise ValueError('Boundary overlaps with data')
2116 out += content
2117
2118 out += b'--' + boundary.encode('ascii') + b'--\r\n'
2119
2120 return out, content_type
2121
2122
2123def multipart_encode(data, boundary=None):
2124 '''
2125 Encode a dict to RFC 7578-compliant form-data
2126
2127 data:
2128 A dict where keys and values can be either Unicode or bytes-like
2129 objects.
2130 boundary:
2131 If specified a Unicode object, it's used as the boundary. Otherwise
2132 a random boundary is generated.
2133
2134 Reference: https://tools.ietf.org/html/rfc7578
2135 '''
2136 has_specified_boundary = boundary is not None
2137
2138 while True:
2139 if boundary is None:
2140 boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
2141
2142 try:
10c87c15 2143 out, content_type = _multipart_encode_impl(data, boundary)
0c265486
YCH
2144 break
2145 except ValueError:
2146 if has_specified_boundary:
2147 raise
2148 boundary = None
2149
2150 return out, content_type
2151
2152
86296ad2 2153def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
2154 if isinstance(key_or_keys, (list, tuple)):
2155 for key in key_or_keys:
86296ad2
S
2156 if key not in d or d[key] is None or skip_false_values and not d[key]:
2157 continue
2158 return d[key]
cbecc9b9
S
2159 return default
2160 return d.get(key_or_keys, default)
2161
2162
329ca3be 2163def try_get(src, getter, expected_type=None):
a32a9a7e
S
2164 if not isinstance(getter, (list, tuple)):
2165 getter = [getter]
2166 for get in getter:
2167 try:
2168 v = get(src)
2169 except (AttributeError, KeyError, TypeError, IndexError):
2170 pass
2171 else:
2172 if expected_type is None or isinstance(v, expected_type):
2173 return v
329ca3be
S
2174
2175
8e60dc75
S
2176def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2177 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2178
16392824 2179
a1a530b0
PH
2180US_RATINGS = {
2181 'G': 0,
2182 'PG': 10,
2183 'PG-13': 13,
2184 'R': 16,
2185 'NC': 18,
2186}
fac55558
PH
2187
2188
a8795327
S
2189TV_PARENTAL_GUIDELINES = {
2190 'TV-Y': 0,
2191 'TV-Y7': 7,
2192 'TV-G': 0,
2193 'TV-PG': 0,
2194 'TV-14': 14,
2195 'TV-MA': 17,
2196}
2197
2198
146c80e2 2199def parse_age_limit(s):
a8795327
S
2200 if type(s) == int:
2201 return s if 0 <= s <= 21 else None
2202 if not isinstance(s, compat_basestring):
d838b1bd 2203 return None
146c80e2 2204 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2205 if m:
2206 return int(m.group('age'))
2207 if s in US_RATINGS:
2208 return US_RATINGS[s]
2209 return TV_PARENTAL_GUIDELINES.get(s)
146c80e2
S
2210
2211
fac55558 2212def strip_jsonp(code):
609a61e3 2213 return re.sub(
5552c9eb
YCH
2214 r'''(?sx)^
2215 (?:window\.)?(?P<func_name>[a-zA-Z0-9_.$]+)
2216 (?:\s*&&\s*(?P=func_name))?
2217 \s*\(\s*(?P<callback_data>.*)\);?
2218 \s*?(?://[^\n]*)*$''',
2219 r'\g<callback_data>', code)
478c2c61
PH
2220
2221
e05f6939 2222def js_to_json(code):
4195096e
S
2223 COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*'
2224 SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE)
2225 INTEGER_TABLE = (
2226 (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16),
2227 (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8),
2228 )
2229
e05f6939 2230 def fix_kv(m):
e7b6d122
PH
2231 v = m.group(0)
2232 if v in ('true', 'false', 'null'):
2233 return v
b3ee552e 2234 elif v.startswith('/*') or v.startswith('//') or v == ',':
bd1e4844 2235 return ""
2236
2237 if v[0] in ("'", '"'):
2238 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 2239 '"': '\\"',
bd1e4844 2240 "\\'": "'",
2241 '\\\n': '',
2242 '\\x': '\\u00',
2243 }.get(m.group(0), m.group(0)), v[1:-1])
2244
89ac4a19
S
2245 for regex, base in INTEGER_TABLE:
2246 im = re.match(regex, v)
2247 if im:
e4659b45 2248 i = int(im.group(1), base)
89ac4a19
S
2249 return '"%d":' % i if v.endswith(':') else '%d' % i
2250
e7b6d122 2251 return '"%s"' % v
e05f6939 2252
bd1e4844 2253 return re.sub(r'''(?sx)
2254 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2255 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
4195096e 2256 {comment}|,(?={skip}[\]}}])|
bd1e4844 2257 [a-zA-Z_][.a-zA-Z_0-9]*|
4195096e
S
2258 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
2259 [0-9]+(?={skip}:)
2260 '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
e05f6939
PH
2261
2262
478c2c61
PH
2263def qualities(quality_ids):
2264 """ Get a numeric quality value out of a list of possible values """
2265 def q(qid):
2266 try:
2267 return quality_ids.index(qid)
2268 except ValueError:
2269 return -1
2270 return q
2271
acd69589
PH
2272
2273DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 2274
a020a0dc
PH
2275
2276def limit_length(s, length):
2277 """ Add ellipses to overly long strings """
2278 if s is None:
2279 return None
2280 ELLIPSES = '...'
2281 if len(s) > length:
2282 return s[:length - len(ELLIPSES)] + ELLIPSES
2283 return s
48844745
PH
2284
2285
2286def version_tuple(v):
5f9b8394 2287 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2288
2289
2290def is_outdated_version(version, limit, assume_new=True):
2291 if not version:
2292 return not assume_new
2293 try:
2294 return version_tuple(version) < version_tuple(limit)
2295 except ValueError:
2296 return not assume_new
732ea2f0
PH
2297
2298
2299def ytdl_is_updateable():
2300 """ Returns if youtube-dl can be updated with -U """
2301 from zipimport import zipimporter
2302
2303 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
2304
2305
2306def args_to_str(args):
2307 # Get a short string representation for a subprocess command
702ccf2d 2308 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2309
2310
9b9c5355 2311def error_to_compat_str(err):
fdae2358
S
2312 err_str = str(err)
2313 # On python 2 error byte string must be decoded with proper
2314 # encoding rather than ascii
2315 if sys.version_info[0] < 3:
2316 err_str = err_str.decode(preferredencoding())
2317 return err_str
2318
2319
c460bdd5 2320def mimetype2ext(mt):
eb9ee194
S
2321 if mt is None:
2322 return None
2323
765ac263
JMF
2324 ext = {
2325 'audio/mp4': 'm4a',
6c33d24b
YCH
2326 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2327 # it's the most popular one
2328 'audio/mpeg': 'mp3',
765ac263
JMF
2329 }.get(mt)
2330 if ext is not None:
2331 return ext
2332
c460bdd5 2333 _, _, res = mt.rpartition('/')
6562d34a 2334 res = res.split(';')[0].strip().lower()
c460bdd5
PH
2335
2336 return {
f6861ec9 2337 '3gpp': '3gp',
cafcf657 2338 'smptett+xml': 'tt',
cafcf657 2339 'ttaf+xml': 'dfxp',
a0d8d704 2340 'ttml+xml': 'ttml',
f6861ec9 2341 'x-flv': 'flv',
a0d8d704
YCH
2342 'x-mp4-fragmented': 'mp4',
2343 'x-ms-wmv': 'wmv',
b4173f15
RA
2344 'mpegurl': 'm3u8',
2345 'x-mpegurl': 'm3u8',
2346 'vnd.apple.mpegurl': 'm3u8',
2347 'dash+xml': 'mpd',
b4173f15 2348 'f4m+xml': 'f4m',
f164b971 2349 'hds+xml': 'f4m',
e910fe2f 2350 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 2351 'quicktime': 'mov',
98ce1a3f 2352 'mp2t': 'ts',
c460bdd5
PH
2353 }.get(res, res)
2354
2355
4f3c5e06 2356def parse_codecs(codecs_str):
2357 # http://tools.ietf.org/html/rfc6381
2358 if not codecs_str:
2359 return {}
2360 splited_codecs = list(filter(None, map(
2361 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2362 vcodec, acodec = None, None
2363 for full_codec in splited_codecs:
2364 codec = full_codec.split('.')[0]
2365 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2366 if not vcodec:
2367 vcodec = full_codec
60f5c9fb 2368 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
4f3c5e06 2369 if not acodec:
2370 acodec = full_codec
2371 else:
60f5c9fb 2372 write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
4f3c5e06 2373 if not vcodec and not acodec:
2374 if len(splited_codecs) == 2:
2375 return {
2376 'vcodec': vcodec,
2377 'acodec': acodec,
2378 }
2379 elif len(splited_codecs) == 1:
2380 return {
2381 'vcodec': 'none',
2382 'acodec': vcodec,
2383 }
2384 else:
2385 return {
2386 'vcodec': vcodec or 'none',
2387 'acodec': acodec or 'none',
2388 }
2389 return {}
2390
2391
2ccd1b10 2392def urlhandle_detect_ext(url_handle):
79298173 2393 getheader = url_handle.headers.get
2ccd1b10 2394
b55ee18f
PH
2395 cd = getheader('Content-Disposition')
2396 if cd:
2397 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2398 if m:
2399 e = determine_ext(m.group('filename'), default_ext=None)
2400 if e:
2401 return e
2402
c460bdd5 2403 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2404
2405
1e399778
YCH
2406def encode_data_uri(data, mime_type):
2407 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2408
2409
05900629 2410def age_restricted(content_limit, age_limit):
6ec6cb4e 2411 """ Returns True iff the content should be blocked """
05900629
PH
2412
2413 if age_limit is None: # No limit set
2414 return False
2415 if content_limit is None:
2416 return False # Content available for everyone
2417 return age_limit < content_limit
61ca9a80
PH
2418
2419
2420def is_html(first_bytes):
2421 """ Detect whether a file contains HTML by examining its first bytes. """
2422
2423 BOMS = [
2424 (b'\xef\xbb\xbf', 'utf-8'),
2425 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2426 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2427 (b'\xff\xfe', 'utf-16-le'),
2428 (b'\xfe\xff', 'utf-16-be'),
2429 ]
2430 for bom, enc in BOMS:
2431 if first_bytes.startswith(bom):
2432 s = first_bytes[len(bom):].decode(enc, 'replace')
2433 break
2434 else:
2435 s = first_bytes.decode('utf-8', 'replace')
2436
2437 return re.match(r'^\s*<', s)
a055469f
PH
2438
2439
2440def determine_protocol(info_dict):
2441 protocol = info_dict.get('protocol')
2442 if protocol is not None:
2443 return protocol
2444
2445 url = info_dict['url']
2446 if url.startswith('rtmp'):
2447 return 'rtmp'
2448 elif url.startswith('mms'):
2449 return 'mms'
2450 elif url.startswith('rtsp'):
2451 return 'rtsp'
2452
2453 ext = determine_ext(url)
2454 if ext == 'm3u8':
2455 return 'm3u8'
2456 elif ext == 'f4m':
2457 return 'f4m'
2458
2459 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2460
2461
2462def render_table(header_row, data):
2463 """ Render a list of rows, each as a list of values """
2464 table = [header_row] + data
2465 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2466 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2467 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2468
2469
2470def _match_one(filter_part, dct):
2471 COMPARISON_OPERATORS = {
2472 '<': operator.lt,
2473 '<=': operator.le,
2474 '>': operator.gt,
2475 '>=': operator.ge,
2476 '=': operator.eq,
2477 '!=': operator.ne,
2478 }
2479 operator_rex = re.compile(r'''(?x)\s*
2480 (?P<key>[a-z_]+)
2481 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2482 (?:
2483 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
db13c16e 2484 (?P<quote>["\'])(?P<quotedstrval>(?:\\.|(?!(?P=quote)|\\).)+?)(?P=quote)|
347de493
PH
2485 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2486 )
2487 \s*$
2488 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2489 m = operator_rex.search(filter_part)
2490 if m:
2491 op = COMPARISON_OPERATORS[m.group('op')]
e5a088dc 2492 actual_value = dct.get(m.group('key'))
db13c16e
S
2493 if (m.group('quotedstrval') is not None or
2494 m.group('strval') is not None or
e5a088dc
S
2495 # If the original field is a string and matching comparisonvalue is
2496 # a number we should respect the origin of the original field
2497 # and process comparison value as a string (see
2498 # https://github.com/rg3/youtube-dl/issues/11082).
2499 actual_value is not None and m.group('intval') is not None and
2500 isinstance(actual_value, compat_str)):
347de493
PH
2501 if m.group('op') not in ('=', '!='):
2502 raise ValueError(
2503 'Operator %s does not support string values!' % m.group('op'))
db13c16e
S
2504 comparison_value = m.group('quotedstrval') or m.group('strval') or m.group('intval')
2505 quote = m.group('quote')
2506 if quote is not None:
2507 comparison_value = comparison_value.replace(r'\%s' % quote, quote)
347de493
PH
2508 else:
2509 try:
2510 comparison_value = int(m.group('intval'))
2511 except ValueError:
2512 comparison_value = parse_filesize(m.group('intval'))
2513 if comparison_value is None:
2514 comparison_value = parse_filesize(m.group('intval') + 'B')
2515 if comparison_value is None:
2516 raise ValueError(
2517 'Invalid integer value %r in filter part %r' % (
2518 m.group('intval'), filter_part))
347de493
PH
2519 if actual_value is None:
2520 return m.group('none_inclusive')
2521 return op(actual_value, comparison_value)
2522
2523 UNARY_OPERATORS = {
2524 '': lambda v: v is not None,
2525 '!': lambda v: v is None,
2526 }
2527 operator_rex = re.compile(r'''(?x)\s*
2528 (?P<op>%s)\s*(?P<key>[a-z_]+)
2529 \s*$
2530 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2531 m = operator_rex.search(filter_part)
2532 if m:
2533 op = UNARY_OPERATORS[m.group('op')]
2534 actual_value = dct.get(m.group('key'))
2535 return op(actual_value)
2536
2537 raise ValueError('Invalid filter part %r' % filter_part)
2538
2539
2540def match_str(filter_str, dct):
2541 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2542
2543 return all(
2544 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2545
2546
2547def match_filter_func(filter_str):
2548 def _match_func(info_dict):
2549 if match_str(filter_str, info_dict):
2550 return None
2551 else:
2552 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2553 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2554 return _match_func
91410c9b
PH
2555
2556
bf6427d2
YCH
2557def parse_dfxp_time_expr(time_expr):
2558 if not time_expr:
d631d5f9 2559 return
bf6427d2
YCH
2560
2561 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2562 if mobj:
2563 return float(mobj.group('time_offset'))
2564
db2fe38b 2565 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2566 if mobj:
db2fe38b 2567 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2568
2569
c1c924ab
YCH
2570def srt_subtitles_timecode(seconds):
2571 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2572
2573
2574def dfxp2srt(dfxp_data):
3869028f
YCH
2575 '''
2576 @param dfxp_data A bytes-like object containing DFXP data
2577 @returns A unicode object containing converted SRT data
2578 '''
5b995f71 2579 LEGACY_NAMESPACES = (
3869028f
YCH
2580 (b'http://www.w3.org/ns/ttml', [
2581 b'http://www.w3.org/2004/11/ttaf1',
2582 b'http://www.w3.org/2006/04/ttaf1',
2583 b'http://www.w3.org/2006/10/ttaf1',
5b995f71 2584 ]),
3869028f
YCH
2585 (b'http://www.w3.org/ns/ttml#styling', [
2586 b'http://www.w3.org/ns/ttml#style',
5b995f71
RA
2587 ]),
2588 )
2589
2590 SUPPORTED_STYLING = [
2591 'color',
2592 'fontFamily',
2593 'fontSize',
2594 'fontStyle',
2595 'fontWeight',
2596 'textDecoration'
2597 ]
2598
4e335771
YCH
2599 _x = functools.partial(xpath_with_ns, ns_map={
2600 'ttml': 'http://www.w3.org/ns/ttml',
5b995f71 2601 'tts': 'http://www.w3.org/ns/ttml#styling',
4e335771 2602 })
bf6427d2 2603
5b995f71
RA
2604 styles = {}
2605 default_style = {}
2606
87de7069 2607 class TTMLPElementParser(object):
5b995f71
RA
2608 _out = ''
2609 _unclosed_elements = []
2610 _applied_styles = []
bf6427d2 2611
2b14cb56 2612 def start(self, tag, attrib):
5b995f71
RA
2613 if tag in (_x('ttml:br'), 'br'):
2614 self._out += '\n'
2615 else:
2616 unclosed_elements = []
2617 style = {}
2618 element_style_id = attrib.get('style')
2619 if default_style:
2620 style.update(default_style)
2621 if element_style_id:
2622 style.update(styles.get(element_style_id, {}))
2623 for prop in SUPPORTED_STYLING:
2624 prop_val = attrib.get(_x('tts:' + prop))
2625 if prop_val:
2626 style[prop] = prop_val
2627 if style:
2628 font = ''
2629 for k, v in sorted(style.items()):
2630 if self._applied_styles and self._applied_styles[-1].get(k) == v:
2631 continue
2632 if k == 'color':
2633 font += ' color="%s"' % v
2634 elif k == 'fontSize':
2635 font += ' size="%s"' % v
2636 elif k == 'fontFamily':
2637 font += ' face="%s"' % v
2638 elif k == 'fontWeight' and v == 'bold':
2639 self._out += '<b>'
2640 unclosed_elements.append('b')
2641 elif k == 'fontStyle' and v == 'italic':
2642 self._out += '<i>'
2643 unclosed_elements.append('i')
2644 elif k == 'textDecoration' and v == 'underline':
2645 self._out += '<u>'
2646 unclosed_elements.append('u')
2647 if font:
2648 self._out += '<font' + font + '>'
2649 unclosed_elements.append('font')
2650 applied_style = {}
2651 if self._applied_styles:
2652 applied_style.update(self._applied_styles[-1])
2653 applied_style.update(style)
2654 self._applied_styles.append(applied_style)
2655 self._unclosed_elements.append(unclosed_elements)
bf6427d2 2656
2b14cb56 2657 def end(self, tag):
5b995f71
RA
2658 if tag not in (_x('ttml:br'), 'br'):
2659 unclosed_elements = self._unclosed_elements.pop()
2660 for element in reversed(unclosed_elements):
2661 self._out += '</%s>' % element
2662 if unclosed_elements and self._applied_styles:
2663 self._applied_styles.pop()
bf6427d2 2664
2b14cb56 2665 def data(self, data):
5b995f71 2666 self._out += data
2b14cb56 2667
2668 def close(self):
5b995f71 2669 return self._out.strip()
2b14cb56 2670
2671 def parse_node(node):
2672 target = TTMLPElementParser()
2673 parser = xml.etree.ElementTree.XMLParser(target=target)
2674 parser.feed(xml.etree.ElementTree.tostring(node))
2675 return parser.close()
bf6427d2 2676
5b995f71
RA
2677 for k, v in LEGACY_NAMESPACES:
2678 for ns in v:
2679 dfxp_data = dfxp_data.replace(ns, k)
2680
3869028f 2681 dfxp = compat_etree_fromstring(dfxp_data)
bf6427d2 2682 out = []
5b995f71 2683 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2684
2685 if not paras:
2686 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2 2687
5b995f71
RA
2688 repeat = False
2689 while True:
2690 for style in dfxp.findall(_x('.//ttml:style')):
2691 style_id = style.get('id')
2692 parent_style_id = style.get('style')
2693 if parent_style_id:
2694 if parent_style_id not in styles:
2695 repeat = True
2696 continue
2697 styles[style_id] = styles[parent_style_id].copy()
2698 for prop in SUPPORTED_STYLING:
2699 prop_val = style.get(_x('tts:' + prop))
2700 if prop_val:
2701 styles.setdefault(style_id, {})[prop] = prop_val
2702 if repeat:
2703 repeat = False
2704 else:
2705 break
2706
2707 for p in ('body', 'div'):
2708 ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
2709 if ele is None:
2710 continue
2711 style = styles.get(ele.get('style'))
2712 if not style:
2713 continue
2714 default_style.update(style)
2715
bf6427d2 2716 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2717 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2718 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2719 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2720 if begin_time is None:
2721 continue
7dff0363 2722 if not end_time:
d631d5f9
YCH
2723 if not dur:
2724 continue
2725 end_time = begin_time + dur
bf6427d2
YCH
2726 out.append('%d\n%s --> %s\n%s\n\n' % (
2727 index,
c1c924ab
YCH
2728 srt_subtitles_timecode(begin_time),
2729 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2730 parse_node(para)))
2731
2732 return ''.join(out)
2733
2734
66e289ba
S
2735def cli_option(params, command_option, param):
2736 param = params.get(param)
98e698f1
RA
2737 if param:
2738 param = compat_str(param)
66e289ba
S
2739 return [command_option, param] if param is not None else []
2740
2741
2742def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2743 param = params.get(param)
5b232f46
S
2744 if param is None:
2745 return []
66e289ba
S
2746 assert isinstance(param, bool)
2747 if separator:
2748 return [command_option + separator + (true_value if param else false_value)]
2749 return [command_option, true_value if param else false_value]
2750
2751
2752def cli_valueless_option(params, command_option, param, expected_value=True):
2753 param = params.get(param)
2754 return [command_option] if param == expected_value else []
2755
2756
2757def cli_configuration_args(params, param, default=[]):
2758 ex_args = params.get(param)
2759 if ex_args is None:
2760 return default
2761 assert isinstance(ex_args, list)
2762 return ex_args
2763
2764
39672624
YCH
2765class ISO639Utils(object):
2766 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2767 _lang_map = {
2768 'aa': 'aar',
2769 'ab': 'abk',
2770 'ae': 'ave',
2771 'af': 'afr',
2772 'ak': 'aka',
2773 'am': 'amh',
2774 'an': 'arg',
2775 'ar': 'ara',
2776 'as': 'asm',
2777 'av': 'ava',
2778 'ay': 'aym',
2779 'az': 'aze',
2780 'ba': 'bak',
2781 'be': 'bel',
2782 'bg': 'bul',
2783 'bh': 'bih',
2784 'bi': 'bis',
2785 'bm': 'bam',
2786 'bn': 'ben',
2787 'bo': 'bod',
2788 'br': 'bre',
2789 'bs': 'bos',
2790 'ca': 'cat',
2791 'ce': 'che',
2792 'ch': 'cha',
2793 'co': 'cos',
2794 'cr': 'cre',
2795 'cs': 'ces',
2796 'cu': 'chu',
2797 'cv': 'chv',
2798 'cy': 'cym',
2799 'da': 'dan',
2800 'de': 'deu',
2801 'dv': 'div',
2802 'dz': 'dzo',
2803 'ee': 'ewe',
2804 'el': 'ell',
2805 'en': 'eng',
2806 'eo': 'epo',
2807 'es': 'spa',
2808 'et': 'est',
2809 'eu': 'eus',
2810 'fa': 'fas',
2811 'ff': 'ful',
2812 'fi': 'fin',
2813 'fj': 'fij',
2814 'fo': 'fao',
2815 'fr': 'fra',
2816 'fy': 'fry',
2817 'ga': 'gle',
2818 'gd': 'gla',
2819 'gl': 'glg',
2820 'gn': 'grn',
2821 'gu': 'guj',
2822 'gv': 'glv',
2823 'ha': 'hau',
2824 'he': 'heb',
2825 'hi': 'hin',
2826 'ho': 'hmo',
2827 'hr': 'hrv',
2828 'ht': 'hat',
2829 'hu': 'hun',
2830 'hy': 'hye',
2831 'hz': 'her',
2832 'ia': 'ina',
2833 'id': 'ind',
2834 'ie': 'ile',
2835 'ig': 'ibo',
2836 'ii': 'iii',
2837 'ik': 'ipk',
2838 'io': 'ido',
2839 'is': 'isl',
2840 'it': 'ita',
2841 'iu': 'iku',
2842 'ja': 'jpn',
2843 'jv': 'jav',
2844 'ka': 'kat',
2845 'kg': 'kon',
2846 'ki': 'kik',
2847 'kj': 'kua',
2848 'kk': 'kaz',
2849 'kl': 'kal',
2850 'km': 'khm',
2851 'kn': 'kan',
2852 'ko': 'kor',
2853 'kr': 'kau',
2854 'ks': 'kas',
2855 'ku': 'kur',
2856 'kv': 'kom',
2857 'kw': 'cor',
2858 'ky': 'kir',
2859 'la': 'lat',
2860 'lb': 'ltz',
2861 'lg': 'lug',
2862 'li': 'lim',
2863 'ln': 'lin',
2864 'lo': 'lao',
2865 'lt': 'lit',
2866 'lu': 'lub',
2867 'lv': 'lav',
2868 'mg': 'mlg',
2869 'mh': 'mah',
2870 'mi': 'mri',
2871 'mk': 'mkd',
2872 'ml': 'mal',
2873 'mn': 'mon',
2874 'mr': 'mar',
2875 'ms': 'msa',
2876 'mt': 'mlt',
2877 'my': 'mya',
2878 'na': 'nau',
2879 'nb': 'nob',
2880 'nd': 'nde',
2881 'ne': 'nep',
2882 'ng': 'ndo',
2883 'nl': 'nld',
2884 'nn': 'nno',
2885 'no': 'nor',
2886 'nr': 'nbl',
2887 'nv': 'nav',
2888 'ny': 'nya',
2889 'oc': 'oci',
2890 'oj': 'oji',
2891 'om': 'orm',
2892 'or': 'ori',
2893 'os': 'oss',
2894 'pa': 'pan',
2895 'pi': 'pli',
2896 'pl': 'pol',
2897 'ps': 'pus',
2898 'pt': 'por',
2899 'qu': 'que',
2900 'rm': 'roh',
2901 'rn': 'run',
2902 'ro': 'ron',
2903 'ru': 'rus',
2904 'rw': 'kin',
2905 'sa': 'san',
2906 'sc': 'srd',
2907 'sd': 'snd',
2908 'se': 'sme',
2909 'sg': 'sag',
2910 'si': 'sin',
2911 'sk': 'slk',
2912 'sl': 'slv',
2913 'sm': 'smo',
2914 'sn': 'sna',
2915 'so': 'som',
2916 'sq': 'sqi',
2917 'sr': 'srp',
2918 'ss': 'ssw',
2919 'st': 'sot',
2920 'su': 'sun',
2921 'sv': 'swe',
2922 'sw': 'swa',
2923 'ta': 'tam',
2924 'te': 'tel',
2925 'tg': 'tgk',
2926 'th': 'tha',
2927 'ti': 'tir',
2928 'tk': 'tuk',
2929 'tl': 'tgl',
2930 'tn': 'tsn',
2931 'to': 'ton',
2932 'tr': 'tur',
2933 'ts': 'tso',
2934 'tt': 'tat',
2935 'tw': 'twi',
2936 'ty': 'tah',
2937 'ug': 'uig',
2938 'uk': 'ukr',
2939 'ur': 'urd',
2940 'uz': 'uzb',
2941 've': 'ven',
2942 'vi': 'vie',
2943 'vo': 'vol',
2944 'wa': 'wln',
2945 'wo': 'wol',
2946 'xh': 'xho',
2947 'yi': 'yid',
2948 'yo': 'yor',
2949 'za': 'zha',
2950 'zh': 'zho',
2951 'zu': 'zul',
2952 }
2953
2954 @classmethod
2955 def short2long(cls, code):
2956 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2957 return cls._lang_map.get(code[:2])
2958
2959 @classmethod
2960 def long2short(cls, code):
2961 """Convert language code from ISO 639-2/T to ISO 639-1"""
2962 for short_name, long_name in cls._lang_map.items():
2963 if long_name == code:
2964 return short_name
2965
2966
4eb10f66
YCH
2967class ISO3166Utils(object):
2968 # From http://data.okfn.org/data/core/country-list
2969 _country_map = {
2970 'AF': 'Afghanistan',
2971 'AX': 'Åland Islands',
2972 'AL': 'Albania',
2973 'DZ': 'Algeria',
2974 'AS': 'American Samoa',
2975 'AD': 'Andorra',
2976 'AO': 'Angola',
2977 'AI': 'Anguilla',
2978 'AQ': 'Antarctica',
2979 'AG': 'Antigua and Barbuda',
2980 'AR': 'Argentina',
2981 'AM': 'Armenia',
2982 'AW': 'Aruba',
2983 'AU': 'Australia',
2984 'AT': 'Austria',
2985 'AZ': 'Azerbaijan',
2986 'BS': 'Bahamas',
2987 'BH': 'Bahrain',
2988 'BD': 'Bangladesh',
2989 'BB': 'Barbados',
2990 'BY': 'Belarus',
2991 'BE': 'Belgium',
2992 'BZ': 'Belize',
2993 'BJ': 'Benin',
2994 'BM': 'Bermuda',
2995 'BT': 'Bhutan',
2996 'BO': 'Bolivia, Plurinational State of',
2997 'BQ': 'Bonaire, Sint Eustatius and Saba',
2998 'BA': 'Bosnia and Herzegovina',
2999 'BW': 'Botswana',
3000 'BV': 'Bouvet Island',
3001 'BR': 'Brazil',
3002 'IO': 'British Indian Ocean Territory',
3003 'BN': 'Brunei Darussalam',
3004 'BG': 'Bulgaria',
3005 'BF': 'Burkina Faso',
3006 'BI': 'Burundi',
3007 'KH': 'Cambodia',
3008 'CM': 'Cameroon',
3009 'CA': 'Canada',
3010 'CV': 'Cape Verde',
3011 'KY': 'Cayman Islands',
3012 'CF': 'Central African Republic',
3013 'TD': 'Chad',
3014 'CL': 'Chile',
3015 'CN': 'China',
3016 'CX': 'Christmas Island',
3017 'CC': 'Cocos (Keeling) Islands',
3018 'CO': 'Colombia',
3019 'KM': 'Comoros',
3020 'CG': 'Congo',
3021 'CD': 'Congo, the Democratic Republic of the',
3022 'CK': 'Cook Islands',
3023 'CR': 'Costa Rica',
3024 'CI': 'Côte d\'Ivoire',
3025 'HR': 'Croatia',
3026 'CU': 'Cuba',
3027 'CW': 'Curaçao',
3028 'CY': 'Cyprus',
3029 'CZ': 'Czech Republic',
3030 'DK': 'Denmark',
3031 'DJ': 'Djibouti',
3032 'DM': 'Dominica',
3033 'DO': 'Dominican Republic',
3034 'EC': 'Ecuador',
3035 'EG': 'Egypt',
3036 'SV': 'El Salvador',
3037 'GQ': 'Equatorial Guinea',
3038 'ER': 'Eritrea',
3039 'EE': 'Estonia',
3040 'ET': 'Ethiopia',
3041 'FK': 'Falkland Islands (Malvinas)',
3042 'FO': 'Faroe Islands',
3043 'FJ': 'Fiji',
3044 'FI': 'Finland',
3045 'FR': 'France',
3046 'GF': 'French Guiana',
3047 'PF': 'French Polynesia',
3048 'TF': 'French Southern Territories',
3049 'GA': 'Gabon',
3050 'GM': 'Gambia',
3051 'GE': 'Georgia',
3052 'DE': 'Germany',
3053 'GH': 'Ghana',
3054 'GI': 'Gibraltar',
3055 'GR': 'Greece',
3056 'GL': 'Greenland',
3057 'GD': 'Grenada',
3058 'GP': 'Guadeloupe',
3059 'GU': 'Guam',
3060 'GT': 'Guatemala',
3061 'GG': 'Guernsey',
3062 'GN': 'Guinea',
3063 'GW': 'Guinea-Bissau',
3064 'GY': 'Guyana',
3065 'HT': 'Haiti',
3066 'HM': 'Heard Island and McDonald Islands',
3067 'VA': 'Holy See (Vatican City State)',
3068 'HN': 'Honduras',
3069 'HK': 'Hong Kong',
3070 'HU': 'Hungary',
3071 'IS': 'Iceland',
3072 'IN': 'India',
3073 'ID': 'Indonesia',
3074 'IR': 'Iran, Islamic Republic of',
3075 'IQ': 'Iraq',
3076 'IE': 'Ireland',
3077 'IM': 'Isle of Man',
3078 'IL': 'Israel',
3079 'IT': 'Italy',
3080 'JM': 'Jamaica',
3081 'JP': 'Japan',
3082 'JE': 'Jersey',
3083 'JO': 'Jordan',
3084 'KZ': 'Kazakhstan',
3085 'KE': 'Kenya',
3086 'KI': 'Kiribati',
3087 'KP': 'Korea, Democratic People\'s Republic of',
3088 'KR': 'Korea, Republic of',
3089 'KW': 'Kuwait',
3090 'KG': 'Kyrgyzstan',
3091 'LA': 'Lao People\'s Democratic Republic',
3092 'LV': 'Latvia',
3093 'LB': 'Lebanon',
3094 'LS': 'Lesotho',
3095 'LR': 'Liberia',
3096 'LY': 'Libya',
3097 'LI': 'Liechtenstein',
3098 'LT': 'Lithuania',
3099 'LU': 'Luxembourg',
3100 'MO': 'Macao',
3101 'MK': 'Macedonia, the Former Yugoslav Republic of',
3102 'MG': 'Madagascar',
3103 'MW': 'Malawi',
3104 'MY': 'Malaysia',
3105 'MV': 'Maldives',
3106 'ML': 'Mali',
3107 'MT': 'Malta',
3108 'MH': 'Marshall Islands',
3109 'MQ': 'Martinique',
3110 'MR': 'Mauritania',
3111 'MU': 'Mauritius',
3112 'YT': 'Mayotte',
3113 'MX': 'Mexico',
3114 'FM': 'Micronesia, Federated States of',
3115 'MD': 'Moldova, Republic of',
3116 'MC': 'Monaco',
3117 'MN': 'Mongolia',
3118 'ME': 'Montenegro',
3119 'MS': 'Montserrat',
3120 'MA': 'Morocco',
3121 'MZ': 'Mozambique',
3122 'MM': 'Myanmar',
3123 'NA': 'Namibia',
3124 'NR': 'Nauru',
3125 'NP': 'Nepal',
3126 'NL': 'Netherlands',
3127 'NC': 'New Caledonia',
3128 'NZ': 'New Zealand',
3129 'NI': 'Nicaragua',
3130 'NE': 'Niger',
3131 'NG': 'Nigeria',
3132 'NU': 'Niue',
3133 'NF': 'Norfolk Island',
3134 'MP': 'Northern Mariana Islands',
3135 'NO': 'Norway',
3136 'OM': 'Oman',
3137 'PK': 'Pakistan',
3138 'PW': 'Palau',
3139 'PS': 'Palestine, State of',
3140 'PA': 'Panama',
3141 'PG': 'Papua New Guinea',
3142 'PY': 'Paraguay',
3143 'PE': 'Peru',
3144 'PH': 'Philippines',
3145 'PN': 'Pitcairn',
3146 'PL': 'Poland',
3147 'PT': 'Portugal',
3148 'PR': 'Puerto Rico',
3149 'QA': 'Qatar',
3150 'RE': 'Réunion',
3151 'RO': 'Romania',
3152 'RU': 'Russian Federation',
3153 'RW': 'Rwanda',
3154 'BL': 'Saint Barthélemy',
3155 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
3156 'KN': 'Saint Kitts and Nevis',
3157 'LC': 'Saint Lucia',
3158 'MF': 'Saint Martin (French part)',
3159 'PM': 'Saint Pierre and Miquelon',
3160 'VC': 'Saint Vincent and the Grenadines',
3161 'WS': 'Samoa',
3162 'SM': 'San Marino',
3163 'ST': 'Sao Tome and Principe',
3164 'SA': 'Saudi Arabia',
3165 'SN': 'Senegal',
3166 'RS': 'Serbia',
3167 'SC': 'Seychelles',
3168 'SL': 'Sierra Leone',
3169 'SG': 'Singapore',
3170 'SX': 'Sint Maarten (Dutch part)',
3171 'SK': 'Slovakia',
3172 'SI': 'Slovenia',
3173 'SB': 'Solomon Islands',
3174 'SO': 'Somalia',
3175 'ZA': 'South Africa',
3176 'GS': 'South Georgia and the South Sandwich Islands',
3177 'SS': 'South Sudan',
3178 'ES': 'Spain',
3179 'LK': 'Sri Lanka',
3180 'SD': 'Sudan',
3181 'SR': 'Suriname',
3182 'SJ': 'Svalbard and Jan Mayen',
3183 'SZ': 'Swaziland',
3184 'SE': 'Sweden',
3185 'CH': 'Switzerland',
3186 'SY': 'Syrian Arab Republic',
3187 'TW': 'Taiwan, Province of China',
3188 'TJ': 'Tajikistan',
3189 'TZ': 'Tanzania, United Republic of',
3190 'TH': 'Thailand',
3191 'TL': 'Timor-Leste',
3192 'TG': 'Togo',
3193 'TK': 'Tokelau',
3194 'TO': 'Tonga',
3195 'TT': 'Trinidad and Tobago',
3196 'TN': 'Tunisia',
3197 'TR': 'Turkey',
3198 'TM': 'Turkmenistan',
3199 'TC': 'Turks and Caicos Islands',
3200 'TV': 'Tuvalu',
3201 'UG': 'Uganda',
3202 'UA': 'Ukraine',
3203 'AE': 'United Arab Emirates',
3204 'GB': 'United Kingdom',
3205 'US': 'United States',
3206 'UM': 'United States Minor Outlying Islands',
3207 'UY': 'Uruguay',
3208 'UZ': 'Uzbekistan',
3209 'VU': 'Vanuatu',
3210 'VE': 'Venezuela, Bolivarian Republic of',
3211 'VN': 'Viet Nam',
3212 'VG': 'Virgin Islands, British',
3213 'VI': 'Virgin Islands, U.S.',
3214 'WF': 'Wallis and Futuna',
3215 'EH': 'Western Sahara',
3216 'YE': 'Yemen',
3217 'ZM': 'Zambia',
3218 'ZW': 'Zimbabwe',
3219 }
3220
3221 @classmethod
3222 def short2full(cls, code):
3223 """Convert an ISO 3166-2 country code to the corresponding full name"""
3224 return cls._country_map.get(code.upper())
3225
3226
773f291d
S
3227class GeoUtils(object):
3228 # Major IPv4 address blocks per country
3229 _country_ip_map = {
3230 'AD': '85.94.160.0/19',
3231 'AE': '94.200.0.0/13',
3232 'AF': '149.54.0.0/17',
3233 'AG': '209.59.64.0/18',
3234 'AI': '204.14.248.0/21',
3235 'AL': '46.99.0.0/16',
3236 'AM': '46.70.0.0/15',
3237 'AO': '105.168.0.0/13',
3238 'AP': '159.117.192.0/21',
3239 'AR': '181.0.0.0/12',
3240 'AS': '202.70.112.0/20',
3241 'AT': '84.112.0.0/13',
3242 'AU': '1.128.0.0/11',
3243 'AW': '181.41.0.0/18',
3244 'AZ': '5.191.0.0/16',
3245 'BA': '31.176.128.0/17',
3246 'BB': '65.48.128.0/17',
3247 'BD': '114.130.0.0/16',
3248 'BE': '57.0.0.0/8',
3249 'BF': '129.45.128.0/17',
3250 'BG': '95.42.0.0/15',
3251 'BH': '37.131.0.0/17',
3252 'BI': '154.117.192.0/18',
3253 'BJ': '137.255.0.0/16',
3254 'BL': '192.131.134.0/24',
3255 'BM': '196.12.64.0/18',
3256 'BN': '156.31.0.0/16',
3257 'BO': '161.56.0.0/16',
3258 'BQ': '161.0.80.0/20',
3259 'BR': '152.240.0.0/12',
3260 'BS': '24.51.64.0/18',
3261 'BT': '119.2.96.0/19',
3262 'BW': '168.167.0.0/16',
3263 'BY': '178.120.0.0/13',
3264 'BZ': '179.42.192.0/18',
3265 'CA': '99.224.0.0/11',
3266 'CD': '41.243.0.0/16',
3267 'CF': '196.32.200.0/21',
3268 'CG': '197.214.128.0/17',
3269 'CH': '85.0.0.0/13',
3270 'CI': '154.232.0.0/14',
3271 'CK': '202.65.32.0/19',
3272 'CL': '152.172.0.0/14',
3273 'CM': '165.210.0.0/15',
3274 'CN': '36.128.0.0/10',
3275 'CO': '181.240.0.0/12',
3276 'CR': '201.192.0.0/12',
3277 'CU': '152.206.0.0/15',
3278 'CV': '165.90.96.0/19',
3279 'CW': '190.88.128.0/17',
3280 'CY': '46.198.0.0/15',
3281 'CZ': '88.100.0.0/14',
3282 'DE': '53.0.0.0/8',
3283 'DJ': '197.241.0.0/17',
3284 'DK': '87.48.0.0/12',
3285 'DM': '192.243.48.0/20',
3286 'DO': '152.166.0.0/15',
3287 'DZ': '41.96.0.0/12',
3288 'EC': '186.68.0.0/15',
3289 'EE': '90.190.0.0/15',
3290 'EG': '156.160.0.0/11',
3291 'ER': '196.200.96.0/20',
3292 'ES': '88.0.0.0/11',
3293 'ET': '196.188.0.0/14',
3294 'EU': '2.16.0.0/13',
3295 'FI': '91.152.0.0/13',
3296 'FJ': '144.120.0.0/16',
3297 'FM': '119.252.112.0/20',
3298 'FO': '88.85.32.0/19',
3299 'FR': '90.0.0.0/9',
3300 'GA': '41.158.0.0/15',
3301 'GB': '25.0.0.0/8',
3302 'GD': '74.122.88.0/21',
3303 'GE': '31.146.0.0/16',
3304 'GF': '161.22.64.0/18',
3305 'GG': '62.68.160.0/19',
3306 'GH': '45.208.0.0/14',
3307 'GI': '85.115.128.0/19',
3308 'GL': '88.83.0.0/19',
3309 'GM': '160.182.0.0/15',
3310 'GN': '197.149.192.0/18',
3311 'GP': '104.250.0.0/19',
3312 'GQ': '105.235.224.0/20',
3313 'GR': '94.64.0.0/13',
3314 'GT': '168.234.0.0/16',
3315 'GU': '168.123.0.0/16',
3316 'GW': '197.214.80.0/20',
3317 'GY': '181.41.64.0/18',
3318 'HK': '113.252.0.0/14',
3319 'HN': '181.210.0.0/16',
3320 'HR': '93.136.0.0/13',
3321 'HT': '148.102.128.0/17',
3322 'HU': '84.0.0.0/14',
3323 'ID': '39.192.0.0/10',
3324 'IE': '87.32.0.0/12',
3325 'IL': '79.176.0.0/13',
3326 'IM': '5.62.80.0/20',
3327 'IN': '117.192.0.0/10',
3328 'IO': '203.83.48.0/21',
3329 'IQ': '37.236.0.0/14',
3330 'IR': '2.176.0.0/12',
3331 'IS': '82.221.0.0/16',
3332 'IT': '79.0.0.0/10',
3333 'JE': '87.244.64.0/18',
3334 'JM': '72.27.0.0/17',
3335 'JO': '176.29.0.0/16',
3336 'JP': '126.0.0.0/8',
3337 'KE': '105.48.0.0/12',
3338 'KG': '158.181.128.0/17',
3339 'KH': '36.37.128.0/17',
3340 'KI': '103.25.140.0/22',
3341 'KM': '197.255.224.0/20',
3342 'KN': '198.32.32.0/19',
3343 'KP': '175.45.176.0/22',
3344 'KR': '175.192.0.0/10',
3345 'KW': '37.36.0.0/14',
3346 'KY': '64.96.0.0/15',
3347 'KZ': '2.72.0.0/13',
3348 'LA': '115.84.64.0/18',
3349 'LB': '178.135.0.0/16',
3350 'LC': '192.147.231.0/24',
3351 'LI': '82.117.0.0/19',
3352 'LK': '112.134.0.0/15',
3353 'LR': '41.86.0.0/19',
3354 'LS': '129.232.0.0/17',
3355 'LT': '78.56.0.0/13',
3356 'LU': '188.42.0.0/16',
3357 'LV': '46.109.0.0/16',
3358 'LY': '41.252.0.0/14',
3359 'MA': '105.128.0.0/11',
3360 'MC': '88.209.64.0/18',
3361 'MD': '37.246.0.0/16',
3362 'ME': '178.175.0.0/17',
3363 'MF': '74.112.232.0/21',
3364 'MG': '154.126.0.0/17',
3365 'MH': '117.103.88.0/21',
3366 'MK': '77.28.0.0/15',
3367 'ML': '154.118.128.0/18',
3368 'MM': '37.111.0.0/17',
3369 'MN': '49.0.128.0/17',
3370 'MO': '60.246.0.0/16',
3371 'MP': '202.88.64.0/20',
3372 'MQ': '109.203.224.0/19',
3373 'MR': '41.188.64.0/18',
3374 'MS': '208.90.112.0/22',
3375 'MT': '46.11.0.0/16',
3376 'MU': '105.16.0.0/12',
3377 'MV': '27.114.128.0/18',
3378 'MW': '105.234.0.0/16',
3379 'MX': '187.192.0.0/11',
3380 'MY': '175.136.0.0/13',
3381 'MZ': '197.218.0.0/15',
3382 'NA': '41.182.0.0/16',
3383 'NC': '101.101.0.0/18',
3384 'NE': '197.214.0.0/18',
3385 'NF': '203.17.240.0/22',
3386 'NG': '105.112.0.0/12',
3387 'NI': '186.76.0.0/15',
3388 'NL': '145.96.0.0/11',
3389 'NO': '84.208.0.0/13',
3390 'NP': '36.252.0.0/15',
3391 'NR': '203.98.224.0/19',
3392 'NU': '49.156.48.0/22',
3393 'NZ': '49.224.0.0/14',
3394 'OM': '5.36.0.0/15',
3395 'PA': '186.72.0.0/15',
3396 'PE': '186.160.0.0/14',
3397 'PF': '123.50.64.0/18',
3398 'PG': '124.240.192.0/19',
3399 'PH': '49.144.0.0/13',
3400 'PK': '39.32.0.0/11',
3401 'PL': '83.0.0.0/11',
3402 'PM': '70.36.0.0/20',
3403 'PR': '66.50.0.0/16',
3404 'PS': '188.161.0.0/16',
3405 'PT': '85.240.0.0/13',
3406 'PW': '202.124.224.0/20',
3407 'PY': '181.120.0.0/14',
3408 'QA': '37.210.0.0/15',
3409 'RE': '139.26.0.0/16',
3410 'RO': '79.112.0.0/13',
3411 'RS': '178.220.0.0/14',
3412 'RU': '5.136.0.0/13',
3413 'RW': '105.178.0.0/15',
3414 'SA': '188.48.0.0/13',
3415 'SB': '202.1.160.0/19',
3416 'SC': '154.192.0.0/11',
3417 'SD': '154.96.0.0/13',
3418 'SE': '78.64.0.0/12',
3419 'SG': '152.56.0.0/14',
3420 'SI': '188.196.0.0/14',
3421 'SK': '78.98.0.0/15',
3422 'SL': '197.215.0.0/17',
3423 'SM': '89.186.32.0/19',
3424 'SN': '41.82.0.0/15',
3425 'SO': '197.220.64.0/19',
3426 'SR': '186.179.128.0/17',
3427 'SS': '105.235.208.0/21',
3428 'ST': '197.159.160.0/19',
3429 'SV': '168.243.0.0/16',
3430 'SX': '190.102.0.0/20',
3431 'SY': '5.0.0.0/16',
3432 'SZ': '41.84.224.0/19',
3433 'TC': '65.255.48.0/20',
3434 'TD': '154.68.128.0/19',
3435 'TG': '196.168.0.0/14',
3436 'TH': '171.96.0.0/13',
3437 'TJ': '85.9.128.0/18',
3438 'TK': '27.96.24.0/21',
3439 'TL': '180.189.160.0/20',
3440 'TM': '95.85.96.0/19',
3441 'TN': '197.0.0.0/11',
3442 'TO': '175.176.144.0/21',
3443 'TR': '78.160.0.0/11',
3444 'TT': '186.44.0.0/15',
3445 'TV': '202.2.96.0/19',
3446 'TW': '120.96.0.0/11',
3447 'TZ': '156.156.0.0/14',
3448 'UA': '93.72.0.0/13',
3449 'UG': '154.224.0.0/13',
3450 'US': '3.0.0.0/8',
3451 'UY': '167.56.0.0/13',
3452 'UZ': '82.215.64.0/18',
3453 'VA': '212.77.0.0/19',
3454 'VC': '24.92.144.0/20',
3455 'VE': '186.88.0.0/13',
3456 'VG': '172.103.64.0/18',
3457 'VI': '146.226.0.0/16',
3458 'VN': '14.160.0.0/11',
3459 'VU': '202.80.32.0/20',
3460 'WF': '117.20.32.0/21',
3461 'WS': '202.4.32.0/19',
3462 'YE': '134.35.0.0/16',
3463 'YT': '41.242.116.0/22',
3464 'ZA': '41.0.0.0/11',
3465 'ZM': '165.56.0.0/13',
3466 'ZW': '41.85.192.0/19',
3467 }
3468
3469 @classmethod
3470 def random_ipv4(cls, code):
3471 block = cls._country_ip_map.get(code.upper())
3472 if not block:
3473 return None
3474 addr, preflen = block.split('/')
3475 addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0]
3476 addr_max = addr_min | (0xffffffff >> int(preflen))
18a0defa 3477 return compat_str(socket.inet_ntoa(
4248dad9 3478 compat_struct_pack('!L', random.randint(addr_min, addr_max))))
773f291d
S
3479
3480
91410c9b 3481class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
3482 def __init__(self, proxies=None):
3483 # Set default handlers
3484 for type in ('http', 'https'):
3485 setattr(self, '%s_open' % type,
3486 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
3487 meth(r, proxy, type))
3488 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
3489
91410c9b 3490 def proxy_open(self, req, proxy, type):
2461f79d 3491 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
3492 if req_proxy is not None:
3493 proxy = req_proxy
2461f79d
PH
3494 del req.headers['Ytdl-request-proxy']
3495
3496 if proxy == '__noproxy__':
3497 return None # No Proxy
51fb4995 3498 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
3499 req.add_header('Ytdl-socks-proxy', proxy)
3500 # youtube-dl's http/https handlers do wrapping the socket with socks
3501 return None
91410c9b
PH
3502 return compat_urllib_request.ProxyHandler.proxy_open(
3503 self, req, proxy, type)
5bc880b9
YCH
3504
3505
0a5445dd
YCH
3506# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is
3507# released into Public Domain
3508# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387
3509
3510def long_to_bytes(n, blocksize=0):
3511 """long_to_bytes(n:long, blocksize:int) : string
3512 Convert a long integer to a byte string.
3513
3514 If optional blocksize is given and greater than zero, pad the front of the
3515 byte string with binary zeros so that the length is a multiple of
3516 blocksize.
3517 """
3518 # after much testing, this algorithm was deemed to be the fastest
3519 s = b''
3520 n = int(n)
3521 while n > 0:
3522 s = compat_struct_pack('>I', n & 0xffffffff) + s
3523 n = n >> 32
3524 # strip off leading zeros
3525 for i in range(len(s)):
3526 if s[i] != b'\000'[0]:
3527 break
3528 else:
3529 # only happens when n == 0
3530 s = b'\000'
3531 i = 0
3532 s = s[i:]
3533 # add back some pad bytes. this could be done more efficiently w.r.t. the
3534 # de-padding being done above, but sigh...
3535 if blocksize > 0 and len(s) % blocksize:
3536 s = (blocksize - len(s) % blocksize) * b'\000' + s
3537 return s
3538
3539
3540def bytes_to_long(s):
3541 """bytes_to_long(string) : long
3542 Convert a byte string to a long integer.
3543
3544 This is (essentially) the inverse of long_to_bytes().
3545 """
3546 acc = 0
3547 length = len(s)
3548 if length % 4:
3549 extra = (4 - length % 4)
3550 s = b'\000' * extra + s
3551 length = length + extra
3552 for i in range(0, length, 4):
3553 acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0]
3554 return acc
3555
3556
5bc880b9
YCH
3557def ohdave_rsa_encrypt(data, exponent, modulus):
3558 '''
3559 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3560
3561 Input:
3562 data: data to encrypt, bytes-like object
3563 exponent, modulus: parameter e and N of RSA algorithm, both integer
3564 Output: hex string of encrypted data
3565
3566 Limitation: supports one block encryption only
3567 '''
3568
3569 payload = int(binascii.hexlify(data[::-1]), 16)
3570 encrypted = pow(payload, exponent, modulus)
3571 return '%x' % encrypted
81bdc8fd
YCH
3572
3573
f48409c7
YCH
3574def pkcs1pad(data, length):
3575 """
3576 Padding input data with PKCS#1 scheme
3577
3578 @param {int[]} data input data
3579 @param {int} length target length
3580 @returns {int[]} padded data
3581 """
3582 if len(data) > length - 11:
3583 raise ValueError('Input data too long for PKCS#1 padding')
3584
3585 pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)]
3586 return [0, 2] + pseudo_random + [0] + data
3587
3588
5eb6bdce 3589def encode_base_n(num, n, table=None):
59f898b7 3590 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
3591 if not table:
3592 table = FULL_TABLE[:n]
3593
5eb6bdce
YCH
3594 if n > len(table):
3595 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3596
3597 if num == 0:
3598 return table[0]
3599
81bdc8fd
YCH
3600 ret = ''
3601 while num:
3602 ret = table[num % n] + ret
3603 num = num // n
3604 return ret
f52354a8
YCH
3605
3606
3607def decode_packed_codes(code):
06b3fe29 3608 mobj = re.search(PACKED_CODES_RE, code)
f52354a8
YCH
3609 obfucasted_code, base, count, symbols = mobj.groups()
3610 base = int(base)
3611 count = int(count)
3612 symbols = symbols.split('|')
3613 symbol_table = {}
3614
3615 while count:
3616 count -= 1
5eb6bdce 3617 base_n_count = encode_base_n(count, base)
f52354a8
YCH
3618 symbol_table[base_n_count] = symbols[count] or base_n_count
3619
3620 return re.sub(
3621 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3622 obfucasted_code)
e154c651 3623
3624
3625def parse_m3u8_attributes(attrib):
3626 info = {}
3627 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3628 if val.startswith('"'):
3629 val = val[1:-1]
3630 info[key] = val
3631 return info
1143535d
YCH
3632
3633
3634def urshift(val, n):
3635 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
3636
3637
3638# Based on png2str() written by @gdkchan and improved by @yokrysty
3639# Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3640def decode_png(png_data):
3641 # Reference: https://www.w3.org/TR/PNG/
3642 header = png_data[8:]
3643
3644 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3645 raise IOError('Not a valid PNG file.')
3646
3647 int_map = {1: '>B', 2: '>H', 4: '>I'}
3648 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3649
3650 chunks = []
3651
3652 while header:
3653 length = unpack_integer(header[:4])
3654 header = header[4:]
3655
3656 chunk_type = header[:4]
3657 header = header[4:]
3658
3659 chunk_data = header[:length]
3660 header = header[length:]
3661
3662 header = header[4:] # Skip CRC
3663
3664 chunks.append({
3665 'type': chunk_type,
3666 'length': length,
3667 'data': chunk_data
3668 })
3669
3670 ihdr = chunks[0]['data']
3671
3672 width = unpack_integer(ihdr[:4])
3673 height = unpack_integer(ihdr[4:8])
3674
3675 idat = b''
3676
3677 for chunk in chunks:
3678 if chunk['type'] == b'IDAT':
3679 idat += chunk['data']
3680
3681 if not idat:
3682 raise IOError('Unable to read PNG data.')
3683
3684 decompressed_data = bytearray(zlib.decompress(idat))
3685
3686 stride = width * 3
3687 pixels = []
3688
3689 def _get_pixel(idx):
3690 x = idx % stride
3691 y = idx // stride
3692 return pixels[y][x]
3693
3694 for y in range(height):
3695 basePos = y * (1 + stride)
3696 filter_type = decompressed_data[basePos]
3697
3698 current_row = []
3699
3700 pixels.append(current_row)
3701
3702 for x in range(stride):
3703 color = decompressed_data[1 + basePos + x]
3704 basex = y * stride + x
3705 left = 0
3706 up = 0
3707
3708 if x > 2:
3709 left = _get_pixel(basex - 3)
3710 if y > 0:
3711 up = _get_pixel(basex - stride)
3712
3713 if filter_type == 1: # Sub
3714 color = (color + left) & 0xff
3715 elif filter_type == 2: # Up
3716 color = (color + up) & 0xff
3717 elif filter_type == 3: # Average
3718 color = (color + ((left + up) >> 1)) & 0xff
3719 elif filter_type == 4: # Paeth
3720 a = left
3721 b = up
3722 c = 0
3723
3724 if x > 2 and y > 0:
3725 c = _get_pixel(basex - stride - 3)
3726
3727 p = a + b - c
3728
3729 pa = abs(p - a)
3730 pb = abs(p - b)
3731 pc = abs(p - c)
3732
3733 if pa <= pb and pa <= pc:
3734 color = (color + a) & 0xff
3735 elif pb <= pc:
3736 color = (color + b) & 0xff
3737 else:
3738 color = (color + c) & 0xff
3739
3740 current_row.append(color)
3741
3742 return width, height, pixels
efa97bdc
YCH
3743
3744
3745def write_xattr(path, key, value):
3746 # This mess below finds the best xattr tool for the job
3747 try:
3748 # try the pyxattr module...
3749 import xattr
3750
53a7e3d2
YCH
3751 if hasattr(xattr, 'set'): # pyxattr
3752 # Unicode arguments are not supported in python-pyxattr until
3753 # version 0.5.0
3754 # See https://github.com/rg3/youtube-dl/issues/5498
3755 pyxattr_required_version = '0.5.0'
3756 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3757 # TODO: fallback to CLI tools
3758 raise XAttrUnavailableError(
3759 'python-pyxattr is detected but is too old. '
3760 'youtube-dl requires %s or above while your version is %s. '
3761 'Falling back to other xattr implementations' % (
3762 pyxattr_required_version, xattr.__version__))
3763
3764 setxattr = xattr.set
3765 else: # xattr
3766 setxattr = xattr.setxattr
efa97bdc
YCH
3767
3768 try:
53a7e3d2 3769 setxattr(path, key, value)
efa97bdc
YCH
3770 except EnvironmentError as e:
3771 raise XAttrMetadataError(e.errno, e.strerror)
3772
3773 except ImportError:
3774 if compat_os_name == 'nt':
3775 # Write xattrs to NTFS Alternate Data Streams:
3776 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3777 assert ':' not in key
3778 assert os.path.exists(path)
3779
3780 ads_fn = path + ':' + key
3781 try:
3782 with open(ads_fn, 'wb') as f:
3783 f.write(value)
3784 except EnvironmentError as e:
3785 raise XAttrMetadataError(e.errno, e.strerror)
3786 else:
3787 user_has_setfattr = check_executable('setfattr', ['--version'])
3788 user_has_xattr = check_executable('xattr', ['-h'])
3789
3790 if user_has_setfattr or user_has_xattr:
3791
3792 value = value.decode('utf-8')
3793 if user_has_setfattr:
3794 executable = 'setfattr'
3795 opts = ['-n', key, '-v', value]
3796 elif user_has_xattr:
3797 executable = 'xattr'
3798 opts = ['-w', key, value]
3799
3800 cmd = ([encodeFilename(executable, True)] +
3801 [encodeArgument(o) for o in opts] +
3802 [encodeFilename(path, True)])
3803
3804 try:
3805 p = subprocess.Popen(
3806 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3807 except EnvironmentError as e:
3808 raise XAttrMetadataError(e.errno, e.strerror)
3809 stdout, stderr = p.communicate()
3810 stderr = stderr.decode('utf-8', 'replace')
3811 if p.returncode != 0:
3812 raise XAttrMetadataError(p.returncode, stderr)
3813
3814 else:
3815 # On Unix, and can't find pyxattr, setfattr, or xattr.
3816 if sys.platform.startswith('linux'):
3817 raise XAttrUnavailableError(
3818 "Couldn't find a tool to set the xattrs. "
3819 "Install either the python 'pyxattr' or 'xattr' "
3820 "modules, or the GNU 'attr' package "
3821 "(which contains the 'setfattr' tool).")
3822 else:
3823 raise XAttrUnavailableError(
3824 "Couldn't find a tool to set the xattrs. "
3825 "Install either the python 'xattr' module, "
3826 "or the 'xattr' binary.")
0c265486
YCH
3827
3828
3829def random_birthday(year_field, month_field, day_field):
3830 return {
3831 year_field: str(random.randint(1950, 1995)),
3832 month_field: str(random.randint(1, 12)),
3833 day_field: str(random.randint(1, 31)),
3834 }