]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
Instead of replacing accented characters with an underscore when sanitizing file...
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
f45c185f 14import errno
be4a824d 15import functools
d77c3dfd 16import gzip
03f9daab 17import io
79a2e94e 18import itertools
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
347de493 22import operator
d77c3dfd 23import os
4eb7f1d1 24import pipes
c496ca96 25import platform
d77c3dfd 26import re
c496ca96 27import socket
79a2e94e 28import ssl
b53466e1 29import struct
1c088fa8 30import subprocess
d77c3dfd 31import sys
181c8655 32import tempfile
01951dda 33import traceback
bcf89ce6 34import xml.etree.ElementTree
d77c3dfd 35import zlib
d77c3dfd 36
8c25f81b 37from .compat import (
8bb56eee 38 compat_HTMLParser,
8f9312c3 39 compat_basestring,
8c25f81b 40 compat_chr,
36e6f62c 41 compat_etree_fromstring,
8c25f81b 42 compat_html_entities,
be4a824d 43 compat_http_client,
c86b6142 44 compat_kwargs,
8c25f81b 45 compat_parse_qs,
be4a824d 46 compat_socket_create_connection,
8c25f81b
PH
47 compat_str,
48 compat_urllib_error,
49 compat_urllib_parse,
15707c7e 50 compat_urllib_parse_urlencode,
8c25f81b
PH
51 compat_urllib_parse_urlparse,
52 compat_urllib_request,
53 compat_urlparse,
810c10ba 54 compat_xpath,
7d4111ed 55 shlex_quote,
8c25f81b 56)
4644ac55
S
57
58
468e2e92
FV
59# This is not clearly defined otherwise
60compiled_regex_type = type(re.compile(''))
61
3e669f36 62std_headers = {
9c7b3898 63 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
59ae15a5
PH
64 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
65 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
66 'Accept-Encoding': 'gzip, deflate',
67 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 68}
f427df17 69
5f6a1245 70
bf42a990
S
71NO_DEFAULT = object()
72
7105440c
YCH
73ENGLISH_MONTH_NAMES = [
74 'January', 'February', 'March', 'April', 'May', 'June',
75 'July', 'August', 'September', 'October', 'November', 'December']
76
a7aaa398
S
77KNOWN_EXTENSIONS = (
78 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
79 'flv', 'f4v', 'f4a', 'f4b',
80 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
81 'mkv', 'mka', 'mk3d',
82 'avi', 'divx',
83 'mov',
84 'asf', 'wmv', 'wma',
85 '3gp', '3g2',
86 'mp3',
87 'flac',
88 'ape',
89 'wav',
90 'f4f', 'f4m', 'm3u8', 'smil')
91
7105440c 92
d77c3dfd 93def preferredencoding():
59ae15a5 94 """Get preferred encoding.
d77c3dfd 95
59ae15a5
PH
96 Returns the best encoding scheme for the system, based on
97 locale.getpreferredencoding() and some further tweaks.
98 """
99 try:
100 pref = locale.getpreferredencoding()
28e614de 101 'TEST'.encode(pref)
70a1165b 102 except Exception:
59ae15a5 103 pref = 'UTF-8'
bae611f2 104
59ae15a5 105 return pref
d77c3dfd 106
f4bfd65f 107
181c8655 108def write_json_file(obj, fn):
1394646a 109 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 110
92120217 111 fn = encodeFilename(fn)
61ee5aeb 112 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
113 encoding = get_filesystem_encoding()
114 # os.path.basename returns a bytes object, but NamedTemporaryFile
115 # will fail if the filename contains non ascii characters unless we
116 # use a unicode object
117 path_basename = lambda f: os.path.basename(fn).decode(encoding)
118 # the same for os.path.dirname
119 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
120 else:
121 path_basename = os.path.basename
122 path_dirname = os.path.dirname
123
73159f99
S
124 args = {
125 'suffix': '.tmp',
ec5f6016
JMF
126 'prefix': path_basename(fn) + '.',
127 'dir': path_dirname(fn),
73159f99
S
128 'delete': False,
129 }
130
181c8655
PH
131 # In Python 2.x, json.dump expects a bytestream.
132 # In Python 3.x, it writes to a character stream
133 if sys.version_info < (3, 0):
73159f99 134 args['mode'] = 'wb'
181c8655 135 else:
73159f99
S
136 args.update({
137 'mode': 'w',
138 'encoding': 'utf-8',
139 })
140
c86b6142 141 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
142
143 try:
144 with tf:
145 json.dump(obj, tf)
1394646a
IK
146 if sys.platform == 'win32':
147 # Need to remove existing file on Windows, else os.rename raises
148 # WindowsError or FileExistsError.
149 try:
150 os.unlink(fn)
151 except OSError:
152 pass
181c8655 153 os.rename(tf.name, fn)
70a1165b 154 except Exception:
181c8655
PH
155 try:
156 os.remove(tf.name)
157 except OSError:
158 pass
159 raise
160
161
162if sys.version_info >= (2, 7):
ee114368 163 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 164 """ Find the xpath xpath[@key=val] """
5d2354f1 165 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 166 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
167 return node.find(expr)
168else:
ee114368 169 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 170 for f in node.findall(compat_xpath(xpath)):
ee114368
S
171 if key not in f.attrib:
172 continue
173 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
174 return f
175 return None
176
d7e66d39
JMF
177# On python2.6 the xml.etree.ElementTree.Element methods don't support
178# the namespace parameter
5f6a1245
JW
179
180
d7e66d39
JMF
181def xpath_with_ns(path, ns_map):
182 components = [c.split(':') for c in path.split('/')]
183 replaced = []
184 for c in components:
185 if len(c) == 1:
186 replaced.append(c[0])
187 else:
188 ns, tag = c
189 replaced.append('{%s}%s' % (ns_map[ns], tag))
190 return '/'.join(replaced)
191
d77c3dfd 192
a41fb80c 193def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 194 def _find_xpath(xpath):
810c10ba 195 return node.find(compat_xpath(xpath))
578c0745
S
196
197 if isinstance(xpath, (str, compat_str)):
198 n = _find_xpath(xpath)
199 else:
200 for xp in xpath:
201 n = _find_xpath(xp)
202 if n is not None:
203 break
d74bebd5 204
8e636da4 205 if n is None:
bf42a990
S
206 if default is not NO_DEFAULT:
207 return default
208 elif fatal:
bf0ff932
PH
209 name = xpath if name is None else name
210 raise ExtractorError('Could not find XML element %s' % name)
211 else:
212 return None
a41fb80c
S
213 return n
214
215
216def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
217 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
218 if n is None or n == default:
219 return n
220 if n.text is None:
221 if default is not NO_DEFAULT:
222 return default
223 elif fatal:
224 name = xpath if name is None else name
225 raise ExtractorError('Could not find XML element\'s text %s' % name)
226 else:
227 return None
228 return n.text
a41fb80c
S
229
230
231def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
232 n = find_xpath_attr(node, xpath, key)
233 if n is None:
234 if default is not NO_DEFAULT:
235 return default
236 elif fatal:
237 name = '%s[@%s]' % (xpath, key) if name is None else name
238 raise ExtractorError('Could not find XML attribute %s' % name)
239 else:
240 return None
241 return n.attrib[key]
bf0ff932
PH
242
243
9e6dd238 244def get_element_by_id(id, html):
43e8fafd 245 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 246 return get_element_by_attribute('id', id, html)
43e8fafd 247
12ea2f30 248
43e8fafd
ND
249def get_element_by_attribute(attribute, value, html):
250 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 251
38285056
PH
252 m = re.search(r'''(?xs)
253 <([a-zA-Z0-9:._-]+)
254 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
255 \s+%s=['"]?%s['"]?
256 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
257 \s*>
258 (?P<content>.*?)
259 </\1>
260 ''' % (re.escape(attribute), re.escape(value)), html)
261
262 if not m:
263 return None
264 res = m.group('content')
265
266 if res.startswith('"') or res.startswith("'"):
267 res = res[1:-1]
a921f407 268
38285056 269 return unescapeHTML(res)
a921f407 270
c5229f39 271
8bb56eee
BF
272class HTMLAttributeParser(compat_HTMLParser):
273 """Trivial HTML parser to gather the attributes for a single element"""
274 def __init__(self):
c5229f39 275 self.attrs = {}
8bb56eee
BF
276 compat_HTMLParser.__init__(self)
277
278 def handle_starttag(self, tag, attrs):
279 self.attrs = dict(attrs)
280
c5229f39 281
8bb56eee
BF
282def extract_attributes(html_element):
283 """Given a string for an HTML element such as
284 <el
285 a="foo" B="bar" c="&98;az" d=boz
286 empty= noval entity="&amp;"
287 sq='"' dq="'"
288 >
289 Decode and return a dictionary of attributes.
290 {
291 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
292 'empty': '', 'noval': None, 'entity': '&',
293 'sq': '"', 'dq': '\''
294 }.
295 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
296 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
297 """
298 parser = HTMLAttributeParser()
299 parser.feed(html_element)
300 parser.close()
301 return parser.attrs
9e6dd238 302
c5229f39 303
9e6dd238 304def clean_html(html):
59ae15a5 305 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
306
307 if html is None: # Convenience for sanitizing descriptions etc.
308 return html
309
59ae15a5
PH
310 # Newline vs <br />
311 html = html.replace('\n', ' ')
6b3aef80
FV
312 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
313 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
314 # Strip html tags
315 html = re.sub('<.*?>', '', html)
316 # Replace html entities
317 html = unescapeHTML(html)
7decf895 318 return html.strip()
9e6dd238
FV
319
320
d77c3dfd 321def sanitize_open(filename, open_mode):
59ae15a5
PH
322 """Try to open the given filename, and slightly tweak it if this fails.
323
324 Attempts to open the given filename. If this fails, it tries to change
325 the filename slightly, step by step, until it's either able to open it
326 or it fails and raises a final exception, like the standard open()
327 function.
328
329 It returns the tuple (stream, definitive_file_name).
330 """
331 try:
28e614de 332 if filename == '-':
59ae15a5
PH
333 if sys.platform == 'win32':
334 import msvcrt
335 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 336 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
337 stream = open(encodeFilename(filename), open_mode)
338 return (stream, filename)
339 except (IOError, OSError) as err:
f45c185f
PH
340 if err.errno in (errno.EACCES,):
341 raise
59ae15a5 342
f45c185f 343 # In case of error, try to remove win32 forbidden chars
d55de57b 344 alt_filename = sanitize_path(filename)
f45c185f
PH
345 if alt_filename == filename:
346 raise
347 else:
348 # An exception here should be caught in the caller
d55de57b 349 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 350 return (stream, alt_filename)
d77c3dfd
FV
351
352
353def timeconvert(timestr):
59ae15a5
PH
354 """Convert RFC 2822 defined time string into system timestamp"""
355 timestamp = None
356 timetuple = email.utils.parsedate_tz(timestr)
357 if timetuple is not None:
358 timestamp = email.utils.mktime_tz(timetuple)
359 return timestamp
1c469a94 360
5f6a1245 361
796173d0 362def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
363 """Sanitizes a string so it could be used as part of a filename.
364 If restricted is set, use a stricter subset of allowed characters.
796173d0 365 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
366 """
367 def replace_insane(char):
79a2e94e
AT
368 accents = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ',
369 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOUUUUYP', ['ss'],
370 'aaaaaa', ['ae'], 'ceeeeiiiionoooooouuuuypy')))
371 if restricted and char in accents:
372 return accents[char]
59ae15a5
PH
373 if char == '?' or ord(char) < 32 or ord(char) == 127:
374 return ''
375 elif char == '"':
376 return '' if restricted else '\''
377 elif char == ':':
378 return '_-' if restricted else ' -'
379 elif char in '\\/|*<>':
380 return '_'
627dcfff 381 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
382 return '_'
383 if restricted and ord(char) > 127:
384 return '_'
385 return char
386
2aeb06d6
PH
387 # Handle timestamps
388 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 389 result = ''.join(map(replace_insane, s))
796173d0
PH
390 if not is_id:
391 while '__' in result:
392 result = result.replace('__', '_')
393 result = result.strip('_')
394 # Common case of "Foreign band name - English song title"
395 if restricted and result.startswith('-_'):
396 result = result[2:]
5a42414b
PH
397 if result.startswith('-'):
398 result = '_' + result[len('-'):]
a7440261 399 result = result.lstrip('.')
796173d0
PH
400 if not result:
401 result = '_'
59ae15a5 402 return result
d77c3dfd 403
5f6a1245 404
a2aaf4db
S
405def sanitize_path(s):
406 """Sanitizes and normalizes path on Windows"""
407 if sys.platform != 'win32':
408 return s
be531ef1
S
409 drive_or_unc, _ = os.path.splitdrive(s)
410 if sys.version_info < (2, 7) and not drive_or_unc:
411 drive_or_unc, _ = os.path.splitunc(s)
412 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
413 if drive_or_unc:
a2aaf4db
S
414 norm_path.pop(0)
415 sanitized_path = [
c90d16cf 416 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
a2aaf4db 417 for path_part in norm_path]
be531ef1
S
418 if drive_or_unc:
419 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
420 return os.path.join(*sanitized_path)
421
422
67dda517
S
423# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
424# unwanted failures due to missing protocol
17bcc626
S
425def sanitize_url(url):
426 return 'http:%s' % url if url.startswith('//') else url
427
428
67dda517 429def sanitized_Request(url, *args, **kwargs):
17bcc626 430 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
431
432
d77c3dfd 433def orderedSet(iterable):
59ae15a5
PH
434 """ Remove all duplicates from the input iterable """
435 res = []
436 for el in iterable:
437 if el not in res:
438 res.append(el)
439 return res
d77c3dfd 440
912b38b4 441
4e408e47
PH
442def _htmlentity_transform(entity):
443 """Transforms an HTML entity to a character."""
444 # Known non-numeric HTML entity
445 if entity in compat_html_entities.name2codepoint:
446 return compat_chr(compat_html_entities.name2codepoint[entity])
447
91757b0f 448 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
449 if mobj is not None:
450 numstr = mobj.group(1)
28e614de 451 if numstr.startswith('x'):
4e408e47 452 base = 16
28e614de 453 numstr = '0%s' % numstr
4e408e47
PH
454 else:
455 base = 10
7aefc49c
S
456 # See https://github.com/rg3/youtube-dl/issues/7518
457 try:
458 return compat_chr(int(numstr, base))
459 except ValueError:
460 pass
4e408e47
PH
461
462 # Unknown entity in name, return its literal representation
7a3f0c00 463 return '&%s;' % entity
4e408e47
PH
464
465
d77c3dfd 466def unescapeHTML(s):
912b38b4
PH
467 if s is None:
468 return None
469 assert type(s) == compat_str
d77c3dfd 470
4e408e47
PH
471 return re.sub(
472 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 473
8bf48f23 474
aa49acd1
S
475def get_subprocess_encoding():
476 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
477 # For subprocess calls, encode with locale encoding
478 # Refer to http://stackoverflow.com/a/9951851/35070
479 encoding = preferredencoding()
480 else:
481 encoding = sys.getfilesystemencoding()
482 if encoding is None:
483 encoding = 'utf-8'
484 return encoding
485
486
8bf48f23 487def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
488 """
489 @param s The name of the file
490 """
d77c3dfd 491
8bf48f23 492 assert type(s) == compat_str
d77c3dfd 493
59ae15a5
PH
494 # Python 3 has a Unicode API
495 if sys.version_info >= (3, 0):
496 return s
0f00efed 497
aa49acd1
S
498 # Pass '' directly to use Unicode APIs on Windows 2000 and up
499 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
500 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
501 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
502 return s
503
8ee239e9
YCH
504 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
505 if sys.platform.startswith('java'):
506 return s
507
aa49acd1
S
508 return s.encode(get_subprocess_encoding(), 'ignore')
509
510
511def decodeFilename(b, for_subprocess=False):
512
513 if sys.version_info >= (3, 0):
514 return b
515
516 if not isinstance(b, bytes):
517 return b
518
519 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 520
f07b74fc
PH
521
522def encodeArgument(s):
523 if not isinstance(s, compat_str):
524 # Legacy code that uses byte strings
525 # Uncomment the following line after fixing all post processors
7af808a5 526 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
527 s = s.decode('ascii')
528 return encodeFilename(s, True)
529
530
aa49acd1
S
531def decodeArgument(b):
532 return decodeFilename(b, True)
533
534
8271226a
PH
535def decodeOption(optval):
536 if optval is None:
537 return optval
538 if isinstance(optval, bytes):
539 optval = optval.decode(preferredencoding())
540
541 assert isinstance(optval, compat_str)
542 return optval
1c256f70 543
5f6a1245 544
4539dd30
PH
545def formatSeconds(secs):
546 if secs > 3600:
547 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
548 elif secs > 60:
549 return '%d:%02d' % (secs // 60, secs % 60)
550 else:
551 return '%d' % secs
552
a0ddb8a2 553
be4a824d
PH
554def make_HTTPS_handler(params, **kwargs):
555 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 556 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 557 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 558 if opts_no_check_certificate:
be5f2c19 559 context.check_hostname = False
0db261ba 560 context.verify_mode = ssl.CERT_NONE
a2366922 561 try:
be4a824d 562 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
563 except TypeError:
564 # Python 2.7.8
565 # (create_default_context present but HTTPSHandler has no context=)
566 pass
567
568 if sys.version_info < (3, 2):
d7932313 569 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 570 else: # Python < 3.4
d7932313 571 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 572 context.verify_mode = (ssl.CERT_NONE
dca08720 573 if opts_no_check_certificate
ea6d901e 574 else ssl.CERT_REQUIRED)
303b479e 575 context.set_default_verify_paths()
be4a824d 576 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 577
732ea2f0 578
08f2a92c
JMF
579def bug_reports_message():
580 if ytdl_is_updateable():
581 update_cmd = 'type youtube-dl -U to update'
582 else:
583 update_cmd = 'see https://yt-dl.org/update on how to update'
584 msg = '; please report this issue on https://yt-dl.org/bug .'
585 msg += ' Make sure you are using the latest version; %s.' % update_cmd
586 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
587 return msg
588
589
1c256f70
PH
590class ExtractorError(Exception):
591 """Error during info extraction."""
5f6a1245 592
d11271dd 593 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
594 """ tb, if given, is the original traceback (so that it can be printed out).
595 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
596 """
597
598 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
599 expected = True
d11271dd
PH
600 if video_id is not None:
601 msg = video_id + ': ' + msg
410f3e73 602 if cause:
28e614de 603 msg += ' (caused by %r)' % cause
9a82b238 604 if not expected:
08f2a92c 605 msg += bug_reports_message()
1c256f70 606 super(ExtractorError, self).__init__(msg)
d5979c5d 607
1c256f70 608 self.traceback = tb
8cc83b8d 609 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 610 self.cause = cause
d11271dd 611 self.video_id = video_id
1c256f70 612
01951dda
PH
613 def format_traceback(self):
614 if self.traceback is None:
615 return None
28e614de 616 return ''.join(traceback.format_tb(self.traceback))
01951dda 617
1c256f70 618
416c7fcb
PH
619class UnsupportedError(ExtractorError):
620 def __init__(self, url):
621 super(UnsupportedError, self).__init__(
622 'Unsupported URL: %s' % url, expected=True)
623 self.url = url
624
625
55b3e45b
JMF
626class RegexNotFoundError(ExtractorError):
627 """Error when a regex didn't match"""
628 pass
629
630
d77c3dfd 631class DownloadError(Exception):
59ae15a5 632 """Download Error exception.
d77c3dfd 633
59ae15a5
PH
634 This exception may be thrown by FileDownloader objects if they are not
635 configured to continue on errors. They will contain the appropriate
636 error message.
637 """
5f6a1245 638
8cc83b8d
FV
639 def __init__(self, msg, exc_info=None):
640 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
641 super(DownloadError, self).__init__(msg)
642 self.exc_info = exc_info
d77c3dfd
FV
643
644
645class SameFileError(Exception):
59ae15a5 646 """Same File exception.
d77c3dfd 647
59ae15a5
PH
648 This exception will be thrown by FileDownloader objects if they detect
649 multiple files would have to be downloaded to the same file on disk.
650 """
651 pass
d77c3dfd
FV
652
653
654class PostProcessingError(Exception):
59ae15a5 655 """Post Processing exception.
d77c3dfd 656
59ae15a5
PH
657 This exception may be raised by PostProcessor's .run() method to
658 indicate an error in the postprocessing task.
659 """
5f6a1245 660
7851b379
PH
661 def __init__(self, msg):
662 self.msg = msg
d77c3dfd 663
5f6a1245 664
d77c3dfd 665class MaxDownloadsReached(Exception):
59ae15a5
PH
666 """ --max-downloads limit has been reached. """
667 pass
d77c3dfd
FV
668
669
670class UnavailableVideoError(Exception):
59ae15a5 671 """Unavailable Format exception.
d77c3dfd 672
59ae15a5
PH
673 This exception will be thrown when a video is requested
674 in a format that is not available for that video.
675 """
676 pass
d77c3dfd
FV
677
678
679class ContentTooShortError(Exception):
59ae15a5 680 """Content Too Short exception.
d77c3dfd 681
59ae15a5
PH
682 This exception may be raised by FileDownloader objects when a file they
683 download is too small for what the server announced first, indicating
684 the connection was probably interrupted.
685 """
d77c3dfd 686
59ae15a5 687 def __init__(self, downloaded, expected):
2c7ed247 688 # Both in bytes
59ae15a5
PH
689 self.downloaded = downloaded
690 self.expected = expected
d77c3dfd 691
5f6a1245 692
c5a59d93 693def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
694 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
695 # expected HTTP responses to meet HTTP/1.0 or later (see also
696 # https://github.com/rg3/youtube-dl/issues/6727)
697 if sys.version_info < (3, 0):
5a1a2e94 698 kwargs[b'strict'] = True
be4a824d
PH
699 hc = http_class(*args, **kwargs)
700 source_address = ydl_handler._params.get('source_address')
701 if source_address is not None:
702 sa = (source_address, 0)
703 if hasattr(hc, 'source_address'): # Python 2.7+
704 hc.source_address = sa
705 else: # Python 2.6
706 def _hc_connect(self, *args, **kwargs):
707 sock = compat_socket_create_connection(
708 (self.host, self.port), self.timeout, sa)
709 if is_https:
d7932313
PH
710 self.sock = ssl.wrap_socket(
711 sock, self.key_file, self.cert_file,
712 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
713 else:
714 self.sock = sock
715 hc.connect = functools.partial(_hc_connect, hc)
716
717 return hc
718
719
87f0e62d 720def handle_youtubedl_headers(headers):
992fc9d6
YCH
721 filtered_headers = headers
722
723 if 'Youtubedl-no-compression' in filtered_headers:
724 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 725 del filtered_headers['Youtubedl-no-compression']
87f0e62d 726
992fc9d6 727 return filtered_headers
87f0e62d
YCH
728
729
acebc9cd 730class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
731 """Handler for HTTP requests and responses.
732
733 This class, when installed with an OpenerDirector, automatically adds
734 the standard headers to every HTTP request and handles gzipped and
735 deflated responses from web servers. If compression is to be avoided in
736 a particular request, the original request in the program code only has
0424ec30 737 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
738 removed before making the real request.
739
740 Part of this code was copied from:
741
742 http://techknack.net/python-urllib2-handlers/
743
744 Andrew Rowls, the author of that code, agreed to release it to the
745 public domain.
746 """
747
be4a824d
PH
748 def __init__(self, params, *args, **kwargs):
749 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
750 self._params = params
751
752 def http_open(self, req):
753 return self.do_open(functools.partial(
c5a59d93 754 _create_http_connection, self, compat_http_client.HTTPConnection, False),
be4a824d
PH
755 req)
756
59ae15a5
PH
757 @staticmethod
758 def deflate(data):
759 try:
760 return zlib.decompress(data, -zlib.MAX_WBITS)
761 except zlib.error:
762 return zlib.decompress(data)
763
764 @staticmethod
765 def addinfourl_wrapper(stream, headers, url, code):
766 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
767 return compat_urllib_request.addinfourl(stream, headers, url, code)
768 ret = compat_urllib_request.addinfourl(stream, headers, url)
769 ret.code = code
770 return ret
771
acebc9cd 772 def http_request(self, req):
51f267d9
S
773 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
774 # always respected by websites, some tend to give out URLs with non percent-encoded
775 # non-ASCII characters (see telemb.py, ard.py [#3412])
776 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
777 # To work around aforementioned issue we will replace request's original URL with
778 # percent-encoded one
779 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
780 # the code of this workaround has been moved here from YoutubeDL.urlopen()
781 url = req.get_full_url()
782 url_escaped = escape_url(url)
783
784 # Substitute URL if any change after escaping
785 if url != url_escaped:
15d260eb 786 req = update_Request(req, url=url_escaped)
51f267d9 787
33ac271b 788 for h, v in std_headers.items():
3d5f7a39
JK
789 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
790 # The dict keys are capitalized because of this bug by urllib
791 if h.capitalize() not in req.headers:
33ac271b 792 req.add_header(h, v)
87f0e62d
YCH
793
794 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
795
796 if sys.version_info < (2, 7) and '#' in req.get_full_url():
797 # Python 2.6 is brain-dead when it comes to fragments
798 req._Request__original = req._Request__original.partition('#')[0]
799 req._Request__r_type = req._Request__r_type.partition('#')[0]
800
59ae15a5
PH
801 return req
802
acebc9cd 803 def http_response(self, req, resp):
59ae15a5
PH
804 old_resp = resp
805 # gzip
806 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
807 content = resp.read()
808 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
809 try:
810 uncompressed = io.BytesIO(gz.read())
811 except IOError as original_ioerror:
812 # There may be junk add the end of the file
813 # See http://stackoverflow.com/q/4928560/35070 for details
814 for i in range(1, 1024):
815 try:
816 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
817 uncompressed = io.BytesIO(gz.read())
818 except IOError:
819 continue
820 break
821 else:
822 raise original_ioerror
823 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 824 resp.msg = old_resp.msg
c047270c 825 del resp.headers['Content-encoding']
59ae15a5
PH
826 # deflate
827 if resp.headers.get('Content-encoding', '') == 'deflate':
828 gz = io.BytesIO(self.deflate(resp.read()))
829 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
830 resp.msg = old_resp.msg
c047270c 831 del resp.headers['Content-encoding']
ad729172
S
832 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
833 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
834 if 300 <= resp.code < 400:
835 location = resp.headers.get('Location')
836 if location:
837 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
838 if sys.version_info >= (3, 0):
839 location = location.encode('iso-8859-1').decode('utf-8')
840 location_escaped = escape_url(location)
841 if location != location_escaped:
842 del resp.headers['Location']
843 resp.headers['Location'] = location_escaped
59ae15a5 844 return resp
0f8d03f8 845
acebc9cd
PH
846 https_request = http_request
847 https_response = http_response
bf50b038 848
5de90176 849
be4a824d
PH
850class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
851 def __init__(self, params, https_conn_class=None, *args, **kwargs):
852 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
853 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
854 self._params = params
855
856 def https_open(self, req):
4f264c02
JMF
857 kwargs = {}
858 if hasattr(self, '_context'): # python > 2.6
859 kwargs['context'] = self._context
860 if hasattr(self, '_check_hostname'): # python 3.x
861 kwargs['check_hostname'] = self._check_hostname
be4a824d
PH
862 return self.do_open(functools.partial(
863 _create_http_connection, self, self._https_conn_class, True),
4f264c02 864 req, **kwargs)
be4a824d
PH
865
866
a6420bf5
S
867class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
868 def __init__(self, cookiejar=None):
869 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
870
871 def http_response(self, request, response):
872 # Python 2 will choke on next HTTP request in row if there are non-ASCII
873 # characters in Set-Cookie HTTP header of last response (see
874 # https://github.com/rg3/youtube-dl/issues/6769).
875 # In order to at least prevent crashing we will percent encode Set-Cookie
876 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
877 # if sys.version_info < (3, 0) and response.headers:
878 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
879 # set_cookie = response.headers.get(set_cookie_header)
880 # if set_cookie:
881 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
882 # if set_cookie != set_cookie_escaped:
883 # del response.headers[set_cookie_header]
884 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
885 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
886
887 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
888 https_response = http_response
889
890
08b38d54 891def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
892 """ Return a UNIX timestamp from the given date """
893
894 if date_str is None:
895 return None
896
52c3a6e4
S
897 date_str = re.sub(r'\.[0-9]+', '', date_str)
898
08b38d54
PH
899 if timezone is None:
900 m = re.search(
52c3a6e4 901 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
08b38d54
PH
902 date_str)
903 if not m:
912b38b4
PH
904 timezone = datetime.timedelta()
905 else:
08b38d54
PH
906 date_str = date_str[:-len(m.group(0))]
907 if not m.group('sign'):
908 timezone = datetime.timedelta()
909 else:
910 sign = 1 if m.group('sign') == '+' else -1
911 timezone = datetime.timedelta(
912 hours=sign * int(m.group('hours')),
913 minutes=sign * int(m.group('minutes')))
52c3a6e4
S
914 try:
915 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
916 dt = datetime.datetime.strptime(date_str, date_format) - timezone
917 return calendar.timegm(dt.timetuple())
918 except ValueError:
919 pass
912b38b4
PH
920
921
42bdd9d0 922def unified_strdate(date_str, day_first=True):
bf50b038 923 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
924
925 if date_str is None:
926 return None
bf50b038 927 upload_date = None
5f6a1245 928 # Replace commas
026fcc04 929 date_str = date_str.replace(',', ' ')
bf50b038 930 # %z (UTC offset) is only supported in python>=3.2
15ac8413
S
931 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
932 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0 933 # Remove AM/PM + timezone
9bb8e0a3 934 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
42bdd9d0 935
19e1d359
JMF
936 format_expressions = [
937 '%d %B %Y',
0f99566c 938 '%d %b %Y',
19e1d359
JMF
939 '%B %d %Y',
940 '%b %d %Y',
f160785c
S
941 '%b %dst %Y %I:%M',
942 '%b %dnd %Y %I:%M',
943 '%b %dth %Y %I:%M',
a69801e2 944 '%Y %m %d',
19e1d359 945 '%Y-%m-%d',
fe556f1b 946 '%Y/%m/%d',
19e1d359 947 '%Y/%m/%d %H:%M:%S',
5d73273f 948 '%Y-%m-%d %H:%M:%S',
e9be9a6a 949 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 950 '%d.%m.%Y %H:%M',
b047de6f 951 '%d.%m.%Y %H.%M',
19e1d359 952 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
953 '%Y-%m-%dT%H:%M:%S.%fZ',
954 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 955 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 956 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 957 '%Y-%m-%dT%H:%M',
19e1d359 958 ]
42bdd9d0
PH
959 if day_first:
960 format_expressions.extend([
79c21abb 961 '%d-%m-%Y',
776dc399
S
962 '%d.%m.%Y',
963 '%d/%m/%Y',
964 '%d/%m/%y',
42bdd9d0
PH
965 '%d/%m/%Y %H:%M:%S',
966 ])
967 else:
968 format_expressions.extend([
79c21abb 969 '%m-%d-%Y',
776dc399
S
970 '%m.%d.%Y',
971 '%m/%d/%Y',
972 '%m/%d/%y',
42bdd9d0
PH
973 '%m/%d/%Y %H:%M:%S',
974 ])
bf50b038
JMF
975 for expression in format_expressions:
976 try:
977 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 978 except ValueError:
bf50b038 979 pass
42393ce2
PH
980 if upload_date is None:
981 timetuple = email.utils.parsedate_tz(date_str)
982 if timetuple:
983 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402
JMF
984 if upload_date is not None:
985 return compat_str(upload_date)
bf50b038 986
5f6a1245 987
28e614de 988def determine_ext(url, default_ext='unknown_video'):
f4776371
S
989 if url is None:
990 return default_ext
9cb9a5df 991 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
992 if re.match(r'^[A-Za-z0-9]+$', guess):
993 return guess
a7aaa398
S
994 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
995 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 996 return guess.rstrip('/')
73e79f2a 997 else:
cbdbb766 998 return default_ext
73e79f2a 999
5f6a1245 1000
d4051a8e 1001def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1002 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1003
5f6a1245 1004
bd558525 1005def date_from_str(date_str):
37254abc
JMF
1006 """
1007 Return a datetime object from a string in the format YYYYMMDD or
1008 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1009 today = datetime.date.today()
f8795e10 1010 if date_str in ('now', 'today'):
37254abc 1011 return today
f8795e10
PH
1012 if date_str == 'yesterday':
1013 return today - datetime.timedelta(days=1)
37254abc
JMF
1014 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1015 if match is not None:
1016 sign = match.group('sign')
1017 time = int(match.group('time'))
1018 if sign == '-':
1019 time = -time
1020 unit = match.group('unit')
dfb1b146 1021 # A bad approximation?
37254abc
JMF
1022 if unit == 'month':
1023 unit = 'day'
1024 time *= 30
1025 elif unit == 'year':
1026 unit = 'day'
1027 time *= 365
1028 unit += 's'
1029 delta = datetime.timedelta(**{unit: time})
1030 return today + delta
611c1dd9 1031 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1032
1033
e63fc1be 1034def hyphenate_date(date_str):
1035 """
1036 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1037 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1038 if match is not None:
1039 return '-'.join(match.groups())
1040 else:
1041 return date_str
1042
5f6a1245 1043
bd558525
JMF
1044class DateRange(object):
1045 """Represents a time interval between two dates"""
5f6a1245 1046
bd558525
JMF
1047 def __init__(self, start=None, end=None):
1048 """start and end must be strings in the format accepted by date"""
1049 if start is not None:
1050 self.start = date_from_str(start)
1051 else:
1052 self.start = datetime.datetime.min.date()
1053 if end is not None:
1054 self.end = date_from_str(end)
1055 else:
1056 self.end = datetime.datetime.max.date()
37254abc 1057 if self.start > self.end:
bd558525 1058 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1059
bd558525
JMF
1060 @classmethod
1061 def day(cls, day):
1062 """Returns a range that only contains the given day"""
5f6a1245
JW
1063 return cls(day, day)
1064
bd558525
JMF
1065 def __contains__(self, date):
1066 """Check if the date is in the range"""
37254abc
JMF
1067 if not isinstance(date, datetime.date):
1068 date = date_from_str(date)
1069 return self.start <= date <= self.end
5f6a1245 1070
bd558525 1071 def __str__(self):
5f6a1245 1072 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1073
1074
1075def platform_name():
1076 """ Returns the platform name as a compat_str """
1077 res = platform.platform()
1078 if isinstance(res, bytes):
1079 res = res.decode(preferredencoding())
1080
1081 assert isinstance(res, compat_str)
1082 return res
c257baff
PH
1083
1084
b58ddb32
PH
1085def _windows_write_string(s, out):
1086 """ Returns True if the string was written using special methods,
1087 False if it has yet to be written out."""
1088 # Adapted from http://stackoverflow.com/a/3259271/35070
1089
1090 import ctypes
1091 import ctypes.wintypes
1092
1093 WIN_OUTPUT_IDS = {
1094 1: -11,
1095 2: -12,
1096 }
1097
a383a98a
PH
1098 try:
1099 fileno = out.fileno()
1100 except AttributeError:
1101 # If the output stream doesn't have a fileno, it's virtual
1102 return False
aa42e873
PH
1103 except io.UnsupportedOperation:
1104 # Some strange Windows pseudo files?
1105 return False
b58ddb32
PH
1106 if fileno not in WIN_OUTPUT_IDS:
1107 return False
1108
e2f89ec7 1109 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1110 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1111 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1112 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1113
e2f89ec7 1114 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1115 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1116 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1117 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1118 written = ctypes.wintypes.DWORD(0)
1119
611c1dd9 1120 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1121 FILE_TYPE_CHAR = 0x0002
1122 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1123 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1124 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1125 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1126 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1127 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1128
1129 def not_a_console(handle):
1130 if handle == INVALID_HANDLE_VALUE or handle is None:
1131 return True
8fb3ac36
PH
1132 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1133 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1134
1135 if not_a_console(h):
1136 return False
1137
d1b9c912
PH
1138 def next_nonbmp_pos(s):
1139 try:
1140 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1141 except StopIteration:
1142 return len(s)
1143
1144 while s:
1145 count = min(next_nonbmp_pos(s), 1024)
1146
b58ddb32 1147 ret = WriteConsoleW(
d1b9c912 1148 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1149 if ret == 0:
1150 raise OSError('Failed to write string')
d1b9c912
PH
1151 if not count: # We just wrote a non-BMP character
1152 assert written.value == 2
1153 s = s[1:]
1154 else:
1155 assert written.value > 0
1156 s = s[written.value:]
b58ddb32
PH
1157 return True
1158
1159
734f90bb 1160def write_string(s, out=None, encoding=None):
7459e3a2
PH
1161 if out is None:
1162 out = sys.stderr
8bf48f23 1163 assert type(s) == compat_str
7459e3a2 1164
b58ddb32
PH
1165 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1166 if _windows_write_string(s, out):
1167 return
1168
7459e3a2
PH
1169 if ('b' in getattr(out, 'mode', '') or
1170 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1171 byt = s.encode(encoding or preferredencoding(), 'ignore')
1172 out.write(byt)
1173 elif hasattr(out, 'buffer'):
1174 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1175 byt = s.encode(enc, 'ignore')
1176 out.buffer.write(byt)
1177 else:
8bf48f23 1178 out.write(s)
7459e3a2
PH
1179 out.flush()
1180
1181
48ea9cea
PH
1182def bytes_to_intlist(bs):
1183 if not bs:
1184 return []
1185 if isinstance(bs[0], int): # Python 3
1186 return list(bs)
1187 else:
1188 return [ord(c) for c in bs]
1189
c257baff 1190
cba892fa 1191def intlist_to_bytes(xs):
1192 if not xs:
1193 return b''
eb4157fd 1194 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1195
1196
c1c9a79c
PH
1197# Cross-platform file locking
1198if sys.platform == 'win32':
1199 import ctypes.wintypes
1200 import msvcrt
1201
1202 class OVERLAPPED(ctypes.Structure):
1203 _fields_ = [
1204 ('Internal', ctypes.wintypes.LPVOID),
1205 ('InternalHigh', ctypes.wintypes.LPVOID),
1206 ('Offset', ctypes.wintypes.DWORD),
1207 ('OffsetHigh', ctypes.wintypes.DWORD),
1208 ('hEvent', ctypes.wintypes.HANDLE),
1209 ]
1210
1211 kernel32 = ctypes.windll.kernel32
1212 LockFileEx = kernel32.LockFileEx
1213 LockFileEx.argtypes = [
1214 ctypes.wintypes.HANDLE, # hFile
1215 ctypes.wintypes.DWORD, # dwFlags
1216 ctypes.wintypes.DWORD, # dwReserved
1217 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1218 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1219 ctypes.POINTER(OVERLAPPED) # Overlapped
1220 ]
1221 LockFileEx.restype = ctypes.wintypes.BOOL
1222 UnlockFileEx = kernel32.UnlockFileEx
1223 UnlockFileEx.argtypes = [
1224 ctypes.wintypes.HANDLE, # hFile
1225 ctypes.wintypes.DWORD, # dwReserved
1226 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1227 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1228 ctypes.POINTER(OVERLAPPED) # Overlapped
1229 ]
1230 UnlockFileEx.restype = ctypes.wintypes.BOOL
1231 whole_low = 0xffffffff
1232 whole_high = 0x7fffffff
1233
1234 def _lock_file(f, exclusive):
1235 overlapped = OVERLAPPED()
1236 overlapped.Offset = 0
1237 overlapped.OffsetHigh = 0
1238 overlapped.hEvent = 0
1239 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1240 handle = msvcrt.get_osfhandle(f.fileno())
1241 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1242 whole_low, whole_high, f._lock_file_overlapped_p):
1243 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1244
1245 def _unlock_file(f):
1246 assert f._lock_file_overlapped_p
1247 handle = msvcrt.get_osfhandle(f.fileno())
1248 if not UnlockFileEx(handle, 0,
1249 whole_low, whole_high, f._lock_file_overlapped_p):
1250 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1251
1252else:
399a76e6
YCH
1253 # Some platforms, such as Jython, is missing fcntl
1254 try:
1255 import fcntl
c1c9a79c 1256
399a76e6
YCH
1257 def _lock_file(f, exclusive):
1258 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1259
399a76e6
YCH
1260 def _unlock_file(f):
1261 fcntl.flock(f, fcntl.LOCK_UN)
1262 except ImportError:
1263 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1264
1265 def _lock_file(f, exclusive):
1266 raise IOError(UNSUPPORTED_MSG)
1267
1268 def _unlock_file(f):
1269 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1270
1271
1272class locked_file(object):
1273 def __init__(self, filename, mode, encoding=None):
1274 assert mode in ['r', 'a', 'w']
1275 self.f = io.open(filename, mode, encoding=encoding)
1276 self.mode = mode
1277
1278 def __enter__(self):
1279 exclusive = self.mode != 'r'
1280 try:
1281 _lock_file(self.f, exclusive)
1282 except IOError:
1283 self.f.close()
1284 raise
1285 return self
1286
1287 def __exit__(self, etype, value, traceback):
1288 try:
1289 _unlock_file(self.f)
1290 finally:
1291 self.f.close()
1292
1293 def __iter__(self):
1294 return iter(self.f)
1295
1296 def write(self, *args):
1297 return self.f.write(*args)
1298
1299 def read(self, *args):
1300 return self.f.read(*args)
4eb7f1d1
JMF
1301
1302
4644ac55
S
1303def get_filesystem_encoding():
1304 encoding = sys.getfilesystemencoding()
1305 return encoding if encoding is not None else 'utf-8'
1306
1307
4eb7f1d1 1308def shell_quote(args):
a6a173c2 1309 quoted_args = []
4644ac55 1310 encoding = get_filesystem_encoding()
a6a173c2
JMF
1311 for a in args:
1312 if isinstance(a, bytes):
1313 # We may get a filename encoded with 'encodeFilename'
1314 a = a.decode(encoding)
1315 quoted_args.append(pipes.quote(a))
28e614de 1316 return ' '.join(quoted_args)
9d4660ca
PH
1317
1318
1319def smuggle_url(url, data):
1320 """ Pass additional data in a URL for internal use. """
1321
15707c7e 1322 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1323 {'__youtubedl_smuggle': json.dumps(data)})
1324 return url + '#' + sdata
9d4660ca
PH
1325
1326
79f82953 1327def unsmuggle_url(smug_url, default=None):
83e865a3 1328 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1329 return smug_url, default
28e614de
PH
1330 url, _, sdata = smug_url.rpartition('#')
1331 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1332 data = json.loads(jsond)
1333 return url, data
02dbf93f
PH
1334
1335
02dbf93f
PH
1336def format_bytes(bytes):
1337 if bytes is None:
28e614de 1338 return 'N/A'
02dbf93f
PH
1339 if type(bytes) is str:
1340 bytes = float(bytes)
1341 if bytes == 0.0:
1342 exponent = 0
1343 else:
1344 exponent = int(math.log(bytes, 1024.0))
28e614de 1345 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1346 converted = float(bytes) / float(1024 ** exponent)
28e614de 1347 return '%.2f%s' % (converted, suffix)
f53c966a 1348
1c088fa8 1349
fb47597b
S
1350def lookup_unit_table(unit_table, s):
1351 units_re = '|'.join(re.escape(u) for u in unit_table)
1352 m = re.match(
782b1b5b 1353 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1354 if not m:
1355 return None
1356 num_str = m.group('num').replace(',', '.')
1357 mult = unit_table[m.group('unit')]
1358 return int(float(num_str) * mult)
1359
1360
be64b5b0
PH
1361def parse_filesize(s):
1362 if s is None:
1363 return None
1364
dfb1b146 1365 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1366 # but we support those too
1367 _UNIT_TABLE = {
1368 'B': 1,
1369 'b': 1,
1370 'KiB': 1024,
1371 'KB': 1000,
1372 'kB': 1024,
1373 'Kb': 1000,
1374 'MiB': 1024 ** 2,
1375 'MB': 1000 ** 2,
1376 'mB': 1024 ** 2,
1377 'Mb': 1000 ** 2,
1378 'GiB': 1024 ** 3,
1379 'GB': 1000 ** 3,
1380 'gB': 1024 ** 3,
1381 'Gb': 1000 ** 3,
1382 'TiB': 1024 ** 4,
1383 'TB': 1000 ** 4,
1384 'tB': 1024 ** 4,
1385 'Tb': 1000 ** 4,
1386 'PiB': 1024 ** 5,
1387 'PB': 1000 ** 5,
1388 'pB': 1024 ** 5,
1389 'Pb': 1000 ** 5,
1390 'EiB': 1024 ** 6,
1391 'EB': 1000 ** 6,
1392 'eB': 1024 ** 6,
1393 'Eb': 1000 ** 6,
1394 'ZiB': 1024 ** 7,
1395 'ZB': 1000 ** 7,
1396 'zB': 1024 ** 7,
1397 'Zb': 1000 ** 7,
1398 'YiB': 1024 ** 8,
1399 'YB': 1000 ** 8,
1400 'yB': 1024 ** 8,
1401 'Yb': 1000 ** 8,
1402 }
1403
fb47597b
S
1404 return lookup_unit_table(_UNIT_TABLE, s)
1405
1406
1407def parse_count(s):
1408 if s is None:
be64b5b0
PH
1409 return None
1410
fb47597b
S
1411 s = s.strip()
1412
1413 if re.match(r'^[\d,.]+$', s):
1414 return str_to_int(s)
1415
1416 _UNIT_TABLE = {
1417 'k': 1000,
1418 'K': 1000,
1419 'm': 1000 ** 2,
1420 'M': 1000 ** 2,
1421 'kk': 1000 ** 2,
1422 'KK': 1000 ** 2,
1423 }
be64b5b0 1424
fb47597b 1425 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1426
2f7ae819 1427
caefb1de
PH
1428def month_by_name(name):
1429 """ Return the number of a month by (locale-independently) English name """
1430
caefb1de 1431 try:
7105440c
YCH
1432 return ENGLISH_MONTH_NAMES.index(name) + 1
1433 except ValueError:
1434 return None
1435
1436
1437def month_by_abbreviation(abbrev):
1438 """ Return the number of a month by (locale-independently) English
1439 abbreviations """
1440
1441 try:
1442 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1443 except ValueError:
1444 return None
18258362
JMF
1445
1446
5aafe895 1447def fix_xml_ampersands(xml_str):
18258362 1448 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1449 return re.sub(
1450 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1451 '&amp;',
5aafe895 1452 xml_str)
e3946f98
PH
1453
1454
1455def setproctitle(title):
8bf48f23 1456 assert isinstance(title, compat_str)
c1c05c67
YCH
1457
1458 # ctypes in Jython is not complete
1459 # http://bugs.jython.org/issue2148
1460 if sys.platform.startswith('java'):
1461 return
1462
e3946f98 1463 try:
611c1dd9 1464 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1465 except OSError:
1466 return
6eefe533
PH
1467 title_bytes = title.encode('utf-8')
1468 buf = ctypes.create_string_buffer(len(title_bytes))
1469 buf.value = title_bytes
e3946f98 1470 try:
6eefe533 1471 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1472 except AttributeError:
1473 return # Strange libc, just skip this
d7dda168
PH
1474
1475
1476def remove_start(s, start):
1477 if s.startswith(start):
1478 return s[len(start):]
1479 return s
29eb5174
PH
1480
1481
2b9faf55
PH
1482def remove_end(s, end):
1483 if s.endswith(end):
1484 return s[:-len(end)]
1485 return s
1486
1487
31b2051e
S
1488def remove_quotes(s):
1489 if s is None or len(s) < 2:
1490 return s
1491 for quote in ('"', "'", ):
1492 if s[0] == quote and s[-1] == quote:
1493 return s[1:-1]
1494 return s
1495
1496
29eb5174 1497def url_basename(url):
9b8aaeed 1498 path = compat_urlparse.urlparse(url).path
28e614de 1499 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1500
1501
1502class HEADRequest(compat_urllib_request.Request):
1503 def get_method(self):
611c1dd9 1504 return 'HEAD'
7217e148
PH
1505
1506
9732d77e 1507def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1508 if get_attr:
1509 if v is not None:
1510 v = getattr(v, get_attr, None)
9572013d
PH
1511 if v == '':
1512 v = None
1812afb7
S
1513 if v is None:
1514 return default
1515 try:
1516 return int(v) * invscale // scale
1517 except ValueError:
af98f8ff 1518 return default
9732d77e 1519
9572013d 1520
40a90862
JMF
1521def str_or_none(v, default=None):
1522 return default if v is None else compat_str(v)
1523
9732d77e
PH
1524
1525def str_to_int(int_str):
48d4681e 1526 """ A more relaxed version of int_or_none """
9732d77e
PH
1527 if int_str is None:
1528 return None
28e614de 1529 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1530 return int(int_str)
608d11f5
PH
1531
1532
9732d77e 1533def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1534 if v is None:
1535 return default
1536 try:
1537 return float(v) * invscale / scale
1538 except ValueError:
1539 return default
43f775e4
PH
1540
1541
608d11f5 1542def parse_duration(s):
8f9312c3 1543 if not isinstance(s, compat_basestring):
608d11f5
PH
1544 return None
1545
ca7b3246
S
1546 s = s.strip()
1547
acaff495 1548 days, hours, mins, secs, ms = [None] * 5
1549 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1550 if m:
1551 days, hours, mins, secs, ms = m.groups()
1552 else:
1553 m = re.match(
1554 r'''(?ix)(?:P?T)?
8f4b58d7 1555 (?:
acaff495 1556 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1557 )?
acaff495 1558 (?:
1559 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1560 )?
1561 (?:
1562 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1563 )?
1564 (?:
1565 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1566 )?$''', s)
1567 if m:
1568 days, hours, mins, secs, ms = m.groups()
1569 else:
1570 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1571 if m:
1572 hours, mins = m.groups()
1573 else:
1574 return None
1575
1576 duration = 0
1577 if secs:
1578 duration += float(secs)
1579 if mins:
1580 duration += float(mins) * 60
1581 if hours:
1582 duration += float(hours) * 60 * 60
1583 if days:
1584 duration += float(days) * 24 * 60 * 60
1585 if ms:
1586 duration += float(ms)
1587 return duration
91d7d0b3
JMF
1588
1589
e65e4c88 1590def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1591 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1592 return (
1593 '{0}.{1}{2}'.format(name, ext, real_ext)
1594 if not expected_real_ext or real_ext[1:] == expected_real_ext
1595 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1596
1597
b3ed15b7
S
1598def replace_extension(filename, ext, expected_real_ext=None):
1599 name, real_ext = os.path.splitext(filename)
1600 return '{0}.{1}'.format(
1601 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1602 ext)
1603
1604
d70ad093
PH
1605def check_executable(exe, args=[]):
1606 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1607 args can be a list of arguments for a short output (like -version) """
1608 try:
1609 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1610 except OSError:
1611 return False
1612 return exe
b7ab0590
PH
1613
1614
95807118 1615def get_exe_version(exe, args=['--version'],
cae97f65 1616 version_re=None, unrecognized='present'):
95807118
PH
1617 """ Returns the version of the specified executable,
1618 or False if the executable is not present """
1619 try:
cae97f65 1620 out, _ = subprocess.Popen(
54116803 1621 [encodeArgument(exe)] + args,
95807118
PH
1622 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1623 except OSError:
1624 return False
cae97f65
PH
1625 if isinstance(out, bytes): # Python 2.x
1626 out = out.decode('ascii', 'ignore')
1627 return detect_exe_version(out, version_re, unrecognized)
1628
1629
1630def detect_exe_version(output, version_re=None, unrecognized='present'):
1631 assert isinstance(output, compat_str)
1632 if version_re is None:
1633 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1634 m = re.search(version_re, output)
95807118
PH
1635 if m:
1636 return m.group(1)
1637 else:
1638 return unrecognized
1639
1640
b7ab0590 1641class PagedList(object):
dd26ced1
PH
1642 def __len__(self):
1643 # This is only useful for tests
1644 return len(self.getslice())
1645
9c44d242
PH
1646
1647class OnDemandPagedList(PagedList):
b95dc034 1648 def __init__(self, pagefunc, pagesize, use_cache=False):
9c44d242
PH
1649 self._pagefunc = pagefunc
1650 self._pagesize = pagesize
b95dc034
YCH
1651 self._use_cache = use_cache
1652 if use_cache:
1653 self._cache = {}
9c44d242 1654
b7ab0590
PH
1655 def getslice(self, start=0, end=None):
1656 res = []
1657 for pagenum in itertools.count(start // self._pagesize):
1658 firstid = pagenum * self._pagesize
1659 nextfirstid = pagenum * self._pagesize + self._pagesize
1660 if start >= nextfirstid:
1661 continue
1662
b95dc034
YCH
1663 page_results = None
1664 if self._use_cache:
1665 page_results = self._cache.get(pagenum)
1666 if page_results is None:
1667 page_results = list(self._pagefunc(pagenum))
1668 if self._use_cache:
1669 self._cache[pagenum] = page_results
b7ab0590
PH
1670
1671 startv = (
1672 start % self._pagesize
1673 if firstid <= start < nextfirstid
1674 else 0)
1675
1676 endv = (
1677 ((end - 1) % self._pagesize) + 1
1678 if (end is not None and firstid <= end <= nextfirstid)
1679 else None)
1680
1681 if startv != 0 or endv is not None:
1682 page_results = page_results[startv:endv]
1683 res.extend(page_results)
1684
1685 # A little optimization - if current page is not "full", ie. does
1686 # not contain page_size videos then we can assume that this page
1687 # is the last one - there are no more ids on further pages -
1688 # i.e. no need to query again.
1689 if len(page_results) + startv < self._pagesize:
1690 break
1691
1692 # If we got the whole page, but the next page is not interesting,
1693 # break out early as well
1694 if end == nextfirstid:
1695 break
1696 return res
81c2f20b
PH
1697
1698
9c44d242
PH
1699class InAdvancePagedList(PagedList):
1700 def __init__(self, pagefunc, pagecount, pagesize):
1701 self._pagefunc = pagefunc
1702 self._pagecount = pagecount
1703 self._pagesize = pagesize
1704
1705 def getslice(self, start=0, end=None):
1706 res = []
1707 start_page = start // self._pagesize
1708 end_page = (
1709 self._pagecount if end is None else (end // self._pagesize + 1))
1710 skip_elems = start - start_page * self._pagesize
1711 only_more = None if end is None else end - start
1712 for pagenum in range(start_page, end_page):
1713 page = list(self._pagefunc(pagenum))
1714 if skip_elems:
1715 page = page[skip_elems:]
1716 skip_elems = None
1717 if only_more is not None:
1718 if len(page) < only_more:
1719 only_more -= len(page)
1720 else:
1721 page = page[:only_more]
1722 res.extend(page)
1723 break
1724 res.extend(page)
1725 return res
1726
1727
81c2f20b 1728def uppercase_escape(s):
676eb3f2 1729 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1730 return re.sub(
a612753d 1731 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1732 lambda m: unicode_escape(m.group(0))[0],
1733 s)
0fe2ff78
YCH
1734
1735
1736def lowercase_escape(s):
1737 unicode_escape = codecs.getdecoder('unicode_escape')
1738 return re.sub(
1739 r'\\u[0-9a-fA-F]{4}',
1740 lambda m: unicode_escape(m.group(0))[0],
1741 s)
b53466e1 1742
d05cfe06
S
1743
1744def escape_rfc3986(s):
1745 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1746 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1747 s = s.encode('utf-8')
ecc0c5ee 1748 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1749
1750
1751def escape_url(url):
1752 """Escape URL as suggested by RFC 3986"""
1753 url_parsed = compat_urllib_parse_urlparse(url)
1754 return url_parsed._replace(
efbed08d 1755 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
1756 path=escape_rfc3986(url_parsed.path),
1757 params=escape_rfc3986(url_parsed.params),
1758 query=escape_rfc3986(url_parsed.query),
1759 fragment=escape_rfc3986(url_parsed.fragment)
1760 ).geturl()
1761
b53466e1 1762try:
28e614de 1763 struct.pack('!I', 0)
b53466e1 1764except TypeError:
622d1916
YCH
1765 # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument
1766 # See https://bugs.python.org/issue19099
b53466e1
PH
1767 def struct_pack(spec, *args):
1768 if isinstance(spec, compat_str):
1769 spec = spec.encode('ascii')
1770 return struct.pack(spec, *args)
1771
1772 def struct_unpack(spec, *args):
1773 if isinstance(spec, compat_str):
1774 spec = spec.encode('ascii')
1775 return struct.unpack(spec, *args)
1776else:
1777 struct_pack = struct.pack
1778 struct_unpack = struct.unpack
62e609ab
PH
1779
1780
1781def read_batch_urls(batch_fd):
1782 def fixup(url):
1783 if not isinstance(url, compat_str):
1784 url = url.decode('utf-8', 'replace')
28e614de 1785 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1786 if url.startswith(BOM_UTF8):
1787 url = url[len(BOM_UTF8):]
1788 url = url.strip()
1789 if url.startswith(('#', ';', ']')):
1790 return False
1791 return url
1792
1793 with contextlib.closing(batch_fd) as fd:
1794 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1795
1796
1797def urlencode_postdata(*args, **kargs):
15707c7e 1798 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1799
1800
38f9ef31 1801def update_url_query(url, query):
cacd9966
YCH
1802 if not query:
1803 return url
38f9ef31 1804 parsed_url = compat_urlparse.urlparse(url)
1805 qs = compat_parse_qs(parsed_url.query)
1806 qs.update(query)
1807 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 1808 query=compat_urllib_parse_urlencode(qs, True)))
16392824 1809
8e60dc75 1810
ed0291d1
S
1811def update_Request(req, url=None, data=None, headers={}, query={}):
1812 req_headers = req.headers.copy()
1813 req_headers.update(headers)
1814 req_data = data or req.data
1815 req_url = update_url_query(url or req.get_full_url(), query)
1816 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1817 new_req = req_type(
1818 req_url, data=req_data, headers=req_headers,
1819 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1820 if hasattr(req, 'timeout'):
1821 new_req.timeout = req.timeout
1822 return new_req
1823
1824
86296ad2 1825def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
1826 if isinstance(key_or_keys, (list, tuple)):
1827 for key in key_or_keys:
86296ad2
S
1828 if key not in d or d[key] is None or skip_false_values and not d[key]:
1829 continue
1830 return d[key]
cbecc9b9
S
1831 return default
1832 return d.get(key_or_keys, default)
1833
1834
8e60dc75
S
1835def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1836 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1837
16392824 1838
a1a530b0
PH
1839US_RATINGS = {
1840 'G': 0,
1841 'PG': 10,
1842 'PG-13': 13,
1843 'R': 16,
1844 'NC': 18,
1845}
fac55558
PH
1846
1847
146c80e2
S
1848def parse_age_limit(s):
1849 if s is None:
d838b1bd 1850 return None
146c80e2 1851 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d800609c 1852 return int(m.group('age')) if m else US_RATINGS.get(s)
146c80e2
S
1853
1854
fac55558 1855def strip_jsonp(code):
609a61e3 1856 return re.sub(
8411229b 1857 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1858
1859
e05f6939
PH
1860def js_to_json(code):
1861 def fix_kv(m):
e7b6d122
PH
1862 v = m.group(0)
1863 if v in ('true', 'false', 'null'):
1864 return v
1865 if v.startswith('"'):
d01949dc
S
1866 v = re.sub(r"\\'", "'", v[1:-1])
1867 elif v.startswith("'"):
e7b6d122
PH
1868 v = v[1:-1]
1869 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1870 '\\\\': '\\\\',
1871 "\\'": "'",
1872 '"': '\\"',
1873 }[m.group(0)], v)
1874 return '"%s"' % v
e05f6939
PH
1875
1876 res = re.sub(r'''(?x)
d305dd73
PH
1877 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1878 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
8f4b58d7 1879 [a-zA-Z_][.a-zA-Z_0-9]*
e05f6939 1880 ''', fix_kv, code)
ba9e68f4 1881 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
e05f6939
PH
1882 return res
1883
1884
478c2c61
PH
1885def qualities(quality_ids):
1886 """ Get a numeric quality value out of a list of possible values """
1887 def q(qid):
1888 try:
1889 return quality_ids.index(qid)
1890 except ValueError:
1891 return -1
1892 return q
1893
acd69589
PH
1894
1895DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1896
a020a0dc
PH
1897
1898def limit_length(s, length):
1899 """ Add ellipses to overly long strings """
1900 if s is None:
1901 return None
1902 ELLIPSES = '...'
1903 if len(s) > length:
1904 return s[:length - len(ELLIPSES)] + ELLIPSES
1905 return s
48844745
PH
1906
1907
1908def version_tuple(v):
5f9b8394 1909 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1910
1911
1912def is_outdated_version(version, limit, assume_new=True):
1913 if not version:
1914 return not assume_new
1915 try:
1916 return version_tuple(version) < version_tuple(limit)
1917 except ValueError:
1918 return not assume_new
732ea2f0
PH
1919
1920
1921def ytdl_is_updateable():
1922 """ Returns if youtube-dl can be updated with -U """
1923 from zipimport import zipimporter
1924
1925 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1926
1927
1928def args_to_str(args):
1929 # Get a short string representation for a subprocess command
1930 return ' '.join(shlex_quote(a) for a in args)
2ccd1b10
PH
1931
1932
9b9c5355 1933def error_to_compat_str(err):
fdae2358
S
1934 err_str = str(err)
1935 # On python 2 error byte string must be decoded with proper
1936 # encoding rather than ascii
1937 if sys.version_info[0] < 3:
1938 err_str = err_str.decode(preferredencoding())
1939 return err_str
1940
1941
c460bdd5 1942def mimetype2ext(mt):
eb9ee194
S
1943 if mt is None:
1944 return None
1945
765ac263
JMF
1946 ext = {
1947 'audio/mp4': 'm4a',
1948 }.get(mt)
1949 if ext is not None:
1950 return ext
1951
c460bdd5
PH
1952 _, _, res = mt.rpartition('/')
1953
1954 return {
f6861ec9 1955 '3gpp': '3gp',
cafcf657 1956 'smptett+xml': 'tt',
1957 'srt': 'srt',
1958 'ttaf+xml': 'dfxp',
a0d8d704 1959 'ttml+xml': 'ttml',
cafcf657 1960 'vtt': 'vtt',
f6861ec9 1961 'x-flv': 'flv',
a0d8d704
YCH
1962 'x-mp4-fragmented': 'mp4',
1963 'x-ms-wmv': 'wmv',
c460bdd5
PH
1964 }.get(res, res)
1965
1966
2ccd1b10
PH
1967def urlhandle_detect_ext(url_handle):
1968 try:
1969 url_handle.headers
1970 getheader = lambda h: url_handle.headers[h]
1971 except AttributeError: # Python < 3
1972 getheader = url_handle.info().getheader
1973
b55ee18f
PH
1974 cd = getheader('Content-Disposition')
1975 if cd:
1976 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1977 if m:
1978 e = determine_ext(m.group('filename'), default_ext=None)
1979 if e:
1980 return e
1981
c460bdd5 1982 return mimetype2ext(getheader('Content-Type'))
05900629
PH
1983
1984
1e399778
YCH
1985def encode_data_uri(data, mime_type):
1986 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1987
1988
05900629 1989def age_restricted(content_limit, age_limit):
6ec6cb4e 1990 """ Returns True iff the content should be blocked """
05900629
PH
1991
1992 if age_limit is None: # No limit set
1993 return False
1994 if content_limit is None:
1995 return False # Content available for everyone
1996 return age_limit < content_limit
61ca9a80
PH
1997
1998
1999def is_html(first_bytes):
2000 """ Detect whether a file contains HTML by examining its first bytes. """
2001
2002 BOMS = [
2003 (b'\xef\xbb\xbf', 'utf-8'),
2004 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2005 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2006 (b'\xff\xfe', 'utf-16-le'),
2007 (b'\xfe\xff', 'utf-16-be'),
2008 ]
2009 for bom, enc in BOMS:
2010 if first_bytes.startswith(bom):
2011 s = first_bytes[len(bom):].decode(enc, 'replace')
2012 break
2013 else:
2014 s = first_bytes.decode('utf-8', 'replace')
2015
2016 return re.match(r'^\s*<', s)
a055469f
PH
2017
2018
2019def determine_protocol(info_dict):
2020 protocol = info_dict.get('protocol')
2021 if protocol is not None:
2022 return protocol
2023
2024 url = info_dict['url']
2025 if url.startswith('rtmp'):
2026 return 'rtmp'
2027 elif url.startswith('mms'):
2028 return 'mms'
2029 elif url.startswith('rtsp'):
2030 return 'rtsp'
2031
2032 ext = determine_ext(url)
2033 if ext == 'm3u8':
2034 return 'm3u8'
2035 elif ext == 'f4m':
2036 return 'f4m'
2037
2038 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2039
2040
2041def render_table(header_row, data):
2042 """ Render a list of rows, each as a list of values """
2043 table = [header_row] + data
2044 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2045 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2046 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2047
2048
2049def _match_one(filter_part, dct):
2050 COMPARISON_OPERATORS = {
2051 '<': operator.lt,
2052 '<=': operator.le,
2053 '>': operator.gt,
2054 '>=': operator.ge,
2055 '=': operator.eq,
2056 '!=': operator.ne,
2057 }
2058 operator_rex = re.compile(r'''(?x)\s*
2059 (?P<key>[a-z_]+)
2060 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2061 (?:
2062 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2063 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2064 )
2065 \s*$
2066 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2067 m = operator_rex.search(filter_part)
2068 if m:
2069 op = COMPARISON_OPERATORS[m.group('op')]
2070 if m.group('strval') is not None:
2071 if m.group('op') not in ('=', '!='):
2072 raise ValueError(
2073 'Operator %s does not support string values!' % m.group('op'))
2074 comparison_value = m.group('strval')
2075 else:
2076 try:
2077 comparison_value = int(m.group('intval'))
2078 except ValueError:
2079 comparison_value = parse_filesize(m.group('intval'))
2080 if comparison_value is None:
2081 comparison_value = parse_filesize(m.group('intval') + 'B')
2082 if comparison_value is None:
2083 raise ValueError(
2084 'Invalid integer value %r in filter part %r' % (
2085 m.group('intval'), filter_part))
2086 actual_value = dct.get(m.group('key'))
2087 if actual_value is None:
2088 return m.group('none_inclusive')
2089 return op(actual_value, comparison_value)
2090
2091 UNARY_OPERATORS = {
2092 '': lambda v: v is not None,
2093 '!': lambda v: v is None,
2094 }
2095 operator_rex = re.compile(r'''(?x)\s*
2096 (?P<op>%s)\s*(?P<key>[a-z_]+)
2097 \s*$
2098 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2099 m = operator_rex.search(filter_part)
2100 if m:
2101 op = UNARY_OPERATORS[m.group('op')]
2102 actual_value = dct.get(m.group('key'))
2103 return op(actual_value)
2104
2105 raise ValueError('Invalid filter part %r' % filter_part)
2106
2107
2108def match_str(filter_str, dct):
2109 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2110
2111 return all(
2112 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2113
2114
2115def match_filter_func(filter_str):
2116 def _match_func(info_dict):
2117 if match_str(filter_str, info_dict):
2118 return None
2119 else:
2120 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2121 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2122 return _match_func
91410c9b
PH
2123
2124
bf6427d2
YCH
2125def parse_dfxp_time_expr(time_expr):
2126 if not time_expr:
d631d5f9 2127 return
bf6427d2
YCH
2128
2129 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2130 if mobj:
2131 return float(mobj.group('time_offset'))
2132
db2fe38b 2133 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2134 if mobj:
db2fe38b 2135 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2136
2137
c1c924ab
YCH
2138def srt_subtitles_timecode(seconds):
2139 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2140
2141
2142def dfxp2srt(dfxp_data):
4e335771
YCH
2143 _x = functools.partial(xpath_with_ns, ns_map={
2144 'ttml': 'http://www.w3.org/ns/ttml',
2145 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
5bf28d78 2146 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
4e335771 2147 })
bf6427d2 2148
87de7069 2149 class TTMLPElementParser(object):
2b14cb56 2150 out = ''
bf6427d2 2151
2b14cb56 2152 def start(self, tag, attrib):
2153 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2154 self.out += '\n'
bf6427d2 2155
2b14cb56 2156 def end(self, tag):
2157 pass
bf6427d2 2158
2b14cb56 2159 def data(self, data):
2160 self.out += data
2161
2162 def close(self):
2163 return self.out.strip()
2164
2165 def parse_node(node):
2166 target = TTMLPElementParser()
2167 parser = xml.etree.ElementTree.XMLParser(target=target)
2168 parser.feed(xml.etree.ElementTree.tostring(node))
2169 return parser.close()
bf6427d2 2170
36e6f62c 2171 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
bf6427d2 2172 out = []
5bf28d78 2173 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2174
2175 if not paras:
2176 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
2177
2178 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2179 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2180 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2181 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2182 if begin_time is None:
2183 continue
7dff0363 2184 if not end_time:
d631d5f9
YCH
2185 if not dur:
2186 continue
2187 end_time = begin_time + dur
bf6427d2
YCH
2188 out.append('%d\n%s --> %s\n%s\n\n' % (
2189 index,
c1c924ab
YCH
2190 srt_subtitles_timecode(begin_time),
2191 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2192 parse_node(para)))
2193
2194 return ''.join(out)
2195
2196
66e289ba
S
2197def cli_option(params, command_option, param):
2198 param = params.get(param)
2199 return [command_option, param] if param is not None else []
2200
2201
2202def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2203 param = params.get(param)
2204 assert isinstance(param, bool)
2205 if separator:
2206 return [command_option + separator + (true_value if param else false_value)]
2207 return [command_option, true_value if param else false_value]
2208
2209
2210def cli_valueless_option(params, command_option, param, expected_value=True):
2211 param = params.get(param)
2212 return [command_option] if param == expected_value else []
2213
2214
2215def cli_configuration_args(params, param, default=[]):
2216 ex_args = params.get(param)
2217 if ex_args is None:
2218 return default
2219 assert isinstance(ex_args, list)
2220 return ex_args
2221
2222
39672624
YCH
2223class ISO639Utils(object):
2224 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2225 _lang_map = {
2226 'aa': 'aar',
2227 'ab': 'abk',
2228 'ae': 'ave',
2229 'af': 'afr',
2230 'ak': 'aka',
2231 'am': 'amh',
2232 'an': 'arg',
2233 'ar': 'ara',
2234 'as': 'asm',
2235 'av': 'ava',
2236 'ay': 'aym',
2237 'az': 'aze',
2238 'ba': 'bak',
2239 'be': 'bel',
2240 'bg': 'bul',
2241 'bh': 'bih',
2242 'bi': 'bis',
2243 'bm': 'bam',
2244 'bn': 'ben',
2245 'bo': 'bod',
2246 'br': 'bre',
2247 'bs': 'bos',
2248 'ca': 'cat',
2249 'ce': 'che',
2250 'ch': 'cha',
2251 'co': 'cos',
2252 'cr': 'cre',
2253 'cs': 'ces',
2254 'cu': 'chu',
2255 'cv': 'chv',
2256 'cy': 'cym',
2257 'da': 'dan',
2258 'de': 'deu',
2259 'dv': 'div',
2260 'dz': 'dzo',
2261 'ee': 'ewe',
2262 'el': 'ell',
2263 'en': 'eng',
2264 'eo': 'epo',
2265 'es': 'spa',
2266 'et': 'est',
2267 'eu': 'eus',
2268 'fa': 'fas',
2269 'ff': 'ful',
2270 'fi': 'fin',
2271 'fj': 'fij',
2272 'fo': 'fao',
2273 'fr': 'fra',
2274 'fy': 'fry',
2275 'ga': 'gle',
2276 'gd': 'gla',
2277 'gl': 'glg',
2278 'gn': 'grn',
2279 'gu': 'guj',
2280 'gv': 'glv',
2281 'ha': 'hau',
2282 'he': 'heb',
2283 'hi': 'hin',
2284 'ho': 'hmo',
2285 'hr': 'hrv',
2286 'ht': 'hat',
2287 'hu': 'hun',
2288 'hy': 'hye',
2289 'hz': 'her',
2290 'ia': 'ina',
2291 'id': 'ind',
2292 'ie': 'ile',
2293 'ig': 'ibo',
2294 'ii': 'iii',
2295 'ik': 'ipk',
2296 'io': 'ido',
2297 'is': 'isl',
2298 'it': 'ita',
2299 'iu': 'iku',
2300 'ja': 'jpn',
2301 'jv': 'jav',
2302 'ka': 'kat',
2303 'kg': 'kon',
2304 'ki': 'kik',
2305 'kj': 'kua',
2306 'kk': 'kaz',
2307 'kl': 'kal',
2308 'km': 'khm',
2309 'kn': 'kan',
2310 'ko': 'kor',
2311 'kr': 'kau',
2312 'ks': 'kas',
2313 'ku': 'kur',
2314 'kv': 'kom',
2315 'kw': 'cor',
2316 'ky': 'kir',
2317 'la': 'lat',
2318 'lb': 'ltz',
2319 'lg': 'lug',
2320 'li': 'lim',
2321 'ln': 'lin',
2322 'lo': 'lao',
2323 'lt': 'lit',
2324 'lu': 'lub',
2325 'lv': 'lav',
2326 'mg': 'mlg',
2327 'mh': 'mah',
2328 'mi': 'mri',
2329 'mk': 'mkd',
2330 'ml': 'mal',
2331 'mn': 'mon',
2332 'mr': 'mar',
2333 'ms': 'msa',
2334 'mt': 'mlt',
2335 'my': 'mya',
2336 'na': 'nau',
2337 'nb': 'nob',
2338 'nd': 'nde',
2339 'ne': 'nep',
2340 'ng': 'ndo',
2341 'nl': 'nld',
2342 'nn': 'nno',
2343 'no': 'nor',
2344 'nr': 'nbl',
2345 'nv': 'nav',
2346 'ny': 'nya',
2347 'oc': 'oci',
2348 'oj': 'oji',
2349 'om': 'orm',
2350 'or': 'ori',
2351 'os': 'oss',
2352 'pa': 'pan',
2353 'pi': 'pli',
2354 'pl': 'pol',
2355 'ps': 'pus',
2356 'pt': 'por',
2357 'qu': 'que',
2358 'rm': 'roh',
2359 'rn': 'run',
2360 'ro': 'ron',
2361 'ru': 'rus',
2362 'rw': 'kin',
2363 'sa': 'san',
2364 'sc': 'srd',
2365 'sd': 'snd',
2366 'se': 'sme',
2367 'sg': 'sag',
2368 'si': 'sin',
2369 'sk': 'slk',
2370 'sl': 'slv',
2371 'sm': 'smo',
2372 'sn': 'sna',
2373 'so': 'som',
2374 'sq': 'sqi',
2375 'sr': 'srp',
2376 'ss': 'ssw',
2377 'st': 'sot',
2378 'su': 'sun',
2379 'sv': 'swe',
2380 'sw': 'swa',
2381 'ta': 'tam',
2382 'te': 'tel',
2383 'tg': 'tgk',
2384 'th': 'tha',
2385 'ti': 'tir',
2386 'tk': 'tuk',
2387 'tl': 'tgl',
2388 'tn': 'tsn',
2389 'to': 'ton',
2390 'tr': 'tur',
2391 'ts': 'tso',
2392 'tt': 'tat',
2393 'tw': 'twi',
2394 'ty': 'tah',
2395 'ug': 'uig',
2396 'uk': 'ukr',
2397 'ur': 'urd',
2398 'uz': 'uzb',
2399 've': 'ven',
2400 'vi': 'vie',
2401 'vo': 'vol',
2402 'wa': 'wln',
2403 'wo': 'wol',
2404 'xh': 'xho',
2405 'yi': 'yid',
2406 'yo': 'yor',
2407 'za': 'zha',
2408 'zh': 'zho',
2409 'zu': 'zul',
2410 }
2411
2412 @classmethod
2413 def short2long(cls, code):
2414 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2415 return cls._lang_map.get(code[:2])
2416
2417 @classmethod
2418 def long2short(cls, code):
2419 """Convert language code from ISO 639-2/T to ISO 639-1"""
2420 for short_name, long_name in cls._lang_map.items():
2421 if long_name == code:
2422 return short_name
2423
2424
4eb10f66
YCH
2425class ISO3166Utils(object):
2426 # From http://data.okfn.org/data/core/country-list
2427 _country_map = {
2428 'AF': 'Afghanistan',
2429 'AX': 'Åland Islands',
2430 'AL': 'Albania',
2431 'DZ': 'Algeria',
2432 'AS': 'American Samoa',
2433 'AD': 'Andorra',
2434 'AO': 'Angola',
2435 'AI': 'Anguilla',
2436 'AQ': 'Antarctica',
2437 'AG': 'Antigua and Barbuda',
2438 'AR': 'Argentina',
2439 'AM': 'Armenia',
2440 'AW': 'Aruba',
2441 'AU': 'Australia',
2442 'AT': 'Austria',
2443 'AZ': 'Azerbaijan',
2444 'BS': 'Bahamas',
2445 'BH': 'Bahrain',
2446 'BD': 'Bangladesh',
2447 'BB': 'Barbados',
2448 'BY': 'Belarus',
2449 'BE': 'Belgium',
2450 'BZ': 'Belize',
2451 'BJ': 'Benin',
2452 'BM': 'Bermuda',
2453 'BT': 'Bhutan',
2454 'BO': 'Bolivia, Plurinational State of',
2455 'BQ': 'Bonaire, Sint Eustatius and Saba',
2456 'BA': 'Bosnia and Herzegovina',
2457 'BW': 'Botswana',
2458 'BV': 'Bouvet Island',
2459 'BR': 'Brazil',
2460 'IO': 'British Indian Ocean Territory',
2461 'BN': 'Brunei Darussalam',
2462 'BG': 'Bulgaria',
2463 'BF': 'Burkina Faso',
2464 'BI': 'Burundi',
2465 'KH': 'Cambodia',
2466 'CM': 'Cameroon',
2467 'CA': 'Canada',
2468 'CV': 'Cape Verde',
2469 'KY': 'Cayman Islands',
2470 'CF': 'Central African Republic',
2471 'TD': 'Chad',
2472 'CL': 'Chile',
2473 'CN': 'China',
2474 'CX': 'Christmas Island',
2475 'CC': 'Cocos (Keeling) Islands',
2476 'CO': 'Colombia',
2477 'KM': 'Comoros',
2478 'CG': 'Congo',
2479 'CD': 'Congo, the Democratic Republic of the',
2480 'CK': 'Cook Islands',
2481 'CR': 'Costa Rica',
2482 'CI': 'Côte d\'Ivoire',
2483 'HR': 'Croatia',
2484 'CU': 'Cuba',
2485 'CW': 'Curaçao',
2486 'CY': 'Cyprus',
2487 'CZ': 'Czech Republic',
2488 'DK': 'Denmark',
2489 'DJ': 'Djibouti',
2490 'DM': 'Dominica',
2491 'DO': 'Dominican Republic',
2492 'EC': 'Ecuador',
2493 'EG': 'Egypt',
2494 'SV': 'El Salvador',
2495 'GQ': 'Equatorial Guinea',
2496 'ER': 'Eritrea',
2497 'EE': 'Estonia',
2498 'ET': 'Ethiopia',
2499 'FK': 'Falkland Islands (Malvinas)',
2500 'FO': 'Faroe Islands',
2501 'FJ': 'Fiji',
2502 'FI': 'Finland',
2503 'FR': 'France',
2504 'GF': 'French Guiana',
2505 'PF': 'French Polynesia',
2506 'TF': 'French Southern Territories',
2507 'GA': 'Gabon',
2508 'GM': 'Gambia',
2509 'GE': 'Georgia',
2510 'DE': 'Germany',
2511 'GH': 'Ghana',
2512 'GI': 'Gibraltar',
2513 'GR': 'Greece',
2514 'GL': 'Greenland',
2515 'GD': 'Grenada',
2516 'GP': 'Guadeloupe',
2517 'GU': 'Guam',
2518 'GT': 'Guatemala',
2519 'GG': 'Guernsey',
2520 'GN': 'Guinea',
2521 'GW': 'Guinea-Bissau',
2522 'GY': 'Guyana',
2523 'HT': 'Haiti',
2524 'HM': 'Heard Island and McDonald Islands',
2525 'VA': 'Holy See (Vatican City State)',
2526 'HN': 'Honduras',
2527 'HK': 'Hong Kong',
2528 'HU': 'Hungary',
2529 'IS': 'Iceland',
2530 'IN': 'India',
2531 'ID': 'Indonesia',
2532 'IR': 'Iran, Islamic Republic of',
2533 'IQ': 'Iraq',
2534 'IE': 'Ireland',
2535 'IM': 'Isle of Man',
2536 'IL': 'Israel',
2537 'IT': 'Italy',
2538 'JM': 'Jamaica',
2539 'JP': 'Japan',
2540 'JE': 'Jersey',
2541 'JO': 'Jordan',
2542 'KZ': 'Kazakhstan',
2543 'KE': 'Kenya',
2544 'KI': 'Kiribati',
2545 'KP': 'Korea, Democratic People\'s Republic of',
2546 'KR': 'Korea, Republic of',
2547 'KW': 'Kuwait',
2548 'KG': 'Kyrgyzstan',
2549 'LA': 'Lao People\'s Democratic Republic',
2550 'LV': 'Latvia',
2551 'LB': 'Lebanon',
2552 'LS': 'Lesotho',
2553 'LR': 'Liberia',
2554 'LY': 'Libya',
2555 'LI': 'Liechtenstein',
2556 'LT': 'Lithuania',
2557 'LU': 'Luxembourg',
2558 'MO': 'Macao',
2559 'MK': 'Macedonia, the Former Yugoslav Republic of',
2560 'MG': 'Madagascar',
2561 'MW': 'Malawi',
2562 'MY': 'Malaysia',
2563 'MV': 'Maldives',
2564 'ML': 'Mali',
2565 'MT': 'Malta',
2566 'MH': 'Marshall Islands',
2567 'MQ': 'Martinique',
2568 'MR': 'Mauritania',
2569 'MU': 'Mauritius',
2570 'YT': 'Mayotte',
2571 'MX': 'Mexico',
2572 'FM': 'Micronesia, Federated States of',
2573 'MD': 'Moldova, Republic of',
2574 'MC': 'Monaco',
2575 'MN': 'Mongolia',
2576 'ME': 'Montenegro',
2577 'MS': 'Montserrat',
2578 'MA': 'Morocco',
2579 'MZ': 'Mozambique',
2580 'MM': 'Myanmar',
2581 'NA': 'Namibia',
2582 'NR': 'Nauru',
2583 'NP': 'Nepal',
2584 'NL': 'Netherlands',
2585 'NC': 'New Caledonia',
2586 'NZ': 'New Zealand',
2587 'NI': 'Nicaragua',
2588 'NE': 'Niger',
2589 'NG': 'Nigeria',
2590 'NU': 'Niue',
2591 'NF': 'Norfolk Island',
2592 'MP': 'Northern Mariana Islands',
2593 'NO': 'Norway',
2594 'OM': 'Oman',
2595 'PK': 'Pakistan',
2596 'PW': 'Palau',
2597 'PS': 'Palestine, State of',
2598 'PA': 'Panama',
2599 'PG': 'Papua New Guinea',
2600 'PY': 'Paraguay',
2601 'PE': 'Peru',
2602 'PH': 'Philippines',
2603 'PN': 'Pitcairn',
2604 'PL': 'Poland',
2605 'PT': 'Portugal',
2606 'PR': 'Puerto Rico',
2607 'QA': 'Qatar',
2608 'RE': 'Réunion',
2609 'RO': 'Romania',
2610 'RU': 'Russian Federation',
2611 'RW': 'Rwanda',
2612 'BL': 'Saint Barthélemy',
2613 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2614 'KN': 'Saint Kitts and Nevis',
2615 'LC': 'Saint Lucia',
2616 'MF': 'Saint Martin (French part)',
2617 'PM': 'Saint Pierre and Miquelon',
2618 'VC': 'Saint Vincent and the Grenadines',
2619 'WS': 'Samoa',
2620 'SM': 'San Marino',
2621 'ST': 'Sao Tome and Principe',
2622 'SA': 'Saudi Arabia',
2623 'SN': 'Senegal',
2624 'RS': 'Serbia',
2625 'SC': 'Seychelles',
2626 'SL': 'Sierra Leone',
2627 'SG': 'Singapore',
2628 'SX': 'Sint Maarten (Dutch part)',
2629 'SK': 'Slovakia',
2630 'SI': 'Slovenia',
2631 'SB': 'Solomon Islands',
2632 'SO': 'Somalia',
2633 'ZA': 'South Africa',
2634 'GS': 'South Georgia and the South Sandwich Islands',
2635 'SS': 'South Sudan',
2636 'ES': 'Spain',
2637 'LK': 'Sri Lanka',
2638 'SD': 'Sudan',
2639 'SR': 'Suriname',
2640 'SJ': 'Svalbard and Jan Mayen',
2641 'SZ': 'Swaziland',
2642 'SE': 'Sweden',
2643 'CH': 'Switzerland',
2644 'SY': 'Syrian Arab Republic',
2645 'TW': 'Taiwan, Province of China',
2646 'TJ': 'Tajikistan',
2647 'TZ': 'Tanzania, United Republic of',
2648 'TH': 'Thailand',
2649 'TL': 'Timor-Leste',
2650 'TG': 'Togo',
2651 'TK': 'Tokelau',
2652 'TO': 'Tonga',
2653 'TT': 'Trinidad and Tobago',
2654 'TN': 'Tunisia',
2655 'TR': 'Turkey',
2656 'TM': 'Turkmenistan',
2657 'TC': 'Turks and Caicos Islands',
2658 'TV': 'Tuvalu',
2659 'UG': 'Uganda',
2660 'UA': 'Ukraine',
2661 'AE': 'United Arab Emirates',
2662 'GB': 'United Kingdom',
2663 'US': 'United States',
2664 'UM': 'United States Minor Outlying Islands',
2665 'UY': 'Uruguay',
2666 'UZ': 'Uzbekistan',
2667 'VU': 'Vanuatu',
2668 'VE': 'Venezuela, Bolivarian Republic of',
2669 'VN': 'Viet Nam',
2670 'VG': 'Virgin Islands, British',
2671 'VI': 'Virgin Islands, U.S.',
2672 'WF': 'Wallis and Futuna',
2673 'EH': 'Western Sahara',
2674 'YE': 'Yemen',
2675 'ZM': 'Zambia',
2676 'ZW': 'Zimbabwe',
2677 }
2678
2679 @classmethod
2680 def short2full(cls, code):
2681 """Convert an ISO 3166-2 country code to the corresponding full name"""
2682 return cls._country_map.get(code.upper())
2683
2684
91410c9b 2685class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2686 def __init__(self, proxies=None):
2687 # Set default handlers
2688 for type in ('http', 'https'):
2689 setattr(self, '%s_open' % type,
2690 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2691 meth(r, proxy, type))
2692 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2693
91410c9b 2694 def proxy_open(self, req, proxy, type):
2461f79d 2695 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
2696 if req_proxy is not None:
2697 proxy = req_proxy
2461f79d
PH
2698 del req.headers['Ytdl-request-proxy']
2699
2700 if proxy == '__noproxy__':
2701 return None # No Proxy
91410c9b
PH
2702 return compat_urllib_request.ProxyHandler.proxy_open(
2703 self, req, proxy, type)
5bc880b9
YCH
2704
2705
2706def ohdave_rsa_encrypt(data, exponent, modulus):
2707 '''
2708 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2709
2710 Input:
2711 data: data to encrypt, bytes-like object
2712 exponent, modulus: parameter e and N of RSA algorithm, both integer
2713 Output: hex string of encrypted data
2714
2715 Limitation: supports one block encryption only
2716 '''
2717
2718 payload = int(binascii.hexlify(data[::-1]), 16)
2719 encrypted = pow(payload, exponent, modulus)
2720 return '%x' % encrypted
81bdc8fd
YCH
2721
2722
5eb6bdce 2723def encode_base_n(num, n, table=None):
59f898b7 2724 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
2725 if not table:
2726 table = FULL_TABLE[:n]
2727
5eb6bdce
YCH
2728 if n > len(table):
2729 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2730
2731 if num == 0:
2732 return table[0]
2733
81bdc8fd
YCH
2734 ret = ''
2735 while num:
2736 ret = table[num % n] + ret
2737 num = num // n
2738 return ret
f52354a8
YCH
2739
2740
2741def decode_packed_codes(code):
2742 mobj = re.search(
680079be 2743 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
f52354a8
YCH
2744 code)
2745 obfucasted_code, base, count, symbols = mobj.groups()
2746 base = int(base)
2747 count = int(count)
2748 symbols = symbols.split('|')
2749 symbol_table = {}
2750
2751 while count:
2752 count -= 1
5eb6bdce 2753 base_n_count = encode_base_n(count, base)
f52354a8
YCH
2754 symbol_table[base_n_count] = symbols[count] or base_n_count
2755
2756 return re.sub(
2757 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2758 obfucasted_code)