]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[compat] Add compat_urllib_parse_urlencode and eliminate encode_dict
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
f45c185f 14import errno
be4a824d 15import functools
d77c3dfd 16import gzip
b7ab0590 17import itertools
03f9daab 18import io
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
347de493 22import operator
d77c3dfd 23import os
4eb7f1d1 24import pipes
c496ca96 25import platform
d77c3dfd 26import re
13ebea79 27import ssl
c496ca96 28import socket
b53466e1 29import struct
1c088fa8 30import subprocess
d77c3dfd 31import sys
181c8655 32import tempfile
01951dda 33import traceback
bcf89ce6 34import xml.etree.ElementTree
d77c3dfd 35import zlib
d77c3dfd 36
8c25f81b 37from .compat import (
8bb56eee 38 compat_HTMLParser,
8f9312c3 39 compat_basestring,
8c25f81b 40 compat_chr,
36e6f62c 41 compat_etree_fromstring,
8c25f81b 42 compat_html_entities,
be4a824d 43 compat_http_client,
c86b6142 44 compat_kwargs,
8c25f81b 45 compat_parse_qs,
be4a824d 46 compat_socket_create_connection,
8c25f81b
PH
47 compat_str,
48 compat_urllib_error,
49 compat_urllib_parse,
15707c7e 50 compat_urllib_parse_urlencode,
8c25f81b
PH
51 compat_urllib_parse_urlparse,
52 compat_urllib_request,
53 compat_urlparse,
810c10ba 54 compat_xpath,
7d4111ed 55 shlex_quote,
8c25f81b 56)
4644ac55
S
57
58
468e2e92
FV
59# This is not clearly defined otherwise
60compiled_regex_type = type(re.compile(''))
61
3e669f36 62std_headers = {
9c7b3898 63 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
59ae15a5
PH
64 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
65 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
66 'Accept-Encoding': 'gzip, deflate',
67 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 68}
f427df17 69
5f6a1245 70
bf42a990
S
71NO_DEFAULT = object()
72
7105440c
YCH
73ENGLISH_MONTH_NAMES = [
74 'January', 'February', 'March', 'April', 'May', 'June',
75 'July', 'August', 'September', 'October', 'November', 'December']
76
a7aaa398
S
77KNOWN_EXTENSIONS = (
78 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
79 'flv', 'f4v', 'f4a', 'f4b',
80 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
81 'mkv', 'mka', 'mk3d',
82 'avi', 'divx',
83 'mov',
84 'asf', 'wmv', 'wma',
85 '3gp', '3g2',
86 'mp3',
87 'flac',
88 'ape',
89 'wav',
90 'f4f', 'f4m', 'm3u8', 'smil')
91
7105440c 92
d77c3dfd 93def preferredencoding():
59ae15a5 94 """Get preferred encoding.
d77c3dfd 95
59ae15a5
PH
96 Returns the best encoding scheme for the system, based on
97 locale.getpreferredencoding() and some further tweaks.
98 """
99 try:
100 pref = locale.getpreferredencoding()
28e614de 101 'TEST'.encode(pref)
70a1165b 102 except Exception:
59ae15a5 103 pref = 'UTF-8'
bae611f2 104
59ae15a5 105 return pref
d77c3dfd 106
f4bfd65f 107
181c8655 108def write_json_file(obj, fn):
1394646a 109 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 110
92120217 111 fn = encodeFilename(fn)
61ee5aeb 112 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
113 encoding = get_filesystem_encoding()
114 # os.path.basename returns a bytes object, but NamedTemporaryFile
115 # will fail if the filename contains non ascii characters unless we
116 # use a unicode object
117 path_basename = lambda f: os.path.basename(fn).decode(encoding)
118 # the same for os.path.dirname
119 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
120 else:
121 path_basename = os.path.basename
122 path_dirname = os.path.dirname
123
73159f99
S
124 args = {
125 'suffix': '.tmp',
ec5f6016
JMF
126 'prefix': path_basename(fn) + '.',
127 'dir': path_dirname(fn),
73159f99
S
128 'delete': False,
129 }
130
181c8655
PH
131 # In Python 2.x, json.dump expects a bytestream.
132 # In Python 3.x, it writes to a character stream
133 if sys.version_info < (3, 0):
73159f99 134 args['mode'] = 'wb'
181c8655 135 else:
73159f99
S
136 args.update({
137 'mode': 'w',
138 'encoding': 'utf-8',
139 })
140
c86b6142 141 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
142
143 try:
144 with tf:
145 json.dump(obj, tf)
1394646a
IK
146 if sys.platform == 'win32':
147 # Need to remove existing file on Windows, else os.rename raises
148 # WindowsError or FileExistsError.
149 try:
150 os.unlink(fn)
151 except OSError:
152 pass
181c8655 153 os.rename(tf.name, fn)
70a1165b 154 except Exception:
181c8655
PH
155 try:
156 os.remove(tf.name)
157 except OSError:
158 pass
159 raise
160
161
162if sys.version_info >= (2, 7):
ee114368 163 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 164 """ Find the xpath xpath[@key=val] """
5d2354f1 165 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 166 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
167 return node.find(expr)
168else:
ee114368 169 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 170 for f in node.findall(compat_xpath(xpath)):
ee114368
S
171 if key not in f.attrib:
172 continue
173 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
174 return f
175 return None
176
d7e66d39
JMF
177# On python2.6 the xml.etree.ElementTree.Element methods don't support
178# the namespace parameter
5f6a1245
JW
179
180
d7e66d39
JMF
181def xpath_with_ns(path, ns_map):
182 components = [c.split(':') for c in path.split('/')]
183 replaced = []
184 for c in components:
185 if len(c) == 1:
186 replaced.append(c[0])
187 else:
188 ns, tag = c
189 replaced.append('{%s}%s' % (ns_map[ns], tag))
190 return '/'.join(replaced)
191
d77c3dfd 192
a41fb80c 193def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 194 def _find_xpath(xpath):
810c10ba 195 return node.find(compat_xpath(xpath))
578c0745
S
196
197 if isinstance(xpath, (str, compat_str)):
198 n = _find_xpath(xpath)
199 else:
200 for xp in xpath:
201 n = _find_xpath(xp)
202 if n is not None:
203 break
d74bebd5 204
8e636da4 205 if n is None:
bf42a990
S
206 if default is not NO_DEFAULT:
207 return default
208 elif fatal:
bf0ff932
PH
209 name = xpath if name is None else name
210 raise ExtractorError('Could not find XML element %s' % name)
211 else:
212 return None
a41fb80c
S
213 return n
214
215
216def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
217 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
218 if n is None or n == default:
219 return n
220 if n.text is None:
221 if default is not NO_DEFAULT:
222 return default
223 elif fatal:
224 name = xpath if name is None else name
225 raise ExtractorError('Could not find XML element\'s text %s' % name)
226 else:
227 return None
228 return n.text
a41fb80c
S
229
230
231def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
232 n = find_xpath_attr(node, xpath, key)
233 if n is None:
234 if default is not NO_DEFAULT:
235 return default
236 elif fatal:
237 name = '%s[@%s]' % (xpath, key) if name is None else name
238 raise ExtractorError('Could not find XML attribute %s' % name)
239 else:
240 return None
241 return n.attrib[key]
bf0ff932
PH
242
243
9e6dd238 244def get_element_by_id(id, html):
43e8fafd 245 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 246 return get_element_by_attribute('id', id, html)
43e8fafd 247
12ea2f30 248
43e8fafd
ND
249def get_element_by_attribute(attribute, value, html):
250 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 251
38285056
PH
252 m = re.search(r'''(?xs)
253 <([a-zA-Z0-9:._-]+)
254 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
255 \s+%s=['"]?%s['"]?
256 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
257 \s*>
258 (?P<content>.*?)
259 </\1>
260 ''' % (re.escape(attribute), re.escape(value)), html)
261
262 if not m:
263 return None
264 res = m.group('content')
265
266 if res.startswith('"') or res.startswith("'"):
267 res = res[1:-1]
a921f407 268
38285056 269 return unescapeHTML(res)
a921f407 270
c5229f39 271
8bb56eee
BF
272class HTMLAttributeParser(compat_HTMLParser):
273 """Trivial HTML parser to gather the attributes for a single element"""
274 def __init__(self):
c5229f39 275 self.attrs = {}
8bb56eee
BF
276 compat_HTMLParser.__init__(self)
277
278 def handle_starttag(self, tag, attrs):
279 self.attrs = dict(attrs)
280
c5229f39 281
8bb56eee
BF
282def extract_attributes(html_element):
283 """Given a string for an HTML element such as
284 <el
285 a="foo" B="bar" c="&98;az" d=boz
286 empty= noval entity="&amp;"
287 sq='"' dq="'"
288 >
289 Decode and return a dictionary of attributes.
290 {
291 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
292 'empty': '', 'noval': None, 'entity': '&',
293 'sq': '"', 'dq': '\''
294 }.
295 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
296 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
297 """
298 parser = HTMLAttributeParser()
299 parser.feed(html_element)
300 parser.close()
301 return parser.attrs
9e6dd238 302
c5229f39 303
9e6dd238 304def clean_html(html):
59ae15a5 305 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
306
307 if html is None: # Convenience for sanitizing descriptions etc.
308 return html
309
59ae15a5
PH
310 # Newline vs <br />
311 html = html.replace('\n', ' ')
6b3aef80
FV
312 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
313 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
314 # Strip html tags
315 html = re.sub('<.*?>', '', html)
316 # Replace html entities
317 html = unescapeHTML(html)
7decf895 318 return html.strip()
9e6dd238
FV
319
320
d77c3dfd 321def sanitize_open(filename, open_mode):
59ae15a5
PH
322 """Try to open the given filename, and slightly tweak it if this fails.
323
324 Attempts to open the given filename. If this fails, it tries to change
325 the filename slightly, step by step, until it's either able to open it
326 or it fails and raises a final exception, like the standard open()
327 function.
328
329 It returns the tuple (stream, definitive_file_name).
330 """
331 try:
28e614de 332 if filename == '-':
59ae15a5
PH
333 if sys.platform == 'win32':
334 import msvcrt
335 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 336 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
337 stream = open(encodeFilename(filename), open_mode)
338 return (stream, filename)
339 except (IOError, OSError) as err:
f45c185f
PH
340 if err.errno in (errno.EACCES,):
341 raise
59ae15a5 342
f45c185f 343 # In case of error, try to remove win32 forbidden chars
d55de57b 344 alt_filename = sanitize_path(filename)
f45c185f
PH
345 if alt_filename == filename:
346 raise
347 else:
348 # An exception here should be caught in the caller
d55de57b 349 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 350 return (stream, alt_filename)
d77c3dfd
FV
351
352
353def timeconvert(timestr):
59ae15a5
PH
354 """Convert RFC 2822 defined time string into system timestamp"""
355 timestamp = None
356 timetuple = email.utils.parsedate_tz(timestr)
357 if timetuple is not None:
358 timestamp = email.utils.mktime_tz(timetuple)
359 return timestamp
1c469a94 360
5f6a1245 361
796173d0 362def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
363 """Sanitizes a string so it could be used as part of a filename.
364 If restricted is set, use a stricter subset of allowed characters.
796173d0 365 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
366 """
367 def replace_insane(char):
368 if char == '?' or ord(char) < 32 or ord(char) == 127:
369 return ''
370 elif char == '"':
371 return '' if restricted else '\''
372 elif char == ':':
373 return '_-' if restricted else ' -'
374 elif char in '\\/|*<>':
375 return '_'
627dcfff 376 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
377 return '_'
378 if restricted and ord(char) > 127:
379 return '_'
380 return char
381
2aeb06d6
PH
382 # Handle timestamps
383 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 384 result = ''.join(map(replace_insane, s))
796173d0
PH
385 if not is_id:
386 while '__' in result:
387 result = result.replace('__', '_')
388 result = result.strip('_')
389 # Common case of "Foreign band name - English song title"
390 if restricted and result.startswith('-_'):
391 result = result[2:]
5a42414b
PH
392 if result.startswith('-'):
393 result = '_' + result[len('-'):]
a7440261 394 result = result.lstrip('.')
796173d0
PH
395 if not result:
396 result = '_'
59ae15a5 397 return result
d77c3dfd 398
5f6a1245 399
a2aaf4db
S
400def sanitize_path(s):
401 """Sanitizes and normalizes path on Windows"""
402 if sys.platform != 'win32':
403 return s
be531ef1
S
404 drive_or_unc, _ = os.path.splitdrive(s)
405 if sys.version_info < (2, 7) and not drive_or_unc:
406 drive_or_unc, _ = os.path.splitunc(s)
407 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
408 if drive_or_unc:
a2aaf4db
S
409 norm_path.pop(0)
410 sanitized_path = [
c90d16cf 411 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
a2aaf4db 412 for path_part in norm_path]
be531ef1
S
413 if drive_or_unc:
414 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
415 return os.path.join(*sanitized_path)
416
417
67dda517
S
418# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
419# unwanted failures due to missing protocol
420def sanitized_Request(url, *args, **kwargs):
421 return compat_urllib_request.Request(
422 'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
423
424
d77c3dfd 425def orderedSet(iterable):
59ae15a5
PH
426 """ Remove all duplicates from the input iterable """
427 res = []
428 for el in iterable:
429 if el not in res:
430 res.append(el)
431 return res
d77c3dfd 432
912b38b4 433
4e408e47
PH
434def _htmlentity_transform(entity):
435 """Transforms an HTML entity to a character."""
436 # Known non-numeric HTML entity
437 if entity in compat_html_entities.name2codepoint:
438 return compat_chr(compat_html_entities.name2codepoint[entity])
439
91757b0f 440 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
441 if mobj is not None:
442 numstr = mobj.group(1)
28e614de 443 if numstr.startswith('x'):
4e408e47 444 base = 16
28e614de 445 numstr = '0%s' % numstr
4e408e47
PH
446 else:
447 base = 10
7aefc49c
S
448 # See https://github.com/rg3/youtube-dl/issues/7518
449 try:
450 return compat_chr(int(numstr, base))
451 except ValueError:
452 pass
4e408e47
PH
453
454 # Unknown entity in name, return its literal representation
7a3f0c00 455 return '&%s;' % entity
4e408e47
PH
456
457
d77c3dfd 458def unescapeHTML(s):
912b38b4
PH
459 if s is None:
460 return None
461 assert type(s) == compat_str
d77c3dfd 462
4e408e47
PH
463 return re.sub(
464 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 465
8bf48f23 466
aa49acd1
S
467def get_subprocess_encoding():
468 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
469 # For subprocess calls, encode with locale encoding
470 # Refer to http://stackoverflow.com/a/9951851/35070
471 encoding = preferredencoding()
472 else:
473 encoding = sys.getfilesystemencoding()
474 if encoding is None:
475 encoding = 'utf-8'
476 return encoding
477
478
8bf48f23 479def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
480 """
481 @param s The name of the file
482 """
d77c3dfd 483
8bf48f23 484 assert type(s) == compat_str
d77c3dfd 485
59ae15a5
PH
486 # Python 3 has a Unicode API
487 if sys.version_info >= (3, 0):
488 return s
0f00efed 489
aa49acd1
S
490 # Pass '' directly to use Unicode APIs on Windows 2000 and up
491 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
492 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
493 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
494 return s
495
8ee239e9
YCH
496 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
497 if sys.platform.startswith('java'):
498 return s
499
aa49acd1
S
500 return s.encode(get_subprocess_encoding(), 'ignore')
501
502
503def decodeFilename(b, for_subprocess=False):
504
505 if sys.version_info >= (3, 0):
506 return b
507
508 if not isinstance(b, bytes):
509 return b
510
511 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 512
f07b74fc
PH
513
514def encodeArgument(s):
515 if not isinstance(s, compat_str):
516 # Legacy code that uses byte strings
517 # Uncomment the following line after fixing all post processors
7af808a5 518 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
519 s = s.decode('ascii')
520 return encodeFilename(s, True)
521
522
aa49acd1
S
523def decodeArgument(b):
524 return decodeFilename(b, True)
525
526
8271226a
PH
527def decodeOption(optval):
528 if optval is None:
529 return optval
530 if isinstance(optval, bytes):
531 optval = optval.decode(preferredencoding())
532
533 assert isinstance(optval, compat_str)
534 return optval
1c256f70 535
5f6a1245 536
4539dd30
PH
537def formatSeconds(secs):
538 if secs > 3600:
539 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
540 elif secs > 60:
541 return '%d:%02d' % (secs // 60, secs % 60)
542 else:
543 return '%d' % secs
544
a0ddb8a2 545
be4a824d
PH
546def make_HTTPS_handler(params, **kwargs):
547 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 548 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 549 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 550 if opts_no_check_certificate:
be5f2c19 551 context.check_hostname = False
0db261ba 552 context.verify_mode = ssl.CERT_NONE
a2366922 553 try:
be4a824d 554 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
555 except TypeError:
556 # Python 2.7.8
557 # (create_default_context present but HTTPSHandler has no context=)
558 pass
559
560 if sys.version_info < (3, 2):
d7932313 561 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 562 else: # Python < 3.4
d7932313 563 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 564 context.verify_mode = (ssl.CERT_NONE
dca08720 565 if opts_no_check_certificate
ea6d901e 566 else ssl.CERT_REQUIRED)
303b479e 567 context.set_default_verify_paths()
be4a824d 568 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 569
732ea2f0 570
08f2a92c
JMF
571def bug_reports_message():
572 if ytdl_is_updateable():
573 update_cmd = 'type youtube-dl -U to update'
574 else:
575 update_cmd = 'see https://yt-dl.org/update on how to update'
576 msg = '; please report this issue on https://yt-dl.org/bug .'
577 msg += ' Make sure you are using the latest version; %s.' % update_cmd
578 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
579 return msg
580
581
1c256f70
PH
582class ExtractorError(Exception):
583 """Error during info extraction."""
5f6a1245 584
d11271dd 585 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
586 """ tb, if given, is the original traceback (so that it can be printed out).
587 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
588 """
589
590 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
591 expected = True
d11271dd
PH
592 if video_id is not None:
593 msg = video_id + ': ' + msg
410f3e73 594 if cause:
28e614de 595 msg += ' (caused by %r)' % cause
9a82b238 596 if not expected:
08f2a92c 597 msg += bug_reports_message()
1c256f70 598 super(ExtractorError, self).__init__(msg)
d5979c5d 599
1c256f70 600 self.traceback = tb
8cc83b8d 601 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 602 self.cause = cause
d11271dd 603 self.video_id = video_id
1c256f70 604
01951dda
PH
605 def format_traceback(self):
606 if self.traceback is None:
607 return None
28e614de 608 return ''.join(traceback.format_tb(self.traceback))
01951dda 609
1c256f70 610
416c7fcb
PH
611class UnsupportedError(ExtractorError):
612 def __init__(self, url):
613 super(UnsupportedError, self).__init__(
614 'Unsupported URL: %s' % url, expected=True)
615 self.url = url
616
617
55b3e45b
JMF
618class RegexNotFoundError(ExtractorError):
619 """Error when a regex didn't match"""
620 pass
621
622
d77c3dfd 623class DownloadError(Exception):
59ae15a5 624 """Download Error exception.
d77c3dfd 625
59ae15a5
PH
626 This exception may be thrown by FileDownloader objects if they are not
627 configured to continue on errors. They will contain the appropriate
628 error message.
629 """
5f6a1245 630
8cc83b8d
FV
631 def __init__(self, msg, exc_info=None):
632 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
633 super(DownloadError, self).__init__(msg)
634 self.exc_info = exc_info
d77c3dfd
FV
635
636
637class SameFileError(Exception):
59ae15a5 638 """Same File exception.
d77c3dfd 639
59ae15a5
PH
640 This exception will be thrown by FileDownloader objects if they detect
641 multiple files would have to be downloaded to the same file on disk.
642 """
643 pass
d77c3dfd
FV
644
645
646class PostProcessingError(Exception):
59ae15a5 647 """Post Processing exception.
d77c3dfd 648
59ae15a5
PH
649 This exception may be raised by PostProcessor's .run() method to
650 indicate an error in the postprocessing task.
651 """
5f6a1245 652
7851b379
PH
653 def __init__(self, msg):
654 self.msg = msg
d77c3dfd 655
5f6a1245 656
d77c3dfd 657class MaxDownloadsReached(Exception):
59ae15a5
PH
658 """ --max-downloads limit has been reached. """
659 pass
d77c3dfd
FV
660
661
662class UnavailableVideoError(Exception):
59ae15a5 663 """Unavailable Format exception.
d77c3dfd 664
59ae15a5
PH
665 This exception will be thrown when a video is requested
666 in a format that is not available for that video.
667 """
668 pass
d77c3dfd
FV
669
670
671class ContentTooShortError(Exception):
59ae15a5 672 """Content Too Short exception.
d77c3dfd 673
59ae15a5
PH
674 This exception may be raised by FileDownloader objects when a file they
675 download is too small for what the server announced first, indicating
676 the connection was probably interrupted.
677 """
d77c3dfd 678
59ae15a5 679 def __init__(self, downloaded, expected):
2c7ed247 680 # Both in bytes
59ae15a5
PH
681 self.downloaded = downloaded
682 self.expected = expected
d77c3dfd 683
5f6a1245 684
c5a59d93 685def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
686 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
687 # expected HTTP responses to meet HTTP/1.0 or later (see also
688 # https://github.com/rg3/youtube-dl/issues/6727)
689 if sys.version_info < (3, 0):
5a1a2e94 690 kwargs[b'strict'] = True
be4a824d
PH
691 hc = http_class(*args, **kwargs)
692 source_address = ydl_handler._params.get('source_address')
693 if source_address is not None:
694 sa = (source_address, 0)
695 if hasattr(hc, 'source_address'): # Python 2.7+
696 hc.source_address = sa
697 else: # Python 2.6
698 def _hc_connect(self, *args, **kwargs):
699 sock = compat_socket_create_connection(
700 (self.host, self.port), self.timeout, sa)
701 if is_https:
d7932313
PH
702 self.sock = ssl.wrap_socket(
703 sock, self.key_file, self.cert_file,
704 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
705 else:
706 self.sock = sock
707 hc.connect = functools.partial(_hc_connect, hc)
708
709 return hc
710
711
87f0e62d 712def handle_youtubedl_headers(headers):
992fc9d6
YCH
713 filtered_headers = headers
714
715 if 'Youtubedl-no-compression' in filtered_headers:
716 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 717 del filtered_headers['Youtubedl-no-compression']
87f0e62d 718
992fc9d6 719 return filtered_headers
87f0e62d
YCH
720
721
acebc9cd 722class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
723 """Handler for HTTP requests and responses.
724
725 This class, when installed with an OpenerDirector, automatically adds
726 the standard headers to every HTTP request and handles gzipped and
727 deflated responses from web servers. If compression is to be avoided in
728 a particular request, the original request in the program code only has
0424ec30 729 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
730 removed before making the real request.
731
732 Part of this code was copied from:
733
734 http://techknack.net/python-urllib2-handlers/
735
736 Andrew Rowls, the author of that code, agreed to release it to the
737 public domain.
738 """
739
be4a824d
PH
740 def __init__(self, params, *args, **kwargs):
741 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
742 self._params = params
743
744 def http_open(self, req):
745 return self.do_open(functools.partial(
c5a59d93 746 _create_http_connection, self, compat_http_client.HTTPConnection, False),
be4a824d
PH
747 req)
748
59ae15a5
PH
749 @staticmethod
750 def deflate(data):
751 try:
752 return zlib.decompress(data, -zlib.MAX_WBITS)
753 except zlib.error:
754 return zlib.decompress(data)
755
756 @staticmethod
757 def addinfourl_wrapper(stream, headers, url, code):
758 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
759 return compat_urllib_request.addinfourl(stream, headers, url, code)
760 ret = compat_urllib_request.addinfourl(stream, headers, url)
761 ret.code = code
762 return ret
763
acebc9cd 764 def http_request(self, req):
51f267d9
S
765 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
766 # always respected by websites, some tend to give out URLs with non percent-encoded
767 # non-ASCII characters (see telemb.py, ard.py [#3412])
768 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
769 # To work around aforementioned issue we will replace request's original URL with
770 # percent-encoded one
771 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
772 # the code of this workaround has been moved here from YoutubeDL.urlopen()
773 url = req.get_full_url()
774 url_escaped = escape_url(url)
775
776 # Substitute URL if any change after escaping
777 if url != url_escaped:
778 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
779 new_req = req_type(
780 url_escaped, data=req.data, headers=req.headers,
781 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
782 new_req.timeout = req.timeout
783 req = new_req
784
33ac271b 785 for h, v in std_headers.items():
3d5f7a39
JK
786 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
787 # The dict keys are capitalized because of this bug by urllib
788 if h.capitalize() not in req.headers:
33ac271b 789 req.add_header(h, v)
87f0e62d
YCH
790
791 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
792
793 if sys.version_info < (2, 7) and '#' in req.get_full_url():
794 # Python 2.6 is brain-dead when it comes to fragments
795 req._Request__original = req._Request__original.partition('#')[0]
796 req._Request__r_type = req._Request__r_type.partition('#')[0]
797
59ae15a5
PH
798 return req
799
acebc9cd 800 def http_response(self, req, resp):
59ae15a5
PH
801 old_resp = resp
802 # gzip
803 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
804 content = resp.read()
805 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
806 try:
807 uncompressed = io.BytesIO(gz.read())
808 except IOError as original_ioerror:
809 # There may be junk add the end of the file
810 # See http://stackoverflow.com/q/4928560/35070 for details
811 for i in range(1, 1024):
812 try:
813 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
814 uncompressed = io.BytesIO(gz.read())
815 except IOError:
816 continue
817 break
818 else:
819 raise original_ioerror
820 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 821 resp.msg = old_resp.msg
c047270c 822 del resp.headers['Content-encoding']
59ae15a5
PH
823 # deflate
824 if resp.headers.get('Content-encoding', '') == 'deflate':
825 gz = io.BytesIO(self.deflate(resp.read()))
826 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
827 resp.msg = old_resp.msg
c047270c 828 del resp.headers['Content-encoding']
ad729172
S
829 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
830 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
831 if 300 <= resp.code < 400:
832 location = resp.headers.get('Location')
833 if location:
834 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
835 if sys.version_info >= (3, 0):
836 location = location.encode('iso-8859-1').decode('utf-8')
837 location_escaped = escape_url(location)
838 if location != location_escaped:
839 del resp.headers['Location']
840 resp.headers['Location'] = location_escaped
59ae15a5 841 return resp
0f8d03f8 842
acebc9cd
PH
843 https_request = http_request
844 https_response = http_response
bf50b038 845
5de90176 846
be4a824d
PH
847class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
848 def __init__(self, params, https_conn_class=None, *args, **kwargs):
849 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
850 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
851 self._params = params
852
853 def https_open(self, req):
4f264c02
JMF
854 kwargs = {}
855 if hasattr(self, '_context'): # python > 2.6
856 kwargs['context'] = self._context
857 if hasattr(self, '_check_hostname'): # python 3.x
858 kwargs['check_hostname'] = self._check_hostname
be4a824d
PH
859 return self.do_open(functools.partial(
860 _create_http_connection, self, self._https_conn_class, True),
4f264c02 861 req, **kwargs)
be4a824d
PH
862
863
a6420bf5
S
864class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
865 def __init__(self, cookiejar=None):
866 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
867
868 def http_response(self, request, response):
869 # Python 2 will choke on next HTTP request in row if there are non-ASCII
870 # characters in Set-Cookie HTTP header of last response (see
871 # https://github.com/rg3/youtube-dl/issues/6769).
872 # In order to at least prevent crashing we will percent encode Set-Cookie
873 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
874 # if sys.version_info < (3, 0) and response.headers:
875 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
876 # set_cookie = response.headers.get(set_cookie_header)
877 # if set_cookie:
878 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
879 # if set_cookie != set_cookie_escaped:
880 # del response.headers[set_cookie_header]
881 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
882 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
883
884 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
885 https_response = http_response
886
887
08b38d54 888def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
889 """ Return a UNIX timestamp from the given date """
890
891 if date_str is None:
892 return None
893
52c3a6e4
S
894 date_str = re.sub(r'\.[0-9]+', '', date_str)
895
08b38d54
PH
896 if timezone is None:
897 m = re.search(
52c3a6e4 898 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
08b38d54
PH
899 date_str)
900 if not m:
912b38b4
PH
901 timezone = datetime.timedelta()
902 else:
08b38d54
PH
903 date_str = date_str[:-len(m.group(0))]
904 if not m.group('sign'):
905 timezone = datetime.timedelta()
906 else:
907 sign = 1 if m.group('sign') == '+' else -1
908 timezone = datetime.timedelta(
909 hours=sign * int(m.group('hours')),
910 minutes=sign * int(m.group('minutes')))
52c3a6e4
S
911 try:
912 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
913 dt = datetime.datetime.strptime(date_str, date_format) - timezone
914 return calendar.timegm(dt.timetuple())
915 except ValueError:
916 pass
912b38b4
PH
917
918
42bdd9d0 919def unified_strdate(date_str, day_first=True):
bf50b038 920 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
921
922 if date_str is None:
923 return None
bf50b038 924 upload_date = None
5f6a1245 925 # Replace commas
026fcc04 926 date_str = date_str.replace(',', ' ')
bf50b038 927 # %z (UTC offset) is only supported in python>=3.2
15ac8413
S
928 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
929 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0 930 # Remove AM/PM + timezone
9bb8e0a3 931 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
42bdd9d0 932
19e1d359
JMF
933 format_expressions = [
934 '%d %B %Y',
0f99566c 935 '%d %b %Y',
19e1d359
JMF
936 '%B %d %Y',
937 '%b %d %Y',
f160785c
S
938 '%b %dst %Y %I:%M',
939 '%b %dnd %Y %I:%M',
940 '%b %dth %Y %I:%M',
a69801e2 941 '%Y %m %d',
19e1d359 942 '%Y-%m-%d',
fe556f1b 943 '%Y/%m/%d',
19e1d359 944 '%Y/%m/%d %H:%M:%S',
5d73273f 945 '%Y-%m-%d %H:%M:%S',
e9be9a6a 946 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 947 '%d.%m.%Y %H:%M',
b047de6f 948 '%d.%m.%Y %H.%M',
19e1d359 949 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
950 '%Y-%m-%dT%H:%M:%S.%fZ',
951 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 952 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 953 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 954 '%Y-%m-%dT%H:%M',
19e1d359 955 ]
42bdd9d0
PH
956 if day_first:
957 format_expressions.extend([
79c21abb 958 '%d-%m-%Y',
776dc399
S
959 '%d.%m.%Y',
960 '%d/%m/%Y',
961 '%d/%m/%y',
42bdd9d0
PH
962 '%d/%m/%Y %H:%M:%S',
963 ])
964 else:
965 format_expressions.extend([
79c21abb 966 '%m-%d-%Y',
776dc399
S
967 '%m.%d.%Y',
968 '%m/%d/%Y',
969 '%m/%d/%y',
42bdd9d0
PH
970 '%m/%d/%Y %H:%M:%S',
971 ])
bf50b038
JMF
972 for expression in format_expressions:
973 try:
974 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 975 except ValueError:
bf50b038 976 pass
42393ce2
PH
977 if upload_date is None:
978 timetuple = email.utils.parsedate_tz(date_str)
979 if timetuple:
980 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402
JMF
981 if upload_date is not None:
982 return compat_str(upload_date)
bf50b038 983
5f6a1245 984
28e614de 985def determine_ext(url, default_ext='unknown_video'):
f4776371
S
986 if url is None:
987 return default_ext
9cb9a5df 988 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
989 if re.match(r'^[A-Za-z0-9]+$', guess):
990 return guess
a7aaa398
S
991 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
992 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 993 return guess.rstrip('/')
73e79f2a 994 else:
cbdbb766 995 return default_ext
73e79f2a 996
5f6a1245 997
d4051a8e 998def subtitles_filename(filename, sub_lang, sub_format):
28e614de 999 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1000
5f6a1245 1001
bd558525 1002def date_from_str(date_str):
37254abc
JMF
1003 """
1004 Return a datetime object from a string in the format YYYYMMDD or
1005 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1006 today = datetime.date.today()
f8795e10 1007 if date_str in ('now', 'today'):
37254abc 1008 return today
f8795e10
PH
1009 if date_str == 'yesterday':
1010 return today - datetime.timedelta(days=1)
37254abc
JMF
1011 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1012 if match is not None:
1013 sign = match.group('sign')
1014 time = int(match.group('time'))
1015 if sign == '-':
1016 time = -time
1017 unit = match.group('unit')
dfb1b146 1018 # A bad approximation?
37254abc
JMF
1019 if unit == 'month':
1020 unit = 'day'
1021 time *= 30
1022 elif unit == 'year':
1023 unit = 'day'
1024 time *= 365
1025 unit += 's'
1026 delta = datetime.timedelta(**{unit: time})
1027 return today + delta
611c1dd9 1028 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1029
1030
e63fc1be 1031def hyphenate_date(date_str):
1032 """
1033 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1034 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1035 if match is not None:
1036 return '-'.join(match.groups())
1037 else:
1038 return date_str
1039
5f6a1245 1040
bd558525
JMF
1041class DateRange(object):
1042 """Represents a time interval between two dates"""
5f6a1245 1043
bd558525
JMF
1044 def __init__(self, start=None, end=None):
1045 """start and end must be strings in the format accepted by date"""
1046 if start is not None:
1047 self.start = date_from_str(start)
1048 else:
1049 self.start = datetime.datetime.min.date()
1050 if end is not None:
1051 self.end = date_from_str(end)
1052 else:
1053 self.end = datetime.datetime.max.date()
37254abc 1054 if self.start > self.end:
bd558525 1055 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1056
bd558525
JMF
1057 @classmethod
1058 def day(cls, day):
1059 """Returns a range that only contains the given day"""
5f6a1245
JW
1060 return cls(day, day)
1061
bd558525
JMF
1062 def __contains__(self, date):
1063 """Check if the date is in the range"""
37254abc
JMF
1064 if not isinstance(date, datetime.date):
1065 date = date_from_str(date)
1066 return self.start <= date <= self.end
5f6a1245 1067
bd558525 1068 def __str__(self):
5f6a1245 1069 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1070
1071
1072def platform_name():
1073 """ Returns the platform name as a compat_str """
1074 res = platform.platform()
1075 if isinstance(res, bytes):
1076 res = res.decode(preferredencoding())
1077
1078 assert isinstance(res, compat_str)
1079 return res
c257baff
PH
1080
1081
b58ddb32
PH
1082def _windows_write_string(s, out):
1083 """ Returns True if the string was written using special methods,
1084 False if it has yet to be written out."""
1085 # Adapted from http://stackoverflow.com/a/3259271/35070
1086
1087 import ctypes
1088 import ctypes.wintypes
1089
1090 WIN_OUTPUT_IDS = {
1091 1: -11,
1092 2: -12,
1093 }
1094
a383a98a
PH
1095 try:
1096 fileno = out.fileno()
1097 except AttributeError:
1098 # If the output stream doesn't have a fileno, it's virtual
1099 return False
aa42e873
PH
1100 except io.UnsupportedOperation:
1101 # Some strange Windows pseudo files?
1102 return False
b58ddb32
PH
1103 if fileno not in WIN_OUTPUT_IDS:
1104 return False
1105
e2f89ec7 1106 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1107 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1108 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1109 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1110
e2f89ec7 1111 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1112 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1113 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1114 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1115 written = ctypes.wintypes.DWORD(0)
1116
611c1dd9 1117 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1118 FILE_TYPE_CHAR = 0x0002
1119 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1120 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1121 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1122 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1123 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1124 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1125
1126 def not_a_console(handle):
1127 if handle == INVALID_HANDLE_VALUE or handle is None:
1128 return True
8fb3ac36
PH
1129 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1130 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1131
1132 if not_a_console(h):
1133 return False
1134
d1b9c912
PH
1135 def next_nonbmp_pos(s):
1136 try:
1137 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1138 except StopIteration:
1139 return len(s)
1140
1141 while s:
1142 count = min(next_nonbmp_pos(s), 1024)
1143
b58ddb32 1144 ret = WriteConsoleW(
d1b9c912 1145 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1146 if ret == 0:
1147 raise OSError('Failed to write string')
d1b9c912
PH
1148 if not count: # We just wrote a non-BMP character
1149 assert written.value == 2
1150 s = s[1:]
1151 else:
1152 assert written.value > 0
1153 s = s[written.value:]
b58ddb32
PH
1154 return True
1155
1156
734f90bb 1157def write_string(s, out=None, encoding=None):
7459e3a2
PH
1158 if out is None:
1159 out = sys.stderr
8bf48f23 1160 assert type(s) == compat_str
7459e3a2 1161
b58ddb32
PH
1162 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1163 if _windows_write_string(s, out):
1164 return
1165
7459e3a2
PH
1166 if ('b' in getattr(out, 'mode', '') or
1167 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1168 byt = s.encode(encoding or preferredencoding(), 'ignore')
1169 out.write(byt)
1170 elif hasattr(out, 'buffer'):
1171 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1172 byt = s.encode(enc, 'ignore')
1173 out.buffer.write(byt)
1174 else:
8bf48f23 1175 out.write(s)
7459e3a2
PH
1176 out.flush()
1177
1178
48ea9cea
PH
1179def bytes_to_intlist(bs):
1180 if not bs:
1181 return []
1182 if isinstance(bs[0], int): # Python 3
1183 return list(bs)
1184 else:
1185 return [ord(c) for c in bs]
1186
c257baff 1187
cba892fa 1188def intlist_to_bytes(xs):
1189 if not xs:
1190 return b''
eb4157fd 1191 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1192
1193
c1c9a79c
PH
1194# Cross-platform file locking
1195if sys.platform == 'win32':
1196 import ctypes.wintypes
1197 import msvcrt
1198
1199 class OVERLAPPED(ctypes.Structure):
1200 _fields_ = [
1201 ('Internal', ctypes.wintypes.LPVOID),
1202 ('InternalHigh', ctypes.wintypes.LPVOID),
1203 ('Offset', ctypes.wintypes.DWORD),
1204 ('OffsetHigh', ctypes.wintypes.DWORD),
1205 ('hEvent', ctypes.wintypes.HANDLE),
1206 ]
1207
1208 kernel32 = ctypes.windll.kernel32
1209 LockFileEx = kernel32.LockFileEx
1210 LockFileEx.argtypes = [
1211 ctypes.wintypes.HANDLE, # hFile
1212 ctypes.wintypes.DWORD, # dwFlags
1213 ctypes.wintypes.DWORD, # dwReserved
1214 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1215 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1216 ctypes.POINTER(OVERLAPPED) # Overlapped
1217 ]
1218 LockFileEx.restype = ctypes.wintypes.BOOL
1219 UnlockFileEx = kernel32.UnlockFileEx
1220 UnlockFileEx.argtypes = [
1221 ctypes.wintypes.HANDLE, # hFile
1222 ctypes.wintypes.DWORD, # dwReserved
1223 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1224 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1225 ctypes.POINTER(OVERLAPPED) # Overlapped
1226 ]
1227 UnlockFileEx.restype = ctypes.wintypes.BOOL
1228 whole_low = 0xffffffff
1229 whole_high = 0x7fffffff
1230
1231 def _lock_file(f, exclusive):
1232 overlapped = OVERLAPPED()
1233 overlapped.Offset = 0
1234 overlapped.OffsetHigh = 0
1235 overlapped.hEvent = 0
1236 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1237 handle = msvcrt.get_osfhandle(f.fileno())
1238 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1239 whole_low, whole_high, f._lock_file_overlapped_p):
1240 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1241
1242 def _unlock_file(f):
1243 assert f._lock_file_overlapped_p
1244 handle = msvcrt.get_osfhandle(f.fileno())
1245 if not UnlockFileEx(handle, 0,
1246 whole_low, whole_high, f._lock_file_overlapped_p):
1247 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1248
1249else:
399a76e6
YCH
1250 # Some platforms, such as Jython, is missing fcntl
1251 try:
1252 import fcntl
c1c9a79c 1253
399a76e6
YCH
1254 def _lock_file(f, exclusive):
1255 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1256
399a76e6
YCH
1257 def _unlock_file(f):
1258 fcntl.flock(f, fcntl.LOCK_UN)
1259 except ImportError:
1260 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1261
1262 def _lock_file(f, exclusive):
1263 raise IOError(UNSUPPORTED_MSG)
1264
1265 def _unlock_file(f):
1266 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1267
1268
1269class locked_file(object):
1270 def __init__(self, filename, mode, encoding=None):
1271 assert mode in ['r', 'a', 'w']
1272 self.f = io.open(filename, mode, encoding=encoding)
1273 self.mode = mode
1274
1275 def __enter__(self):
1276 exclusive = self.mode != 'r'
1277 try:
1278 _lock_file(self.f, exclusive)
1279 except IOError:
1280 self.f.close()
1281 raise
1282 return self
1283
1284 def __exit__(self, etype, value, traceback):
1285 try:
1286 _unlock_file(self.f)
1287 finally:
1288 self.f.close()
1289
1290 def __iter__(self):
1291 return iter(self.f)
1292
1293 def write(self, *args):
1294 return self.f.write(*args)
1295
1296 def read(self, *args):
1297 return self.f.read(*args)
4eb7f1d1
JMF
1298
1299
4644ac55
S
1300def get_filesystem_encoding():
1301 encoding = sys.getfilesystemencoding()
1302 return encoding if encoding is not None else 'utf-8'
1303
1304
4eb7f1d1 1305def shell_quote(args):
a6a173c2 1306 quoted_args = []
4644ac55 1307 encoding = get_filesystem_encoding()
a6a173c2
JMF
1308 for a in args:
1309 if isinstance(a, bytes):
1310 # We may get a filename encoded with 'encodeFilename'
1311 a = a.decode(encoding)
1312 quoted_args.append(pipes.quote(a))
28e614de 1313 return ' '.join(quoted_args)
9d4660ca
PH
1314
1315
1316def smuggle_url(url, data):
1317 """ Pass additional data in a URL for internal use. """
1318
15707c7e 1319 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1320 {'__youtubedl_smuggle': json.dumps(data)})
1321 return url + '#' + sdata
9d4660ca
PH
1322
1323
79f82953 1324def unsmuggle_url(smug_url, default=None):
83e865a3 1325 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1326 return smug_url, default
28e614de
PH
1327 url, _, sdata = smug_url.rpartition('#')
1328 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1329 data = json.loads(jsond)
1330 return url, data
02dbf93f
PH
1331
1332
02dbf93f
PH
1333def format_bytes(bytes):
1334 if bytes is None:
28e614de 1335 return 'N/A'
02dbf93f
PH
1336 if type(bytes) is str:
1337 bytes = float(bytes)
1338 if bytes == 0.0:
1339 exponent = 0
1340 else:
1341 exponent = int(math.log(bytes, 1024.0))
28e614de 1342 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1343 converted = float(bytes) / float(1024 ** exponent)
28e614de 1344 return '%.2f%s' % (converted, suffix)
f53c966a 1345
1c088fa8 1346
fb47597b
S
1347def lookup_unit_table(unit_table, s):
1348 units_re = '|'.join(re.escape(u) for u in unit_table)
1349 m = re.match(
782b1b5b 1350 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1351 if not m:
1352 return None
1353 num_str = m.group('num').replace(',', '.')
1354 mult = unit_table[m.group('unit')]
1355 return int(float(num_str) * mult)
1356
1357
be64b5b0
PH
1358def parse_filesize(s):
1359 if s is None:
1360 return None
1361
dfb1b146 1362 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1363 # but we support those too
1364 _UNIT_TABLE = {
1365 'B': 1,
1366 'b': 1,
1367 'KiB': 1024,
1368 'KB': 1000,
1369 'kB': 1024,
1370 'Kb': 1000,
1371 'MiB': 1024 ** 2,
1372 'MB': 1000 ** 2,
1373 'mB': 1024 ** 2,
1374 'Mb': 1000 ** 2,
1375 'GiB': 1024 ** 3,
1376 'GB': 1000 ** 3,
1377 'gB': 1024 ** 3,
1378 'Gb': 1000 ** 3,
1379 'TiB': 1024 ** 4,
1380 'TB': 1000 ** 4,
1381 'tB': 1024 ** 4,
1382 'Tb': 1000 ** 4,
1383 'PiB': 1024 ** 5,
1384 'PB': 1000 ** 5,
1385 'pB': 1024 ** 5,
1386 'Pb': 1000 ** 5,
1387 'EiB': 1024 ** 6,
1388 'EB': 1000 ** 6,
1389 'eB': 1024 ** 6,
1390 'Eb': 1000 ** 6,
1391 'ZiB': 1024 ** 7,
1392 'ZB': 1000 ** 7,
1393 'zB': 1024 ** 7,
1394 'Zb': 1000 ** 7,
1395 'YiB': 1024 ** 8,
1396 'YB': 1000 ** 8,
1397 'yB': 1024 ** 8,
1398 'Yb': 1000 ** 8,
1399 }
1400
fb47597b
S
1401 return lookup_unit_table(_UNIT_TABLE, s)
1402
1403
1404def parse_count(s):
1405 if s is None:
be64b5b0
PH
1406 return None
1407
fb47597b
S
1408 s = s.strip()
1409
1410 if re.match(r'^[\d,.]+$', s):
1411 return str_to_int(s)
1412
1413 _UNIT_TABLE = {
1414 'k': 1000,
1415 'K': 1000,
1416 'm': 1000 ** 2,
1417 'M': 1000 ** 2,
1418 'kk': 1000 ** 2,
1419 'KK': 1000 ** 2,
1420 }
be64b5b0 1421
fb47597b 1422 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1423
2f7ae819 1424
caefb1de
PH
1425def month_by_name(name):
1426 """ Return the number of a month by (locale-independently) English name """
1427
caefb1de 1428 try:
7105440c
YCH
1429 return ENGLISH_MONTH_NAMES.index(name) + 1
1430 except ValueError:
1431 return None
1432
1433
1434def month_by_abbreviation(abbrev):
1435 """ Return the number of a month by (locale-independently) English
1436 abbreviations """
1437
1438 try:
1439 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1440 except ValueError:
1441 return None
18258362
JMF
1442
1443
5aafe895 1444def fix_xml_ampersands(xml_str):
18258362 1445 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1446 return re.sub(
1447 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1448 '&amp;',
5aafe895 1449 xml_str)
e3946f98
PH
1450
1451
1452def setproctitle(title):
8bf48f23 1453 assert isinstance(title, compat_str)
c1c05c67
YCH
1454
1455 # ctypes in Jython is not complete
1456 # http://bugs.jython.org/issue2148
1457 if sys.platform.startswith('java'):
1458 return
1459
e3946f98 1460 try:
611c1dd9 1461 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1462 except OSError:
1463 return
6eefe533
PH
1464 title_bytes = title.encode('utf-8')
1465 buf = ctypes.create_string_buffer(len(title_bytes))
1466 buf.value = title_bytes
e3946f98 1467 try:
6eefe533 1468 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1469 except AttributeError:
1470 return # Strange libc, just skip this
d7dda168
PH
1471
1472
1473def remove_start(s, start):
1474 if s.startswith(start):
1475 return s[len(start):]
1476 return s
29eb5174
PH
1477
1478
2b9faf55
PH
1479def remove_end(s, end):
1480 if s.endswith(end):
1481 return s[:-len(end)]
1482 return s
1483
1484
31b2051e
S
1485def remove_quotes(s):
1486 if s is None or len(s) < 2:
1487 return s
1488 for quote in ('"', "'", ):
1489 if s[0] == quote and s[-1] == quote:
1490 return s[1:-1]
1491 return s
1492
1493
29eb5174 1494def url_basename(url):
9b8aaeed 1495 path = compat_urlparse.urlparse(url).path
28e614de 1496 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1497
1498
1499class HEADRequest(compat_urllib_request.Request):
1500 def get_method(self):
611c1dd9 1501 return 'HEAD'
7217e148
PH
1502
1503
9732d77e 1504def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1505 if get_attr:
1506 if v is not None:
1507 v = getattr(v, get_attr, None)
9572013d
PH
1508 if v == '':
1509 v = None
1812afb7
S
1510 if v is None:
1511 return default
1512 try:
1513 return int(v) * invscale // scale
1514 except ValueError:
af98f8ff 1515 return default
9732d77e 1516
9572013d 1517
40a90862
JMF
1518def str_or_none(v, default=None):
1519 return default if v is None else compat_str(v)
1520
9732d77e
PH
1521
1522def str_to_int(int_str):
48d4681e 1523 """ A more relaxed version of int_or_none """
9732d77e
PH
1524 if int_str is None:
1525 return None
28e614de 1526 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1527 return int(int_str)
608d11f5
PH
1528
1529
9732d77e 1530def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1531 if v is None:
1532 return default
1533 try:
1534 return float(v) * invscale / scale
1535 except ValueError:
1536 return default
43f775e4
PH
1537
1538
608d11f5 1539def parse_duration(s):
8f9312c3 1540 if not isinstance(s, compat_basestring):
608d11f5
PH
1541 return None
1542
ca7b3246
S
1543 s = s.strip()
1544
608d11f5 1545 m = re.match(
9d22a7df 1546 r'''(?ix)(?:P?T)?
e8df5cee 1547 (?:
9c29bc69 1548 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
e8df5cee
PH
1549 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1550
9c29bc69 1551 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
6a68bb57 1552 (?:
8f4b58d7
PH
1553 (?:
1554 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1555 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1556 )?
6a68bb57
PH
1557 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1558 )?
e8df5cee
PH
1559 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1560 )$''', s)
608d11f5
PH
1561 if not m:
1562 return None
e8df5cee
PH
1563 res = 0
1564 if m.group('only_mins'):
1565 return float_or_none(m.group('only_mins'), invscale=60)
1566 if m.group('only_hours'):
1567 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1568 if m.group('secs'):
1569 res += int(m.group('secs'))
3e675fab
PH
1570 if m.group('mins_reversed'):
1571 res += int(m.group('mins_reversed')) * 60
608d11f5
PH
1572 if m.group('mins'):
1573 res += int(m.group('mins')) * 60
e8df5cee
PH
1574 if m.group('hours'):
1575 res += int(m.group('hours')) * 60 * 60
3e675fab
PH
1576 if m.group('hours_reversed'):
1577 res += int(m.group('hours_reversed')) * 60 * 60
8f4b58d7
PH
1578 if m.group('days'):
1579 res += int(m.group('days')) * 24 * 60 * 60
7adcbe75
PH
1580 if m.group('ms'):
1581 res += float(m.group('ms'))
608d11f5 1582 return res
91d7d0b3
JMF
1583
1584
e65e4c88 1585def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1586 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1587 return (
1588 '{0}.{1}{2}'.format(name, ext, real_ext)
1589 if not expected_real_ext or real_ext[1:] == expected_real_ext
1590 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1591
1592
b3ed15b7
S
1593def replace_extension(filename, ext, expected_real_ext=None):
1594 name, real_ext = os.path.splitext(filename)
1595 return '{0}.{1}'.format(
1596 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1597 ext)
1598
1599
d70ad093
PH
1600def check_executable(exe, args=[]):
1601 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1602 args can be a list of arguments for a short output (like -version) """
1603 try:
1604 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1605 except OSError:
1606 return False
1607 return exe
b7ab0590
PH
1608
1609
95807118 1610def get_exe_version(exe, args=['--version'],
cae97f65 1611 version_re=None, unrecognized='present'):
95807118
PH
1612 """ Returns the version of the specified executable,
1613 or False if the executable is not present """
1614 try:
cae97f65 1615 out, _ = subprocess.Popen(
54116803 1616 [encodeArgument(exe)] + args,
95807118
PH
1617 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1618 except OSError:
1619 return False
cae97f65
PH
1620 if isinstance(out, bytes): # Python 2.x
1621 out = out.decode('ascii', 'ignore')
1622 return detect_exe_version(out, version_re, unrecognized)
1623
1624
1625def detect_exe_version(output, version_re=None, unrecognized='present'):
1626 assert isinstance(output, compat_str)
1627 if version_re is None:
1628 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1629 m = re.search(version_re, output)
95807118
PH
1630 if m:
1631 return m.group(1)
1632 else:
1633 return unrecognized
1634
1635
b7ab0590 1636class PagedList(object):
dd26ced1
PH
1637 def __len__(self):
1638 # This is only useful for tests
1639 return len(self.getslice())
1640
9c44d242
PH
1641
1642class OnDemandPagedList(PagedList):
b95dc034 1643 def __init__(self, pagefunc, pagesize, use_cache=False):
9c44d242
PH
1644 self._pagefunc = pagefunc
1645 self._pagesize = pagesize
b95dc034
YCH
1646 self._use_cache = use_cache
1647 if use_cache:
1648 self._cache = {}
9c44d242 1649
b7ab0590
PH
1650 def getslice(self, start=0, end=None):
1651 res = []
1652 for pagenum in itertools.count(start // self._pagesize):
1653 firstid = pagenum * self._pagesize
1654 nextfirstid = pagenum * self._pagesize + self._pagesize
1655 if start >= nextfirstid:
1656 continue
1657
b95dc034
YCH
1658 page_results = None
1659 if self._use_cache:
1660 page_results = self._cache.get(pagenum)
1661 if page_results is None:
1662 page_results = list(self._pagefunc(pagenum))
1663 if self._use_cache:
1664 self._cache[pagenum] = page_results
b7ab0590
PH
1665
1666 startv = (
1667 start % self._pagesize
1668 if firstid <= start < nextfirstid
1669 else 0)
1670
1671 endv = (
1672 ((end - 1) % self._pagesize) + 1
1673 if (end is not None and firstid <= end <= nextfirstid)
1674 else None)
1675
1676 if startv != 0 or endv is not None:
1677 page_results = page_results[startv:endv]
1678 res.extend(page_results)
1679
1680 # A little optimization - if current page is not "full", ie. does
1681 # not contain page_size videos then we can assume that this page
1682 # is the last one - there are no more ids on further pages -
1683 # i.e. no need to query again.
1684 if len(page_results) + startv < self._pagesize:
1685 break
1686
1687 # If we got the whole page, but the next page is not interesting,
1688 # break out early as well
1689 if end == nextfirstid:
1690 break
1691 return res
81c2f20b
PH
1692
1693
9c44d242
PH
1694class InAdvancePagedList(PagedList):
1695 def __init__(self, pagefunc, pagecount, pagesize):
1696 self._pagefunc = pagefunc
1697 self._pagecount = pagecount
1698 self._pagesize = pagesize
1699
1700 def getslice(self, start=0, end=None):
1701 res = []
1702 start_page = start // self._pagesize
1703 end_page = (
1704 self._pagecount if end is None else (end // self._pagesize + 1))
1705 skip_elems = start - start_page * self._pagesize
1706 only_more = None if end is None else end - start
1707 for pagenum in range(start_page, end_page):
1708 page = list(self._pagefunc(pagenum))
1709 if skip_elems:
1710 page = page[skip_elems:]
1711 skip_elems = None
1712 if only_more is not None:
1713 if len(page) < only_more:
1714 only_more -= len(page)
1715 else:
1716 page = page[:only_more]
1717 res.extend(page)
1718 break
1719 res.extend(page)
1720 return res
1721
1722
81c2f20b 1723def uppercase_escape(s):
676eb3f2 1724 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1725 return re.sub(
a612753d 1726 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1727 lambda m: unicode_escape(m.group(0))[0],
1728 s)
0fe2ff78
YCH
1729
1730
1731def lowercase_escape(s):
1732 unicode_escape = codecs.getdecoder('unicode_escape')
1733 return re.sub(
1734 r'\\u[0-9a-fA-F]{4}',
1735 lambda m: unicode_escape(m.group(0))[0],
1736 s)
b53466e1 1737
d05cfe06
S
1738
1739def escape_rfc3986(s):
1740 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1741 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1742 s = s.encode('utf-8')
ecc0c5ee 1743 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1744
1745
1746def escape_url(url):
1747 """Escape URL as suggested by RFC 3986"""
1748 url_parsed = compat_urllib_parse_urlparse(url)
1749 return url_parsed._replace(
efbed08d 1750 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
1751 path=escape_rfc3986(url_parsed.path),
1752 params=escape_rfc3986(url_parsed.params),
1753 query=escape_rfc3986(url_parsed.query),
1754 fragment=escape_rfc3986(url_parsed.fragment)
1755 ).geturl()
1756
b53466e1 1757try:
28e614de 1758 struct.pack('!I', 0)
b53466e1 1759except TypeError:
622d1916
YCH
1760 # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument
1761 # See https://bugs.python.org/issue19099
b53466e1
PH
1762 def struct_pack(spec, *args):
1763 if isinstance(spec, compat_str):
1764 spec = spec.encode('ascii')
1765 return struct.pack(spec, *args)
1766
1767 def struct_unpack(spec, *args):
1768 if isinstance(spec, compat_str):
1769 spec = spec.encode('ascii')
1770 return struct.unpack(spec, *args)
1771else:
1772 struct_pack = struct.pack
1773 struct_unpack = struct.unpack
62e609ab
PH
1774
1775
1776def read_batch_urls(batch_fd):
1777 def fixup(url):
1778 if not isinstance(url, compat_str):
1779 url = url.decode('utf-8', 'replace')
28e614de 1780 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1781 if url.startswith(BOM_UTF8):
1782 url = url[len(BOM_UTF8):]
1783 url = url.strip()
1784 if url.startswith(('#', ';', ']')):
1785 return False
1786 return url
1787
1788 with contextlib.closing(batch_fd) as fd:
1789 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1790
1791
1792def urlencode_postdata(*args, **kargs):
15707c7e 1793 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1794
1795
38f9ef31 1796def update_url_query(url, query):
1797 parsed_url = compat_urlparse.urlparse(url)
1798 qs = compat_parse_qs(parsed_url.query)
1799 qs.update(query)
1800 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 1801 query=compat_urllib_parse_urlencode(qs, True)))
16392824 1802
8e60dc75 1803
86296ad2 1804def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
1805 if isinstance(key_or_keys, (list, tuple)):
1806 for key in key_or_keys:
86296ad2
S
1807 if key not in d or d[key] is None or skip_false_values and not d[key]:
1808 continue
1809 return d[key]
cbecc9b9
S
1810 return default
1811 return d.get(key_or_keys, default)
1812
1813
8e60dc75
S
1814def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1815 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1816
16392824 1817
a1a530b0
PH
1818US_RATINGS = {
1819 'G': 0,
1820 'PG': 10,
1821 'PG-13': 13,
1822 'R': 16,
1823 'NC': 18,
1824}
fac55558
PH
1825
1826
146c80e2
S
1827def parse_age_limit(s):
1828 if s is None:
d838b1bd 1829 return None
146c80e2 1830 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d800609c 1831 return int(m.group('age')) if m else US_RATINGS.get(s)
146c80e2
S
1832
1833
fac55558 1834def strip_jsonp(code):
609a61e3 1835 return re.sub(
8411229b 1836 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1837
1838
e05f6939
PH
1839def js_to_json(code):
1840 def fix_kv(m):
e7b6d122
PH
1841 v = m.group(0)
1842 if v in ('true', 'false', 'null'):
1843 return v
1844 if v.startswith('"'):
d01949dc
S
1845 v = re.sub(r"\\'", "'", v[1:-1])
1846 elif v.startswith("'"):
e7b6d122
PH
1847 v = v[1:-1]
1848 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1849 '\\\\': '\\\\',
1850 "\\'": "'",
1851 '"': '\\"',
1852 }[m.group(0)], v)
1853 return '"%s"' % v
e05f6939
PH
1854
1855 res = re.sub(r'''(?x)
d305dd73
PH
1856 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1857 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
8f4b58d7 1858 [a-zA-Z_][.a-zA-Z_0-9]*
e05f6939 1859 ''', fix_kv, code)
ba9e68f4 1860 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
e05f6939
PH
1861 return res
1862
1863
478c2c61
PH
1864def qualities(quality_ids):
1865 """ Get a numeric quality value out of a list of possible values """
1866 def q(qid):
1867 try:
1868 return quality_ids.index(qid)
1869 except ValueError:
1870 return -1
1871 return q
1872
acd69589
PH
1873
1874DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1875
a020a0dc
PH
1876
1877def limit_length(s, length):
1878 """ Add ellipses to overly long strings """
1879 if s is None:
1880 return None
1881 ELLIPSES = '...'
1882 if len(s) > length:
1883 return s[:length - len(ELLIPSES)] + ELLIPSES
1884 return s
48844745
PH
1885
1886
1887def version_tuple(v):
5f9b8394 1888 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1889
1890
1891def is_outdated_version(version, limit, assume_new=True):
1892 if not version:
1893 return not assume_new
1894 try:
1895 return version_tuple(version) < version_tuple(limit)
1896 except ValueError:
1897 return not assume_new
732ea2f0
PH
1898
1899
1900def ytdl_is_updateable():
1901 """ Returns if youtube-dl can be updated with -U """
1902 from zipimport import zipimporter
1903
1904 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1905
1906
1907def args_to_str(args):
1908 # Get a short string representation for a subprocess command
1909 return ' '.join(shlex_quote(a) for a in args)
2ccd1b10
PH
1910
1911
9b9c5355 1912def error_to_compat_str(err):
fdae2358
S
1913 err_str = str(err)
1914 # On python 2 error byte string must be decoded with proper
1915 # encoding rather than ascii
1916 if sys.version_info[0] < 3:
1917 err_str = err_str.decode(preferredencoding())
1918 return err_str
1919
1920
c460bdd5 1921def mimetype2ext(mt):
765ac263
JMF
1922 ext = {
1923 'audio/mp4': 'm4a',
1924 }.get(mt)
1925 if ext is not None:
1926 return ext
1927
c460bdd5
PH
1928 _, _, res = mt.rpartition('/')
1929
1930 return {
f6861ec9 1931 '3gpp': '3gp',
cafcf657 1932 'smptett+xml': 'tt',
1933 'srt': 'srt',
1934 'ttaf+xml': 'dfxp',
a0d8d704 1935 'ttml+xml': 'ttml',
cafcf657 1936 'vtt': 'vtt',
f6861ec9 1937 'x-flv': 'flv',
a0d8d704
YCH
1938 'x-mp4-fragmented': 'mp4',
1939 'x-ms-wmv': 'wmv',
c460bdd5
PH
1940 }.get(res, res)
1941
1942
2ccd1b10
PH
1943def urlhandle_detect_ext(url_handle):
1944 try:
1945 url_handle.headers
1946 getheader = lambda h: url_handle.headers[h]
1947 except AttributeError: # Python < 3
1948 getheader = url_handle.info().getheader
1949
b55ee18f
PH
1950 cd = getheader('Content-Disposition')
1951 if cd:
1952 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1953 if m:
1954 e = determine_ext(m.group('filename'), default_ext=None)
1955 if e:
1956 return e
1957
c460bdd5 1958 return mimetype2ext(getheader('Content-Type'))
05900629
PH
1959
1960
1e399778
YCH
1961def encode_data_uri(data, mime_type):
1962 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1963
1964
05900629 1965def age_restricted(content_limit, age_limit):
6ec6cb4e 1966 """ Returns True iff the content should be blocked """
05900629
PH
1967
1968 if age_limit is None: # No limit set
1969 return False
1970 if content_limit is None:
1971 return False # Content available for everyone
1972 return age_limit < content_limit
61ca9a80
PH
1973
1974
1975def is_html(first_bytes):
1976 """ Detect whether a file contains HTML by examining its first bytes. """
1977
1978 BOMS = [
1979 (b'\xef\xbb\xbf', 'utf-8'),
1980 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1981 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1982 (b'\xff\xfe', 'utf-16-le'),
1983 (b'\xfe\xff', 'utf-16-be'),
1984 ]
1985 for bom, enc in BOMS:
1986 if first_bytes.startswith(bom):
1987 s = first_bytes[len(bom):].decode(enc, 'replace')
1988 break
1989 else:
1990 s = first_bytes.decode('utf-8', 'replace')
1991
1992 return re.match(r'^\s*<', s)
a055469f
PH
1993
1994
1995def determine_protocol(info_dict):
1996 protocol = info_dict.get('protocol')
1997 if protocol is not None:
1998 return protocol
1999
2000 url = info_dict['url']
2001 if url.startswith('rtmp'):
2002 return 'rtmp'
2003 elif url.startswith('mms'):
2004 return 'mms'
2005 elif url.startswith('rtsp'):
2006 return 'rtsp'
2007
2008 ext = determine_ext(url)
2009 if ext == 'm3u8':
2010 return 'm3u8'
2011 elif ext == 'f4m':
2012 return 'f4m'
2013
2014 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2015
2016
2017def render_table(header_row, data):
2018 """ Render a list of rows, each as a list of values """
2019 table = [header_row] + data
2020 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2021 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2022 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2023
2024
2025def _match_one(filter_part, dct):
2026 COMPARISON_OPERATORS = {
2027 '<': operator.lt,
2028 '<=': operator.le,
2029 '>': operator.gt,
2030 '>=': operator.ge,
2031 '=': operator.eq,
2032 '!=': operator.ne,
2033 }
2034 operator_rex = re.compile(r'''(?x)\s*
2035 (?P<key>[a-z_]+)
2036 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2037 (?:
2038 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2039 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2040 )
2041 \s*$
2042 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2043 m = operator_rex.search(filter_part)
2044 if m:
2045 op = COMPARISON_OPERATORS[m.group('op')]
2046 if m.group('strval') is not None:
2047 if m.group('op') not in ('=', '!='):
2048 raise ValueError(
2049 'Operator %s does not support string values!' % m.group('op'))
2050 comparison_value = m.group('strval')
2051 else:
2052 try:
2053 comparison_value = int(m.group('intval'))
2054 except ValueError:
2055 comparison_value = parse_filesize(m.group('intval'))
2056 if comparison_value is None:
2057 comparison_value = parse_filesize(m.group('intval') + 'B')
2058 if comparison_value is None:
2059 raise ValueError(
2060 'Invalid integer value %r in filter part %r' % (
2061 m.group('intval'), filter_part))
2062 actual_value = dct.get(m.group('key'))
2063 if actual_value is None:
2064 return m.group('none_inclusive')
2065 return op(actual_value, comparison_value)
2066
2067 UNARY_OPERATORS = {
2068 '': lambda v: v is not None,
2069 '!': lambda v: v is None,
2070 }
2071 operator_rex = re.compile(r'''(?x)\s*
2072 (?P<op>%s)\s*(?P<key>[a-z_]+)
2073 \s*$
2074 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2075 m = operator_rex.search(filter_part)
2076 if m:
2077 op = UNARY_OPERATORS[m.group('op')]
2078 actual_value = dct.get(m.group('key'))
2079 return op(actual_value)
2080
2081 raise ValueError('Invalid filter part %r' % filter_part)
2082
2083
2084def match_str(filter_str, dct):
2085 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2086
2087 return all(
2088 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2089
2090
2091def match_filter_func(filter_str):
2092 def _match_func(info_dict):
2093 if match_str(filter_str, info_dict):
2094 return None
2095 else:
2096 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2097 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2098 return _match_func
91410c9b
PH
2099
2100
bf6427d2
YCH
2101def parse_dfxp_time_expr(time_expr):
2102 if not time_expr:
d631d5f9 2103 return
bf6427d2
YCH
2104
2105 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2106 if mobj:
2107 return float(mobj.group('time_offset'))
2108
db2fe38b 2109 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2110 if mobj:
db2fe38b 2111 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2112
2113
c1c924ab
YCH
2114def srt_subtitles_timecode(seconds):
2115 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2116
2117
2118def dfxp2srt(dfxp_data):
4e335771
YCH
2119 _x = functools.partial(xpath_with_ns, ns_map={
2120 'ttml': 'http://www.w3.org/ns/ttml',
2121 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2122 })
bf6427d2 2123
87de7069 2124 class TTMLPElementParser(object):
2b14cb56 2125 out = ''
bf6427d2 2126
2b14cb56 2127 def start(self, tag, attrib):
2128 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2129 self.out += '\n'
bf6427d2 2130
2b14cb56 2131 def end(self, tag):
2132 pass
bf6427d2 2133
2b14cb56 2134 def data(self, data):
2135 self.out += data
2136
2137 def close(self):
2138 return self.out.strip()
2139
2140 def parse_node(node):
2141 target = TTMLPElementParser()
2142 parser = xml.etree.ElementTree.XMLParser(target=target)
2143 parser.feed(xml.etree.ElementTree.tostring(node))
2144 return parser.close()
bf6427d2 2145
36e6f62c 2146 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
bf6427d2 2147 out = []
4e335771 2148 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2149
2150 if not paras:
2151 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
2152
2153 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2154 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2155 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2156 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2157 if begin_time is None:
2158 continue
7dff0363 2159 if not end_time:
d631d5f9
YCH
2160 if not dur:
2161 continue
2162 end_time = begin_time + dur
bf6427d2
YCH
2163 out.append('%d\n%s --> %s\n%s\n\n' % (
2164 index,
c1c924ab
YCH
2165 srt_subtitles_timecode(begin_time),
2166 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2167 parse_node(para)))
2168
2169 return ''.join(out)
2170
2171
66e289ba
S
2172def cli_option(params, command_option, param):
2173 param = params.get(param)
2174 return [command_option, param] if param is not None else []
2175
2176
2177def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2178 param = params.get(param)
2179 assert isinstance(param, bool)
2180 if separator:
2181 return [command_option + separator + (true_value if param else false_value)]
2182 return [command_option, true_value if param else false_value]
2183
2184
2185def cli_valueless_option(params, command_option, param, expected_value=True):
2186 param = params.get(param)
2187 return [command_option] if param == expected_value else []
2188
2189
2190def cli_configuration_args(params, param, default=[]):
2191 ex_args = params.get(param)
2192 if ex_args is None:
2193 return default
2194 assert isinstance(ex_args, list)
2195 return ex_args
2196
2197
39672624
YCH
2198class ISO639Utils(object):
2199 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2200 _lang_map = {
2201 'aa': 'aar',
2202 'ab': 'abk',
2203 'ae': 'ave',
2204 'af': 'afr',
2205 'ak': 'aka',
2206 'am': 'amh',
2207 'an': 'arg',
2208 'ar': 'ara',
2209 'as': 'asm',
2210 'av': 'ava',
2211 'ay': 'aym',
2212 'az': 'aze',
2213 'ba': 'bak',
2214 'be': 'bel',
2215 'bg': 'bul',
2216 'bh': 'bih',
2217 'bi': 'bis',
2218 'bm': 'bam',
2219 'bn': 'ben',
2220 'bo': 'bod',
2221 'br': 'bre',
2222 'bs': 'bos',
2223 'ca': 'cat',
2224 'ce': 'che',
2225 'ch': 'cha',
2226 'co': 'cos',
2227 'cr': 'cre',
2228 'cs': 'ces',
2229 'cu': 'chu',
2230 'cv': 'chv',
2231 'cy': 'cym',
2232 'da': 'dan',
2233 'de': 'deu',
2234 'dv': 'div',
2235 'dz': 'dzo',
2236 'ee': 'ewe',
2237 'el': 'ell',
2238 'en': 'eng',
2239 'eo': 'epo',
2240 'es': 'spa',
2241 'et': 'est',
2242 'eu': 'eus',
2243 'fa': 'fas',
2244 'ff': 'ful',
2245 'fi': 'fin',
2246 'fj': 'fij',
2247 'fo': 'fao',
2248 'fr': 'fra',
2249 'fy': 'fry',
2250 'ga': 'gle',
2251 'gd': 'gla',
2252 'gl': 'glg',
2253 'gn': 'grn',
2254 'gu': 'guj',
2255 'gv': 'glv',
2256 'ha': 'hau',
2257 'he': 'heb',
2258 'hi': 'hin',
2259 'ho': 'hmo',
2260 'hr': 'hrv',
2261 'ht': 'hat',
2262 'hu': 'hun',
2263 'hy': 'hye',
2264 'hz': 'her',
2265 'ia': 'ina',
2266 'id': 'ind',
2267 'ie': 'ile',
2268 'ig': 'ibo',
2269 'ii': 'iii',
2270 'ik': 'ipk',
2271 'io': 'ido',
2272 'is': 'isl',
2273 'it': 'ita',
2274 'iu': 'iku',
2275 'ja': 'jpn',
2276 'jv': 'jav',
2277 'ka': 'kat',
2278 'kg': 'kon',
2279 'ki': 'kik',
2280 'kj': 'kua',
2281 'kk': 'kaz',
2282 'kl': 'kal',
2283 'km': 'khm',
2284 'kn': 'kan',
2285 'ko': 'kor',
2286 'kr': 'kau',
2287 'ks': 'kas',
2288 'ku': 'kur',
2289 'kv': 'kom',
2290 'kw': 'cor',
2291 'ky': 'kir',
2292 'la': 'lat',
2293 'lb': 'ltz',
2294 'lg': 'lug',
2295 'li': 'lim',
2296 'ln': 'lin',
2297 'lo': 'lao',
2298 'lt': 'lit',
2299 'lu': 'lub',
2300 'lv': 'lav',
2301 'mg': 'mlg',
2302 'mh': 'mah',
2303 'mi': 'mri',
2304 'mk': 'mkd',
2305 'ml': 'mal',
2306 'mn': 'mon',
2307 'mr': 'mar',
2308 'ms': 'msa',
2309 'mt': 'mlt',
2310 'my': 'mya',
2311 'na': 'nau',
2312 'nb': 'nob',
2313 'nd': 'nde',
2314 'ne': 'nep',
2315 'ng': 'ndo',
2316 'nl': 'nld',
2317 'nn': 'nno',
2318 'no': 'nor',
2319 'nr': 'nbl',
2320 'nv': 'nav',
2321 'ny': 'nya',
2322 'oc': 'oci',
2323 'oj': 'oji',
2324 'om': 'orm',
2325 'or': 'ori',
2326 'os': 'oss',
2327 'pa': 'pan',
2328 'pi': 'pli',
2329 'pl': 'pol',
2330 'ps': 'pus',
2331 'pt': 'por',
2332 'qu': 'que',
2333 'rm': 'roh',
2334 'rn': 'run',
2335 'ro': 'ron',
2336 'ru': 'rus',
2337 'rw': 'kin',
2338 'sa': 'san',
2339 'sc': 'srd',
2340 'sd': 'snd',
2341 'se': 'sme',
2342 'sg': 'sag',
2343 'si': 'sin',
2344 'sk': 'slk',
2345 'sl': 'slv',
2346 'sm': 'smo',
2347 'sn': 'sna',
2348 'so': 'som',
2349 'sq': 'sqi',
2350 'sr': 'srp',
2351 'ss': 'ssw',
2352 'st': 'sot',
2353 'su': 'sun',
2354 'sv': 'swe',
2355 'sw': 'swa',
2356 'ta': 'tam',
2357 'te': 'tel',
2358 'tg': 'tgk',
2359 'th': 'tha',
2360 'ti': 'tir',
2361 'tk': 'tuk',
2362 'tl': 'tgl',
2363 'tn': 'tsn',
2364 'to': 'ton',
2365 'tr': 'tur',
2366 'ts': 'tso',
2367 'tt': 'tat',
2368 'tw': 'twi',
2369 'ty': 'tah',
2370 'ug': 'uig',
2371 'uk': 'ukr',
2372 'ur': 'urd',
2373 'uz': 'uzb',
2374 've': 'ven',
2375 'vi': 'vie',
2376 'vo': 'vol',
2377 'wa': 'wln',
2378 'wo': 'wol',
2379 'xh': 'xho',
2380 'yi': 'yid',
2381 'yo': 'yor',
2382 'za': 'zha',
2383 'zh': 'zho',
2384 'zu': 'zul',
2385 }
2386
2387 @classmethod
2388 def short2long(cls, code):
2389 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2390 return cls._lang_map.get(code[:2])
2391
2392 @classmethod
2393 def long2short(cls, code):
2394 """Convert language code from ISO 639-2/T to ISO 639-1"""
2395 for short_name, long_name in cls._lang_map.items():
2396 if long_name == code:
2397 return short_name
2398
2399
4eb10f66
YCH
2400class ISO3166Utils(object):
2401 # From http://data.okfn.org/data/core/country-list
2402 _country_map = {
2403 'AF': 'Afghanistan',
2404 'AX': 'Åland Islands',
2405 'AL': 'Albania',
2406 'DZ': 'Algeria',
2407 'AS': 'American Samoa',
2408 'AD': 'Andorra',
2409 'AO': 'Angola',
2410 'AI': 'Anguilla',
2411 'AQ': 'Antarctica',
2412 'AG': 'Antigua and Barbuda',
2413 'AR': 'Argentina',
2414 'AM': 'Armenia',
2415 'AW': 'Aruba',
2416 'AU': 'Australia',
2417 'AT': 'Austria',
2418 'AZ': 'Azerbaijan',
2419 'BS': 'Bahamas',
2420 'BH': 'Bahrain',
2421 'BD': 'Bangladesh',
2422 'BB': 'Barbados',
2423 'BY': 'Belarus',
2424 'BE': 'Belgium',
2425 'BZ': 'Belize',
2426 'BJ': 'Benin',
2427 'BM': 'Bermuda',
2428 'BT': 'Bhutan',
2429 'BO': 'Bolivia, Plurinational State of',
2430 'BQ': 'Bonaire, Sint Eustatius and Saba',
2431 'BA': 'Bosnia and Herzegovina',
2432 'BW': 'Botswana',
2433 'BV': 'Bouvet Island',
2434 'BR': 'Brazil',
2435 'IO': 'British Indian Ocean Territory',
2436 'BN': 'Brunei Darussalam',
2437 'BG': 'Bulgaria',
2438 'BF': 'Burkina Faso',
2439 'BI': 'Burundi',
2440 'KH': 'Cambodia',
2441 'CM': 'Cameroon',
2442 'CA': 'Canada',
2443 'CV': 'Cape Verde',
2444 'KY': 'Cayman Islands',
2445 'CF': 'Central African Republic',
2446 'TD': 'Chad',
2447 'CL': 'Chile',
2448 'CN': 'China',
2449 'CX': 'Christmas Island',
2450 'CC': 'Cocos (Keeling) Islands',
2451 'CO': 'Colombia',
2452 'KM': 'Comoros',
2453 'CG': 'Congo',
2454 'CD': 'Congo, the Democratic Republic of the',
2455 'CK': 'Cook Islands',
2456 'CR': 'Costa Rica',
2457 'CI': 'Côte d\'Ivoire',
2458 'HR': 'Croatia',
2459 'CU': 'Cuba',
2460 'CW': 'Curaçao',
2461 'CY': 'Cyprus',
2462 'CZ': 'Czech Republic',
2463 'DK': 'Denmark',
2464 'DJ': 'Djibouti',
2465 'DM': 'Dominica',
2466 'DO': 'Dominican Republic',
2467 'EC': 'Ecuador',
2468 'EG': 'Egypt',
2469 'SV': 'El Salvador',
2470 'GQ': 'Equatorial Guinea',
2471 'ER': 'Eritrea',
2472 'EE': 'Estonia',
2473 'ET': 'Ethiopia',
2474 'FK': 'Falkland Islands (Malvinas)',
2475 'FO': 'Faroe Islands',
2476 'FJ': 'Fiji',
2477 'FI': 'Finland',
2478 'FR': 'France',
2479 'GF': 'French Guiana',
2480 'PF': 'French Polynesia',
2481 'TF': 'French Southern Territories',
2482 'GA': 'Gabon',
2483 'GM': 'Gambia',
2484 'GE': 'Georgia',
2485 'DE': 'Germany',
2486 'GH': 'Ghana',
2487 'GI': 'Gibraltar',
2488 'GR': 'Greece',
2489 'GL': 'Greenland',
2490 'GD': 'Grenada',
2491 'GP': 'Guadeloupe',
2492 'GU': 'Guam',
2493 'GT': 'Guatemala',
2494 'GG': 'Guernsey',
2495 'GN': 'Guinea',
2496 'GW': 'Guinea-Bissau',
2497 'GY': 'Guyana',
2498 'HT': 'Haiti',
2499 'HM': 'Heard Island and McDonald Islands',
2500 'VA': 'Holy See (Vatican City State)',
2501 'HN': 'Honduras',
2502 'HK': 'Hong Kong',
2503 'HU': 'Hungary',
2504 'IS': 'Iceland',
2505 'IN': 'India',
2506 'ID': 'Indonesia',
2507 'IR': 'Iran, Islamic Republic of',
2508 'IQ': 'Iraq',
2509 'IE': 'Ireland',
2510 'IM': 'Isle of Man',
2511 'IL': 'Israel',
2512 'IT': 'Italy',
2513 'JM': 'Jamaica',
2514 'JP': 'Japan',
2515 'JE': 'Jersey',
2516 'JO': 'Jordan',
2517 'KZ': 'Kazakhstan',
2518 'KE': 'Kenya',
2519 'KI': 'Kiribati',
2520 'KP': 'Korea, Democratic People\'s Republic of',
2521 'KR': 'Korea, Republic of',
2522 'KW': 'Kuwait',
2523 'KG': 'Kyrgyzstan',
2524 'LA': 'Lao People\'s Democratic Republic',
2525 'LV': 'Latvia',
2526 'LB': 'Lebanon',
2527 'LS': 'Lesotho',
2528 'LR': 'Liberia',
2529 'LY': 'Libya',
2530 'LI': 'Liechtenstein',
2531 'LT': 'Lithuania',
2532 'LU': 'Luxembourg',
2533 'MO': 'Macao',
2534 'MK': 'Macedonia, the Former Yugoslav Republic of',
2535 'MG': 'Madagascar',
2536 'MW': 'Malawi',
2537 'MY': 'Malaysia',
2538 'MV': 'Maldives',
2539 'ML': 'Mali',
2540 'MT': 'Malta',
2541 'MH': 'Marshall Islands',
2542 'MQ': 'Martinique',
2543 'MR': 'Mauritania',
2544 'MU': 'Mauritius',
2545 'YT': 'Mayotte',
2546 'MX': 'Mexico',
2547 'FM': 'Micronesia, Federated States of',
2548 'MD': 'Moldova, Republic of',
2549 'MC': 'Monaco',
2550 'MN': 'Mongolia',
2551 'ME': 'Montenegro',
2552 'MS': 'Montserrat',
2553 'MA': 'Morocco',
2554 'MZ': 'Mozambique',
2555 'MM': 'Myanmar',
2556 'NA': 'Namibia',
2557 'NR': 'Nauru',
2558 'NP': 'Nepal',
2559 'NL': 'Netherlands',
2560 'NC': 'New Caledonia',
2561 'NZ': 'New Zealand',
2562 'NI': 'Nicaragua',
2563 'NE': 'Niger',
2564 'NG': 'Nigeria',
2565 'NU': 'Niue',
2566 'NF': 'Norfolk Island',
2567 'MP': 'Northern Mariana Islands',
2568 'NO': 'Norway',
2569 'OM': 'Oman',
2570 'PK': 'Pakistan',
2571 'PW': 'Palau',
2572 'PS': 'Palestine, State of',
2573 'PA': 'Panama',
2574 'PG': 'Papua New Guinea',
2575 'PY': 'Paraguay',
2576 'PE': 'Peru',
2577 'PH': 'Philippines',
2578 'PN': 'Pitcairn',
2579 'PL': 'Poland',
2580 'PT': 'Portugal',
2581 'PR': 'Puerto Rico',
2582 'QA': 'Qatar',
2583 'RE': 'Réunion',
2584 'RO': 'Romania',
2585 'RU': 'Russian Federation',
2586 'RW': 'Rwanda',
2587 'BL': 'Saint Barthélemy',
2588 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2589 'KN': 'Saint Kitts and Nevis',
2590 'LC': 'Saint Lucia',
2591 'MF': 'Saint Martin (French part)',
2592 'PM': 'Saint Pierre and Miquelon',
2593 'VC': 'Saint Vincent and the Grenadines',
2594 'WS': 'Samoa',
2595 'SM': 'San Marino',
2596 'ST': 'Sao Tome and Principe',
2597 'SA': 'Saudi Arabia',
2598 'SN': 'Senegal',
2599 'RS': 'Serbia',
2600 'SC': 'Seychelles',
2601 'SL': 'Sierra Leone',
2602 'SG': 'Singapore',
2603 'SX': 'Sint Maarten (Dutch part)',
2604 'SK': 'Slovakia',
2605 'SI': 'Slovenia',
2606 'SB': 'Solomon Islands',
2607 'SO': 'Somalia',
2608 'ZA': 'South Africa',
2609 'GS': 'South Georgia and the South Sandwich Islands',
2610 'SS': 'South Sudan',
2611 'ES': 'Spain',
2612 'LK': 'Sri Lanka',
2613 'SD': 'Sudan',
2614 'SR': 'Suriname',
2615 'SJ': 'Svalbard and Jan Mayen',
2616 'SZ': 'Swaziland',
2617 'SE': 'Sweden',
2618 'CH': 'Switzerland',
2619 'SY': 'Syrian Arab Republic',
2620 'TW': 'Taiwan, Province of China',
2621 'TJ': 'Tajikistan',
2622 'TZ': 'Tanzania, United Republic of',
2623 'TH': 'Thailand',
2624 'TL': 'Timor-Leste',
2625 'TG': 'Togo',
2626 'TK': 'Tokelau',
2627 'TO': 'Tonga',
2628 'TT': 'Trinidad and Tobago',
2629 'TN': 'Tunisia',
2630 'TR': 'Turkey',
2631 'TM': 'Turkmenistan',
2632 'TC': 'Turks and Caicos Islands',
2633 'TV': 'Tuvalu',
2634 'UG': 'Uganda',
2635 'UA': 'Ukraine',
2636 'AE': 'United Arab Emirates',
2637 'GB': 'United Kingdom',
2638 'US': 'United States',
2639 'UM': 'United States Minor Outlying Islands',
2640 'UY': 'Uruguay',
2641 'UZ': 'Uzbekistan',
2642 'VU': 'Vanuatu',
2643 'VE': 'Venezuela, Bolivarian Republic of',
2644 'VN': 'Viet Nam',
2645 'VG': 'Virgin Islands, British',
2646 'VI': 'Virgin Islands, U.S.',
2647 'WF': 'Wallis and Futuna',
2648 'EH': 'Western Sahara',
2649 'YE': 'Yemen',
2650 'ZM': 'Zambia',
2651 'ZW': 'Zimbabwe',
2652 }
2653
2654 @classmethod
2655 def short2full(cls, code):
2656 """Convert an ISO 3166-2 country code to the corresponding full name"""
2657 return cls._country_map.get(code.upper())
2658
2659
91410c9b 2660class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2661 def __init__(self, proxies=None):
2662 # Set default handlers
2663 for type in ('http', 'https'):
2664 setattr(self, '%s_open' % type,
2665 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2666 meth(r, proxy, type))
2667 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2668
91410c9b 2669 def proxy_open(self, req, proxy, type):
2461f79d 2670 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
2671 if req_proxy is not None:
2672 proxy = req_proxy
2461f79d
PH
2673 del req.headers['Ytdl-request-proxy']
2674
2675 if proxy == '__noproxy__':
2676 return None # No Proxy
91410c9b
PH
2677 return compat_urllib_request.ProxyHandler.proxy_open(
2678 self, req, proxy, type)
5bc880b9
YCH
2679
2680
2681def ohdave_rsa_encrypt(data, exponent, modulus):
2682 '''
2683 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2684
2685 Input:
2686 data: data to encrypt, bytes-like object
2687 exponent, modulus: parameter e and N of RSA algorithm, both integer
2688 Output: hex string of encrypted data
2689
2690 Limitation: supports one block encryption only
2691 '''
2692
2693 payload = int(binascii.hexlify(data[::-1]), 16)
2694 encrypted = pow(payload, exponent, modulus)
2695 return '%x' % encrypted
81bdc8fd
YCH
2696
2697
5eb6bdce 2698def encode_base_n(num, n, table=None):
59f898b7 2699 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
2700 if not table:
2701 table = FULL_TABLE[:n]
2702
5eb6bdce
YCH
2703 if n > len(table):
2704 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2705
2706 if num == 0:
2707 return table[0]
2708
81bdc8fd
YCH
2709 ret = ''
2710 while num:
2711 ret = table[num % n] + ret
2712 num = num // n
2713 return ret
f52354a8
YCH
2714
2715
2716def decode_packed_codes(code):
2717 mobj = re.search(
680079be 2718 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
f52354a8
YCH
2719 code)
2720 obfucasted_code, base, count, symbols = mobj.groups()
2721 base = int(base)
2722 count = int(count)
2723 symbols = symbols.split('|')
2724 symbol_table = {}
2725
2726 while count:
2727 count -= 1
5eb6bdce 2728 base_n_count = encode_base_n(count, base)
f52354a8
YCH
2729 symbol_table[base_n_count] = symbols[count] or base_n_count
2730
2731 return re.sub(
2732 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2733 obfucasted_code)