]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[openload] Fix title extraction (Closes #9298)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
f45c185f 14import errno
be4a824d 15import functools
d77c3dfd 16import gzip
b7ab0590 17import itertools
03f9daab 18import io
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
347de493 22import operator
d77c3dfd 23import os
4eb7f1d1 24import pipes
c496ca96 25import platform
d77c3dfd 26import re
13ebea79 27import ssl
c496ca96 28import socket
b53466e1 29import struct
1c088fa8 30import subprocess
d77c3dfd 31import sys
181c8655 32import tempfile
01951dda 33import traceback
bcf89ce6 34import xml.etree.ElementTree
d77c3dfd 35import zlib
d77c3dfd 36
8c25f81b 37from .compat import (
8bb56eee 38 compat_HTMLParser,
8f9312c3 39 compat_basestring,
8c25f81b 40 compat_chr,
36e6f62c 41 compat_etree_fromstring,
8c25f81b 42 compat_html_entities,
be4a824d 43 compat_http_client,
c86b6142 44 compat_kwargs,
8c25f81b 45 compat_parse_qs,
be4a824d 46 compat_socket_create_connection,
8c25f81b
PH
47 compat_str,
48 compat_urllib_error,
49 compat_urllib_parse,
15707c7e 50 compat_urllib_parse_urlencode,
8c25f81b
PH
51 compat_urllib_parse_urlparse,
52 compat_urllib_request,
53 compat_urlparse,
810c10ba 54 compat_xpath,
7d4111ed 55 shlex_quote,
8c25f81b 56)
4644ac55
S
57
58
468e2e92
FV
59# This is not clearly defined otherwise
60compiled_regex_type = type(re.compile(''))
61
3e669f36 62std_headers = {
9c7b3898 63 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
59ae15a5
PH
64 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
65 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
66 'Accept-Encoding': 'gzip, deflate',
67 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 68}
f427df17 69
5f6a1245 70
bf42a990
S
71NO_DEFAULT = object()
72
7105440c
YCH
73ENGLISH_MONTH_NAMES = [
74 'January', 'February', 'March', 'April', 'May', 'June',
75 'July', 'August', 'September', 'October', 'November', 'December']
76
a7aaa398
S
77KNOWN_EXTENSIONS = (
78 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
79 'flv', 'f4v', 'f4a', 'f4b',
80 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
81 'mkv', 'mka', 'mk3d',
82 'avi', 'divx',
83 'mov',
84 'asf', 'wmv', 'wma',
85 '3gp', '3g2',
86 'mp3',
87 'flac',
88 'ape',
89 'wav',
90 'f4f', 'f4m', 'm3u8', 'smil')
91
7105440c 92
d77c3dfd 93def preferredencoding():
59ae15a5 94 """Get preferred encoding.
d77c3dfd 95
59ae15a5
PH
96 Returns the best encoding scheme for the system, based on
97 locale.getpreferredencoding() and some further tweaks.
98 """
99 try:
100 pref = locale.getpreferredencoding()
28e614de 101 'TEST'.encode(pref)
70a1165b 102 except Exception:
59ae15a5 103 pref = 'UTF-8'
bae611f2 104
59ae15a5 105 return pref
d77c3dfd 106
f4bfd65f 107
181c8655 108def write_json_file(obj, fn):
1394646a 109 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 110
92120217 111 fn = encodeFilename(fn)
61ee5aeb 112 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
113 encoding = get_filesystem_encoding()
114 # os.path.basename returns a bytes object, but NamedTemporaryFile
115 # will fail if the filename contains non ascii characters unless we
116 # use a unicode object
117 path_basename = lambda f: os.path.basename(fn).decode(encoding)
118 # the same for os.path.dirname
119 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
120 else:
121 path_basename = os.path.basename
122 path_dirname = os.path.dirname
123
73159f99
S
124 args = {
125 'suffix': '.tmp',
ec5f6016
JMF
126 'prefix': path_basename(fn) + '.',
127 'dir': path_dirname(fn),
73159f99
S
128 'delete': False,
129 }
130
181c8655
PH
131 # In Python 2.x, json.dump expects a bytestream.
132 # In Python 3.x, it writes to a character stream
133 if sys.version_info < (3, 0):
73159f99 134 args['mode'] = 'wb'
181c8655 135 else:
73159f99
S
136 args.update({
137 'mode': 'w',
138 'encoding': 'utf-8',
139 })
140
c86b6142 141 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
142
143 try:
144 with tf:
145 json.dump(obj, tf)
1394646a
IK
146 if sys.platform == 'win32':
147 # Need to remove existing file on Windows, else os.rename raises
148 # WindowsError or FileExistsError.
149 try:
150 os.unlink(fn)
151 except OSError:
152 pass
181c8655 153 os.rename(tf.name, fn)
70a1165b 154 except Exception:
181c8655
PH
155 try:
156 os.remove(tf.name)
157 except OSError:
158 pass
159 raise
160
161
162if sys.version_info >= (2, 7):
ee114368 163 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 164 """ Find the xpath xpath[@key=val] """
5d2354f1 165 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 166 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
167 return node.find(expr)
168else:
ee114368 169 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 170 for f in node.findall(compat_xpath(xpath)):
ee114368
S
171 if key not in f.attrib:
172 continue
173 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
174 return f
175 return None
176
d7e66d39
JMF
177# On python2.6 the xml.etree.ElementTree.Element methods don't support
178# the namespace parameter
5f6a1245
JW
179
180
d7e66d39
JMF
181def xpath_with_ns(path, ns_map):
182 components = [c.split(':') for c in path.split('/')]
183 replaced = []
184 for c in components:
185 if len(c) == 1:
186 replaced.append(c[0])
187 else:
188 ns, tag = c
189 replaced.append('{%s}%s' % (ns_map[ns], tag))
190 return '/'.join(replaced)
191
d77c3dfd 192
a41fb80c 193def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 194 def _find_xpath(xpath):
810c10ba 195 return node.find(compat_xpath(xpath))
578c0745
S
196
197 if isinstance(xpath, (str, compat_str)):
198 n = _find_xpath(xpath)
199 else:
200 for xp in xpath:
201 n = _find_xpath(xp)
202 if n is not None:
203 break
d74bebd5 204
8e636da4 205 if n is None:
bf42a990
S
206 if default is not NO_DEFAULT:
207 return default
208 elif fatal:
bf0ff932
PH
209 name = xpath if name is None else name
210 raise ExtractorError('Could not find XML element %s' % name)
211 else:
212 return None
a41fb80c
S
213 return n
214
215
216def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
217 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
218 if n is None or n == default:
219 return n
220 if n.text is None:
221 if default is not NO_DEFAULT:
222 return default
223 elif fatal:
224 name = xpath if name is None else name
225 raise ExtractorError('Could not find XML element\'s text %s' % name)
226 else:
227 return None
228 return n.text
a41fb80c
S
229
230
231def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
232 n = find_xpath_attr(node, xpath, key)
233 if n is None:
234 if default is not NO_DEFAULT:
235 return default
236 elif fatal:
237 name = '%s[@%s]' % (xpath, key) if name is None else name
238 raise ExtractorError('Could not find XML attribute %s' % name)
239 else:
240 return None
241 return n.attrib[key]
bf0ff932
PH
242
243
9e6dd238 244def get_element_by_id(id, html):
43e8fafd 245 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 246 return get_element_by_attribute('id', id, html)
43e8fafd 247
12ea2f30 248
43e8fafd
ND
249def get_element_by_attribute(attribute, value, html):
250 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 251
38285056
PH
252 m = re.search(r'''(?xs)
253 <([a-zA-Z0-9:._-]+)
254 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
255 \s+%s=['"]?%s['"]?
256 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
257 \s*>
258 (?P<content>.*?)
259 </\1>
260 ''' % (re.escape(attribute), re.escape(value)), html)
261
262 if not m:
263 return None
264 res = m.group('content')
265
266 if res.startswith('"') or res.startswith("'"):
267 res = res[1:-1]
a921f407 268
38285056 269 return unescapeHTML(res)
a921f407 270
c5229f39 271
8bb56eee
BF
272class HTMLAttributeParser(compat_HTMLParser):
273 """Trivial HTML parser to gather the attributes for a single element"""
274 def __init__(self):
c5229f39 275 self.attrs = {}
8bb56eee
BF
276 compat_HTMLParser.__init__(self)
277
278 def handle_starttag(self, tag, attrs):
279 self.attrs = dict(attrs)
280
c5229f39 281
8bb56eee
BF
282def extract_attributes(html_element):
283 """Given a string for an HTML element such as
284 <el
285 a="foo" B="bar" c="&98;az" d=boz
286 empty= noval entity="&amp;"
287 sq='"' dq="'"
288 >
289 Decode and return a dictionary of attributes.
290 {
291 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
292 'empty': '', 'noval': None, 'entity': '&',
293 'sq': '"', 'dq': '\''
294 }.
295 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
296 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
297 """
298 parser = HTMLAttributeParser()
299 parser.feed(html_element)
300 parser.close()
301 return parser.attrs
9e6dd238 302
c5229f39 303
9e6dd238 304def clean_html(html):
59ae15a5 305 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
306
307 if html is None: # Convenience for sanitizing descriptions etc.
308 return html
309
59ae15a5
PH
310 # Newline vs <br />
311 html = html.replace('\n', ' ')
6b3aef80
FV
312 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
313 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
314 # Strip html tags
315 html = re.sub('<.*?>', '', html)
316 # Replace html entities
317 html = unescapeHTML(html)
7decf895 318 return html.strip()
9e6dd238
FV
319
320
d77c3dfd 321def sanitize_open(filename, open_mode):
59ae15a5
PH
322 """Try to open the given filename, and slightly tweak it if this fails.
323
324 Attempts to open the given filename. If this fails, it tries to change
325 the filename slightly, step by step, until it's either able to open it
326 or it fails and raises a final exception, like the standard open()
327 function.
328
329 It returns the tuple (stream, definitive_file_name).
330 """
331 try:
28e614de 332 if filename == '-':
59ae15a5
PH
333 if sys.platform == 'win32':
334 import msvcrt
335 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 336 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
337 stream = open(encodeFilename(filename), open_mode)
338 return (stream, filename)
339 except (IOError, OSError) as err:
f45c185f
PH
340 if err.errno in (errno.EACCES,):
341 raise
59ae15a5 342
f45c185f 343 # In case of error, try to remove win32 forbidden chars
d55de57b 344 alt_filename = sanitize_path(filename)
f45c185f
PH
345 if alt_filename == filename:
346 raise
347 else:
348 # An exception here should be caught in the caller
d55de57b 349 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 350 return (stream, alt_filename)
d77c3dfd
FV
351
352
353def timeconvert(timestr):
59ae15a5
PH
354 """Convert RFC 2822 defined time string into system timestamp"""
355 timestamp = None
356 timetuple = email.utils.parsedate_tz(timestr)
357 if timetuple is not None:
358 timestamp = email.utils.mktime_tz(timetuple)
359 return timestamp
1c469a94 360
5f6a1245 361
796173d0 362def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
363 """Sanitizes a string so it could be used as part of a filename.
364 If restricted is set, use a stricter subset of allowed characters.
796173d0 365 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
366 """
367 def replace_insane(char):
368 if char == '?' or ord(char) < 32 or ord(char) == 127:
369 return ''
370 elif char == '"':
371 return '' if restricted else '\''
372 elif char == ':':
373 return '_-' if restricted else ' -'
374 elif char in '\\/|*<>':
375 return '_'
627dcfff 376 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
377 return '_'
378 if restricted and ord(char) > 127:
379 return '_'
380 return char
381
2aeb06d6
PH
382 # Handle timestamps
383 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 384 result = ''.join(map(replace_insane, s))
796173d0
PH
385 if not is_id:
386 while '__' in result:
387 result = result.replace('__', '_')
388 result = result.strip('_')
389 # Common case of "Foreign band name - English song title"
390 if restricted and result.startswith('-_'):
391 result = result[2:]
5a42414b
PH
392 if result.startswith('-'):
393 result = '_' + result[len('-'):]
a7440261 394 result = result.lstrip('.')
796173d0
PH
395 if not result:
396 result = '_'
59ae15a5 397 return result
d77c3dfd 398
5f6a1245 399
a2aaf4db
S
400def sanitize_path(s):
401 """Sanitizes and normalizes path on Windows"""
402 if sys.platform != 'win32':
403 return s
be531ef1
S
404 drive_or_unc, _ = os.path.splitdrive(s)
405 if sys.version_info < (2, 7) and not drive_or_unc:
406 drive_or_unc, _ = os.path.splitunc(s)
407 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
408 if drive_or_unc:
a2aaf4db
S
409 norm_path.pop(0)
410 sanitized_path = [
c90d16cf 411 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
a2aaf4db 412 for path_part in norm_path]
be531ef1
S
413 if drive_or_unc:
414 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
415 return os.path.join(*sanitized_path)
416
417
67dda517
S
418# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
419# unwanted failures due to missing protocol
17bcc626
S
420def sanitize_url(url):
421 return 'http:%s' % url if url.startswith('//') else url
422
423
67dda517 424def sanitized_Request(url, *args, **kwargs):
17bcc626 425 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
426
427
d77c3dfd 428def orderedSet(iterable):
59ae15a5
PH
429 """ Remove all duplicates from the input iterable """
430 res = []
431 for el in iterable:
432 if el not in res:
433 res.append(el)
434 return res
d77c3dfd 435
912b38b4 436
4e408e47
PH
437def _htmlentity_transform(entity):
438 """Transforms an HTML entity to a character."""
439 # Known non-numeric HTML entity
440 if entity in compat_html_entities.name2codepoint:
441 return compat_chr(compat_html_entities.name2codepoint[entity])
442
91757b0f 443 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
444 if mobj is not None:
445 numstr = mobj.group(1)
28e614de 446 if numstr.startswith('x'):
4e408e47 447 base = 16
28e614de 448 numstr = '0%s' % numstr
4e408e47
PH
449 else:
450 base = 10
7aefc49c
S
451 # See https://github.com/rg3/youtube-dl/issues/7518
452 try:
453 return compat_chr(int(numstr, base))
454 except ValueError:
455 pass
4e408e47
PH
456
457 # Unknown entity in name, return its literal representation
7a3f0c00 458 return '&%s;' % entity
4e408e47
PH
459
460
d77c3dfd 461def unescapeHTML(s):
912b38b4
PH
462 if s is None:
463 return None
464 assert type(s) == compat_str
d77c3dfd 465
4e408e47
PH
466 return re.sub(
467 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 468
8bf48f23 469
aa49acd1
S
470def get_subprocess_encoding():
471 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
472 # For subprocess calls, encode with locale encoding
473 # Refer to http://stackoverflow.com/a/9951851/35070
474 encoding = preferredencoding()
475 else:
476 encoding = sys.getfilesystemencoding()
477 if encoding is None:
478 encoding = 'utf-8'
479 return encoding
480
481
8bf48f23 482def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
483 """
484 @param s The name of the file
485 """
d77c3dfd 486
8bf48f23 487 assert type(s) == compat_str
d77c3dfd 488
59ae15a5
PH
489 # Python 3 has a Unicode API
490 if sys.version_info >= (3, 0):
491 return s
0f00efed 492
aa49acd1
S
493 # Pass '' directly to use Unicode APIs on Windows 2000 and up
494 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
495 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
496 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
497 return s
498
8ee239e9
YCH
499 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
500 if sys.platform.startswith('java'):
501 return s
502
aa49acd1
S
503 return s.encode(get_subprocess_encoding(), 'ignore')
504
505
506def decodeFilename(b, for_subprocess=False):
507
508 if sys.version_info >= (3, 0):
509 return b
510
511 if not isinstance(b, bytes):
512 return b
513
514 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 515
f07b74fc
PH
516
517def encodeArgument(s):
518 if not isinstance(s, compat_str):
519 # Legacy code that uses byte strings
520 # Uncomment the following line after fixing all post processors
7af808a5 521 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
522 s = s.decode('ascii')
523 return encodeFilename(s, True)
524
525
aa49acd1
S
526def decodeArgument(b):
527 return decodeFilename(b, True)
528
529
8271226a
PH
530def decodeOption(optval):
531 if optval is None:
532 return optval
533 if isinstance(optval, bytes):
534 optval = optval.decode(preferredencoding())
535
536 assert isinstance(optval, compat_str)
537 return optval
1c256f70 538
5f6a1245 539
4539dd30
PH
540def formatSeconds(secs):
541 if secs > 3600:
542 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
543 elif secs > 60:
544 return '%d:%02d' % (secs // 60, secs % 60)
545 else:
546 return '%d' % secs
547
a0ddb8a2 548
be4a824d
PH
549def make_HTTPS_handler(params, **kwargs):
550 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 551 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 552 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 553 if opts_no_check_certificate:
be5f2c19 554 context.check_hostname = False
0db261ba 555 context.verify_mode = ssl.CERT_NONE
a2366922 556 try:
be4a824d 557 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
558 except TypeError:
559 # Python 2.7.8
560 # (create_default_context present but HTTPSHandler has no context=)
561 pass
562
563 if sys.version_info < (3, 2):
d7932313 564 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 565 else: # Python < 3.4
d7932313 566 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 567 context.verify_mode = (ssl.CERT_NONE
dca08720 568 if opts_no_check_certificate
ea6d901e 569 else ssl.CERT_REQUIRED)
303b479e 570 context.set_default_verify_paths()
be4a824d 571 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 572
732ea2f0 573
08f2a92c
JMF
574def bug_reports_message():
575 if ytdl_is_updateable():
576 update_cmd = 'type youtube-dl -U to update'
577 else:
578 update_cmd = 'see https://yt-dl.org/update on how to update'
579 msg = '; please report this issue on https://yt-dl.org/bug .'
580 msg += ' Make sure you are using the latest version; %s.' % update_cmd
581 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
582 return msg
583
584
1c256f70
PH
585class ExtractorError(Exception):
586 """Error during info extraction."""
5f6a1245 587
d11271dd 588 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
589 """ tb, if given, is the original traceback (so that it can be printed out).
590 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
591 """
592
593 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
594 expected = True
d11271dd
PH
595 if video_id is not None:
596 msg = video_id + ': ' + msg
410f3e73 597 if cause:
28e614de 598 msg += ' (caused by %r)' % cause
9a82b238 599 if not expected:
08f2a92c 600 msg += bug_reports_message()
1c256f70 601 super(ExtractorError, self).__init__(msg)
d5979c5d 602
1c256f70 603 self.traceback = tb
8cc83b8d 604 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 605 self.cause = cause
d11271dd 606 self.video_id = video_id
1c256f70 607
01951dda
PH
608 def format_traceback(self):
609 if self.traceback is None:
610 return None
28e614de 611 return ''.join(traceback.format_tb(self.traceback))
01951dda 612
1c256f70 613
416c7fcb
PH
614class UnsupportedError(ExtractorError):
615 def __init__(self, url):
616 super(UnsupportedError, self).__init__(
617 'Unsupported URL: %s' % url, expected=True)
618 self.url = url
619
620
55b3e45b
JMF
621class RegexNotFoundError(ExtractorError):
622 """Error when a regex didn't match"""
623 pass
624
625
d77c3dfd 626class DownloadError(Exception):
59ae15a5 627 """Download Error exception.
d77c3dfd 628
59ae15a5
PH
629 This exception may be thrown by FileDownloader objects if they are not
630 configured to continue on errors. They will contain the appropriate
631 error message.
632 """
5f6a1245 633
8cc83b8d
FV
634 def __init__(self, msg, exc_info=None):
635 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
636 super(DownloadError, self).__init__(msg)
637 self.exc_info = exc_info
d77c3dfd
FV
638
639
640class SameFileError(Exception):
59ae15a5 641 """Same File exception.
d77c3dfd 642
59ae15a5
PH
643 This exception will be thrown by FileDownloader objects if they detect
644 multiple files would have to be downloaded to the same file on disk.
645 """
646 pass
d77c3dfd
FV
647
648
649class PostProcessingError(Exception):
59ae15a5 650 """Post Processing exception.
d77c3dfd 651
59ae15a5
PH
652 This exception may be raised by PostProcessor's .run() method to
653 indicate an error in the postprocessing task.
654 """
5f6a1245 655
7851b379
PH
656 def __init__(self, msg):
657 self.msg = msg
d77c3dfd 658
5f6a1245 659
d77c3dfd 660class MaxDownloadsReached(Exception):
59ae15a5
PH
661 """ --max-downloads limit has been reached. """
662 pass
d77c3dfd
FV
663
664
665class UnavailableVideoError(Exception):
59ae15a5 666 """Unavailable Format exception.
d77c3dfd 667
59ae15a5
PH
668 This exception will be thrown when a video is requested
669 in a format that is not available for that video.
670 """
671 pass
d77c3dfd
FV
672
673
674class ContentTooShortError(Exception):
59ae15a5 675 """Content Too Short exception.
d77c3dfd 676
59ae15a5
PH
677 This exception may be raised by FileDownloader objects when a file they
678 download is too small for what the server announced first, indicating
679 the connection was probably interrupted.
680 """
d77c3dfd 681
59ae15a5 682 def __init__(self, downloaded, expected):
2c7ed247 683 # Both in bytes
59ae15a5
PH
684 self.downloaded = downloaded
685 self.expected = expected
d77c3dfd 686
5f6a1245 687
c5a59d93 688def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
689 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
690 # expected HTTP responses to meet HTTP/1.0 or later (see also
691 # https://github.com/rg3/youtube-dl/issues/6727)
692 if sys.version_info < (3, 0):
5a1a2e94 693 kwargs[b'strict'] = True
be4a824d
PH
694 hc = http_class(*args, **kwargs)
695 source_address = ydl_handler._params.get('source_address')
696 if source_address is not None:
697 sa = (source_address, 0)
698 if hasattr(hc, 'source_address'): # Python 2.7+
699 hc.source_address = sa
700 else: # Python 2.6
701 def _hc_connect(self, *args, **kwargs):
702 sock = compat_socket_create_connection(
703 (self.host, self.port), self.timeout, sa)
704 if is_https:
d7932313
PH
705 self.sock = ssl.wrap_socket(
706 sock, self.key_file, self.cert_file,
707 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
708 else:
709 self.sock = sock
710 hc.connect = functools.partial(_hc_connect, hc)
711
712 return hc
713
714
87f0e62d 715def handle_youtubedl_headers(headers):
992fc9d6
YCH
716 filtered_headers = headers
717
718 if 'Youtubedl-no-compression' in filtered_headers:
719 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 720 del filtered_headers['Youtubedl-no-compression']
87f0e62d 721
992fc9d6 722 return filtered_headers
87f0e62d
YCH
723
724
acebc9cd 725class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
726 """Handler for HTTP requests and responses.
727
728 This class, when installed with an OpenerDirector, automatically adds
729 the standard headers to every HTTP request and handles gzipped and
730 deflated responses from web servers. If compression is to be avoided in
731 a particular request, the original request in the program code only has
0424ec30 732 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
733 removed before making the real request.
734
735 Part of this code was copied from:
736
737 http://techknack.net/python-urllib2-handlers/
738
739 Andrew Rowls, the author of that code, agreed to release it to the
740 public domain.
741 """
742
be4a824d
PH
743 def __init__(self, params, *args, **kwargs):
744 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
745 self._params = params
746
747 def http_open(self, req):
748 return self.do_open(functools.partial(
c5a59d93 749 _create_http_connection, self, compat_http_client.HTTPConnection, False),
be4a824d
PH
750 req)
751
59ae15a5
PH
752 @staticmethod
753 def deflate(data):
754 try:
755 return zlib.decompress(data, -zlib.MAX_WBITS)
756 except zlib.error:
757 return zlib.decompress(data)
758
759 @staticmethod
760 def addinfourl_wrapper(stream, headers, url, code):
761 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
762 return compat_urllib_request.addinfourl(stream, headers, url, code)
763 ret = compat_urllib_request.addinfourl(stream, headers, url)
764 ret.code = code
765 return ret
766
acebc9cd 767 def http_request(self, req):
51f267d9
S
768 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
769 # always respected by websites, some tend to give out URLs with non percent-encoded
770 # non-ASCII characters (see telemb.py, ard.py [#3412])
771 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
772 # To work around aforementioned issue we will replace request's original URL with
773 # percent-encoded one
774 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
775 # the code of this workaround has been moved here from YoutubeDL.urlopen()
776 url = req.get_full_url()
777 url_escaped = escape_url(url)
778
779 # Substitute URL if any change after escaping
780 if url != url_escaped:
15d260eb 781 req = update_Request(req, url=url_escaped)
51f267d9 782
33ac271b 783 for h, v in std_headers.items():
3d5f7a39
JK
784 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
785 # The dict keys are capitalized because of this bug by urllib
786 if h.capitalize() not in req.headers:
33ac271b 787 req.add_header(h, v)
87f0e62d
YCH
788
789 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
790
791 if sys.version_info < (2, 7) and '#' in req.get_full_url():
792 # Python 2.6 is brain-dead when it comes to fragments
793 req._Request__original = req._Request__original.partition('#')[0]
794 req._Request__r_type = req._Request__r_type.partition('#')[0]
795
59ae15a5
PH
796 return req
797
acebc9cd 798 def http_response(self, req, resp):
59ae15a5
PH
799 old_resp = resp
800 # gzip
801 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
802 content = resp.read()
803 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
804 try:
805 uncompressed = io.BytesIO(gz.read())
806 except IOError as original_ioerror:
807 # There may be junk add the end of the file
808 # See http://stackoverflow.com/q/4928560/35070 for details
809 for i in range(1, 1024):
810 try:
811 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
812 uncompressed = io.BytesIO(gz.read())
813 except IOError:
814 continue
815 break
816 else:
817 raise original_ioerror
818 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 819 resp.msg = old_resp.msg
c047270c 820 del resp.headers['Content-encoding']
59ae15a5
PH
821 # deflate
822 if resp.headers.get('Content-encoding', '') == 'deflate':
823 gz = io.BytesIO(self.deflate(resp.read()))
824 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
825 resp.msg = old_resp.msg
c047270c 826 del resp.headers['Content-encoding']
ad729172
S
827 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
828 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
829 if 300 <= resp.code < 400:
830 location = resp.headers.get('Location')
831 if location:
832 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
833 if sys.version_info >= (3, 0):
834 location = location.encode('iso-8859-1').decode('utf-8')
835 location_escaped = escape_url(location)
836 if location != location_escaped:
837 del resp.headers['Location']
838 resp.headers['Location'] = location_escaped
59ae15a5 839 return resp
0f8d03f8 840
acebc9cd
PH
841 https_request = http_request
842 https_response = http_response
bf50b038 843
5de90176 844
be4a824d
PH
845class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
846 def __init__(self, params, https_conn_class=None, *args, **kwargs):
847 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
848 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
849 self._params = params
850
851 def https_open(self, req):
4f264c02
JMF
852 kwargs = {}
853 if hasattr(self, '_context'): # python > 2.6
854 kwargs['context'] = self._context
855 if hasattr(self, '_check_hostname'): # python 3.x
856 kwargs['check_hostname'] = self._check_hostname
be4a824d
PH
857 return self.do_open(functools.partial(
858 _create_http_connection, self, self._https_conn_class, True),
4f264c02 859 req, **kwargs)
be4a824d
PH
860
861
a6420bf5
S
862class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
863 def __init__(self, cookiejar=None):
864 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
865
866 def http_response(self, request, response):
867 # Python 2 will choke on next HTTP request in row if there are non-ASCII
868 # characters in Set-Cookie HTTP header of last response (see
869 # https://github.com/rg3/youtube-dl/issues/6769).
870 # In order to at least prevent crashing we will percent encode Set-Cookie
871 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
872 # if sys.version_info < (3, 0) and response.headers:
873 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
874 # set_cookie = response.headers.get(set_cookie_header)
875 # if set_cookie:
876 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
877 # if set_cookie != set_cookie_escaped:
878 # del response.headers[set_cookie_header]
879 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
880 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
881
882 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
883 https_response = http_response
884
885
08b38d54 886def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
887 """ Return a UNIX timestamp from the given date """
888
889 if date_str is None:
890 return None
891
52c3a6e4
S
892 date_str = re.sub(r'\.[0-9]+', '', date_str)
893
08b38d54
PH
894 if timezone is None:
895 m = re.search(
52c3a6e4 896 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
08b38d54
PH
897 date_str)
898 if not m:
912b38b4
PH
899 timezone = datetime.timedelta()
900 else:
08b38d54
PH
901 date_str = date_str[:-len(m.group(0))]
902 if not m.group('sign'):
903 timezone = datetime.timedelta()
904 else:
905 sign = 1 if m.group('sign') == '+' else -1
906 timezone = datetime.timedelta(
907 hours=sign * int(m.group('hours')),
908 minutes=sign * int(m.group('minutes')))
52c3a6e4
S
909 try:
910 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
911 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912 return calendar.timegm(dt.timetuple())
913 except ValueError:
914 pass
912b38b4
PH
915
916
42bdd9d0 917def unified_strdate(date_str, day_first=True):
bf50b038 918 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
919
920 if date_str is None:
921 return None
bf50b038 922 upload_date = None
5f6a1245 923 # Replace commas
026fcc04 924 date_str = date_str.replace(',', ' ')
bf50b038 925 # %z (UTC offset) is only supported in python>=3.2
15ac8413
S
926 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
927 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0 928 # Remove AM/PM + timezone
9bb8e0a3 929 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
42bdd9d0 930
19e1d359
JMF
931 format_expressions = [
932 '%d %B %Y',
0f99566c 933 '%d %b %Y',
19e1d359
JMF
934 '%B %d %Y',
935 '%b %d %Y',
f160785c
S
936 '%b %dst %Y %I:%M',
937 '%b %dnd %Y %I:%M',
938 '%b %dth %Y %I:%M',
a69801e2 939 '%Y %m %d',
19e1d359 940 '%Y-%m-%d',
fe556f1b 941 '%Y/%m/%d',
19e1d359 942 '%Y/%m/%d %H:%M:%S',
5d73273f 943 '%Y-%m-%d %H:%M:%S',
e9be9a6a 944 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 945 '%d.%m.%Y %H:%M',
b047de6f 946 '%d.%m.%Y %H.%M',
19e1d359 947 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
948 '%Y-%m-%dT%H:%M:%S.%fZ',
949 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 950 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 951 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 952 '%Y-%m-%dT%H:%M',
19e1d359 953 ]
42bdd9d0
PH
954 if day_first:
955 format_expressions.extend([
79c21abb 956 '%d-%m-%Y',
776dc399
S
957 '%d.%m.%Y',
958 '%d/%m/%Y',
959 '%d/%m/%y',
42bdd9d0
PH
960 '%d/%m/%Y %H:%M:%S',
961 ])
962 else:
963 format_expressions.extend([
79c21abb 964 '%m-%d-%Y',
776dc399
S
965 '%m.%d.%Y',
966 '%m/%d/%Y',
967 '%m/%d/%y',
42bdd9d0
PH
968 '%m/%d/%Y %H:%M:%S',
969 ])
bf50b038
JMF
970 for expression in format_expressions:
971 try:
972 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 973 except ValueError:
bf50b038 974 pass
42393ce2
PH
975 if upload_date is None:
976 timetuple = email.utils.parsedate_tz(date_str)
977 if timetuple:
978 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402
JMF
979 if upload_date is not None:
980 return compat_str(upload_date)
bf50b038 981
5f6a1245 982
28e614de 983def determine_ext(url, default_ext='unknown_video'):
f4776371
S
984 if url is None:
985 return default_ext
9cb9a5df 986 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
987 if re.match(r'^[A-Za-z0-9]+$', guess):
988 return guess
a7aaa398
S
989 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
990 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 991 return guess.rstrip('/')
73e79f2a 992 else:
cbdbb766 993 return default_ext
73e79f2a 994
5f6a1245 995
d4051a8e 996def subtitles_filename(filename, sub_lang, sub_format):
28e614de 997 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 998
5f6a1245 999
bd558525 1000def date_from_str(date_str):
37254abc
JMF
1001 """
1002 Return a datetime object from a string in the format YYYYMMDD or
1003 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1004 today = datetime.date.today()
f8795e10 1005 if date_str in ('now', 'today'):
37254abc 1006 return today
f8795e10
PH
1007 if date_str == 'yesterday':
1008 return today - datetime.timedelta(days=1)
37254abc
JMF
1009 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1010 if match is not None:
1011 sign = match.group('sign')
1012 time = int(match.group('time'))
1013 if sign == '-':
1014 time = -time
1015 unit = match.group('unit')
dfb1b146 1016 # A bad approximation?
37254abc
JMF
1017 if unit == 'month':
1018 unit = 'day'
1019 time *= 30
1020 elif unit == 'year':
1021 unit = 'day'
1022 time *= 365
1023 unit += 's'
1024 delta = datetime.timedelta(**{unit: time})
1025 return today + delta
611c1dd9 1026 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1027
1028
e63fc1be 1029def hyphenate_date(date_str):
1030 """
1031 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1032 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1033 if match is not None:
1034 return '-'.join(match.groups())
1035 else:
1036 return date_str
1037
5f6a1245 1038
bd558525
JMF
1039class DateRange(object):
1040 """Represents a time interval between two dates"""
5f6a1245 1041
bd558525
JMF
1042 def __init__(self, start=None, end=None):
1043 """start and end must be strings in the format accepted by date"""
1044 if start is not None:
1045 self.start = date_from_str(start)
1046 else:
1047 self.start = datetime.datetime.min.date()
1048 if end is not None:
1049 self.end = date_from_str(end)
1050 else:
1051 self.end = datetime.datetime.max.date()
37254abc 1052 if self.start > self.end:
bd558525 1053 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1054
bd558525
JMF
1055 @classmethod
1056 def day(cls, day):
1057 """Returns a range that only contains the given day"""
5f6a1245
JW
1058 return cls(day, day)
1059
bd558525
JMF
1060 def __contains__(self, date):
1061 """Check if the date is in the range"""
37254abc
JMF
1062 if not isinstance(date, datetime.date):
1063 date = date_from_str(date)
1064 return self.start <= date <= self.end
5f6a1245 1065
bd558525 1066 def __str__(self):
5f6a1245 1067 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1068
1069
1070def platform_name():
1071 """ Returns the platform name as a compat_str """
1072 res = platform.platform()
1073 if isinstance(res, bytes):
1074 res = res.decode(preferredencoding())
1075
1076 assert isinstance(res, compat_str)
1077 return res
c257baff
PH
1078
1079
b58ddb32
PH
1080def _windows_write_string(s, out):
1081 """ Returns True if the string was written using special methods,
1082 False if it has yet to be written out."""
1083 # Adapted from http://stackoverflow.com/a/3259271/35070
1084
1085 import ctypes
1086 import ctypes.wintypes
1087
1088 WIN_OUTPUT_IDS = {
1089 1: -11,
1090 2: -12,
1091 }
1092
a383a98a
PH
1093 try:
1094 fileno = out.fileno()
1095 except AttributeError:
1096 # If the output stream doesn't have a fileno, it's virtual
1097 return False
aa42e873
PH
1098 except io.UnsupportedOperation:
1099 # Some strange Windows pseudo files?
1100 return False
b58ddb32
PH
1101 if fileno not in WIN_OUTPUT_IDS:
1102 return False
1103
e2f89ec7 1104 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1105 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1106 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1107 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1108
e2f89ec7 1109 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1110 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1111 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1112 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1113 written = ctypes.wintypes.DWORD(0)
1114
611c1dd9 1115 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1116 FILE_TYPE_CHAR = 0x0002
1117 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1118 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1119 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1120 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1121 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1122 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1123
1124 def not_a_console(handle):
1125 if handle == INVALID_HANDLE_VALUE or handle is None:
1126 return True
8fb3ac36
PH
1127 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1128 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1129
1130 if not_a_console(h):
1131 return False
1132
d1b9c912
PH
1133 def next_nonbmp_pos(s):
1134 try:
1135 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1136 except StopIteration:
1137 return len(s)
1138
1139 while s:
1140 count = min(next_nonbmp_pos(s), 1024)
1141
b58ddb32 1142 ret = WriteConsoleW(
d1b9c912 1143 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1144 if ret == 0:
1145 raise OSError('Failed to write string')
d1b9c912
PH
1146 if not count: # We just wrote a non-BMP character
1147 assert written.value == 2
1148 s = s[1:]
1149 else:
1150 assert written.value > 0
1151 s = s[written.value:]
b58ddb32
PH
1152 return True
1153
1154
734f90bb 1155def write_string(s, out=None, encoding=None):
7459e3a2
PH
1156 if out is None:
1157 out = sys.stderr
8bf48f23 1158 assert type(s) == compat_str
7459e3a2 1159
b58ddb32
PH
1160 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1161 if _windows_write_string(s, out):
1162 return
1163
7459e3a2
PH
1164 if ('b' in getattr(out, 'mode', '') or
1165 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1166 byt = s.encode(encoding or preferredencoding(), 'ignore')
1167 out.write(byt)
1168 elif hasattr(out, 'buffer'):
1169 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1170 byt = s.encode(enc, 'ignore')
1171 out.buffer.write(byt)
1172 else:
8bf48f23 1173 out.write(s)
7459e3a2
PH
1174 out.flush()
1175
1176
48ea9cea
PH
1177def bytes_to_intlist(bs):
1178 if not bs:
1179 return []
1180 if isinstance(bs[0], int): # Python 3
1181 return list(bs)
1182 else:
1183 return [ord(c) for c in bs]
1184
c257baff 1185
cba892fa 1186def intlist_to_bytes(xs):
1187 if not xs:
1188 return b''
eb4157fd 1189 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1190
1191
c1c9a79c
PH
1192# Cross-platform file locking
1193if sys.platform == 'win32':
1194 import ctypes.wintypes
1195 import msvcrt
1196
1197 class OVERLAPPED(ctypes.Structure):
1198 _fields_ = [
1199 ('Internal', ctypes.wintypes.LPVOID),
1200 ('InternalHigh', ctypes.wintypes.LPVOID),
1201 ('Offset', ctypes.wintypes.DWORD),
1202 ('OffsetHigh', ctypes.wintypes.DWORD),
1203 ('hEvent', ctypes.wintypes.HANDLE),
1204 ]
1205
1206 kernel32 = ctypes.windll.kernel32
1207 LockFileEx = kernel32.LockFileEx
1208 LockFileEx.argtypes = [
1209 ctypes.wintypes.HANDLE, # hFile
1210 ctypes.wintypes.DWORD, # dwFlags
1211 ctypes.wintypes.DWORD, # dwReserved
1212 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1213 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1214 ctypes.POINTER(OVERLAPPED) # Overlapped
1215 ]
1216 LockFileEx.restype = ctypes.wintypes.BOOL
1217 UnlockFileEx = kernel32.UnlockFileEx
1218 UnlockFileEx.argtypes = [
1219 ctypes.wintypes.HANDLE, # hFile
1220 ctypes.wintypes.DWORD, # dwReserved
1221 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1222 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1223 ctypes.POINTER(OVERLAPPED) # Overlapped
1224 ]
1225 UnlockFileEx.restype = ctypes.wintypes.BOOL
1226 whole_low = 0xffffffff
1227 whole_high = 0x7fffffff
1228
1229 def _lock_file(f, exclusive):
1230 overlapped = OVERLAPPED()
1231 overlapped.Offset = 0
1232 overlapped.OffsetHigh = 0
1233 overlapped.hEvent = 0
1234 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1235 handle = msvcrt.get_osfhandle(f.fileno())
1236 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1237 whole_low, whole_high, f._lock_file_overlapped_p):
1238 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1239
1240 def _unlock_file(f):
1241 assert f._lock_file_overlapped_p
1242 handle = msvcrt.get_osfhandle(f.fileno())
1243 if not UnlockFileEx(handle, 0,
1244 whole_low, whole_high, f._lock_file_overlapped_p):
1245 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1246
1247else:
399a76e6
YCH
1248 # Some platforms, such as Jython, is missing fcntl
1249 try:
1250 import fcntl
c1c9a79c 1251
399a76e6
YCH
1252 def _lock_file(f, exclusive):
1253 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1254
399a76e6
YCH
1255 def _unlock_file(f):
1256 fcntl.flock(f, fcntl.LOCK_UN)
1257 except ImportError:
1258 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1259
1260 def _lock_file(f, exclusive):
1261 raise IOError(UNSUPPORTED_MSG)
1262
1263 def _unlock_file(f):
1264 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1265
1266
1267class locked_file(object):
1268 def __init__(self, filename, mode, encoding=None):
1269 assert mode in ['r', 'a', 'w']
1270 self.f = io.open(filename, mode, encoding=encoding)
1271 self.mode = mode
1272
1273 def __enter__(self):
1274 exclusive = self.mode != 'r'
1275 try:
1276 _lock_file(self.f, exclusive)
1277 except IOError:
1278 self.f.close()
1279 raise
1280 return self
1281
1282 def __exit__(self, etype, value, traceback):
1283 try:
1284 _unlock_file(self.f)
1285 finally:
1286 self.f.close()
1287
1288 def __iter__(self):
1289 return iter(self.f)
1290
1291 def write(self, *args):
1292 return self.f.write(*args)
1293
1294 def read(self, *args):
1295 return self.f.read(*args)
4eb7f1d1
JMF
1296
1297
4644ac55
S
1298def get_filesystem_encoding():
1299 encoding = sys.getfilesystemencoding()
1300 return encoding if encoding is not None else 'utf-8'
1301
1302
4eb7f1d1 1303def shell_quote(args):
a6a173c2 1304 quoted_args = []
4644ac55 1305 encoding = get_filesystem_encoding()
a6a173c2
JMF
1306 for a in args:
1307 if isinstance(a, bytes):
1308 # We may get a filename encoded with 'encodeFilename'
1309 a = a.decode(encoding)
1310 quoted_args.append(pipes.quote(a))
28e614de 1311 return ' '.join(quoted_args)
9d4660ca
PH
1312
1313
1314def smuggle_url(url, data):
1315 """ Pass additional data in a URL for internal use. """
1316
15707c7e 1317 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1318 {'__youtubedl_smuggle': json.dumps(data)})
1319 return url + '#' + sdata
9d4660ca
PH
1320
1321
79f82953 1322def unsmuggle_url(smug_url, default=None):
83e865a3 1323 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1324 return smug_url, default
28e614de
PH
1325 url, _, sdata = smug_url.rpartition('#')
1326 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1327 data = json.loads(jsond)
1328 return url, data
02dbf93f
PH
1329
1330
02dbf93f
PH
1331def format_bytes(bytes):
1332 if bytes is None:
28e614de 1333 return 'N/A'
02dbf93f
PH
1334 if type(bytes) is str:
1335 bytes = float(bytes)
1336 if bytes == 0.0:
1337 exponent = 0
1338 else:
1339 exponent = int(math.log(bytes, 1024.0))
28e614de 1340 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1341 converted = float(bytes) / float(1024 ** exponent)
28e614de 1342 return '%.2f%s' % (converted, suffix)
f53c966a 1343
1c088fa8 1344
fb47597b
S
1345def lookup_unit_table(unit_table, s):
1346 units_re = '|'.join(re.escape(u) for u in unit_table)
1347 m = re.match(
782b1b5b 1348 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1349 if not m:
1350 return None
1351 num_str = m.group('num').replace(',', '.')
1352 mult = unit_table[m.group('unit')]
1353 return int(float(num_str) * mult)
1354
1355
be64b5b0
PH
1356def parse_filesize(s):
1357 if s is None:
1358 return None
1359
dfb1b146 1360 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1361 # but we support those too
1362 _UNIT_TABLE = {
1363 'B': 1,
1364 'b': 1,
1365 'KiB': 1024,
1366 'KB': 1000,
1367 'kB': 1024,
1368 'Kb': 1000,
1369 'MiB': 1024 ** 2,
1370 'MB': 1000 ** 2,
1371 'mB': 1024 ** 2,
1372 'Mb': 1000 ** 2,
1373 'GiB': 1024 ** 3,
1374 'GB': 1000 ** 3,
1375 'gB': 1024 ** 3,
1376 'Gb': 1000 ** 3,
1377 'TiB': 1024 ** 4,
1378 'TB': 1000 ** 4,
1379 'tB': 1024 ** 4,
1380 'Tb': 1000 ** 4,
1381 'PiB': 1024 ** 5,
1382 'PB': 1000 ** 5,
1383 'pB': 1024 ** 5,
1384 'Pb': 1000 ** 5,
1385 'EiB': 1024 ** 6,
1386 'EB': 1000 ** 6,
1387 'eB': 1024 ** 6,
1388 'Eb': 1000 ** 6,
1389 'ZiB': 1024 ** 7,
1390 'ZB': 1000 ** 7,
1391 'zB': 1024 ** 7,
1392 'Zb': 1000 ** 7,
1393 'YiB': 1024 ** 8,
1394 'YB': 1000 ** 8,
1395 'yB': 1024 ** 8,
1396 'Yb': 1000 ** 8,
1397 }
1398
fb47597b
S
1399 return lookup_unit_table(_UNIT_TABLE, s)
1400
1401
1402def parse_count(s):
1403 if s is None:
be64b5b0
PH
1404 return None
1405
fb47597b
S
1406 s = s.strip()
1407
1408 if re.match(r'^[\d,.]+$', s):
1409 return str_to_int(s)
1410
1411 _UNIT_TABLE = {
1412 'k': 1000,
1413 'K': 1000,
1414 'm': 1000 ** 2,
1415 'M': 1000 ** 2,
1416 'kk': 1000 ** 2,
1417 'KK': 1000 ** 2,
1418 }
be64b5b0 1419
fb47597b 1420 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1421
2f7ae819 1422
caefb1de
PH
1423def month_by_name(name):
1424 """ Return the number of a month by (locale-independently) English name """
1425
caefb1de 1426 try:
7105440c
YCH
1427 return ENGLISH_MONTH_NAMES.index(name) + 1
1428 except ValueError:
1429 return None
1430
1431
1432def month_by_abbreviation(abbrev):
1433 """ Return the number of a month by (locale-independently) English
1434 abbreviations """
1435
1436 try:
1437 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1438 except ValueError:
1439 return None
18258362
JMF
1440
1441
5aafe895 1442def fix_xml_ampersands(xml_str):
18258362 1443 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1444 return re.sub(
1445 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1446 '&amp;',
5aafe895 1447 xml_str)
e3946f98
PH
1448
1449
1450def setproctitle(title):
8bf48f23 1451 assert isinstance(title, compat_str)
c1c05c67
YCH
1452
1453 # ctypes in Jython is not complete
1454 # http://bugs.jython.org/issue2148
1455 if sys.platform.startswith('java'):
1456 return
1457
e3946f98 1458 try:
611c1dd9 1459 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1460 except OSError:
1461 return
6eefe533
PH
1462 title_bytes = title.encode('utf-8')
1463 buf = ctypes.create_string_buffer(len(title_bytes))
1464 buf.value = title_bytes
e3946f98 1465 try:
6eefe533 1466 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1467 except AttributeError:
1468 return # Strange libc, just skip this
d7dda168
PH
1469
1470
1471def remove_start(s, start):
1472 if s.startswith(start):
1473 return s[len(start):]
1474 return s
29eb5174
PH
1475
1476
2b9faf55
PH
1477def remove_end(s, end):
1478 if s.endswith(end):
1479 return s[:-len(end)]
1480 return s
1481
1482
31b2051e
S
1483def remove_quotes(s):
1484 if s is None or len(s) < 2:
1485 return s
1486 for quote in ('"', "'", ):
1487 if s[0] == quote and s[-1] == quote:
1488 return s[1:-1]
1489 return s
1490
1491
29eb5174 1492def url_basename(url):
9b8aaeed 1493 path = compat_urlparse.urlparse(url).path
28e614de 1494 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1495
1496
1497class HEADRequest(compat_urllib_request.Request):
1498 def get_method(self):
611c1dd9 1499 return 'HEAD'
7217e148
PH
1500
1501
9732d77e 1502def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1503 if get_attr:
1504 if v is not None:
1505 v = getattr(v, get_attr, None)
9572013d
PH
1506 if v == '':
1507 v = None
1812afb7
S
1508 if v is None:
1509 return default
1510 try:
1511 return int(v) * invscale // scale
1512 except ValueError:
af98f8ff 1513 return default
9732d77e 1514
9572013d 1515
40a90862
JMF
1516def str_or_none(v, default=None):
1517 return default if v is None else compat_str(v)
1518
9732d77e
PH
1519
1520def str_to_int(int_str):
48d4681e 1521 """ A more relaxed version of int_or_none """
9732d77e
PH
1522 if int_str is None:
1523 return None
28e614de 1524 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1525 return int(int_str)
608d11f5
PH
1526
1527
9732d77e 1528def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1529 if v is None:
1530 return default
1531 try:
1532 return float(v) * invscale / scale
1533 except ValueError:
1534 return default
43f775e4
PH
1535
1536
608d11f5 1537def parse_duration(s):
8f9312c3 1538 if not isinstance(s, compat_basestring):
608d11f5
PH
1539 return None
1540
ca7b3246
S
1541 s = s.strip()
1542
acaff495 1543 days, hours, mins, secs, ms = [None] * 5
1544 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1545 if m:
1546 days, hours, mins, secs, ms = m.groups()
1547 else:
1548 m = re.match(
1549 r'''(?ix)(?:P?T)?
8f4b58d7 1550 (?:
acaff495 1551 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1552 )?
acaff495 1553 (?:
1554 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1555 )?
1556 (?:
1557 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1558 )?
1559 (?:
1560 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1561 )?$''', s)
1562 if m:
1563 days, hours, mins, secs, ms = m.groups()
1564 else:
1565 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1566 if m:
1567 hours, mins = m.groups()
1568 else:
1569 return None
1570
1571 duration = 0
1572 if secs:
1573 duration += float(secs)
1574 if mins:
1575 duration += float(mins) * 60
1576 if hours:
1577 duration += float(hours) * 60 * 60
1578 if days:
1579 duration += float(days) * 24 * 60 * 60
1580 if ms:
1581 duration += float(ms)
1582 return duration
91d7d0b3
JMF
1583
1584
e65e4c88 1585def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1586 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1587 return (
1588 '{0}.{1}{2}'.format(name, ext, real_ext)
1589 if not expected_real_ext or real_ext[1:] == expected_real_ext
1590 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1591
1592
b3ed15b7
S
1593def replace_extension(filename, ext, expected_real_ext=None):
1594 name, real_ext = os.path.splitext(filename)
1595 return '{0}.{1}'.format(
1596 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1597 ext)
1598
1599
d70ad093
PH
1600def check_executable(exe, args=[]):
1601 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1602 args can be a list of arguments for a short output (like -version) """
1603 try:
1604 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1605 except OSError:
1606 return False
1607 return exe
b7ab0590
PH
1608
1609
95807118 1610def get_exe_version(exe, args=['--version'],
cae97f65 1611 version_re=None, unrecognized='present'):
95807118
PH
1612 """ Returns the version of the specified executable,
1613 or False if the executable is not present """
1614 try:
cae97f65 1615 out, _ = subprocess.Popen(
54116803 1616 [encodeArgument(exe)] + args,
95807118
PH
1617 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1618 except OSError:
1619 return False
cae97f65
PH
1620 if isinstance(out, bytes): # Python 2.x
1621 out = out.decode('ascii', 'ignore')
1622 return detect_exe_version(out, version_re, unrecognized)
1623
1624
1625def detect_exe_version(output, version_re=None, unrecognized='present'):
1626 assert isinstance(output, compat_str)
1627 if version_re is None:
1628 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1629 m = re.search(version_re, output)
95807118
PH
1630 if m:
1631 return m.group(1)
1632 else:
1633 return unrecognized
1634
1635
b7ab0590 1636class PagedList(object):
dd26ced1
PH
1637 def __len__(self):
1638 # This is only useful for tests
1639 return len(self.getslice())
1640
9c44d242
PH
1641
1642class OnDemandPagedList(PagedList):
b95dc034 1643 def __init__(self, pagefunc, pagesize, use_cache=False):
9c44d242
PH
1644 self._pagefunc = pagefunc
1645 self._pagesize = pagesize
b95dc034
YCH
1646 self._use_cache = use_cache
1647 if use_cache:
1648 self._cache = {}
9c44d242 1649
b7ab0590
PH
1650 def getslice(self, start=0, end=None):
1651 res = []
1652 for pagenum in itertools.count(start // self._pagesize):
1653 firstid = pagenum * self._pagesize
1654 nextfirstid = pagenum * self._pagesize + self._pagesize
1655 if start >= nextfirstid:
1656 continue
1657
b95dc034
YCH
1658 page_results = None
1659 if self._use_cache:
1660 page_results = self._cache.get(pagenum)
1661 if page_results is None:
1662 page_results = list(self._pagefunc(pagenum))
1663 if self._use_cache:
1664 self._cache[pagenum] = page_results
b7ab0590
PH
1665
1666 startv = (
1667 start % self._pagesize
1668 if firstid <= start < nextfirstid
1669 else 0)
1670
1671 endv = (
1672 ((end - 1) % self._pagesize) + 1
1673 if (end is not None and firstid <= end <= nextfirstid)
1674 else None)
1675
1676 if startv != 0 or endv is not None:
1677 page_results = page_results[startv:endv]
1678 res.extend(page_results)
1679
1680 # A little optimization - if current page is not "full", ie. does
1681 # not contain page_size videos then we can assume that this page
1682 # is the last one - there are no more ids on further pages -
1683 # i.e. no need to query again.
1684 if len(page_results) + startv < self._pagesize:
1685 break
1686
1687 # If we got the whole page, but the next page is not interesting,
1688 # break out early as well
1689 if end == nextfirstid:
1690 break
1691 return res
81c2f20b
PH
1692
1693
9c44d242
PH
1694class InAdvancePagedList(PagedList):
1695 def __init__(self, pagefunc, pagecount, pagesize):
1696 self._pagefunc = pagefunc
1697 self._pagecount = pagecount
1698 self._pagesize = pagesize
1699
1700 def getslice(self, start=0, end=None):
1701 res = []
1702 start_page = start // self._pagesize
1703 end_page = (
1704 self._pagecount if end is None else (end // self._pagesize + 1))
1705 skip_elems = start - start_page * self._pagesize
1706 only_more = None if end is None else end - start
1707 for pagenum in range(start_page, end_page):
1708 page = list(self._pagefunc(pagenum))
1709 if skip_elems:
1710 page = page[skip_elems:]
1711 skip_elems = None
1712 if only_more is not None:
1713 if len(page) < only_more:
1714 only_more -= len(page)
1715 else:
1716 page = page[:only_more]
1717 res.extend(page)
1718 break
1719 res.extend(page)
1720 return res
1721
1722
81c2f20b 1723def uppercase_escape(s):
676eb3f2 1724 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1725 return re.sub(
a612753d 1726 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1727 lambda m: unicode_escape(m.group(0))[0],
1728 s)
0fe2ff78
YCH
1729
1730
1731def lowercase_escape(s):
1732 unicode_escape = codecs.getdecoder('unicode_escape')
1733 return re.sub(
1734 r'\\u[0-9a-fA-F]{4}',
1735 lambda m: unicode_escape(m.group(0))[0],
1736 s)
b53466e1 1737
d05cfe06
S
1738
1739def escape_rfc3986(s):
1740 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1741 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1742 s = s.encode('utf-8')
ecc0c5ee 1743 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1744
1745
1746def escape_url(url):
1747 """Escape URL as suggested by RFC 3986"""
1748 url_parsed = compat_urllib_parse_urlparse(url)
1749 return url_parsed._replace(
efbed08d 1750 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
1751 path=escape_rfc3986(url_parsed.path),
1752 params=escape_rfc3986(url_parsed.params),
1753 query=escape_rfc3986(url_parsed.query),
1754 fragment=escape_rfc3986(url_parsed.fragment)
1755 ).geturl()
1756
b53466e1 1757try:
28e614de 1758 struct.pack('!I', 0)
b53466e1 1759except TypeError:
622d1916
YCH
1760 # In Python 2.6 and 2.7.x < 2.7.7, struct requires a bytes argument
1761 # See https://bugs.python.org/issue19099
b53466e1
PH
1762 def struct_pack(spec, *args):
1763 if isinstance(spec, compat_str):
1764 spec = spec.encode('ascii')
1765 return struct.pack(spec, *args)
1766
1767 def struct_unpack(spec, *args):
1768 if isinstance(spec, compat_str):
1769 spec = spec.encode('ascii')
1770 return struct.unpack(spec, *args)
1771else:
1772 struct_pack = struct.pack
1773 struct_unpack = struct.unpack
62e609ab
PH
1774
1775
1776def read_batch_urls(batch_fd):
1777 def fixup(url):
1778 if not isinstance(url, compat_str):
1779 url = url.decode('utf-8', 'replace')
28e614de 1780 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1781 if url.startswith(BOM_UTF8):
1782 url = url[len(BOM_UTF8):]
1783 url = url.strip()
1784 if url.startswith(('#', ';', ']')):
1785 return False
1786 return url
1787
1788 with contextlib.closing(batch_fd) as fd:
1789 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1790
1791
1792def urlencode_postdata(*args, **kargs):
15707c7e 1793 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1794
1795
38f9ef31 1796def update_url_query(url, query):
cacd9966
YCH
1797 if not query:
1798 return url
38f9ef31 1799 parsed_url = compat_urlparse.urlparse(url)
1800 qs = compat_parse_qs(parsed_url.query)
1801 qs.update(query)
1802 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 1803 query=compat_urllib_parse_urlencode(qs, True)))
16392824 1804
8e60dc75 1805
ed0291d1
S
1806def update_Request(req, url=None, data=None, headers={}, query={}):
1807 req_headers = req.headers.copy()
1808 req_headers.update(headers)
1809 req_data = data or req.data
1810 req_url = update_url_query(url or req.get_full_url(), query)
1811 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1812 new_req = req_type(
1813 req_url, data=req_data, headers=req_headers,
1814 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1815 if hasattr(req, 'timeout'):
1816 new_req.timeout = req.timeout
1817 return new_req
1818
1819
86296ad2 1820def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
1821 if isinstance(key_or_keys, (list, tuple)):
1822 for key in key_or_keys:
86296ad2
S
1823 if key not in d or d[key] is None or skip_false_values and not d[key]:
1824 continue
1825 return d[key]
cbecc9b9
S
1826 return default
1827 return d.get(key_or_keys, default)
1828
1829
8e60dc75
S
1830def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1831 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1832
16392824 1833
a1a530b0
PH
1834US_RATINGS = {
1835 'G': 0,
1836 'PG': 10,
1837 'PG-13': 13,
1838 'R': 16,
1839 'NC': 18,
1840}
fac55558
PH
1841
1842
146c80e2
S
1843def parse_age_limit(s):
1844 if s is None:
d838b1bd 1845 return None
146c80e2 1846 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d800609c 1847 return int(m.group('age')) if m else US_RATINGS.get(s)
146c80e2
S
1848
1849
fac55558 1850def strip_jsonp(code):
609a61e3 1851 return re.sub(
8411229b 1852 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1853
1854
e05f6939
PH
1855def js_to_json(code):
1856 def fix_kv(m):
e7b6d122
PH
1857 v = m.group(0)
1858 if v in ('true', 'false', 'null'):
1859 return v
1860 if v.startswith('"'):
d01949dc
S
1861 v = re.sub(r"\\'", "'", v[1:-1])
1862 elif v.startswith("'"):
e7b6d122
PH
1863 v = v[1:-1]
1864 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1865 '\\\\': '\\\\',
1866 "\\'": "'",
1867 '"': '\\"',
1868 }[m.group(0)], v)
1869 return '"%s"' % v
e05f6939
PH
1870
1871 res = re.sub(r'''(?x)
d305dd73
PH
1872 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1873 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
8f4b58d7 1874 [a-zA-Z_][.a-zA-Z_0-9]*
e05f6939 1875 ''', fix_kv, code)
ba9e68f4 1876 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
e05f6939
PH
1877 return res
1878
1879
478c2c61
PH
1880def qualities(quality_ids):
1881 """ Get a numeric quality value out of a list of possible values """
1882 def q(qid):
1883 try:
1884 return quality_ids.index(qid)
1885 except ValueError:
1886 return -1
1887 return q
1888
acd69589
PH
1889
1890DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1891
a020a0dc
PH
1892
1893def limit_length(s, length):
1894 """ Add ellipses to overly long strings """
1895 if s is None:
1896 return None
1897 ELLIPSES = '...'
1898 if len(s) > length:
1899 return s[:length - len(ELLIPSES)] + ELLIPSES
1900 return s
48844745
PH
1901
1902
1903def version_tuple(v):
5f9b8394 1904 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1905
1906
1907def is_outdated_version(version, limit, assume_new=True):
1908 if not version:
1909 return not assume_new
1910 try:
1911 return version_tuple(version) < version_tuple(limit)
1912 except ValueError:
1913 return not assume_new
732ea2f0
PH
1914
1915
1916def ytdl_is_updateable():
1917 """ Returns if youtube-dl can be updated with -U """
1918 from zipimport import zipimporter
1919
1920 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1921
1922
1923def args_to_str(args):
1924 # Get a short string representation for a subprocess command
1925 return ' '.join(shlex_quote(a) for a in args)
2ccd1b10
PH
1926
1927
9b9c5355 1928def error_to_compat_str(err):
fdae2358
S
1929 err_str = str(err)
1930 # On python 2 error byte string must be decoded with proper
1931 # encoding rather than ascii
1932 if sys.version_info[0] < 3:
1933 err_str = err_str.decode(preferredencoding())
1934 return err_str
1935
1936
c460bdd5 1937def mimetype2ext(mt):
765ac263
JMF
1938 ext = {
1939 'audio/mp4': 'm4a',
1940 }.get(mt)
1941 if ext is not None:
1942 return ext
1943
c460bdd5
PH
1944 _, _, res = mt.rpartition('/')
1945
1946 return {
f6861ec9 1947 '3gpp': '3gp',
cafcf657 1948 'smptett+xml': 'tt',
1949 'srt': 'srt',
1950 'ttaf+xml': 'dfxp',
a0d8d704 1951 'ttml+xml': 'ttml',
cafcf657 1952 'vtt': 'vtt',
f6861ec9 1953 'x-flv': 'flv',
a0d8d704
YCH
1954 'x-mp4-fragmented': 'mp4',
1955 'x-ms-wmv': 'wmv',
c460bdd5
PH
1956 }.get(res, res)
1957
1958
2ccd1b10
PH
1959def urlhandle_detect_ext(url_handle):
1960 try:
1961 url_handle.headers
1962 getheader = lambda h: url_handle.headers[h]
1963 except AttributeError: # Python < 3
1964 getheader = url_handle.info().getheader
1965
b55ee18f
PH
1966 cd = getheader('Content-Disposition')
1967 if cd:
1968 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1969 if m:
1970 e = determine_ext(m.group('filename'), default_ext=None)
1971 if e:
1972 return e
1973
c460bdd5 1974 return mimetype2ext(getheader('Content-Type'))
05900629
PH
1975
1976
1e399778
YCH
1977def encode_data_uri(data, mime_type):
1978 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1979
1980
05900629 1981def age_restricted(content_limit, age_limit):
6ec6cb4e 1982 """ Returns True iff the content should be blocked """
05900629
PH
1983
1984 if age_limit is None: # No limit set
1985 return False
1986 if content_limit is None:
1987 return False # Content available for everyone
1988 return age_limit < content_limit
61ca9a80
PH
1989
1990
1991def is_html(first_bytes):
1992 """ Detect whether a file contains HTML by examining its first bytes. """
1993
1994 BOMS = [
1995 (b'\xef\xbb\xbf', 'utf-8'),
1996 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1997 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1998 (b'\xff\xfe', 'utf-16-le'),
1999 (b'\xfe\xff', 'utf-16-be'),
2000 ]
2001 for bom, enc in BOMS:
2002 if first_bytes.startswith(bom):
2003 s = first_bytes[len(bom):].decode(enc, 'replace')
2004 break
2005 else:
2006 s = first_bytes.decode('utf-8', 'replace')
2007
2008 return re.match(r'^\s*<', s)
a055469f
PH
2009
2010
2011def determine_protocol(info_dict):
2012 protocol = info_dict.get('protocol')
2013 if protocol is not None:
2014 return protocol
2015
2016 url = info_dict['url']
2017 if url.startswith('rtmp'):
2018 return 'rtmp'
2019 elif url.startswith('mms'):
2020 return 'mms'
2021 elif url.startswith('rtsp'):
2022 return 'rtsp'
2023
2024 ext = determine_ext(url)
2025 if ext == 'm3u8':
2026 return 'm3u8'
2027 elif ext == 'f4m':
2028 return 'f4m'
2029
2030 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2031
2032
2033def render_table(header_row, data):
2034 """ Render a list of rows, each as a list of values """
2035 table = [header_row] + data
2036 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2037 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2038 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2039
2040
2041def _match_one(filter_part, dct):
2042 COMPARISON_OPERATORS = {
2043 '<': operator.lt,
2044 '<=': operator.le,
2045 '>': operator.gt,
2046 '>=': operator.ge,
2047 '=': operator.eq,
2048 '!=': operator.ne,
2049 }
2050 operator_rex = re.compile(r'''(?x)\s*
2051 (?P<key>[a-z_]+)
2052 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2053 (?:
2054 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2055 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2056 )
2057 \s*$
2058 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2059 m = operator_rex.search(filter_part)
2060 if m:
2061 op = COMPARISON_OPERATORS[m.group('op')]
2062 if m.group('strval') is not None:
2063 if m.group('op') not in ('=', '!='):
2064 raise ValueError(
2065 'Operator %s does not support string values!' % m.group('op'))
2066 comparison_value = m.group('strval')
2067 else:
2068 try:
2069 comparison_value = int(m.group('intval'))
2070 except ValueError:
2071 comparison_value = parse_filesize(m.group('intval'))
2072 if comparison_value is None:
2073 comparison_value = parse_filesize(m.group('intval') + 'B')
2074 if comparison_value is None:
2075 raise ValueError(
2076 'Invalid integer value %r in filter part %r' % (
2077 m.group('intval'), filter_part))
2078 actual_value = dct.get(m.group('key'))
2079 if actual_value is None:
2080 return m.group('none_inclusive')
2081 return op(actual_value, comparison_value)
2082
2083 UNARY_OPERATORS = {
2084 '': lambda v: v is not None,
2085 '!': lambda v: v is None,
2086 }
2087 operator_rex = re.compile(r'''(?x)\s*
2088 (?P<op>%s)\s*(?P<key>[a-z_]+)
2089 \s*$
2090 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2091 m = operator_rex.search(filter_part)
2092 if m:
2093 op = UNARY_OPERATORS[m.group('op')]
2094 actual_value = dct.get(m.group('key'))
2095 return op(actual_value)
2096
2097 raise ValueError('Invalid filter part %r' % filter_part)
2098
2099
2100def match_str(filter_str, dct):
2101 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2102
2103 return all(
2104 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2105
2106
2107def match_filter_func(filter_str):
2108 def _match_func(info_dict):
2109 if match_str(filter_str, info_dict):
2110 return None
2111 else:
2112 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2113 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2114 return _match_func
91410c9b
PH
2115
2116
bf6427d2
YCH
2117def parse_dfxp_time_expr(time_expr):
2118 if not time_expr:
d631d5f9 2119 return
bf6427d2
YCH
2120
2121 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2122 if mobj:
2123 return float(mobj.group('time_offset'))
2124
db2fe38b 2125 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2126 if mobj:
db2fe38b 2127 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2128
2129
c1c924ab
YCH
2130def srt_subtitles_timecode(seconds):
2131 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2132
2133
2134def dfxp2srt(dfxp_data):
4e335771
YCH
2135 _x = functools.partial(xpath_with_ns, ns_map={
2136 'ttml': 'http://www.w3.org/ns/ttml',
2137 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
5bf28d78 2138 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
4e335771 2139 })
bf6427d2 2140
87de7069 2141 class TTMLPElementParser(object):
2b14cb56 2142 out = ''
bf6427d2 2143
2b14cb56 2144 def start(self, tag, attrib):
2145 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2146 self.out += '\n'
bf6427d2 2147
2b14cb56 2148 def end(self, tag):
2149 pass
bf6427d2 2150
2b14cb56 2151 def data(self, data):
2152 self.out += data
2153
2154 def close(self):
2155 return self.out.strip()
2156
2157 def parse_node(node):
2158 target = TTMLPElementParser()
2159 parser = xml.etree.ElementTree.XMLParser(target=target)
2160 parser.feed(xml.etree.ElementTree.tostring(node))
2161 return parser.close()
bf6427d2 2162
36e6f62c 2163 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
bf6427d2 2164 out = []
5bf28d78 2165 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2166
2167 if not paras:
2168 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
2169
2170 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2171 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2172 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2173 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2174 if begin_time is None:
2175 continue
7dff0363 2176 if not end_time:
d631d5f9
YCH
2177 if not dur:
2178 continue
2179 end_time = begin_time + dur
bf6427d2
YCH
2180 out.append('%d\n%s --> %s\n%s\n\n' % (
2181 index,
c1c924ab
YCH
2182 srt_subtitles_timecode(begin_time),
2183 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2184 parse_node(para)))
2185
2186 return ''.join(out)
2187
2188
66e289ba
S
2189def cli_option(params, command_option, param):
2190 param = params.get(param)
2191 return [command_option, param] if param is not None else []
2192
2193
2194def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2195 param = params.get(param)
2196 assert isinstance(param, bool)
2197 if separator:
2198 return [command_option + separator + (true_value if param else false_value)]
2199 return [command_option, true_value if param else false_value]
2200
2201
2202def cli_valueless_option(params, command_option, param, expected_value=True):
2203 param = params.get(param)
2204 return [command_option] if param == expected_value else []
2205
2206
2207def cli_configuration_args(params, param, default=[]):
2208 ex_args = params.get(param)
2209 if ex_args is None:
2210 return default
2211 assert isinstance(ex_args, list)
2212 return ex_args
2213
2214
39672624
YCH
2215class ISO639Utils(object):
2216 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2217 _lang_map = {
2218 'aa': 'aar',
2219 'ab': 'abk',
2220 'ae': 'ave',
2221 'af': 'afr',
2222 'ak': 'aka',
2223 'am': 'amh',
2224 'an': 'arg',
2225 'ar': 'ara',
2226 'as': 'asm',
2227 'av': 'ava',
2228 'ay': 'aym',
2229 'az': 'aze',
2230 'ba': 'bak',
2231 'be': 'bel',
2232 'bg': 'bul',
2233 'bh': 'bih',
2234 'bi': 'bis',
2235 'bm': 'bam',
2236 'bn': 'ben',
2237 'bo': 'bod',
2238 'br': 'bre',
2239 'bs': 'bos',
2240 'ca': 'cat',
2241 'ce': 'che',
2242 'ch': 'cha',
2243 'co': 'cos',
2244 'cr': 'cre',
2245 'cs': 'ces',
2246 'cu': 'chu',
2247 'cv': 'chv',
2248 'cy': 'cym',
2249 'da': 'dan',
2250 'de': 'deu',
2251 'dv': 'div',
2252 'dz': 'dzo',
2253 'ee': 'ewe',
2254 'el': 'ell',
2255 'en': 'eng',
2256 'eo': 'epo',
2257 'es': 'spa',
2258 'et': 'est',
2259 'eu': 'eus',
2260 'fa': 'fas',
2261 'ff': 'ful',
2262 'fi': 'fin',
2263 'fj': 'fij',
2264 'fo': 'fao',
2265 'fr': 'fra',
2266 'fy': 'fry',
2267 'ga': 'gle',
2268 'gd': 'gla',
2269 'gl': 'glg',
2270 'gn': 'grn',
2271 'gu': 'guj',
2272 'gv': 'glv',
2273 'ha': 'hau',
2274 'he': 'heb',
2275 'hi': 'hin',
2276 'ho': 'hmo',
2277 'hr': 'hrv',
2278 'ht': 'hat',
2279 'hu': 'hun',
2280 'hy': 'hye',
2281 'hz': 'her',
2282 'ia': 'ina',
2283 'id': 'ind',
2284 'ie': 'ile',
2285 'ig': 'ibo',
2286 'ii': 'iii',
2287 'ik': 'ipk',
2288 'io': 'ido',
2289 'is': 'isl',
2290 'it': 'ita',
2291 'iu': 'iku',
2292 'ja': 'jpn',
2293 'jv': 'jav',
2294 'ka': 'kat',
2295 'kg': 'kon',
2296 'ki': 'kik',
2297 'kj': 'kua',
2298 'kk': 'kaz',
2299 'kl': 'kal',
2300 'km': 'khm',
2301 'kn': 'kan',
2302 'ko': 'kor',
2303 'kr': 'kau',
2304 'ks': 'kas',
2305 'ku': 'kur',
2306 'kv': 'kom',
2307 'kw': 'cor',
2308 'ky': 'kir',
2309 'la': 'lat',
2310 'lb': 'ltz',
2311 'lg': 'lug',
2312 'li': 'lim',
2313 'ln': 'lin',
2314 'lo': 'lao',
2315 'lt': 'lit',
2316 'lu': 'lub',
2317 'lv': 'lav',
2318 'mg': 'mlg',
2319 'mh': 'mah',
2320 'mi': 'mri',
2321 'mk': 'mkd',
2322 'ml': 'mal',
2323 'mn': 'mon',
2324 'mr': 'mar',
2325 'ms': 'msa',
2326 'mt': 'mlt',
2327 'my': 'mya',
2328 'na': 'nau',
2329 'nb': 'nob',
2330 'nd': 'nde',
2331 'ne': 'nep',
2332 'ng': 'ndo',
2333 'nl': 'nld',
2334 'nn': 'nno',
2335 'no': 'nor',
2336 'nr': 'nbl',
2337 'nv': 'nav',
2338 'ny': 'nya',
2339 'oc': 'oci',
2340 'oj': 'oji',
2341 'om': 'orm',
2342 'or': 'ori',
2343 'os': 'oss',
2344 'pa': 'pan',
2345 'pi': 'pli',
2346 'pl': 'pol',
2347 'ps': 'pus',
2348 'pt': 'por',
2349 'qu': 'que',
2350 'rm': 'roh',
2351 'rn': 'run',
2352 'ro': 'ron',
2353 'ru': 'rus',
2354 'rw': 'kin',
2355 'sa': 'san',
2356 'sc': 'srd',
2357 'sd': 'snd',
2358 'se': 'sme',
2359 'sg': 'sag',
2360 'si': 'sin',
2361 'sk': 'slk',
2362 'sl': 'slv',
2363 'sm': 'smo',
2364 'sn': 'sna',
2365 'so': 'som',
2366 'sq': 'sqi',
2367 'sr': 'srp',
2368 'ss': 'ssw',
2369 'st': 'sot',
2370 'su': 'sun',
2371 'sv': 'swe',
2372 'sw': 'swa',
2373 'ta': 'tam',
2374 'te': 'tel',
2375 'tg': 'tgk',
2376 'th': 'tha',
2377 'ti': 'tir',
2378 'tk': 'tuk',
2379 'tl': 'tgl',
2380 'tn': 'tsn',
2381 'to': 'ton',
2382 'tr': 'tur',
2383 'ts': 'tso',
2384 'tt': 'tat',
2385 'tw': 'twi',
2386 'ty': 'tah',
2387 'ug': 'uig',
2388 'uk': 'ukr',
2389 'ur': 'urd',
2390 'uz': 'uzb',
2391 've': 'ven',
2392 'vi': 'vie',
2393 'vo': 'vol',
2394 'wa': 'wln',
2395 'wo': 'wol',
2396 'xh': 'xho',
2397 'yi': 'yid',
2398 'yo': 'yor',
2399 'za': 'zha',
2400 'zh': 'zho',
2401 'zu': 'zul',
2402 }
2403
2404 @classmethod
2405 def short2long(cls, code):
2406 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2407 return cls._lang_map.get(code[:2])
2408
2409 @classmethod
2410 def long2short(cls, code):
2411 """Convert language code from ISO 639-2/T to ISO 639-1"""
2412 for short_name, long_name in cls._lang_map.items():
2413 if long_name == code:
2414 return short_name
2415
2416
4eb10f66
YCH
2417class ISO3166Utils(object):
2418 # From http://data.okfn.org/data/core/country-list
2419 _country_map = {
2420 'AF': 'Afghanistan',
2421 'AX': 'Åland Islands',
2422 'AL': 'Albania',
2423 'DZ': 'Algeria',
2424 'AS': 'American Samoa',
2425 'AD': 'Andorra',
2426 'AO': 'Angola',
2427 'AI': 'Anguilla',
2428 'AQ': 'Antarctica',
2429 'AG': 'Antigua and Barbuda',
2430 'AR': 'Argentina',
2431 'AM': 'Armenia',
2432 'AW': 'Aruba',
2433 'AU': 'Australia',
2434 'AT': 'Austria',
2435 'AZ': 'Azerbaijan',
2436 'BS': 'Bahamas',
2437 'BH': 'Bahrain',
2438 'BD': 'Bangladesh',
2439 'BB': 'Barbados',
2440 'BY': 'Belarus',
2441 'BE': 'Belgium',
2442 'BZ': 'Belize',
2443 'BJ': 'Benin',
2444 'BM': 'Bermuda',
2445 'BT': 'Bhutan',
2446 'BO': 'Bolivia, Plurinational State of',
2447 'BQ': 'Bonaire, Sint Eustatius and Saba',
2448 'BA': 'Bosnia and Herzegovina',
2449 'BW': 'Botswana',
2450 'BV': 'Bouvet Island',
2451 'BR': 'Brazil',
2452 'IO': 'British Indian Ocean Territory',
2453 'BN': 'Brunei Darussalam',
2454 'BG': 'Bulgaria',
2455 'BF': 'Burkina Faso',
2456 'BI': 'Burundi',
2457 'KH': 'Cambodia',
2458 'CM': 'Cameroon',
2459 'CA': 'Canada',
2460 'CV': 'Cape Verde',
2461 'KY': 'Cayman Islands',
2462 'CF': 'Central African Republic',
2463 'TD': 'Chad',
2464 'CL': 'Chile',
2465 'CN': 'China',
2466 'CX': 'Christmas Island',
2467 'CC': 'Cocos (Keeling) Islands',
2468 'CO': 'Colombia',
2469 'KM': 'Comoros',
2470 'CG': 'Congo',
2471 'CD': 'Congo, the Democratic Republic of the',
2472 'CK': 'Cook Islands',
2473 'CR': 'Costa Rica',
2474 'CI': 'Côte d\'Ivoire',
2475 'HR': 'Croatia',
2476 'CU': 'Cuba',
2477 'CW': 'Curaçao',
2478 'CY': 'Cyprus',
2479 'CZ': 'Czech Republic',
2480 'DK': 'Denmark',
2481 'DJ': 'Djibouti',
2482 'DM': 'Dominica',
2483 'DO': 'Dominican Republic',
2484 'EC': 'Ecuador',
2485 'EG': 'Egypt',
2486 'SV': 'El Salvador',
2487 'GQ': 'Equatorial Guinea',
2488 'ER': 'Eritrea',
2489 'EE': 'Estonia',
2490 'ET': 'Ethiopia',
2491 'FK': 'Falkland Islands (Malvinas)',
2492 'FO': 'Faroe Islands',
2493 'FJ': 'Fiji',
2494 'FI': 'Finland',
2495 'FR': 'France',
2496 'GF': 'French Guiana',
2497 'PF': 'French Polynesia',
2498 'TF': 'French Southern Territories',
2499 'GA': 'Gabon',
2500 'GM': 'Gambia',
2501 'GE': 'Georgia',
2502 'DE': 'Germany',
2503 'GH': 'Ghana',
2504 'GI': 'Gibraltar',
2505 'GR': 'Greece',
2506 'GL': 'Greenland',
2507 'GD': 'Grenada',
2508 'GP': 'Guadeloupe',
2509 'GU': 'Guam',
2510 'GT': 'Guatemala',
2511 'GG': 'Guernsey',
2512 'GN': 'Guinea',
2513 'GW': 'Guinea-Bissau',
2514 'GY': 'Guyana',
2515 'HT': 'Haiti',
2516 'HM': 'Heard Island and McDonald Islands',
2517 'VA': 'Holy See (Vatican City State)',
2518 'HN': 'Honduras',
2519 'HK': 'Hong Kong',
2520 'HU': 'Hungary',
2521 'IS': 'Iceland',
2522 'IN': 'India',
2523 'ID': 'Indonesia',
2524 'IR': 'Iran, Islamic Republic of',
2525 'IQ': 'Iraq',
2526 'IE': 'Ireland',
2527 'IM': 'Isle of Man',
2528 'IL': 'Israel',
2529 'IT': 'Italy',
2530 'JM': 'Jamaica',
2531 'JP': 'Japan',
2532 'JE': 'Jersey',
2533 'JO': 'Jordan',
2534 'KZ': 'Kazakhstan',
2535 'KE': 'Kenya',
2536 'KI': 'Kiribati',
2537 'KP': 'Korea, Democratic People\'s Republic of',
2538 'KR': 'Korea, Republic of',
2539 'KW': 'Kuwait',
2540 'KG': 'Kyrgyzstan',
2541 'LA': 'Lao People\'s Democratic Republic',
2542 'LV': 'Latvia',
2543 'LB': 'Lebanon',
2544 'LS': 'Lesotho',
2545 'LR': 'Liberia',
2546 'LY': 'Libya',
2547 'LI': 'Liechtenstein',
2548 'LT': 'Lithuania',
2549 'LU': 'Luxembourg',
2550 'MO': 'Macao',
2551 'MK': 'Macedonia, the Former Yugoslav Republic of',
2552 'MG': 'Madagascar',
2553 'MW': 'Malawi',
2554 'MY': 'Malaysia',
2555 'MV': 'Maldives',
2556 'ML': 'Mali',
2557 'MT': 'Malta',
2558 'MH': 'Marshall Islands',
2559 'MQ': 'Martinique',
2560 'MR': 'Mauritania',
2561 'MU': 'Mauritius',
2562 'YT': 'Mayotte',
2563 'MX': 'Mexico',
2564 'FM': 'Micronesia, Federated States of',
2565 'MD': 'Moldova, Republic of',
2566 'MC': 'Monaco',
2567 'MN': 'Mongolia',
2568 'ME': 'Montenegro',
2569 'MS': 'Montserrat',
2570 'MA': 'Morocco',
2571 'MZ': 'Mozambique',
2572 'MM': 'Myanmar',
2573 'NA': 'Namibia',
2574 'NR': 'Nauru',
2575 'NP': 'Nepal',
2576 'NL': 'Netherlands',
2577 'NC': 'New Caledonia',
2578 'NZ': 'New Zealand',
2579 'NI': 'Nicaragua',
2580 'NE': 'Niger',
2581 'NG': 'Nigeria',
2582 'NU': 'Niue',
2583 'NF': 'Norfolk Island',
2584 'MP': 'Northern Mariana Islands',
2585 'NO': 'Norway',
2586 'OM': 'Oman',
2587 'PK': 'Pakistan',
2588 'PW': 'Palau',
2589 'PS': 'Palestine, State of',
2590 'PA': 'Panama',
2591 'PG': 'Papua New Guinea',
2592 'PY': 'Paraguay',
2593 'PE': 'Peru',
2594 'PH': 'Philippines',
2595 'PN': 'Pitcairn',
2596 'PL': 'Poland',
2597 'PT': 'Portugal',
2598 'PR': 'Puerto Rico',
2599 'QA': 'Qatar',
2600 'RE': 'Réunion',
2601 'RO': 'Romania',
2602 'RU': 'Russian Federation',
2603 'RW': 'Rwanda',
2604 'BL': 'Saint Barthélemy',
2605 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2606 'KN': 'Saint Kitts and Nevis',
2607 'LC': 'Saint Lucia',
2608 'MF': 'Saint Martin (French part)',
2609 'PM': 'Saint Pierre and Miquelon',
2610 'VC': 'Saint Vincent and the Grenadines',
2611 'WS': 'Samoa',
2612 'SM': 'San Marino',
2613 'ST': 'Sao Tome and Principe',
2614 'SA': 'Saudi Arabia',
2615 'SN': 'Senegal',
2616 'RS': 'Serbia',
2617 'SC': 'Seychelles',
2618 'SL': 'Sierra Leone',
2619 'SG': 'Singapore',
2620 'SX': 'Sint Maarten (Dutch part)',
2621 'SK': 'Slovakia',
2622 'SI': 'Slovenia',
2623 'SB': 'Solomon Islands',
2624 'SO': 'Somalia',
2625 'ZA': 'South Africa',
2626 'GS': 'South Georgia and the South Sandwich Islands',
2627 'SS': 'South Sudan',
2628 'ES': 'Spain',
2629 'LK': 'Sri Lanka',
2630 'SD': 'Sudan',
2631 'SR': 'Suriname',
2632 'SJ': 'Svalbard and Jan Mayen',
2633 'SZ': 'Swaziland',
2634 'SE': 'Sweden',
2635 'CH': 'Switzerland',
2636 'SY': 'Syrian Arab Republic',
2637 'TW': 'Taiwan, Province of China',
2638 'TJ': 'Tajikistan',
2639 'TZ': 'Tanzania, United Republic of',
2640 'TH': 'Thailand',
2641 'TL': 'Timor-Leste',
2642 'TG': 'Togo',
2643 'TK': 'Tokelau',
2644 'TO': 'Tonga',
2645 'TT': 'Trinidad and Tobago',
2646 'TN': 'Tunisia',
2647 'TR': 'Turkey',
2648 'TM': 'Turkmenistan',
2649 'TC': 'Turks and Caicos Islands',
2650 'TV': 'Tuvalu',
2651 'UG': 'Uganda',
2652 'UA': 'Ukraine',
2653 'AE': 'United Arab Emirates',
2654 'GB': 'United Kingdom',
2655 'US': 'United States',
2656 'UM': 'United States Minor Outlying Islands',
2657 'UY': 'Uruguay',
2658 'UZ': 'Uzbekistan',
2659 'VU': 'Vanuatu',
2660 'VE': 'Venezuela, Bolivarian Republic of',
2661 'VN': 'Viet Nam',
2662 'VG': 'Virgin Islands, British',
2663 'VI': 'Virgin Islands, U.S.',
2664 'WF': 'Wallis and Futuna',
2665 'EH': 'Western Sahara',
2666 'YE': 'Yemen',
2667 'ZM': 'Zambia',
2668 'ZW': 'Zimbabwe',
2669 }
2670
2671 @classmethod
2672 def short2full(cls, code):
2673 """Convert an ISO 3166-2 country code to the corresponding full name"""
2674 return cls._country_map.get(code.upper())
2675
2676
91410c9b 2677class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2678 def __init__(self, proxies=None):
2679 # Set default handlers
2680 for type in ('http', 'https'):
2681 setattr(self, '%s_open' % type,
2682 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2683 meth(r, proxy, type))
2684 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2685
91410c9b 2686 def proxy_open(self, req, proxy, type):
2461f79d 2687 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
2688 if req_proxy is not None:
2689 proxy = req_proxy
2461f79d
PH
2690 del req.headers['Ytdl-request-proxy']
2691
2692 if proxy == '__noproxy__':
2693 return None # No Proxy
91410c9b
PH
2694 return compat_urllib_request.ProxyHandler.proxy_open(
2695 self, req, proxy, type)
5bc880b9
YCH
2696
2697
2698def ohdave_rsa_encrypt(data, exponent, modulus):
2699 '''
2700 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2701
2702 Input:
2703 data: data to encrypt, bytes-like object
2704 exponent, modulus: parameter e and N of RSA algorithm, both integer
2705 Output: hex string of encrypted data
2706
2707 Limitation: supports one block encryption only
2708 '''
2709
2710 payload = int(binascii.hexlify(data[::-1]), 16)
2711 encrypted = pow(payload, exponent, modulus)
2712 return '%x' % encrypted
81bdc8fd
YCH
2713
2714
5eb6bdce 2715def encode_base_n(num, n, table=None):
59f898b7 2716 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
2717 if not table:
2718 table = FULL_TABLE[:n]
2719
5eb6bdce
YCH
2720 if n > len(table):
2721 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2722
2723 if num == 0:
2724 return table[0]
2725
81bdc8fd
YCH
2726 ret = ''
2727 while num:
2728 ret = table[num % n] + ret
2729 num = num // n
2730 return ret
f52354a8
YCH
2731
2732
2733def decode_packed_codes(code):
2734 mobj = re.search(
680079be 2735 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
f52354a8
YCH
2736 code)
2737 obfucasted_code, base, count, symbols = mobj.groups()
2738 base = int(base)
2739 count = int(count)
2740 symbols = symbols.split('|')
2741 symbol_table = {}
2742
2743 while count:
2744 count -= 1
5eb6bdce 2745 base_n_count = encode_base_n(count, base)
f52354a8
YCH
2746 symbol_table[base_n_count] = symbols[count] or base_n_count
2747
2748 return re.sub(
2749 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2750 obfucasted_code)