]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[extractor/generic] Fix missing byte literal prefix
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
f45c185f 14import errno
be4a824d 15import functools
d77c3dfd 16import gzip
b7ab0590 17import itertools
03f9daab 18import io
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
347de493 22import operator
d77c3dfd 23import os
4eb7f1d1 24import pipes
c496ca96 25import platform
d77c3dfd 26import re
13ebea79 27import ssl
c496ca96 28import socket
b53466e1 29import struct
1c088fa8 30import subprocess
d77c3dfd 31import sys
181c8655 32import tempfile
01951dda 33import traceback
bcf89ce6 34import xml.etree.ElementTree
d77c3dfd 35import zlib
d77c3dfd 36
8c25f81b 37from .compat import (
8bb56eee 38 compat_HTMLParser,
8f9312c3 39 compat_basestring,
8c25f81b 40 compat_chr,
36e6f62c 41 compat_etree_fromstring,
8c25f81b 42 compat_html_entities,
be4a824d 43 compat_http_client,
c86b6142 44 compat_kwargs,
8c25f81b 45 compat_parse_qs,
be4a824d 46 compat_socket_create_connection,
8c25f81b
PH
47 compat_str,
48 compat_urllib_error,
49 compat_urllib_parse,
50 compat_urllib_parse_urlparse,
51 compat_urllib_request,
52 compat_urlparse,
810c10ba 53 compat_xpath,
7d4111ed 54 shlex_quote,
8c25f81b 55)
4644ac55
S
56
57
468e2e92
FV
58# This is not clearly defined otherwise
59compiled_regex_type = type(re.compile(''))
60
3e669f36 61std_headers = {
9c7b3898 62 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/44.0 (Chrome)',
59ae15a5
PH
63 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
64 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
65 'Accept-Encoding': 'gzip, deflate',
66 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 67}
f427df17 68
5f6a1245 69
bf42a990
S
70NO_DEFAULT = object()
71
7105440c
YCH
72ENGLISH_MONTH_NAMES = [
73 'January', 'February', 'March', 'April', 'May', 'June',
74 'July', 'August', 'September', 'October', 'November', 'December']
75
a7aaa398
S
76KNOWN_EXTENSIONS = (
77 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
78 'flv', 'f4v', 'f4a', 'f4b',
79 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
80 'mkv', 'mka', 'mk3d',
81 'avi', 'divx',
82 'mov',
83 'asf', 'wmv', 'wma',
84 '3gp', '3g2',
85 'mp3',
86 'flac',
87 'ape',
88 'wav',
89 'f4f', 'f4m', 'm3u8', 'smil')
90
7105440c 91
d77c3dfd 92def preferredencoding():
59ae15a5 93 """Get preferred encoding.
d77c3dfd 94
59ae15a5
PH
95 Returns the best encoding scheme for the system, based on
96 locale.getpreferredencoding() and some further tweaks.
97 """
98 try:
99 pref = locale.getpreferredencoding()
28e614de 100 'TEST'.encode(pref)
70a1165b 101 except Exception:
59ae15a5 102 pref = 'UTF-8'
bae611f2 103
59ae15a5 104 return pref
d77c3dfd 105
f4bfd65f 106
181c8655 107def write_json_file(obj, fn):
1394646a 108 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 109
92120217 110 fn = encodeFilename(fn)
61ee5aeb 111 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
112 encoding = get_filesystem_encoding()
113 # os.path.basename returns a bytes object, but NamedTemporaryFile
114 # will fail if the filename contains non ascii characters unless we
115 # use a unicode object
116 path_basename = lambda f: os.path.basename(fn).decode(encoding)
117 # the same for os.path.dirname
118 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
119 else:
120 path_basename = os.path.basename
121 path_dirname = os.path.dirname
122
73159f99
S
123 args = {
124 'suffix': '.tmp',
ec5f6016
JMF
125 'prefix': path_basename(fn) + '.',
126 'dir': path_dirname(fn),
73159f99
S
127 'delete': False,
128 }
129
181c8655
PH
130 # In Python 2.x, json.dump expects a bytestream.
131 # In Python 3.x, it writes to a character stream
132 if sys.version_info < (3, 0):
73159f99 133 args['mode'] = 'wb'
181c8655 134 else:
73159f99
S
135 args.update({
136 'mode': 'w',
137 'encoding': 'utf-8',
138 })
139
c86b6142 140 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
141
142 try:
143 with tf:
144 json.dump(obj, tf)
1394646a
IK
145 if sys.platform == 'win32':
146 # Need to remove existing file on Windows, else os.rename raises
147 # WindowsError or FileExistsError.
148 try:
149 os.unlink(fn)
150 except OSError:
151 pass
181c8655 152 os.rename(tf.name, fn)
70a1165b 153 except Exception:
181c8655
PH
154 try:
155 os.remove(tf.name)
156 except OSError:
157 pass
158 raise
159
160
161if sys.version_info >= (2, 7):
ee114368 162 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 163 """ Find the xpath xpath[@key=val] """
5d2354f1 164 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 165 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
166 return node.find(expr)
167else:
ee114368 168 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 169 for f in node.findall(compat_xpath(xpath)):
ee114368
S
170 if key not in f.attrib:
171 continue
172 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
173 return f
174 return None
175
d7e66d39
JMF
176# On python2.6 the xml.etree.ElementTree.Element methods don't support
177# the namespace parameter
5f6a1245
JW
178
179
d7e66d39
JMF
180def xpath_with_ns(path, ns_map):
181 components = [c.split(':') for c in path.split('/')]
182 replaced = []
183 for c in components:
184 if len(c) == 1:
185 replaced.append(c[0])
186 else:
187 ns, tag = c
188 replaced.append('{%s}%s' % (ns_map[ns], tag))
189 return '/'.join(replaced)
190
d77c3dfd 191
a41fb80c 192def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 193 def _find_xpath(xpath):
810c10ba 194 return node.find(compat_xpath(xpath))
578c0745
S
195
196 if isinstance(xpath, (str, compat_str)):
197 n = _find_xpath(xpath)
198 else:
199 for xp in xpath:
200 n = _find_xpath(xp)
201 if n is not None:
202 break
d74bebd5 203
8e636da4 204 if n is None:
bf42a990
S
205 if default is not NO_DEFAULT:
206 return default
207 elif fatal:
bf0ff932
PH
208 name = xpath if name is None else name
209 raise ExtractorError('Could not find XML element %s' % name)
210 else:
211 return None
a41fb80c
S
212 return n
213
214
215def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
216 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
217 if n is None or n == default:
218 return n
219 if n.text is None:
220 if default is not NO_DEFAULT:
221 return default
222 elif fatal:
223 name = xpath if name is None else name
224 raise ExtractorError('Could not find XML element\'s text %s' % name)
225 else:
226 return None
227 return n.text
a41fb80c
S
228
229
230def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
231 n = find_xpath_attr(node, xpath, key)
232 if n is None:
233 if default is not NO_DEFAULT:
234 return default
235 elif fatal:
236 name = '%s[@%s]' % (xpath, key) if name is None else name
237 raise ExtractorError('Could not find XML attribute %s' % name)
238 else:
239 return None
240 return n.attrib[key]
bf0ff932
PH
241
242
9e6dd238 243def get_element_by_id(id, html):
43e8fafd 244 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 245 return get_element_by_attribute('id', id, html)
43e8fafd 246
12ea2f30 247
43e8fafd
ND
248def get_element_by_attribute(attribute, value, html):
249 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 250
38285056
PH
251 m = re.search(r'''(?xs)
252 <([a-zA-Z0-9:._-]+)
253 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
254 \s+%s=['"]?%s['"]?
255 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
256 \s*>
257 (?P<content>.*?)
258 </\1>
259 ''' % (re.escape(attribute), re.escape(value)), html)
260
261 if not m:
262 return None
263 res = m.group('content')
264
265 if res.startswith('"') or res.startswith("'"):
266 res = res[1:-1]
a921f407 267
38285056 268 return unescapeHTML(res)
a921f407 269
c5229f39 270
8bb56eee
BF
271class HTMLAttributeParser(compat_HTMLParser):
272 """Trivial HTML parser to gather the attributes for a single element"""
273 def __init__(self):
c5229f39 274 self.attrs = {}
8bb56eee
BF
275 compat_HTMLParser.__init__(self)
276
277 def handle_starttag(self, tag, attrs):
278 self.attrs = dict(attrs)
279
c5229f39 280
8bb56eee
BF
281def extract_attributes(html_element):
282 """Given a string for an HTML element such as
283 <el
284 a="foo" B="bar" c="&98;az" d=boz
285 empty= noval entity="&amp;"
286 sq='"' dq="'"
287 >
288 Decode and return a dictionary of attributes.
289 {
290 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
291 'empty': '', 'noval': None, 'entity': '&',
292 'sq': '"', 'dq': '\''
293 }.
294 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
295 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
296 """
297 parser = HTMLAttributeParser()
298 parser.feed(html_element)
299 parser.close()
300 return parser.attrs
9e6dd238 301
c5229f39 302
9e6dd238 303def clean_html(html):
59ae15a5 304 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
305
306 if html is None: # Convenience for sanitizing descriptions etc.
307 return html
308
59ae15a5
PH
309 # Newline vs <br />
310 html = html.replace('\n', ' ')
6b3aef80
FV
311 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
312 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
313 # Strip html tags
314 html = re.sub('<.*?>', '', html)
315 # Replace html entities
316 html = unescapeHTML(html)
7decf895 317 return html.strip()
9e6dd238
FV
318
319
d77c3dfd 320def sanitize_open(filename, open_mode):
59ae15a5
PH
321 """Try to open the given filename, and slightly tweak it if this fails.
322
323 Attempts to open the given filename. If this fails, it tries to change
324 the filename slightly, step by step, until it's either able to open it
325 or it fails and raises a final exception, like the standard open()
326 function.
327
328 It returns the tuple (stream, definitive_file_name).
329 """
330 try:
28e614de 331 if filename == '-':
59ae15a5
PH
332 if sys.platform == 'win32':
333 import msvcrt
334 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 335 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
336 stream = open(encodeFilename(filename), open_mode)
337 return (stream, filename)
338 except (IOError, OSError) as err:
f45c185f
PH
339 if err.errno in (errno.EACCES,):
340 raise
59ae15a5 341
f45c185f 342 # In case of error, try to remove win32 forbidden chars
d55de57b 343 alt_filename = sanitize_path(filename)
f45c185f
PH
344 if alt_filename == filename:
345 raise
346 else:
347 # An exception here should be caught in the caller
d55de57b 348 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 349 return (stream, alt_filename)
d77c3dfd
FV
350
351
352def timeconvert(timestr):
59ae15a5
PH
353 """Convert RFC 2822 defined time string into system timestamp"""
354 timestamp = None
355 timetuple = email.utils.parsedate_tz(timestr)
356 if timetuple is not None:
357 timestamp = email.utils.mktime_tz(timetuple)
358 return timestamp
1c469a94 359
5f6a1245 360
796173d0 361def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
362 """Sanitizes a string so it could be used as part of a filename.
363 If restricted is set, use a stricter subset of allowed characters.
796173d0 364 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
365 """
366 def replace_insane(char):
367 if char == '?' or ord(char) < 32 or ord(char) == 127:
368 return ''
369 elif char == '"':
370 return '' if restricted else '\''
371 elif char == ':':
372 return '_-' if restricted else ' -'
373 elif char in '\\/|*<>':
374 return '_'
627dcfff 375 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
376 return '_'
377 if restricted and ord(char) > 127:
378 return '_'
379 return char
380
2aeb06d6
PH
381 # Handle timestamps
382 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 383 result = ''.join(map(replace_insane, s))
796173d0
PH
384 if not is_id:
385 while '__' in result:
386 result = result.replace('__', '_')
387 result = result.strip('_')
388 # Common case of "Foreign band name - English song title"
389 if restricted and result.startswith('-_'):
390 result = result[2:]
5a42414b
PH
391 if result.startswith('-'):
392 result = '_' + result[len('-'):]
a7440261 393 result = result.lstrip('.')
796173d0
PH
394 if not result:
395 result = '_'
59ae15a5 396 return result
d77c3dfd 397
5f6a1245 398
a2aaf4db
S
399def sanitize_path(s):
400 """Sanitizes and normalizes path on Windows"""
401 if sys.platform != 'win32':
402 return s
be531ef1
S
403 drive_or_unc, _ = os.path.splitdrive(s)
404 if sys.version_info < (2, 7) and not drive_or_unc:
405 drive_or_unc, _ = os.path.splitunc(s)
406 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
407 if drive_or_unc:
a2aaf4db
S
408 norm_path.pop(0)
409 sanitized_path = [
c90d16cf 410 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
a2aaf4db 411 for path_part in norm_path]
be531ef1
S
412 if drive_or_unc:
413 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
414 return os.path.join(*sanitized_path)
415
416
67dda517
S
417# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
418# unwanted failures due to missing protocol
419def sanitized_Request(url, *args, **kwargs):
420 return compat_urllib_request.Request(
421 'http:%s' % url if url.startswith('//') else url, *args, **kwargs)
422
423
d77c3dfd 424def orderedSet(iterable):
59ae15a5
PH
425 """ Remove all duplicates from the input iterable """
426 res = []
427 for el in iterable:
428 if el not in res:
429 res.append(el)
430 return res
d77c3dfd 431
912b38b4 432
4e408e47
PH
433def _htmlentity_transform(entity):
434 """Transforms an HTML entity to a character."""
435 # Known non-numeric HTML entity
436 if entity in compat_html_entities.name2codepoint:
437 return compat_chr(compat_html_entities.name2codepoint[entity])
438
91757b0f 439 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
440 if mobj is not None:
441 numstr = mobj.group(1)
28e614de 442 if numstr.startswith('x'):
4e408e47 443 base = 16
28e614de 444 numstr = '0%s' % numstr
4e408e47
PH
445 else:
446 base = 10
7aefc49c
S
447 # See https://github.com/rg3/youtube-dl/issues/7518
448 try:
449 return compat_chr(int(numstr, base))
450 except ValueError:
451 pass
4e408e47
PH
452
453 # Unknown entity in name, return its literal representation
7a3f0c00 454 return '&%s;' % entity
4e408e47
PH
455
456
d77c3dfd 457def unescapeHTML(s):
912b38b4
PH
458 if s is None:
459 return None
460 assert type(s) == compat_str
d77c3dfd 461
4e408e47
PH
462 return re.sub(
463 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 464
8bf48f23 465
aa49acd1
S
466def get_subprocess_encoding():
467 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
468 # For subprocess calls, encode with locale encoding
469 # Refer to http://stackoverflow.com/a/9951851/35070
470 encoding = preferredencoding()
471 else:
472 encoding = sys.getfilesystemencoding()
473 if encoding is None:
474 encoding = 'utf-8'
475 return encoding
476
477
8bf48f23 478def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
479 """
480 @param s The name of the file
481 """
d77c3dfd 482
8bf48f23 483 assert type(s) == compat_str
d77c3dfd 484
59ae15a5
PH
485 # Python 3 has a Unicode API
486 if sys.version_info >= (3, 0):
487 return s
0f00efed 488
aa49acd1
S
489 # Pass '' directly to use Unicode APIs on Windows 2000 and up
490 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
491 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
492 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
493 return s
494
8ee239e9
YCH
495 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
496 if sys.platform.startswith('java'):
497 return s
498
aa49acd1
S
499 return s.encode(get_subprocess_encoding(), 'ignore')
500
501
502def decodeFilename(b, for_subprocess=False):
503
504 if sys.version_info >= (3, 0):
505 return b
506
507 if not isinstance(b, bytes):
508 return b
509
510 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 511
f07b74fc
PH
512
513def encodeArgument(s):
514 if not isinstance(s, compat_str):
515 # Legacy code that uses byte strings
516 # Uncomment the following line after fixing all post processors
7af808a5 517 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
518 s = s.decode('ascii')
519 return encodeFilename(s, True)
520
521
aa49acd1
S
522def decodeArgument(b):
523 return decodeFilename(b, True)
524
525
8271226a
PH
526def decodeOption(optval):
527 if optval is None:
528 return optval
529 if isinstance(optval, bytes):
530 optval = optval.decode(preferredencoding())
531
532 assert isinstance(optval, compat_str)
533 return optval
1c256f70 534
5f6a1245 535
4539dd30
PH
536def formatSeconds(secs):
537 if secs > 3600:
538 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
539 elif secs > 60:
540 return '%d:%02d' % (secs // 60, secs % 60)
541 else:
542 return '%d' % secs
543
a0ddb8a2 544
be4a824d
PH
545def make_HTTPS_handler(params, **kwargs):
546 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 547 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 548 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 549 if opts_no_check_certificate:
be5f2c19 550 context.check_hostname = False
0db261ba 551 context.verify_mode = ssl.CERT_NONE
a2366922 552 try:
be4a824d 553 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
554 except TypeError:
555 # Python 2.7.8
556 # (create_default_context present but HTTPSHandler has no context=)
557 pass
558
559 if sys.version_info < (3, 2):
d7932313 560 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 561 else: # Python < 3.4
d7932313 562 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 563 context.verify_mode = (ssl.CERT_NONE
dca08720 564 if opts_no_check_certificate
ea6d901e 565 else ssl.CERT_REQUIRED)
303b479e 566 context.set_default_verify_paths()
be4a824d 567 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 568
732ea2f0 569
08f2a92c
JMF
570def bug_reports_message():
571 if ytdl_is_updateable():
572 update_cmd = 'type youtube-dl -U to update'
573 else:
574 update_cmd = 'see https://yt-dl.org/update on how to update'
575 msg = '; please report this issue on https://yt-dl.org/bug .'
576 msg += ' Make sure you are using the latest version; %s.' % update_cmd
577 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
578 return msg
579
580
1c256f70
PH
581class ExtractorError(Exception):
582 """Error during info extraction."""
5f6a1245 583
d11271dd 584 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
585 """ tb, if given, is the original traceback (so that it can be printed out).
586 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
587 """
588
589 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
590 expected = True
d11271dd
PH
591 if video_id is not None:
592 msg = video_id + ': ' + msg
410f3e73 593 if cause:
28e614de 594 msg += ' (caused by %r)' % cause
9a82b238 595 if not expected:
08f2a92c 596 msg += bug_reports_message()
1c256f70 597 super(ExtractorError, self).__init__(msg)
d5979c5d 598
1c256f70 599 self.traceback = tb
8cc83b8d 600 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 601 self.cause = cause
d11271dd 602 self.video_id = video_id
1c256f70 603
01951dda
PH
604 def format_traceback(self):
605 if self.traceback is None:
606 return None
28e614de 607 return ''.join(traceback.format_tb(self.traceback))
01951dda 608
1c256f70 609
416c7fcb
PH
610class UnsupportedError(ExtractorError):
611 def __init__(self, url):
612 super(UnsupportedError, self).__init__(
613 'Unsupported URL: %s' % url, expected=True)
614 self.url = url
615
616
55b3e45b
JMF
617class RegexNotFoundError(ExtractorError):
618 """Error when a regex didn't match"""
619 pass
620
621
d77c3dfd 622class DownloadError(Exception):
59ae15a5 623 """Download Error exception.
d77c3dfd 624
59ae15a5
PH
625 This exception may be thrown by FileDownloader objects if they are not
626 configured to continue on errors. They will contain the appropriate
627 error message.
628 """
5f6a1245 629
8cc83b8d
FV
630 def __init__(self, msg, exc_info=None):
631 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
632 super(DownloadError, self).__init__(msg)
633 self.exc_info = exc_info
d77c3dfd
FV
634
635
636class SameFileError(Exception):
59ae15a5 637 """Same File exception.
d77c3dfd 638
59ae15a5
PH
639 This exception will be thrown by FileDownloader objects if they detect
640 multiple files would have to be downloaded to the same file on disk.
641 """
642 pass
d77c3dfd
FV
643
644
645class PostProcessingError(Exception):
59ae15a5 646 """Post Processing exception.
d77c3dfd 647
59ae15a5
PH
648 This exception may be raised by PostProcessor's .run() method to
649 indicate an error in the postprocessing task.
650 """
5f6a1245 651
7851b379
PH
652 def __init__(self, msg):
653 self.msg = msg
d77c3dfd 654
5f6a1245 655
d77c3dfd 656class MaxDownloadsReached(Exception):
59ae15a5
PH
657 """ --max-downloads limit has been reached. """
658 pass
d77c3dfd
FV
659
660
661class UnavailableVideoError(Exception):
59ae15a5 662 """Unavailable Format exception.
d77c3dfd 663
59ae15a5
PH
664 This exception will be thrown when a video is requested
665 in a format that is not available for that video.
666 """
667 pass
d77c3dfd
FV
668
669
670class ContentTooShortError(Exception):
59ae15a5 671 """Content Too Short exception.
d77c3dfd 672
59ae15a5
PH
673 This exception may be raised by FileDownloader objects when a file they
674 download is too small for what the server announced first, indicating
675 the connection was probably interrupted.
676 """
d77c3dfd 677
59ae15a5 678 def __init__(self, downloaded, expected):
2c7ed247 679 # Both in bytes
59ae15a5
PH
680 self.downloaded = downloaded
681 self.expected = expected
d77c3dfd 682
5f6a1245 683
c5a59d93 684def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
685 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
686 # expected HTTP responses to meet HTTP/1.0 or later (see also
687 # https://github.com/rg3/youtube-dl/issues/6727)
688 if sys.version_info < (3, 0):
5a1a2e94 689 kwargs[b'strict'] = True
be4a824d
PH
690 hc = http_class(*args, **kwargs)
691 source_address = ydl_handler._params.get('source_address')
692 if source_address is not None:
693 sa = (source_address, 0)
694 if hasattr(hc, 'source_address'): # Python 2.7+
695 hc.source_address = sa
696 else: # Python 2.6
697 def _hc_connect(self, *args, **kwargs):
698 sock = compat_socket_create_connection(
699 (self.host, self.port), self.timeout, sa)
700 if is_https:
d7932313
PH
701 self.sock = ssl.wrap_socket(
702 sock, self.key_file, self.cert_file,
703 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
704 else:
705 self.sock = sock
706 hc.connect = functools.partial(_hc_connect, hc)
707
708 return hc
709
710
87f0e62d 711def handle_youtubedl_headers(headers):
992fc9d6
YCH
712 filtered_headers = headers
713
714 if 'Youtubedl-no-compression' in filtered_headers:
715 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 716 del filtered_headers['Youtubedl-no-compression']
87f0e62d 717
992fc9d6 718 return filtered_headers
87f0e62d
YCH
719
720
acebc9cd 721class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
722 """Handler for HTTP requests and responses.
723
724 This class, when installed with an OpenerDirector, automatically adds
725 the standard headers to every HTTP request and handles gzipped and
726 deflated responses from web servers. If compression is to be avoided in
727 a particular request, the original request in the program code only has
0424ec30 728 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
729 removed before making the real request.
730
731 Part of this code was copied from:
732
733 http://techknack.net/python-urllib2-handlers/
734
735 Andrew Rowls, the author of that code, agreed to release it to the
736 public domain.
737 """
738
be4a824d
PH
739 def __init__(self, params, *args, **kwargs):
740 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
741 self._params = params
742
743 def http_open(self, req):
744 return self.do_open(functools.partial(
c5a59d93 745 _create_http_connection, self, compat_http_client.HTTPConnection, False),
be4a824d
PH
746 req)
747
59ae15a5
PH
748 @staticmethod
749 def deflate(data):
750 try:
751 return zlib.decompress(data, -zlib.MAX_WBITS)
752 except zlib.error:
753 return zlib.decompress(data)
754
755 @staticmethod
756 def addinfourl_wrapper(stream, headers, url, code):
757 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
758 return compat_urllib_request.addinfourl(stream, headers, url, code)
759 ret = compat_urllib_request.addinfourl(stream, headers, url)
760 ret.code = code
761 return ret
762
acebc9cd 763 def http_request(self, req):
51f267d9
S
764 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
765 # always respected by websites, some tend to give out URLs with non percent-encoded
766 # non-ASCII characters (see telemb.py, ard.py [#3412])
767 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
768 # To work around aforementioned issue we will replace request's original URL with
769 # percent-encoded one
770 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
771 # the code of this workaround has been moved here from YoutubeDL.urlopen()
772 url = req.get_full_url()
773 url_escaped = escape_url(url)
774
775 # Substitute URL if any change after escaping
776 if url != url_escaped:
777 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
778 new_req = req_type(
779 url_escaped, data=req.data, headers=req.headers,
780 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
781 new_req.timeout = req.timeout
782 req = new_req
783
33ac271b 784 for h, v in std_headers.items():
3d5f7a39
JK
785 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
786 # The dict keys are capitalized because of this bug by urllib
787 if h.capitalize() not in req.headers:
33ac271b 788 req.add_header(h, v)
87f0e62d
YCH
789
790 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
791
792 if sys.version_info < (2, 7) and '#' in req.get_full_url():
793 # Python 2.6 is brain-dead when it comes to fragments
794 req._Request__original = req._Request__original.partition('#')[0]
795 req._Request__r_type = req._Request__r_type.partition('#')[0]
796
59ae15a5
PH
797 return req
798
acebc9cd 799 def http_response(self, req, resp):
59ae15a5
PH
800 old_resp = resp
801 # gzip
802 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
803 content = resp.read()
804 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
805 try:
806 uncompressed = io.BytesIO(gz.read())
807 except IOError as original_ioerror:
808 # There may be junk add the end of the file
809 # See http://stackoverflow.com/q/4928560/35070 for details
810 for i in range(1, 1024):
811 try:
812 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
813 uncompressed = io.BytesIO(gz.read())
814 except IOError:
815 continue
816 break
817 else:
818 raise original_ioerror
819 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 820 resp.msg = old_resp.msg
c047270c 821 del resp.headers['Content-encoding']
59ae15a5
PH
822 # deflate
823 if resp.headers.get('Content-encoding', '') == 'deflate':
824 gz = io.BytesIO(self.deflate(resp.read()))
825 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
826 resp.msg = old_resp.msg
c047270c 827 del resp.headers['Content-encoding']
ad729172
S
828 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
829 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
830 if 300 <= resp.code < 400:
831 location = resp.headers.get('Location')
832 if location:
833 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
834 if sys.version_info >= (3, 0):
835 location = location.encode('iso-8859-1').decode('utf-8')
836 location_escaped = escape_url(location)
837 if location != location_escaped:
838 del resp.headers['Location']
839 resp.headers['Location'] = location_escaped
59ae15a5 840 return resp
0f8d03f8 841
acebc9cd
PH
842 https_request = http_request
843 https_response = http_response
bf50b038 844
5de90176 845
be4a824d
PH
846class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
847 def __init__(self, params, https_conn_class=None, *args, **kwargs):
848 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
849 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
850 self._params = params
851
852 def https_open(self, req):
4f264c02
JMF
853 kwargs = {}
854 if hasattr(self, '_context'): # python > 2.6
855 kwargs['context'] = self._context
856 if hasattr(self, '_check_hostname'): # python 3.x
857 kwargs['check_hostname'] = self._check_hostname
be4a824d
PH
858 return self.do_open(functools.partial(
859 _create_http_connection, self, self._https_conn_class, True),
4f264c02 860 req, **kwargs)
be4a824d
PH
861
862
a6420bf5
S
863class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
864 def __init__(self, cookiejar=None):
865 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
866
867 def http_response(self, request, response):
868 # Python 2 will choke on next HTTP request in row if there are non-ASCII
869 # characters in Set-Cookie HTTP header of last response (see
870 # https://github.com/rg3/youtube-dl/issues/6769).
871 # In order to at least prevent crashing we will percent encode Set-Cookie
872 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
873 # if sys.version_info < (3, 0) and response.headers:
874 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
875 # set_cookie = response.headers.get(set_cookie_header)
876 # if set_cookie:
877 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
878 # if set_cookie != set_cookie_escaped:
879 # del response.headers[set_cookie_header]
880 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
881 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
882
883 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
884 https_response = http_response
885
886
08b38d54 887def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
888 """ Return a UNIX timestamp from the given date """
889
890 if date_str is None:
891 return None
892
52c3a6e4
S
893 date_str = re.sub(r'\.[0-9]+', '', date_str)
894
08b38d54
PH
895 if timezone is None:
896 m = re.search(
52c3a6e4 897 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
08b38d54
PH
898 date_str)
899 if not m:
912b38b4
PH
900 timezone = datetime.timedelta()
901 else:
08b38d54
PH
902 date_str = date_str[:-len(m.group(0))]
903 if not m.group('sign'):
904 timezone = datetime.timedelta()
905 else:
906 sign = 1 if m.group('sign') == '+' else -1
907 timezone = datetime.timedelta(
908 hours=sign * int(m.group('hours')),
909 minutes=sign * int(m.group('minutes')))
52c3a6e4
S
910 try:
911 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
912 dt = datetime.datetime.strptime(date_str, date_format) - timezone
913 return calendar.timegm(dt.timetuple())
914 except ValueError:
915 pass
912b38b4
PH
916
917
42bdd9d0 918def unified_strdate(date_str, day_first=True):
bf50b038 919 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
920
921 if date_str is None:
922 return None
bf50b038 923 upload_date = None
5f6a1245 924 # Replace commas
026fcc04 925 date_str = date_str.replace(',', ' ')
bf50b038 926 # %z (UTC offset) is only supported in python>=3.2
15ac8413
S
927 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
928 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0 929 # Remove AM/PM + timezone
9bb8e0a3 930 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
42bdd9d0 931
19e1d359
JMF
932 format_expressions = [
933 '%d %B %Y',
0f99566c 934 '%d %b %Y',
19e1d359
JMF
935 '%B %d %Y',
936 '%b %d %Y',
f160785c
S
937 '%b %dst %Y %I:%M',
938 '%b %dnd %Y %I:%M',
939 '%b %dth %Y %I:%M',
a69801e2 940 '%Y %m %d',
19e1d359 941 '%Y-%m-%d',
fe556f1b 942 '%Y/%m/%d',
19e1d359 943 '%Y/%m/%d %H:%M:%S',
5d73273f 944 '%Y-%m-%d %H:%M:%S',
e9be9a6a 945 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 946 '%d.%m.%Y %H:%M',
b047de6f 947 '%d.%m.%Y %H.%M',
19e1d359 948 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
949 '%Y-%m-%dT%H:%M:%S.%fZ',
950 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 951 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 952 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 953 '%Y-%m-%dT%H:%M',
19e1d359 954 ]
42bdd9d0
PH
955 if day_first:
956 format_expressions.extend([
79c21abb 957 '%d-%m-%Y',
776dc399
S
958 '%d.%m.%Y',
959 '%d/%m/%Y',
960 '%d/%m/%y',
42bdd9d0
PH
961 '%d/%m/%Y %H:%M:%S',
962 ])
963 else:
964 format_expressions.extend([
79c21abb 965 '%m-%d-%Y',
776dc399
S
966 '%m.%d.%Y',
967 '%m/%d/%Y',
968 '%m/%d/%y',
42bdd9d0
PH
969 '%m/%d/%Y %H:%M:%S',
970 ])
bf50b038
JMF
971 for expression in format_expressions:
972 try:
973 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 974 except ValueError:
bf50b038 975 pass
42393ce2
PH
976 if upload_date is None:
977 timetuple = email.utils.parsedate_tz(date_str)
978 if timetuple:
979 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
6a750402
JMF
980 if upload_date is not None:
981 return compat_str(upload_date)
bf50b038 982
5f6a1245 983
28e614de 984def determine_ext(url, default_ext='unknown_video'):
f4776371
S
985 if url is None:
986 return default_ext
9cb9a5df 987 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
988 if re.match(r'^[A-Za-z0-9]+$', guess):
989 return guess
a7aaa398
S
990 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
991 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 992 return guess.rstrip('/')
73e79f2a 993 else:
cbdbb766 994 return default_ext
73e79f2a 995
5f6a1245 996
d4051a8e 997def subtitles_filename(filename, sub_lang, sub_format):
28e614de 998 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 999
5f6a1245 1000
bd558525 1001def date_from_str(date_str):
37254abc
JMF
1002 """
1003 Return a datetime object from a string in the format YYYYMMDD or
1004 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1005 today = datetime.date.today()
f8795e10 1006 if date_str in ('now', 'today'):
37254abc 1007 return today
f8795e10
PH
1008 if date_str == 'yesterday':
1009 return today - datetime.timedelta(days=1)
37254abc
JMF
1010 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1011 if match is not None:
1012 sign = match.group('sign')
1013 time = int(match.group('time'))
1014 if sign == '-':
1015 time = -time
1016 unit = match.group('unit')
dfb1b146 1017 # A bad approximation?
37254abc
JMF
1018 if unit == 'month':
1019 unit = 'day'
1020 time *= 30
1021 elif unit == 'year':
1022 unit = 'day'
1023 time *= 365
1024 unit += 's'
1025 delta = datetime.timedelta(**{unit: time})
1026 return today + delta
611c1dd9 1027 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1028
1029
e63fc1be 1030def hyphenate_date(date_str):
1031 """
1032 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1033 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1034 if match is not None:
1035 return '-'.join(match.groups())
1036 else:
1037 return date_str
1038
5f6a1245 1039
bd558525
JMF
1040class DateRange(object):
1041 """Represents a time interval between two dates"""
5f6a1245 1042
bd558525
JMF
1043 def __init__(self, start=None, end=None):
1044 """start and end must be strings in the format accepted by date"""
1045 if start is not None:
1046 self.start = date_from_str(start)
1047 else:
1048 self.start = datetime.datetime.min.date()
1049 if end is not None:
1050 self.end = date_from_str(end)
1051 else:
1052 self.end = datetime.datetime.max.date()
37254abc 1053 if self.start > self.end:
bd558525 1054 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1055
bd558525
JMF
1056 @classmethod
1057 def day(cls, day):
1058 """Returns a range that only contains the given day"""
5f6a1245
JW
1059 return cls(day, day)
1060
bd558525
JMF
1061 def __contains__(self, date):
1062 """Check if the date is in the range"""
37254abc
JMF
1063 if not isinstance(date, datetime.date):
1064 date = date_from_str(date)
1065 return self.start <= date <= self.end
5f6a1245 1066
bd558525 1067 def __str__(self):
5f6a1245 1068 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1069
1070
1071def platform_name():
1072 """ Returns the platform name as a compat_str """
1073 res = platform.platform()
1074 if isinstance(res, bytes):
1075 res = res.decode(preferredencoding())
1076
1077 assert isinstance(res, compat_str)
1078 return res
c257baff
PH
1079
1080
b58ddb32
PH
1081def _windows_write_string(s, out):
1082 """ Returns True if the string was written using special methods,
1083 False if it has yet to be written out."""
1084 # Adapted from http://stackoverflow.com/a/3259271/35070
1085
1086 import ctypes
1087 import ctypes.wintypes
1088
1089 WIN_OUTPUT_IDS = {
1090 1: -11,
1091 2: -12,
1092 }
1093
a383a98a
PH
1094 try:
1095 fileno = out.fileno()
1096 except AttributeError:
1097 # If the output stream doesn't have a fileno, it's virtual
1098 return False
aa42e873
PH
1099 except io.UnsupportedOperation:
1100 # Some strange Windows pseudo files?
1101 return False
b58ddb32
PH
1102 if fileno not in WIN_OUTPUT_IDS:
1103 return False
1104
e2f89ec7 1105 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1106 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1107 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1108 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1109
e2f89ec7 1110 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1111 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1112 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1113 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1114 written = ctypes.wintypes.DWORD(0)
1115
611c1dd9 1116 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1117 FILE_TYPE_CHAR = 0x0002
1118 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1119 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1120 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1121 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1122 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1123 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1124
1125 def not_a_console(handle):
1126 if handle == INVALID_HANDLE_VALUE or handle is None:
1127 return True
8fb3ac36
PH
1128 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1129 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1130
1131 if not_a_console(h):
1132 return False
1133
d1b9c912
PH
1134 def next_nonbmp_pos(s):
1135 try:
1136 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1137 except StopIteration:
1138 return len(s)
1139
1140 while s:
1141 count = min(next_nonbmp_pos(s), 1024)
1142
b58ddb32 1143 ret = WriteConsoleW(
d1b9c912 1144 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1145 if ret == 0:
1146 raise OSError('Failed to write string')
d1b9c912
PH
1147 if not count: # We just wrote a non-BMP character
1148 assert written.value == 2
1149 s = s[1:]
1150 else:
1151 assert written.value > 0
1152 s = s[written.value:]
b58ddb32
PH
1153 return True
1154
1155
734f90bb 1156def write_string(s, out=None, encoding=None):
7459e3a2
PH
1157 if out is None:
1158 out = sys.stderr
8bf48f23 1159 assert type(s) == compat_str
7459e3a2 1160
b58ddb32
PH
1161 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1162 if _windows_write_string(s, out):
1163 return
1164
7459e3a2
PH
1165 if ('b' in getattr(out, 'mode', '') or
1166 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1167 byt = s.encode(encoding or preferredencoding(), 'ignore')
1168 out.write(byt)
1169 elif hasattr(out, 'buffer'):
1170 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1171 byt = s.encode(enc, 'ignore')
1172 out.buffer.write(byt)
1173 else:
8bf48f23 1174 out.write(s)
7459e3a2
PH
1175 out.flush()
1176
1177
48ea9cea
PH
1178def bytes_to_intlist(bs):
1179 if not bs:
1180 return []
1181 if isinstance(bs[0], int): # Python 3
1182 return list(bs)
1183 else:
1184 return [ord(c) for c in bs]
1185
c257baff 1186
cba892fa 1187def intlist_to_bytes(xs):
1188 if not xs:
1189 return b''
eb4157fd 1190 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1191
1192
c1c9a79c
PH
1193# Cross-platform file locking
1194if sys.platform == 'win32':
1195 import ctypes.wintypes
1196 import msvcrt
1197
1198 class OVERLAPPED(ctypes.Structure):
1199 _fields_ = [
1200 ('Internal', ctypes.wintypes.LPVOID),
1201 ('InternalHigh', ctypes.wintypes.LPVOID),
1202 ('Offset', ctypes.wintypes.DWORD),
1203 ('OffsetHigh', ctypes.wintypes.DWORD),
1204 ('hEvent', ctypes.wintypes.HANDLE),
1205 ]
1206
1207 kernel32 = ctypes.windll.kernel32
1208 LockFileEx = kernel32.LockFileEx
1209 LockFileEx.argtypes = [
1210 ctypes.wintypes.HANDLE, # hFile
1211 ctypes.wintypes.DWORD, # dwFlags
1212 ctypes.wintypes.DWORD, # dwReserved
1213 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1214 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1215 ctypes.POINTER(OVERLAPPED) # Overlapped
1216 ]
1217 LockFileEx.restype = ctypes.wintypes.BOOL
1218 UnlockFileEx = kernel32.UnlockFileEx
1219 UnlockFileEx.argtypes = [
1220 ctypes.wintypes.HANDLE, # hFile
1221 ctypes.wintypes.DWORD, # dwReserved
1222 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1223 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1224 ctypes.POINTER(OVERLAPPED) # Overlapped
1225 ]
1226 UnlockFileEx.restype = ctypes.wintypes.BOOL
1227 whole_low = 0xffffffff
1228 whole_high = 0x7fffffff
1229
1230 def _lock_file(f, exclusive):
1231 overlapped = OVERLAPPED()
1232 overlapped.Offset = 0
1233 overlapped.OffsetHigh = 0
1234 overlapped.hEvent = 0
1235 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1236 handle = msvcrt.get_osfhandle(f.fileno())
1237 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1238 whole_low, whole_high, f._lock_file_overlapped_p):
1239 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1240
1241 def _unlock_file(f):
1242 assert f._lock_file_overlapped_p
1243 handle = msvcrt.get_osfhandle(f.fileno())
1244 if not UnlockFileEx(handle, 0,
1245 whole_low, whole_high, f._lock_file_overlapped_p):
1246 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1247
1248else:
399a76e6
YCH
1249 # Some platforms, such as Jython, is missing fcntl
1250 try:
1251 import fcntl
c1c9a79c 1252
399a76e6
YCH
1253 def _lock_file(f, exclusive):
1254 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1255
399a76e6
YCH
1256 def _unlock_file(f):
1257 fcntl.flock(f, fcntl.LOCK_UN)
1258 except ImportError:
1259 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1260
1261 def _lock_file(f, exclusive):
1262 raise IOError(UNSUPPORTED_MSG)
1263
1264 def _unlock_file(f):
1265 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1266
1267
1268class locked_file(object):
1269 def __init__(self, filename, mode, encoding=None):
1270 assert mode in ['r', 'a', 'w']
1271 self.f = io.open(filename, mode, encoding=encoding)
1272 self.mode = mode
1273
1274 def __enter__(self):
1275 exclusive = self.mode != 'r'
1276 try:
1277 _lock_file(self.f, exclusive)
1278 except IOError:
1279 self.f.close()
1280 raise
1281 return self
1282
1283 def __exit__(self, etype, value, traceback):
1284 try:
1285 _unlock_file(self.f)
1286 finally:
1287 self.f.close()
1288
1289 def __iter__(self):
1290 return iter(self.f)
1291
1292 def write(self, *args):
1293 return self.f.write(*args)
1294
1295 def read(self, *args):
1296 return self.f.read(*args)
4eb7f1d1
JMF
1297
1298
4644ac55
S
1299def get_filesystem_encoding():
1300 encoding = sys.getfilesystemencoding()
1301 return encoding if encoding is not None else 'utf-8'
1302
1303
4eb7f1d1 1304def shell_quote(args):
a6a173c2 1305 quoted_args = []
4644ac55 1306 encoding = get_filesystem_encoding()
a6a173c2
JMF
1307 for a in args:
1308 if isinstance(a, bytes):
1309 # We may get a filename encoded with 'encodeFilename'
1310 a = a.decode(encoding)
1311 quoted_args.append(pipes.quote(a))
28e614de 1312 return ' '.join(quoted_args)
9d4660ca
PH
1313
1314
1315def smuggle_url(url, data):
1316 """ Pass additional data in a URL for internal use. """
1317
1318 sdata = compat_urllib_parse.urlencode(
28e614de
PH
1319 {'__youtubedl_smuggle': json.dumps(data)})
1320 return url + '#' + sdata
9d4660ca
PH
1321
1322
79f82953 1323def unsmuggle_url(smug_url, default=None):
83e865a3 1324 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1325 return smug_url, default
28e614de
PH
1326 url, _, sdata = smug_url.rpartition('#')
1327 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1328 data = json.loads(jsond)
1329 return url, data
02dbf93f
PH
1330
1331
02dbf93f
PH
1332def format_bytes(bytes):
1333 if bytes is None:
28e614de 1334 return 'N/A'
02dbf93f
PH
1335 if type(bytes) is str:
1336 bytes = float(bytes)
1337 if bytes == 0.0:
1338 exponent = 0
1339 else:
1340 exponent = int(math.log(bytes, 1024.0))
28e614de 1341 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1342 converted = float(bytes) / float(1024 ** exponent)
28e614de 1343 return '%.2f%s' % (converted, suffix)
f53c966a 1344
1c088fa8 1345
fb47597b
S
1346def lookup_unit_table(unit_table, s):
1347 units_re = '|'.join(re.escape(u) for u in unit_table)
1348 m = re.match(
09fc3319 1349 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)$' % units_re, s)
fb47597b
S
1350 if not m:
1351 return None
1352 num_str = m.group('num').replace(',', '.')
1353 mult = unit_table[m.group('unit')]
1354 return int(float(num_str) * mult)
1355
1356
be64b5b0
PH
1357def parse_filesize(s):
1358 if s is None:
1359 return None
1360
dfb1b146 1361 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1362 # but we support those too
1363 _UNIT_TABLE = {
1364 'B': 1,
1365 'b': 1,
1366 'KiB': 1024,
1367 'KB': 1000,
1368 'kB': 1024,
1369 'Kb': 1000,
1370 'MiB': 1024 ** 2,
1371 'MB': 1000 ** 2,
1372 'mB': 1024 ** 2,
1373 'Mb': 1000 ** 2,
1374 'GiB': 1024 ** 3,
1375 'GB': 1000 ** 3,
1376 'gB': 1024 ** 3,
1377 'Gb': 1000 ** 3,
1378 'TiB': 1024 ** 4,
1379 'TB': 1000 ** 4,
1380 'tB': 1024 ** 4,
1381 'Tb': 1000 ** 4,
1382 'PiB': 1024 ** 5,
1383 'PB': 1000 ** 5,
1384 'pB': 1024 ** 5,
1385 'Pb': 1000 ** 5,
1386 'EiB': 1024 ** 6,
1387 'EB': 1000 ** 6,
1388 'eB': 1024 ** 6,
1389 'Eb': 1000 ** 6,
1390 'ZiB': 1024 ** 7,
1391 'ZB': 1000 ** 7,
1392 'zB': 1024 ** 7,
1393 'Zb': 1000 ** 7,
1394 'YiB': 1024 ** 8,
1395 'YB': 1000 ** 8,
1396 'yB': 1024 ** 8,
1397 'Yb': 1000 ** 8,
1398 }
1399
fb47597b
S
1400 return lookup_unit_table(_UNIT_TABLE, s)
1401
1402
1403def parse_count(s):
1404 if s is None:
be64b5b0
PH
1405 return None
1406
fb47597b
S
1407 s = s.strip()
1408
1409 if re.match(r'^[\d,.]+$', s):
1410 return str_to_int(s)
1411
1412 _UNIT_TABLE = {
1413 'k': 1000,
1414 'K': 1000,
1415 'm': 1000 ** 2,
1416 'M': 1000 ** 2,
1417 'kk': 1000 ** 2,
1418 'KK': 1000 ** 2,
1419 }
be64b5b0 1420
fb47597b 1421 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1422
2f7ae819 1423
caefb1de
PH
1424def month_by_name(name):
1425 """ Return the number of a month by (locale-independently) English name """
1426
caefb1de 1427 try:
7105440c
YCH
1428 return ENGLISH_MONTH_NAMES.index(name) + 1
1429 except ValueError:
1430 return None
1431
1432
1433def month_by_abbreviation(abbrev):
1434 """ Return the number of a month by (locale-independently) English
1435 abbreviations """
1436
1437 try:
1438 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1439 except ValueError:
1440 return None
18258362
JMF
1441
1442
5aafe895 1443def fix_xml_ampersands(xml_str):
18258362 1444 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1445 return re.sub(
1446 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1447 '&amp;',
5aafe895 1448 xml_str)
e3946f98
PH
1449
1450
1451def setproctitle(title):
8bf48f23 1452 assert isinstance(title, compat_str)
c1c05c67
YCH
1453
1454 # ctypes in Jython is not complete
1455 # http://bugs.jython.org/issue2148
1456 if sys.platform.startswith('java'):
1457 return
1458
e3946f98 1459 try:
611c1dd9 1460 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1461 except OSError:
1462 return
6eefe533
PH
1463 title_bytes = title.encode('utf-8')
1464 buf = ctypes.create_string_buffer(len(title_bytes))
1465 buf.value = title_bytes
e3946f98 1466 try:
6eefe533 1467 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1468 except AttributeError:
1469 return # Strange libc, just skip this
d7dda168
PH
1470
1471
1472def remove_start(s, start):
1473 if s.startswith(start):
1474 return s[len(start):]
1475 return s
29eb5174
PH
1476
1477
2b9faf55
PH
1478def remove_end(s, end):
1479 if s.endswith(end):
1480 return s[:-len(end)]
1481 return s
1482
1483
31b2051e
S
1484def remove_quotes(s):
1485 if s is None or len(s) < 2:
1486 return s
1487 for quote in ('"', "'", ):
1488 if s[0] == quote and s[-1] == quote:
1489 return s[1:-1]
1490 return s
1491
1492
29eb5174 1493def url_basename(url):
9b8aaeed 1494 path = compat_urlparse.urlparse(url).path
28e614de 1495 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1496
1497
1498class HEADRequest(compat_urllib_request.Request):
1499 def get_method(self):
611c1dd9 1500 return 'HEAD'
7217e148
PH
1501
1502
9732d77e 1503def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1504 if get_attr:
1505 if v is not None:
1506 v = getattr(v, get_attr, None)
9572013d
PH
1507 if v == '':
1508 v = None
1812afb7
S
1509 if v is None:
1510 return default
1511 try:
1512 return int(v) * invscale // scale
1513 except ValueError:
af98f8ff 1514 return default
9732d77e 1515
9572013d 1516
40a90862
JMF
1517def str_or_none(v, default=None):
1518 return default if v is None else compat_str(v)
1519
9732d77e
PH
1520
1521def str_to_int(int_str):
48d4681e 1522 """ A more relaxed version of int_or_none """
9732d77e
PH
1523 if int_str is None:
1524 return None
28e614de 1525 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1526 return int(int_str)
608d11f5
PH
1527
1528
9732d77e 1529def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1530 if v is None:
1531 return default
1532 try:
1533 return float(v) * invscale / scale
1534 except ValueError:
1535 return default
43f775e4
PH
1536
1537
608d11f5 1538def parse_duration(s):
8f9312c3 1539 if not isinstance(s, compat_basestring):
608d11f5
PH
1540 return None
1541
ca7b3246
S
1542 s = s.strip()
1543
608d11f5 1544 m = re.match(
9d22a7df 1545 r'''(?ix)(?:P?T)?
e8df5cee 1546 (?:
9c29bc69 1547 (?P<only_mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*|
e8df5cee
PH
1548 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1549
9c29bc69 1550 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?\.?|minutes?)\s*|
6a68bb57 1551 (?:
8f4b58d7
PH
1552 (?:
1553 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1554 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1555 )?
6a68bb57
PH
1556 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1557 )?
e8df5cee
PH
1558 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1559 )$''', s)
608d11f5
PH
1560 if not m:
1561 return None
e8df5cee
PH
1562 res = 0
1563 if m.group('only_mins'):
1564 return float_or_none(m.group('only_mins'), invscale=60)
1565 if m.group('only_hours'):
1566 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1567 if m.group('secs'):
1568 res += int(m.group('secs'))
3e675fab
PH
1569 if m.group('mins_reversed'):
1570 res += int(m.group('mins_reversed')) * 60
608d11f5
PH
1571 if m.group('mins'):
1572 res += int(m.group('mins')) * 60
e8df5cee
PH
1573 if m.group('hours'):
1574 res += int(m.group('hours')) * 60 * 60
3e675fab
PH
1575 if m.group('hours_reversed'):
1576 res += int(m.group('hours_reversed')) * 60 * 60
8f4b58d7
PH
1577 if m.group('days'):
1578 res += int(m.group('days')) * 24 * 60 * 60
7adcbe75
PH
1579 if m.group('ms'):
1580 res += float(m.group('ms'))
608d11f5 1581 return res
91d7d0b3
JMF
1582
1583
e65e4c88 1584def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1585 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1586 return (
1587 '{0}.{1}{2}'.format(name, ext, real_ext)
1588 if not expected_real_ext or real_ext[1:] == expected_real_ext
1589 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1590
1591
b3ed15b7
S
1592def replace_extension(filename, ext, expected_real_ext=None):
1593 name, real_ext = os.path.splitext(filename)
1594 return '{0}.{1}'.format(
1595 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1596 ext)
1597
1598
d70ad093
PH
1599def check_executable(exe, args=[]):
1600 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1601 args can be a list of arguments for a short output (like -version) """
1602 try:
1603 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1604 except OSError:
1605 return False
1606 return exe
b7ab0590
PH
1607
1608
95807118 1609def get_exe_version(exe, args=['--version'],
cae97f65 1610 version_re=None, unrecognized='present'):
95807118
PH
1611 """ Returns the version of the specified executable,
1612 or False if the executable is not present """
1613 try:
cae97f65 1614 out, _ = subprocess.Popen(
54116803 1615 [encodeArgument(exe)] + args,
95807118
PH
1616 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1617 except OSError:
1618 return False
cae97f65
PH
1619 if isinstance(out, bytes): # Python 2.x
1620 out = out.decode('ascii', 'ignore')
1621 return detect_exe_version(out, version_re, unrecognized)
1622
1623
1624def detect_exe_version(output, version_re=None, unrecognized='present'):
1625 assert isinstance(output, compat_str)
1626 if version_re is None:
1627 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1628 m = re.search(version_re, output)
95807118
PH
1629 if m:
1630 return m.group(1)
1631 else:
1632 return unrecognized
1633
1634
b7ab0590 1635class PagedList(object):
dd26ced1
PH
1636 def __len__(self):
1637 # This is only useful for tests
1638 return len(self.getslice())
1639
9c44d242
PH
1640
1641class OnDemandPagedList(PagedList):
b95dc034 1642 def __init__(self, pagefunc, pagesize, use_cache=False):
9c44d242
PH
1643 self._pagefunc = pagefunc
1644 self._pagesize = pagesize
b95dc034
YCH
1645 self._use_cache = use_cache
1646 if use_cache:
1647 self._cache = {}
9c44d242 1648
b7ab0590
PH
1649 def getslice(self, start=0, end=None):
1650 res = []
1651 for pagenum in itertools.count(start // self._pagesize):
1652 firstid = pagenum * self._pagesize
1653 nextfirstid = pagenum * self._pagesize + self._pagesize
1654 if start >= nextfirstid:
1655 continue
1656
b95dc034
YCH
1657 page_results = None
1658 if self._use_cache:
1659 page_results = self._cache.get(pagenum)
1660 if page_results is None:
1661 page_results = list(self._pagefunc(pagenum))
1662 if self._use_cache:
1663 self._cache[pagenum] = page_results
b7ab0590
PH
1664
1665 startv = (
1666 start % self._pagesize
1667 if firstid <= start < nextfirstid
1668 else 0)
1669
1670 endv = (
1671 ((end - 1) % self._pagesize) + 1
1672 if (end is not None and firstid <= end <= nextfirstid)
1673 else None)
1674
1675 if startv != 0 or endv is not None:
1676 page_results = page_results[startv:endv]
1677 res.extend(page_results)
1678
1679 # A little optimization - if current page is not "full", ie. does
1680 # not contain page_size videos then we can assume that this page
1681 # is the last one - there are no more ids on further pages -
1682 # i.e. no need to query again.
1683 if len(page_results) + startv < self._pagesize:
1684 break
1685
1686 # If we got the whole page, but the next page is not interesting,
1687 # break out early as well
1688 if end == nextfirstid:
1689 break
1690 return res
81c2f20b
PH
1691
1692
9c44d242
PH
1693class InAdvancePagedList(PagedList):
1694 def __init__(self, pagefunc, pagecount, pagesize):
1695 self._pagefunc = pagefunc
1696 self._pagecount = pagecount
1697 self._pagesize = pagesize
1698
1699 def getslice(self, start=0, end=None):
1700 res = []
1701 start_page = start // self._pagesize
1702 end_page = (
1703 self._pagecount if end is None else (end // self._pagesize + 1))
1704 skip_elems = start - start_page * self._pagesize
1705 only_more = None if end is None else end - start
1706 for pagenum in range(start_page, end_page):
1707 page = list(self._pagefunc(pagenum))
1708 if skip_elems:
1709 page = page[skip_elems:]
1710 skip_elems = None
1711 if only_more is not None:
1712 if len(page) < only_more:
1713 only_more -= len(page)
1714 else:
1715 page = page[:only_more]
1716 res.extend(page)
1717 break
1718 res.extend(page)
1719 return res
1720
1721
81c2f20b 1722def uppercase_escape(s):
676eb3f2 1723 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1724 return re.sub(
a612753d 1725 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1726 lambda m: unicode_escape(m.group(0))[0],
1727 s)
0fe2ff78
YCH
1728
1729
1730def lowercase_escape(s):
1731 unicode_escape = codecs.getdecoder('unicode_escape')
1732 return re.sub(
1733 r'\\u[0-9a-fA-F]{4}',
1734 lambda m: unicode_escape(m.group(0))[0],
1735 s)
b53466e1 1736
d05cfe06
S
1737
1738def escape_rfc3986(s):
1739 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1740 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1741 s = s.encode('utf-8')
ecc0c5ee 1742 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1743
1744
1745def escape_url(url):
1746 """Escape URL as suggested by RFC 3986"""
1747 url_parsed = compat_urllib_parse_urlparse(url)
1748 return url_parsed._replace(
1749 path=escape_rfc3986(url_parsed.path),
1750 params=escape_rfc3986(url_parsed.params),
1751 query=escape_rfc3986(url_parsed.query),
1752 fragment=escape_rfc3986(url_parsed.fragment)
1753 ).geturl()
1754
b53466e1 1755try:
28e614de 1756 struct.pack('!I', 0)
b53466e1
PH
1757except TypeError:
1758 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1759 def struct_pack(spec, *args):
1760 if isinstance(spec, compat_str):
1761 spec = spec.encode('ascii')
1762 return struct.pack(spec, *args)
1763
1764 def struct_unpack(spec, *args):
1765 if isinstance(spec, compat_str):
1766 spec = spec.encode('ascii')
1767 return struct.unpack(spec, *args)
1768else:
1769 struct_pack = struct.pack
1770 struct_unpack = struct.unpack
62e609ab
PH
1771
1772
1773def read_batch_urls(batch_fd):
1774 def fixup(url):
1775 if not isinstance(url, compat_str):
1776 url = url.decode('utf-8', 'replace')
28e614de 1777 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1778 if url.startswith(BOM_UTF8):
1779 url = url[len(BOM_UTF8):]
1780 url = url.strip()
1781 if url.startswith(('#', ';', ']')):
1782 return False
1783 return url
1784
1785 with contextlib.closing(batch_fd) as fd:
1786 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1787
1788
1789def urlencode_postdata(*args, **kargs):
1790 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1791
1792
38f9ef31 1793def update_url_query(url, query):
1794 parsed_url = compat_urlparse.urlparse(url)
1795 qs = compat_parse_qs(parsed_url.query)
1796 qs.update(query)
3233a68f 1797 qs = encode_dict(qs)
38f9ef31 1798 return compat_urlparse.urlunparse(parsed_url._replace(
1799 query=compat_urllib_parse.urlencode(qs, True)))
1800
1801
16392824 1802def encode_dict(d, encoding='utf-8'):
7e1f5447
S
1803 def encode(v):
1804 return v.encode(encoding) if isinstance(v, compat_basestring) else v
1805 return dict((encode(k), encode(v)) for k, v in d.items())
16392824 1806
8e60dc75 1807
86296ad2 1808def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
1809 if isinstance(key_or_keys, (list, tuple)):
1810 for key in key_or_keys:
86296ad2
S
1811 if key not in d or d[key] is None or skip_false_values and not d[key]:
1812 continue
1813 return d[key]
cbecc9b9
S
1814 return default
1815 return d.get(key_or_keys, default)
1816
1817
8e60dc75
S
1818def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1819 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1820
16392824 1821
a1a530b0
PH
1822US_RATINGS = {
1823 'G': 0,
1824 'PG': 10,
1825 'PG-13': 13,
1826 'R': 16,
1827 'NC': 18,
1828}
fac55558
PH
1829
1830
146c80e2
S
1831def parse_age_limit(s):
1832 if s is None:
d838b1bd 1833 return None
146c80e2 1834 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d800609c 1835 return int(m.group('age')) if m else US_RATINGS.get(s)
146c80e2
S
1836
1837
fac55558 1838def strip_jsonp(code):
609a61e3 1839 return re.sub(
8411229b 1840 r'(?s)^[a-zA-Z0-9_.]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1841
1842
e05f6939
PH
1843def js_to_json(code):
1844 def fix_kv(m):
e7b6d122
PH
1845 v = m.group(0)
1846 if v in ('true', 'false', 'null'):
1847 return v
1848 if v.startswith('"'):
d01949dc
S
1849 v = re.sub(r"\\'", "'", v[1:-1])
1850 elif v.startswith("'"):
e7b6d122
PH
1851 v = v[1:-1]
1852 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1853 '\\\\': '\\\\',
1854 "\\'": "'",
1855 '"': '\\"',
1856 }[m.group(0)], v)
1857 return '"%s"' % v
e05f6939
PH
1858
1859 res = re.sub(r'''(?x)
d305dd73
PH
1860 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1861 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
8f4b58d7 1862 [a-zA-Z_][.a-zA-Z_0-9]*
e05f6939 1863 ''', fix_kv, code)
ba9e68f4 1864 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
e05f6939
PH
1865 return res
1866
1867
478c2c61
PH
1868def qualities(quality_ids):
1869 """ Get a numeric quality value out of a list of possible values """
1870 def q(qid):
1871 try:
1872 return quality_ids.index(qid)
1873 except ValueError:
1874 return -1
1875 return q
1876
acd69589
PH
1877
1878DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1879
a020a0dc
PH
1880
1881def limit_length(s, length):
1882 """ Add ellipses to overly long strings """
1883 if s is None:
1884 return None
1885 ELLIPSES = '...'
1886 if len(s) > length:
1887 return s[:length - len(ELLIPSES)] + ELLIPSES
1888 return s
48844745
PH
1889
1890
1891def version_tuple(v):
5f9b8394 1892 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1893
1894
1895def is_outdated_version(version, limit, assume_new=True):
1896 if not version:
1897 return not assume_new
1898 try:
1899 return version_tuple(version) < version_tuple(limit)
1900 except ValueError:
1901 return not assume_new
732ea2f0
PH
1902
1903
1904def ytdl_is_updateable():
1905 """ Returns if youtube-dl can be updated with -U """
1906 from zipimport import zipimporter
1907
1908 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1909
1910
1911def args_to_str(args):
1912 # Get a short string representation for a subprocess command
1913 return ' '.join(shlex_quote(a) for a in args)
2ccd1b10
PH
1914
1915
9b9c5355 1916def error_to_compat_str(err):
fdae2358
S
1917 err_str = str(err)
1918 # On python 2 error byte string must be decoded with proper
1919 # encoding rather than ascii
1920 if sys.version_info[0] < 3:
1921 err_str = err_str.decode(preferredencoding())
1922 return err_str
1923
1924
c460bdd5 1925def mimetype2ext(mt):
765ac263
JMF
1926 ext = {
1927 'audio/mp4': 'm4a',
1928 }.get(mt)
1929 if ext is not None:
1930 return ext
1931
c460bdd5
PH
1932 _, _, res = mt.rpartition('/')
1933
1934 return {
f6861ec9 1935 '3gpp': '3gp',
cafcf657 1936 'smptett+xml': 'tt',
1937 'srt': 'srt',
1938 'ttaf+xml': 'dfxp',
a0d8d704 1939 'ttml+xml': 'ttml',
cafcf657 1940 'vtt': 'vtt',
f6861ec9 1941 'x-flv': 'flv',
a0d8d704
YCH
1942 'x-mp4-fragmented': 'mp4',
1943 'x-ms-wmv': 'wmv',
c460bdd5
PH
1944 }.get(res, res)
1945
1946
2ccd1b10
PH
1947def urlhandle_detect_ext(url_handle):
1948 try:
1949 url_handle.headers
1950 getheader = lambda h: url_handle.headers[h]
1951 except AttributeError: # Python < 3
1952 getheader = url_handle.info().getheader
1953
b55ee18f
PH
1954 cd = getheader('Content-Disposition')
1955 if cd:
1956 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1957 if m:
1958 e = determine_ext(m.group('filename'), default_ext=None)
1959 if e:
1960 return e
1961
c460bdd5 1962 return mimetype2ext(getheader('Content-Type'))
05900629
PH
1963
1964
1e399778
YCH
1965def encode_data_uri(data, mime_type):
1966 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
1967
1968
05900629 1969def age_restricted(content_limit, age_limit):
6ec6cb4e 1970 """ Returns True iff the content should be blocked """
05900629
PH
1971
1972 if age_limit is None: # No limit set
1973 return False
1974 if content_limit is None:
1975 return False # Content available for everyone
1976 return age_limit < content_limit
61ca9a80
PH
1977
1978
1979def is_html(first_bytes):
1980 """ Detect whether a file contains HTML by examining its first bytes. """
1981
1982 BOMS = [
1983 (b'\xef\xbb\xbf', 'utf-8'),
1984 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1985 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1986 (b'\xff\xfe', 'utf-16-le'),
1987 (b'\xfe\xff', 'utf-16-be'),
1988 ]
1989 for bom, enc in BOMS:
1990 if first_bytes.startswith(bom):
1991 s = first_bytes[len(bom):].decode(enc, 'replace')
1992 break
1993 else:
1994 s = first_bytes.decode('utf-8', 'replace')
1995
1996 return re.match(r'^\s*<', s)
a055469f
PH
1997
1998
1999def determine_protocol(info_dict):
2000 protocol = info_dict.get('protocol')
2001 if protocol is not None:
2002 return protocol
2003
2004 url = info_dict['url']
2005 if url.startswith('rtmp'):
2006 return 'rtmp'
2007 elif url.startswith('mms'):
2008 return 'mms'
2009 elif url.startswith('rtsp'):
2010 return 'rtsp'
2011
2012 ext = determine_ext(url)
2013 if ext == 'm3u8':
2014 return 'm3u8'
2015 elif ext == 'f4m':
2016 return 'f4m'
2017
2018 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2019
2020
2021def render_table(header_row, data):
2022 """ Render a list of rows, each as a list of values """
2023 table = [header_row] + data
2024 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2025 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2026 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2027
2028
2029def _match_one(filter_part, dct):
2030 COMPARISON_OPERATORS = {
2031 '<': operator.lt,
2032 '<=': operator.le,
2033 '>': operator.gt,
2034 '>=': operator.ge,
2035 '=': operator.eq,
2036 '!=': operator.ne,
2037 }
2038 operator_rex = re.compile(r'''(?x)\s*
2039 (?P<key>[a-z_]+)
2040 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2041 (?:
2042 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2043 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2044 )
2045 \s*$
2046 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2047 m = operator_rex.search(filter_part)
2048 if m:
2049 op = COMPARISON_OPERATORS[m.group('op')]
2050 if m.group('strval') is not None:
2051 if m.group('op') not in ('=', '!='):
2052 raise ValueError(
2053 'Operator %s does not support string values!' % m.group('op'))
2054 comparison_value = m.group('strval')
2055 else:
2056 try:
2057 comparison_value = int(m.group('intval'))
2058 except ValueError:
2059 comparison_value = parse_filesize(m.group('intval'))
2060 if comparison_value is None:
2061 comparison_value = parse_filesize(m.group('intval') + 'B')
2062 if comparison_value is None:
2063 raise ValueError(
2064 'Invalid integer value %r in filter part %r' % (
2065 m.group('intval'), filter_part))
2066 actual_value = dct.get(m.group('key'))
2067 if actual_value is None:
2068 return m.group('none_inclusive')
2069 return op(actual_value, comparison_value)
2070
2071 UNARY_OPERATORS = {
2072 '': lambda v: v is not None,
2073 '!': lambda v: v is None,
2074 }
2075 operator_rex = re.compile(r'''(?x)\s*
2076 (?P<op>%s)\s*(?P<key>[a-z_]+)
2077 \s*$
2078 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2079 m = operator_rex.search(filter_part)
2080 if m:
2081 op = UNARY_OPERATORS[m.group('op')]
2082 actual_value = dct.get(m.group('key'))
2083 return op(actual_value)
2084
2085 raise ValueError('Invalid filter part %r' % filter_part)
2086
2087
2088def match_str(filter_str, dct):
2089 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2090
2091 return all(
2092 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2093
2094
2095def match_filter_func(filter_str):
2096 def _match_func(info_dict):
2097 if match_str(filter_str, info_dict):
2098 return None
2099 else:
2100 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2101 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2102 return _match_func
91410c9b
PH
2103
2104
bf6427d2
YCH
2105def parse_dfxp_time_expr(time_expr):
2106 if not time_expr:
d631d5f9 2107 return
bf6427d2
YCH
2108
2109 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2110 if mobj:
2111 return float(mobj.group('time_offset'))
2112
db2fe38b 2113 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2114 if mobj:
db2fe38b 2115 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2116
2117
c1c924ab
YCH
2118def srt_subtitles_timecode(seconds):
2119 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2120
2121
2122def dfxp2srt(dfxp_data):
4e335771
YCH
2123 _x = functools.partial(xpath_with_ns, ns_map={
2124 'ttml': 'http://www.w3.org/ns/ttml',
2125 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
2126 })
bf6427d2 2127
87de7069 2128 class TTMLPElementParser(object):
2b14cb56 2129 out = ''
bf6427d2 2130
2b14cb56 2131 def start(self, tag, attrib):
2132 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2133 self.out += '\n'
bf6427d2 2134
2b14cb56 2135 def end(self, tag):
2136 pass
bf6427d2 2137
2b14cb56 2138 def data(self, data):
2139 self.out += data
2140
2141 def close(self):
2142 return self.out.strip()
2143
2144 def parse_node(node):
2145 target = TTMLPElementParser()
2146 parser = xml.etree.ElementTree.XMLParser(target=target)
2147 parser.feed(xml.etree.ElementTree.tostring(node))
2148 return parser.close()
bf6427d2 2149
36e6f62c 2150 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
bf6427d2 2151 out = []
4e335771 2152 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2153
2154 if not paras:
2155 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
2156
2157 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2158 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2159 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2160 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2161 if begin_time is None:
2162 continue
7dff0363 2163 if not end_time:
d631d5f9
YCH
2164 if not dur:
2165 continue
2166 end_time = begin_time + dur
bf6427d2
YCH
2167 out.append('%d\n%s --> %s\n%s\n\n' % (
2168 index,
c1c924ab
YCH
2169 srt_subtitles_timecode(begin_time),
2170 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2171 parse_node(para)))
2172
2173 return ''.join(out)
2174
2175
66e289ba
S
2176def cli_option(params, command_option, param):
2177 param = params.get(param)
2178 return [command_option, param] if param is not None else []
2179
2180
2181def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2182 param = params.get(param)
2183 assert isinstance(param, bool)
2184 if separator:
2185 return [command_option + separator + (true_value if param else false_value)]
2186 return [command_option, true_value if param else false_value]
2187
2188
2189def cli_valueless_option(params, command_option, param, expected_value=True):
2190 param = params.get(param)
2191 return [command_option] if param == expected_value else []
2192
2193
2194def cli_configuration_args(params, param, default=[]):
2195 ex_args = params.get(param)
2196 if ex_args is None:
2197 return default
2198 assert isinstance(ex_args, list)
2199 return ex_args
2200
2201
39672624
YCH
2202class ISO639Utils(object):
2203 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2204 _lang_map = {
2205 'aa': 'aar',
2206 'ab': 'abk',
2207 'ae': 'ave',
2208 'af': 'afr',
2209 'ak': 'aka',
2210 'am': 'amh',
2211 'an': 'arg',
2212 'ar': 'ara',
2213 'as': 'asm',
2214 'av': 'ava',
2215 'ay': 'aym',
2216 'az': 'aze',
2217 'ba': 'bak',
2218 'be': 'bel',
2219 'bg': 'bul',
2220 'bh': 'bih',
2221 'bi': 'bis',
2222 'bm': 'bam',
2223 'bn': 'ben',
2224 'bo': 'bod',
2225 'br': 'bre',
2226 'bs': 'bos',
2227 'ca': 'cat',
2228 'ce': 'che',
2229 'ch': 'cha',
2230 'co': 'cos',
2231 'cr': 'cre',
2232 'cs': 'ces',
2233 'cu': 'chu',
2234 'cv': 'chv',
2235 'cy': 'cym',
2236 'da': 'dan',
2237 'de': 'deu',
2238 'dv': 'div',
2239 'dz': 'dzo',
2240 'ee': 'ewe',
2241 'el': 'ell',
2242 'en': 'eng',
2243 'eo': 'epo',
2244 'es': 'spa',
2245 'et': 'est',
2246 'eu': 'eus',
2247 'fa': 'fas',
2248 'ff': 'ful',
2249 'fi': 'fin',
2250 'fj': 'fij',
2251 'fo': 'fao',
2252 'fr': 'fra',
2253 'fy': 'fry',
2254 'ga': 'gle',
2255 'gd': 'gla',
2256 'gl': 'glg',
2257 'gn': 'grn',
2258 'gu': 'guj',
2259 'gv': 'glv',
2260 'ha': 'hau',
2261 'he': 'heb',
2262 'hi': 'hin',
2263 'ho': 'hmo',
2264 'hr': 'hrv',
2265 'ht': 'hat',
2266 'hu': 'hun',
2267 'hy': 'hye',
2268 'hz': 'her',
2269 'ia': 'ina',
2270 'id': 'ind',
2271 'ie': 'ile',
2272 'ig': 'ibo',
2273 'ii': 'iii',
2274 'ik': 'ipk',
2275 'io': 'ido',
2276 'is': 'isl',
2277 'it': 'ita',
2278 'iu': 'iku',
2279 'ja': 'jpn',
2280 'jv': 'jav',
2281 'ka': 'kat',
2282 'kg': 'kon',
2283 'ki': 'kik',
2284 'kj': 'kua',
2285 'kk': 'kaz',
2286 'kl': 'kal',
2287 'km': 'khm',
2288 'kn': 'kan',
2289 'ko': 'kor',
2290 'kr': 'kau',
2291 'ks': 'kas',
2292 'ku': 'kur',
2293 'kv': 'kom',
2294 'kw': 'cor',
2295 'ky': 'kir',
2296 'la': 'lat',
2297 'lb': 'ltz',
2298 'lg': 'lug',
2299 'li': 'lim',
2300 'ln': 'lin',
2301 'lo': 'lao',
2302 'lt': 'lit',
2303 'lu': 'lub',
2304 'lv': 'lav',
2305 'mg': 'mlg',
2306 'mh': 'mah',
2307 'mi': 'mri',
2308 'mk': 'mkd',
2309 'ml': 'mal',
2310 'mn': 'mon',
2311 'mr': 'mar',
2312 'ms': 'msa',
2313 'mt': 'mlt',
2314 'my': 'mya',
2315 'na': 'nau',
2316 'nb': 'nob',
2317 'nd': 'nde',
2318 'ne': 'nep',
2319 'ng': 'ndo',
2320 'nl': 'nld',
2321 'nn': 'nno',
2322 'no': 'nor',
2323 'nr': 'nbl',
2324 'nv': 'nav',
2325 'ny': 'nya',
2326 'oc': 'oci',
2327 'oj': 'oji',
2328 'om': 'orm',
2329 'or': 'ori',
2330 'os': 'oss',
2331 'pa': 'pan',
2332 'pi': 'pli',
2333 'pl': 'pol',
2334 'ps': 'pus',
2335 'pt': 'por',
2336 'qu': 'que',
2337 'rm': 'roh',
2338 'rn': 'run',
2339 'ro': 'ron',
2340 'ru': 'rus',
2341 'rw': 'kin',
2342 'sa': 'san',
2343 'sc': 'srd',
2344 'sd': 'snd',
2345 'se': 'sme',
2346 'sg': 'sag',
2347 'si': 'sin',
2348 'sk': 'slk',
2349 'sl': 'slv',
2350 'sm': 'smo',
2351 'sn': 'sna',
2352 'so': 'som',
2353 'sq': 'sqi',
2354 'sr': 'srp',
2355 'ss': 'ssw',
2356 'st': 'sot',
2357 'su': 'sun',
2358 'sv': 'swe',
2359 'sw': 'swa',
2360 'ta': 'tam',
2361 'te': 'tel',
2362 'tg': 'tgk',
2363 'th': 'tha',
2364 'ti': 'tir',
2365 'tk': 'tuk',
2366 'tl': 'tgl',
2367 'tn': 'tsn',
2368 'to': 'ton',
2369 'tr': 'tur',
2370 'ts': 'tso',
2371 'tt': 'tat',
2372 'tw': 'twi',
2373 'ty': 'tah',
2374 'ug': 'uig',
2375 'uk': 'ukr',
2376 'ur': 'urd',
2377 'uz': 'uzb',
2378 've': 'ven',
2379 'vi': 'vie',
2380 'vo': 'vol',
2381 'wa': 'wln',
2382 'wo': 'wol',
2383 'xh': 'xho',
2384 'yi': 'yid',
2385 'yo': 'yor',
2386 'za': 'zha',
2387 'zh': 'zho',
2388 'zu': 'zul',
2389 }
2390
2391 @classmethod
2392 def short2long(cls, code):
2393 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2394 return cls._lang_map.get(code[:2])
2395
2396 @classmethod
2397 def long2short(cls, code):
2398 """Convert language code from ISO 639-2/T to ISO 639-1"""
2399 for short_name, long_name in cls._lang_map.items():
2400 if long_name == code:
2401 return short_name
2402
2403
4eb10f66
YCH
2404class ISO3166Utils(object):
2405 # From http://data.okfn.org/data/core/country-list
2406 _country_map = {
2407 'AF': 'Afghanistan',
2408 'AX': 'Åland Islands',
2409 'AL': 'Albania',
2410 'DZ': 'Algeria',
2411 'AS': 'American Samoa',
2412 'AD': 'Andorra',
2413 'AO': 'Angola',
2414 'AI': 'Anguilla',
2415 'AQ': 'Antarctica',
2416 'AG': 'Antigua and Barbuda',
2417 'AR': 'Argentina',
2418 'AM': 'Armenia',
2419 'AW': 'Aruba',
2420 'AU': 'Australia',
2421 'AT': 'Austria',
2422 'AZ': 'Azerbaijan',
2423 'BS': 'Bahamas',
2424 'BH': 'Bahrain',
2425 'BD': 'Bangladesh',
2426 'BB': 'Barbados',
2427 'BY': 'Belarus',
2428 'BE': 'Belgium',
2429 'BZ': 'Belize',
2430 'BJ': 'Benin',
2431 'BM': 'Bermuda',
2432 'BT': 'Bhutan',
2433 'BO': 'Bolivia, Plurinational State of',
2434 'BQ': 'Bonaire, Sint Eustatius and Saba',
2435 'BA': 'Bosnia and Herzegovina',
2436 'BW': 'Botswana',
2437 'BV': 'Bouvet Island',
2438 'BR': 'Brazil',
2439 'IO': 'British Indian Ocean Territory',
2440 'BN': 'Brunei Darussalam',
2441 'BG': 'Bulgaria',
2442 'BF': 'Burkina Faso',
2443 'BI': 'Burundi',
2444 'KH': 'Cambodia',
2445 'CM': 'Cameroon',
2446 'CA': 'Canada',
2447 'CV': 'Cape Verde',
2448 'KY': 'Cayman Islands',
2449 'CF': 'Central African Republic',
2450 'TD': 'Chad',
2451 'CL': 'Chile',
2452 'CN': 'China',
2453 'CX': 'Christmas Island',
2454 'CC': 'Cocos (Keeling) Islands',
2455 'CO': 'Colombia',
2456 'KM': 'Comoros',
2457 'CG': 'Congo',
2458 'CD': 'Congo, the Democratic Republic of the',
2459 'CK': 'Cook Islands',
2460 'CR': 'Costa Rica',
2461 'CI': 'Côte d\'Ivoire',
2462 'HR': 'Croatia',
2463 'CU': 'Cuba',
2464 'CW': 'Curaçao',
2465 'CY': 'Cyprus',
2466 'CZ': 'Czech Republic',
2467 'DK': 'Denmark',
2468 'DJ': 'Djibouti',
2469 'DM': 'Dominica',
2470 'DO': 'Dominican Republic',
2471 'EC': 'Ecuador',
2472 'EG': 'Egypt',
2473 'SV': 'El Salvador',
2474 'GQ': 'Equatorial Guinea',
2475 'ER': 'Eritrea',
2476 'EE': 'Estonia',
2477 'ET': 'Ethiopia',
2478 'FK': 'Falkland Islands (Malvinas)',
2479 'FO': 'Faroe Islands',
2480 'FJ': 'Fiji',
2481 'FI': 'Finland',
2482 'FR': 'France',
2483 'GF': 'French Guiana',
2484 'PF': 'French Polynesia',
2485 'TF': 'French Southern Territories',
2486 'GA': 'Gabon',
2487 'GM': 'Gambia',
2488 'GE': 'Georgia',
2489 'DE': 'Germany',
2490 'GH': 'Ghana',
2491 'GI': 'Gibraltar',
2492 'GR': 'Greece',
2493 'GL': 'Greenland',
2494 'GD': 'Grenada',
2495 'GP': 'Guadeloupe',
2496 'GU': 'Guam',
2497 'GT': 'Guatemala',
2498 'GG': 'Guernsey',
2499 'GN': 'Guinea',
2500 'GW': 'Guinea-Bissau',
2501 'GY': 'Guyana',
2502 'HT': 'Haiti',
2503 'HM': 'Heard Island and McDonald Islands',
2504 'VA': 'Holy See (Vatican City State)',
2505 'HN': 'Honduras',
2506 'HK': 'Hong Kong',
2507 'HU': 'Hungary',
2508 'IS': 'Iceland',
2509 'IN': 'India',
2510 'ID': 'Indonesia',
2511 'IR': 'Iran, Islamic Republic of',
2512 'IQ': 'Iraq',
2513 'IE': 'Ireland',
2514 'IM': 'Isle of Man',
2515 'IL': 'Israel',
2516 'IT': 'Italy',
2517 'JM': 'Jamaica',
2518 'JP': 'Japan',
2519 'JE': 'Jersey',
2520 'JO': 'Jordan',
2521 'KZ': 'Kazakhstan',
2522 'KE': 'Kenya',
2523 'KI': 'Kiribati',
2524 'KP': 'Korea, Democratic People\'s Republic of',
2525 'KR': 'Korea, Republic of',
2526 'KW': 'Kuwait',
2527 'KG': 'Kyrgyzstan',
2528 'LA': 'Lao People\'s Democratic Republic',
2529 'LV': 'Latvia',
2530 'LB': 'Lebanon',
2531 'LS': 'Lesotho',
2532 'LR': 'Liberia',
2533 'LY': 'Libya',
2534 'LI': 'Liechtenstein',
2535 'LT': 'Lithuania',
2536 'LU': 'Luxembourg',
2537 'MO': 'Macao',
2538 'MK': 'Macedonia, the Former Yugoslav Republic of',
2539 'MG': 'Madagascar',
2540 'MW': 'Malawi',
2541 'MY': 'Malaysia',
2542 'MV': 'Maldives',
2543 'ML': 'Mali',
2544 'MT': 'Malta',
2545 'MH': 'Marshall Islands',
2546 'MQ': 'Martinique',
2547 'MR': 'Mauritania',
2548 'MU': 'Mauritius',
2549 'YT': 'Mayotte',
2550 'MX': 'Mexico',
2551 'FM': 'Micronesia, Federated States of',
2552 'MD': 'Moldova, Republic of',
2553 'MC': 'Monaco',
2554 'MN': 'Mongolia',
2555 'ME': 'Montenegro',
2556 'MS': 'Montserrat',
2557 'MA': 'Morocco',
2558 'MZ': 'Mozambique',
2559 'MM': 'Myanmar',
2560 'NA': 'Namibia',
2561 'NR': 'Nauru',
2562 'NP': 'Nepal',
2563 'NL': 'Netherlands',
2564 'NC': 'New Caledonia',
2565 'NZ': 'New Zealand',
2566 'NI': 'Nicaragua',
2567 'NE': 'Niger',
2568 'NG': 'Nigeria',
2569 'NU': 'Niue',
2570 'NF': 'Norfolk Island',
2571 'MP': 'Northern Mariana Islands',
2572 'NO': 'Norway',
2573 'OM': 'Oman',
2574 'PK': 'Pakistan',
2575 'PW': 'Palau',
2576 'PS': 'Palestine, State of',
2577 'PA': 'Panama',
2578 'PG': 'Papua New Guinea',
2579 'PY': 'Paraguay',
2580 'PE': 'Peru',
2581 'PH': 'Philippines',
2582 'PN': 'Pitcairn',
2583 'PL': 'Poland',
2584 'PT': 'Portugal',
2585 'PR': 'Puerto Rico',
2586 'QA': 'Qatar',
2587 'RE': 'Réunion',
2588 'RO': 'Romania',
2589 'RU': 'Russian Federation',
2590 'RW': 'Rwanda',
2591 'BL': 'Saint Barthélemy',
2592 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2593 'KN': 'Saint Kitts and Nevis',
2594 'LC': 'Saint Lucia',
2595 'MF': 'Saint Martin (French part)',
2596 'PM': 'Saint Pierre and Miquelon',
2597 'VC': 'Saint Vincent and the Grenadines',
2598 'WS': 'Samoa',
2599 'SM': 'San Marino',
2600 'ST': 'Sao Tome and Principe',
2601 'SA': 'Saudi Arabia',
2602 'SN': 'Senegal',
2603 'RS': 'Serbia',
2604 'SC': 'Seychelles',
2605 'SL': 'Sierra Leone',
2606 'SG': 'Singapore',
2607 'SX': 'Sint Maarten (Dutch part)',
2608 'SK': 'Slovakia',
2609 'SI': 'Slovenia',
2610 'SB': 'Solomon Islands',
2611 'SO': 'Somalia',
2612 'ZA': 'South Africa',
2613 'GS': 'South Georgia and the South Sandwich Islands',
2614 'SS': 'South Sudan',
2615 'ES': 'Spain',
2616 'LK': 'Sri Lanka',
2617 'SD': 'Sudan',
2618 'SR': 'Suriname',
2619 'SJ': 'Svalbard and Jan Mayen',
2620 'SZ': 'Swaziland',
2621 'SE': 'Sweden',
2622 'CH': 'Switzerland',
2623 'SY': 'Syrian Arab Republic',
2624 'TW': 'Taiwan, Province of China',
2625 'TJ': 'Tajikistan',
2626 'TZ': 'Tanzania, United Republic of',
2627 'TH': 'Thailand',
2628 'TL': 'Timor-Leste',
2629 'TG': 'Togo',
2630 'TK': 'Tokelau',
2631 'TO': 'Tonga',
2632 'TT': 'Trinidad and Tobago',
2633 'TN': 'Tunisia',
2634 'TR': 'Turkey',
2635 'TM': 'Turkmenistan',
2636 'TC': 'Turks and Caicos Islands',
2637 'TV': 'Tuvalu',
2638 'UG': 'Uganda',
2639 'UA': 'Ukraine',
2640 'AE': 'United Arab Emirates',
2641 'GB': 'United Kingdom',
2642 'US': 'United States',
2643 'UM': 'United States Minor Outlying Islands',
2644 'UY': 'Uruguay',
2645 'UZ': 'Uzbekistan',
2646 'VU': 'Vanuatu',
2647 'VE': 'Venezuela, Bolivarian Republic of',
2648 'VN': 'Viet Nam',
2649 'VG': 'Virgin Islands, British',
2650 'VI': 'Virgin Islands, U.S.',
2651 'WF': 'Wallis and Futuna',
2652 'EH': 'Western Sahara',
2653 'YE': 'Yemen',
2654 'ZM': 'Zambia',
2655 'ZW': 'Zimbabwe',
2656 }
2657
2658 @classmethod
2659 def short2full(cls, code):
2660 """Convert an ISO 3166-2 country code to the corresponding full name"""
2661 return cls._country_map.get(code.upper())
2662
2663
91410c9b 2664class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2665 def __init__(self, proxies=None):
2666 # Set default handlers
2667 for type in ('http', 'https'):
2668 setattr(self, '%s_open' % type,
2669 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2670 meth(r, proxy, type))
2671 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2672
91410c9b 2673 def proxy_open(self, req, proxy, type):
2461f79d 2674 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
2675 if req_proxy is not None:
2676 proxy = req_proxy
2461f79d
PH
2677 del req.headers['Ytdl-request-proxy']
2678
2679 if proxy == '__noproxy__':
2680 return None # No Proxy
91410c9b
PH
2681 return compat_urllib_request.ProxyHandler.proxy_open(
2682 self, req, proxy, type)
5bc880b9
YCH
2683
2684
2685def ohdave_rsa_encrypt(data, exponent, modulus):
2686 '''
2687 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2688
2689 Input:
2690 data: data to encrypt, bytes-like object
2691 exponent, modulus: parameter e and N of RSA algorithm, both integer
2692 Output: hex string of encrypted data
2693
2694 Limitation: supports one block encryption only
2695 '''
2696
2697 payload = int(binascii.hexlify(data[::-1]), 16)
2698 encrypted = pow(payload, exponent, modulus)
2699 return '%x' % encrypted
81bdc8fd
YCH
2700
2701
5eb6bdce 2702def encode_base_n(num, n, table=None):
59f898b7 2703 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
2704 if not table:
2705 table = FULL_TABLE[:n]
2706
5eb6bdce
YCH
2707 if n > len(table):
2708 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2709
2710 if num == 0:
2711 return table[0]
2712
81bdc8fd
YCH
2713 ret = ''
2714 while num:
2715 ret = table[num % n] + ret
2716 num = num // n
2717 return ret
f52354a8
YCH
2718
2719
2720def decode_packed_codes(code):
2721 mobj = re.search(
680079be 2722 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
f52354a8
YCH
2723 code)
2724 obfucasted_code, base, count, symbols = mobj.groups()
2725 base = int(base)
2726 count = int(count)
2727 symbols = symbols.split('|')
2728 symbol_table = {}
2729
2730 while count:
2731 count -= 1
5eb6bdce 2732 base_n_count = encode_base_n(count, base)
f52354a8
YCH
2733 symbol_table[base_n_count] = symbols[count] or base_n_count
2734
2735 return re.sub(
2736 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2737 obfucasted_code)