]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[utils] Don't transform numbers not starting with a zero
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
f45c185f 14import errno
be4a824d 15import functools
d77c3dfd 16import gzip
03f9daab 17import io
79a2e94e 18import itertools
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
347de493 22import operator
d77c3dfd 23import os
4eb7f1d1 24import pipes
c496ca96 25import platform
d77c3dfd 26import re
c496ca96 27import socket
79a2e94e 28import ssl
1c088fa8 29import subprocess
d77c3dfd 30import sys
181c8655 31import tempfile
01951dda 32import traceback
bcf89ce6 33import xml.etree.ElementTree
d77c3dfd 34import zlib
d77c3dfd 35
8c25f81b 36from .compat import (
8bb56eee 37 compat_HTMLParser,
8f9312c3 38 compat_basestring,
8c25f81b 39 compat_chr,
36e6f62c 40 compat_etree_fromstring,
8c25f81b 41 compat_html_entities,
55b2f099 42 compat_html_entities_html5,
be4a824d 43 compat_http_client,
c86b6142 44 compat_kwargs,
8c25f81b 45 compat_parse_qs,
702ccf2d 46 compat_shlex_quote,
be4a824d 47 compat_socket_create_connection,
8c25f81b 48 compat_str,
edaa23f8 49 compat_struct_pack,
8c25f81b
PH
50 compat_urllib_error,
51 compat_urllib_parse,
15707c7e 52 compat_urllib_parse_urlencode,
8c25f81b 53 compat_urllib_parse_urlparse,
7581bfc9 54 compat_urllib_parse_unquote_plus,
8c25f81b
PH
55 compat_urllib_request,
56 compat_urlparse,
810c10ba 57 compat_xpath,
8c25f81b 58)
4644ac55 59
71aff188
YCH
60from .socks import (
61 ProxyType,
62 sockssocket,
63)
64
4644ac55 65
51fb4995
YCH
66def register_socks_protocols():
67 # "Register" SOCKS protocols
d5ae6bb5
YCH
68 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
69 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
70 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
71 if scheme not in compat_urlparse.uses_netloc:
72 compat_urlparse.uses_netloc.append(scheme)
73
74
468e2e92
FV
75# This is not clearly defined otherwise
76compiled_regex_type = type(re.compile(''))
77
3e669f36 78std_headers = {
15d10678 79 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
59ae15a5
PH
80 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
81 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
82 'Accept-Encoding': 'gzip, deflate',
83 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 84}
f427df17 85
5f6a1245 86
bf42a990
S
87NO_DEFAULT = object()
88
7105440c
YCH
89ENGLISH_MONTH_NAMES = [
90 'January', 'February', 'March', 'April', 'May', 'June',
91 'July', 'August', 'September', 'October', 'November', 'December']
92
a7aaa398
S
93KNOWN_EXTENSIONS = (
94 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
95 'flv', 'f4v', 'f4a', 'f4b',
96 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
97 'mkv', 'mka', 'mk3d',
98 'avi', 'divx',
99 'mov',
100 'asf', 'wmv', 'wma',
101 '3gp', '3g2',
102 'mp3',
103 'flac',
104 'ape',
105 'wav',
106 'f4f', 'f4m', 'm3u8', 'smil')
107
c587cbb7 108# needed for sanitizing filenames in restricted mode
c8827027 109ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
110 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
111 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
c587cbb7 112
7105440c 113
d77c3dfd 114def preferredencoding():
59ae15a5 115 """Get preferred encoding.
d77c3dfd 116
59ae15a5
PH
117 Returns the best encoding scheme for the system, based on
118 locale.getpreferredencoding() and some further tweaks.
119 """
120 try:
121 pref = locale.getpreferredencoding()
28e614de 122 'TEST'.encode(pref)
70a1165b 123 except Exception:
59ae15a5 124 pref = 'UTF-8'
bae611f2 125
59ae15a5 126 return pref
d77c3dfd 127
f4bfd65f 128
181c8655 129def write_json_file(obj, fn):
1394646a 130 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 131
92120217 132 fn = encodeFilename(fn)
61ee5aeb 133 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
134 encoding = get_filesystem_encoding()
135 # os.path.basename returns a bytes object, but NamedTemporaryFile
136 # will fail if the filename contains non ascii characters unless we
137 # use a unicode object
138 path_basename = lambda f: os.path.basename(fn).decode(encoding)
139 # the same for os.path.dirname
140 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
141 else:
142 path_basename = os.path.basename
143 path_dirname = os.path.dirname
144
73159f99
S
145 args = {
146 'suffix': '.tmp',
ec5f6016
JMF
147 'prefix': path_basename(fn) + '.',
148 'dir': path_dirname(fn),
73159f99
S
149 'delete': False,
150 }
151
181c8655
PH
152 # In Python 2.x, json.dump expects a bytestream.
153 # In Python 3.x, it writes to a character stream
154 if sys.version_info < (3, 0):
73159f99 155 args['mode'] = 'wb'
181c8655 156 else:
73159f99
S
157 args.update({
158 'mode': 'w',
159 'encoding': 'utf-8',
160 })
161
c86b6142 162 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
163
164 try:
165 with tf:
166 json.dump(obj, tf)
1394646a
IK
167 if sys.platform == 'win32':
168 # Need to remove existing file on Windows, else os.rename raises
169 # WindowsError or FileExistsError.
170 try:
171 os.unlink(fn)
172 except OSError:
173 pass
181c8655 174 os.rename(tf.name, fn)
70a1165b 175 except Exception:
181c8655
PH
176 try:
177 os.remove(tf.name)
178 except OSError:
179 pass
180 raise
181
182
183if sys.version_info >= (2, 7):
ee114368 184 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 185 """ Find the xpath xpath[@key=val] """
5d2354f1 186 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 187 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
188 return node.find(expr)
189else:
ee114368 190 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 191 for f in node.findall(compat_xpath(xpath)):
ee114368
S
192 if key not in f.attrib:
193 continue
194 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
195 return f
196 return None
197
d7e66d39
JMF
198# On python2.6 the xml.etree.ElementTree.Element methods don't support
199# the namespace parameter
5f6a1245
JW
200
201
d7e66d39
JMF
202def xpath_with_ns(path, ns_map):
203 components = [c.split(':') for c in path.split('/')]
204 replaced = []
205 for c in components:
206 if len(c) == 1:
207 replaced.append(c[0])
208 else:
209 ns, tag = c
210 replaced.append('{%s}%s' % (ns_map[ns], tag))
211 return '/'.join(replaced)
212
d77c3dfd 213
a41fb80c 214def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 215 def _find_xpath(xpath):
810c10ba 216 return node.find(compat_xpath(xpath))
578c0745
S
217
218 if isinstance(xpath, (str, compat_str)):
219 n = _find_xpath(xpath)
220 else:
221 for xp in xpath:
222 n = _find_xpath(xp)
223 if n is not None:
224 break
d74bebd5 225
8e636da4 226 if n is None:
bf42a990
S
227 if default is not NO_DEFAULT:
228 return default
229 elif fatal:
bf0ff932
PH
230 name = xpath if name is None else name
231 raise ExtractorError('Could not find XML element %s' % name)
232 else:
233 return None
a41fb80c
S
234 return n
235
236
237def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
238 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
239 if n is None or n == default:
240 return n
241 if n.text is None:
242 if default is not NO_DEFAULT:
243 return default
244 elif fatal:
245 name = xpath if name is None else name
246 raise ExtractorError('Could not find XML element\'s text %s' % name)
247 else:
248 return None
249 return n.text
a41fb80c
S
250
251
252def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
253 n = find_xpath_attr(node, xpath, key)
254 if n is None:
255 if default is not NO_DEFAULT:
256 return default
257 elif fatal:
258 name = '%s[@%s]' % (xpath, key) if name is None else name
259 raise ExtractorError('Could not find XML attribute %s' % name)
260 else:
261 return None
262 return n.attrib[key]
bf0ff932
PH
263
264
9e6dd238 265def get_element_by_id(id, html):
43e8fafd 266 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 267 return get_element_by_attribute('id', id, html)
43e8fafd 268
12ea2f30 269
43e8fafd
ND
270def get_element_by_attribute(attribute, value, html):
271 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 272
38285056
PH
273 m = re.search(r'''(?xs)
274 <([a-zA-Z0-9:._-]+)
abc97b5e 275 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056 276 \s+%s=['"]?%s['"]?
abc97b5e 277 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056
PH
278 \s*>
279 (?P<content>.*?)
280 </\1>
281 ''' % (re.escape(attribute), re.escape(value)), html)
282
283 if not m:
284 return None
285 res = m.group('content')
286
287 if res.startswith('"') or res.startswith("'"):
288 res = res[1:-1]
a921f407 289
38285056 290 return unescapeHTML(res)
a921f407 291
c5229f39 292
8bb56eee
BF
293class HTMLAttributeParser(compat_HTMLParser):
294 """Trivial HTML parser to gather the attributes for a single element"""
295 def __init__(self):
c5229f39 296 self.attrs = {}
8bb56eee
BF
297 compat_HTMLParser.__init__(self)
298
299 def handle_starttag(self, tag, attrs):
300 self.attrs = dict(attrs)
301
c5229f39 302
8bb56eee
BF
303def extract_attributes(html_element):
304 """Given a string for an HTML element such as
305 <el
306 a="foo" B="bar" c="&98;az" d=boz
307 empty= noval entity="&amp;"
308 sq='"' dq="'"
309 >
310 Decode and return a dictionary of attributes.
311 {
312 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
313 'empty': '', 'noval': None, 'entity': '&',
314 'sq': '"', 'dq': '\''
315 }.
316 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
317 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
318 """
319 parser = HTMLAttributeParser()
320 parser.feed(html_element)
321 parser.close()
322 return parser.attrs
9e6dd238 323
c5229f39 324
9e6dd238 325def clean_html(html):
59ae15a5 326 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
327
328 if html is None: # Convenience for sanitizing descriptions etc.
329 return html
330
59ae15a5
PH
331 # Newline vs <br />
332 html = html.replace('\n', ' ')
6b3aef80
FV
333 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
334 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
335 # Strip html tags
336 html = re.sub('<.*?>', '', html)
337 # Replace html entities
338 html = unescapeHTML(html)
7decf895 339 return html.strip()
9e6dd238
FV
340
341
d77c3dfd 342def sanitize_open(filename, open_mode):
59ae15a5
PH
343 """Try to open the given filename, and slightly tweak it if this fails.
344
345 Attempts to open the given filename. If this fails, it tries to change
346 the filename slightly, step by step, until it's either able to open it
347 or it fails and raises a final exception, like the standard open()
348 function.
349
350 It returns the tuple (stream, definitive_file_name).
351 """
352 try:
28e614de 353 if filename == '-':
59ae15a5
PH
354 if sys.platform == 'win32':
355 import msvcrt
356 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 357 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
358 stream = open(encodeFilename(filename), open_mode)
359 return (stream, filename)
360 except (IOError, OSError) as err:
f45c185f
PH
361 if err.errno in (errno.EACCES,):
362 raise
59ae15a5 363
f45c185f 364 # In case of error, try to remove win32 forbidden chars
d55de57b 365 alt_filename = sanitize_path(filename)
f45c185f
PH
366 if alt_filename == filename:
367 raise
368 else:
369 # An exception here should be caught in the caller
d55de57b 370 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 371 return (stream, alt_filename)
d77c3dfd
FV
372
373
374def timeconvert(timestr):
59ae15a5
PH
375 """Convert RFC 2822 defined time string into system timestamp"""
376 timestamp = None
377 timetuple = email.utils.parsedate_tz(timestr)
378 if timetuple is not None:
379 timestamp = email.utils.mktime_tz(timetuple)
380 return timestamp
1c469a94 381
5f6a1245 382
796173d0 383def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
384 """Sanitizes a string so it could be used as part of a filename.
385 If restricted is set, use a stricter subset of allowed characters.
796173d0 386 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
387 """
388 def replace_insane(char):
c587cbb7
AT
389 if restricted and char in ACCENT_CHARS:
390 return ACCENT_CHARS[char]
59ae15a5
PH
391 if char == '?' or ord(char) < 32 or ord(char) == 127:
392 return ''
393 elif char == '"':
394 return '' if restricted else '\''
395 elif char == ':':
396 return '_-' if restricted else ' -'
397 elif char in '\\/|*<>':
398 return '_'
627dcfff 399 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
400 return '_'
401 if restricted and ord(char) > 127:
402 return '_'
403 return char
404
2aeb06d6
PH
405 # Handle timestamps
406 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 407 result = ''.join(map(replace_insane, s))
796173d0
PH
408 if not is_id:
409 while '__' in result:
410 result = result.replace('__', '_')
411 result = result.strip('_')
412 # Common case of "Foreign band name - English song title"
413 if restricted and result.startswith('-_'):
414 result = result[2:]
5a42414b
PH
415 if result.startswith('-'):
416 result = '_' + result[len('-'):]
a7440261 417 result = result.lstrip('.')
796173d0
PH
418 if not result:
419 result = '_'
59ae15a5 420 return result
d77c3dfd 421
5f6a1245 422
a2aaf4db
S
423def sanitize_path(s):
424 """Sanitizes and normalizes path on Windows"""
425 if sys.platform != 'win32':
426 return s
be531ef1
S
427 drive_or_unc, _ = os.path.splitdrive(s)
428 if sys.version_info < (2, 7) and not drive_or_unc:
429 drive_or_unc, _ = os.path.splitunc(s)
430 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
431 if drive_or_unc:
a2aaf4db
S
432 norm_path.pop(0)
433 sanitized_path = [
c90d16cf 434 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
a2aaf4db 435 for path_part in norm_path]
be531ef1
S
436 if drive_or_unc:
437 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
438 return os.path.join(*sanitized_path)
439
440
67dda517
S
441# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
442# unwanted failures due to missing protocol
17bcc626
S
443def sanitize_url(url):
444 return 'http:%s' % url if url.startswith('//') else url
445
446
67dda517 447def sanitized_Request(url, *args, **kwargs):
17bcc626 448 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
449
450
d77c3dfd 451def orderedSet(iterable):
59ae15a5
PH
452 """ Remove all duplicates from the input iterable """
453 res = []
454 for el in iterable:
455 if el not in res:
456 res.append(el)
457 return res
d77c3dfd 458
912b38b4 459
55b2f099 460def _htmlentity_transform(entity_with_semicolon):
4e408e47 461 """Transforms an HTML entity to a character."""
55b2f099
YCH
462 entity = entity_with_semicolon[:-1]
463
4e408e47
PH
464 # Known non-numeric HTML entity
465 if entity in compat_html_entities.name2codepoint:
466 return compat_chr(compat_html_entities.name2codepoint[entity])
467
55b2f099
YCH
468 # TODO: HTML5 allows entities without a semicolon. For example,
469 # '&Eacuteric' should be decoded as 'Éric'.
470 if entity_with_semicolon in compat_html_entities_html5:
471 return compat_html_entities_html5[entity_with_semicolon]
472
91757b0f 473 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
474 if mobj is not None:
475 numstr = mobj.group(1)
28e614de 476 if numstr.startswith('x'):
4e408e47 477 base = 16
28e614de 478 numstr = '0%s' % numstr
4e408e47
PH
479 else:
480 base = 10
7aefc49c
S
481 # See https://github.com/rg3/youtube-dl/issues/7518
482 try:
483 return compat_chr(int(numstr, base))
484 except ValueError:
485 pass
4e408e47
PH
486
487 # Unknown entity in name, return its literal representation
7a3f0c00 488 return '&%s;' % entity
4e408e47
PH
489
490
d77c3dfd 491def unescapeHTML(s):
912b38b4
PH
492 if s is None:
493 return None
494 assert type(s) == compat_str
d77c3dfd 495
4e408e47 496 return re.sub(
55b2f099 497 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 498
8bf48f23 499
aa49acd1
S
500def get_subprocess_encoding():
501 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
502 # For subprocess calls, encode with locale encoding
503 # Refer to http://stackoverflow.com/a/9951851/35070
504 encoding = preferredencoding()
505 else:
506 encoding = sys.getfilesystemencoding()
507 if encoding is None:
508 encoding = 'utf-8'
509 return encoding
510
511
8bf48f23 512def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
513 """
514 @param s The name of the file
515 """
d77c3dfd 516
8bf48f23 517 assert type(s) == compat_str
d77c3dfd 518
59ae15a5
PH
519 # Python 3 has a Unicode API
520 if sys.version_info >= (3, 0):
521 return s
0f00efed 522
aa49acd1
S
523 # Pass '' directly to use Unicode APIs on Windows 2000 and up
524 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
525 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
526 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
527 return s
528
8ee239e9
YCH
529 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
530 if sys.platform.startswith('java'):
531 return s
532
aa49acd1
S
533 return s.encode(get_subprocess_encoding(), 'ignore')
534
535
536def decodeFilename(b, for_subprocess=False):
537
538 if sys.version_info >= (3, 0):
539 return b
540
541 if not isinstance(b, bytes):
542 return b
543
544 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 545
f07b74fc
PH
546
547def encodeArgument(s):
548 if not isinstance(s, compat_str):
549 # Legacy code that uses byte strings
550 # Uncomment the following line after fixing all post processors
7af808a5 551 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
552 s = s.decode('ascii')
553 return encodeFilename(s, True)
554
555
aa49acd1
S
556def decodeArgument(b):
557 return decodeFilename(b, True)
558
559
8271226a
PH
560def decodeOption(optval):
561 if optval is None:
562 return optval
563 if isinstance(optval, bytes):
564 optval = optval.decode(preferredencoding())
565
566 assert isinstance(optval, compat_str)
567 return optval
1c256f70 568
5f6a1245 569
4539dd30
PH
570def formatSeconds(secs):
571 if secs > 3600:
572 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
573 elif secs > 60:
574 return '%d:%02d' % (secs // 60, secs % 60)
575 else:
576 return '%d' % secs
577
a0ddb8a2 578
be4a824d
PH
579def make_HTTPS_handler(params, **kwargs):
580 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 581 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 582 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 583 if opts_no_check_certificate:
be5f2c19 584 context.check_hostname = False
0db261ba 585 context.verify_mode = ssl.CERT_NONE
a2366922 586 try:
be4a824d 587 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
588 except TypeError:
589 # Python 2.7.8
590 # (create_default_context present but HTTPSHandler has no context=)
591 pass
592
593 if sys.version_info < (3, 2):
d7932313 594 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 595 else: # Python < 3.4
d7932313 596 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 597 context.verify_mode = (ssl.CERT_NONE
dca08720 598 if opts_no_check_certificate
ea6d901e 599 else ssl.CERT_REQUIRED)
303b479e 600 context.set_default_verify_paths()
be4a824d 601 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 602
732ea2f0 603
08f2a92c
JMF
604def bug_reports_message():
605 if ytdl_is_updateable():
606 update_cmd = 'type youtube-dl -U to update'
607 else:
608 update_cmd = 'see https://yt-dl.org/update on how to update'
609 msg = '; please report this issue on https://yt-dl.org/bug .'
610 msg += ' Make sure you are using the latest version; %s.' % update_cmd
611 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
612 return msg
613
614
1c256f70
PH
615class ExtractorError(Exception):
616 """Error during info extraction."""
5f6a1245 617
d11271dd 618 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
619 """ tb, if given, is the original traceback (so that it can be printed out).
620 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
621 """
622
623 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
624 expected = True
d11271dd
PH
625 if video_id is not None:
626 msg = video_id + ': ' + msg
410f3e73 627 if cause:
28e614de 628 msg += ' (caused by %r)' % cause
9a82b238 629 if not expected:
08f2a92c 630 msg += bug_reports_message()
1c256f70 631 super(ExtractorError, self).__init__(msg)
d5979c5d 632
1c256f70 633 self.traceback = tb
8cc83b8d 634 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 635 self.cause = cause
d11271dd 636 self.video_id = video_id
1c256f70 637
01951dda
PH
638 def format_traceback(self):
639 if self.traceback is None:
640 return None
28e614de 641 return ''.join(traceback.format_tb(self.traceback))
01951dda 642
1c256f70 643
416c7fcb
PH
644class UnsupportedError(ExtractorError):
645 def __init__(self, url):
646 super(UnsupportedError, self).__init__(
647 'Unsupported URL: %s' % url, expected=True)
648 self.url = url
649
650
55b3e45b
JMF
651class RegexNotFoundError(ExtractorError):
652 """Error when a regex didn't match"""
653 pass
654
655
d77c3dfd 656class DownloadError(Exception):
59ae15a5 657 """Download Error exception.
d77c3dfd 658
59ae15a5
PH
659 This exception may be thrown by FileDownloader objects if they are not
660 configured to continue on errors. They will contain the appropriate
661 error message.
662 """
5f6a1245 663
8cc83b8d
FV
664 def __init__(self, msg, exc_info=None):
665 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
666 super(DownloadError, self).__init__(msg)
667 self.exc_info = exc_info
d77c3dfd
FV
668
669
670class SameFileError(Exception):
59ae15a5 671 """Same File exception.
d77c3dfd 672
59ae15a5
PH
673 This exception will be thrown by FileDownloader objects if they detect
674 multiple files would have to be downloaded to the same file on disk.
675 """
676 pass
d77c3dfd
FV
677
678
679class PostProcessingError(Exception):
59ae15a5 680 """Post Processing exception.
d77c3dfd 681
59ae15a5
PH
682 This exception may be raised by PostProcessor's .run() method to
683 indicate an error in the postprocessing task.
684 """
5f6a1245 685
7851b379
PH
686 def __init__(self, msg):
687 self.msg = msg
d77c3dfd 688
5f6a1245 689
d77c3dfd 690class MaxDownloadsReached(Exception):
59ae15a5
PH
691 """ --max-downloads limit has been reached. """
692 pass
d77c3dfd
FV
693
694
695class UnavailableVideoError(Exception):
59ae15a5 696 """Unavailable Format exception.
d77c3dfd 697
59ae15a5
PH
698 This exception will be thrown when a video is requested
699 in a format that is not available for that video.
700 """
701 pass
d77c3dfd
FV
702
703
704class ContentTooShortError(Exception):
59ae15a5 705 """Content Too Short exception.
d77c3dfd 706
59ae15a5
PH
707 This exception may be raised by FileDownloader objects when a file they
708 download is too small for what the server announced first, indicating
709 the connection was probably interrupted.
710 """
d77c3dfd 711
59ae15a5 712 def __init__(self, downloaded, expected):
2c7ed247 713 # Both in bytes
59ae15a5
PH
714 self.downloaded = downloaded
715 self.expected = expected
d77c3dfd 716
5f6a1245 717
c5a59d93 718def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
719 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
720 # expected HTTP responses to meet HTTP/1.0 or later (see also
721 # https://github.com/rg3/youtube-dl/issues/6727)
722 if sys.version_info < (3, 0):
5a1a2e94 723 kwargs[b'strict'] = True
be4a824d
PH
724 hc = http_class(*args, **kwargs)
725 source_address = ydl_handler._params.get('source_address')
726 if source_address is not None:
727 sa = (source_address, 0)
728 if hasattr(hc, 'source_address'): # Python 2.7+
729 hc.source_address = sa
730 else: # Python 2.6
731 def _hc_connect(self, *args, **kwargs):
732 sock = compat_socket_create_connection(
733 (self.host, self.port), self.timeout, sa)
734 if is_https:
d7932313
PH
735 self.sock = ssl.wrap_socket(
736 sock, self.key_file, self.cert_file,
737 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
738 else:
739 self.sock = sock
740 hc.connect = functools.partial(_hc_connect, hc)
741
742 return hc
743
744
87f0e62d 745def handle_youtubedl_headers(headers):
992fc9d6
YCH
746 filtered_headers = headers
747
748 if 'Youtubedl-no-compression' in filtered_headers:
749 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 750 del filtered_headers['Youtubedl-no-compression']
87f0e62d 751
992fc9d6 752 return filtered_headers
87f0e62d
YCH
753
754
acebc9cd 755class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
756 """Handler for HTTP requests and responses.
757
758 This class, when installed with an OpenerDirector, automatically adds
759 the standard headers to every HTTP request and handles gzipped and
760 deflated responses from web servers. If compression is to be avoided in
761 a particular request, the original request in the program code only has
0424ec30 762 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
763 removed before making the real request.
764
765 Part of this code was copied from:
766
767 http://techknack.net/python-urllib2-handlers/
768
769 Andrew Rowls, the author of that code, agreed to release it to the
770 public domain.
771 """
772
be4a824d
PH
773 def __init__(self, params, *args, **kwargs):
774 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
775 self._params = params
776
777 def http_open(self, req):
71aff188
YCH
778 conn_class = compat_http_client.HTTPConnection
779
780 socks_proxy = req.headers.get('Ytdl-socks-proxy')
781 if socks_proxy:
782 conn_class = make_socks_conn_class(conn_class, socks_proxy)
783 del req.headers['Ytdl-socks-proxy']
784
be4a824d 785 return self.do_open(functools.partial(
71aff188 786 _create_http_connection, self, conn_class, False),
be4a824d
PH
787 req)
788
59ae15a5
PH
789 @staticmethod
790 def deflate(data):
791 try:
792 return zlib.decompress(data, -zlib.MAX_WBITS)
793 except zlib.error:
794 return zlib.decompress(data)
795
796 @staticmethod
797 def addinfourl_wrapper(stream, headers, url, code):
798 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
799 return compat_urllib_request.addinfourl(stream, headers, url, code)
800 ret = compat_urllib_request.addinfourl(stream, headers, url)
801 ret.code = code
802 return ret
803
acebc9cd 804 def http_request(self, req):
51f267d9
S
805 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
806 # always respected by websites, some tend to give out URLs with non percent-encoded
807 # non-ASCII characters (see telemb.py, ard.py [#3412])
808 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
809 # To work around aforementioned issue we will replace request's original URL with
810 # percent-encoded one
811 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
812 # the code of this workaround has been moved here from YoutubeDL.urlopen()
813 url = req.get_full_url()
814 url_escaped = escape_url(url)
815
816 # Substitute URL if any change after escaping
817 if url != url_escaped:
15d260eb 818 req = update_Request(req, url=url_escaped)
51f267d9 819
33ac271b 820 for h, v in std_headers.items():
3d5f7a39
JK
821 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
822 # The dict keys are capitalized because of this bug by urllib
823 if h.capitalize() not in req.headers:
33ac271b 824 req.add_header(h, v)
87f0e62d
YCH
825
826 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
827
828 if sys.version_info < (2, 7) and '#' in req.get_full_url():
829 # Python 2.6 is brain-dead when it comes to fragments
830 req._Request__original = req._Request__original.partition('#')[0]
831 req._Request__r_type = req._Request__r_type.partition('#')[0]
832
59ae15a5
PH
833 return req
834
acebc9cd 835 def http_response(self, req, resp):
59ae15a5
PH
836 old_resp = resp
837 # gzip
838 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
839 content = resp.read()
840 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
841 try:
842 uncompressed = io.BytesIO(gz.read())
843 except IOError as original_ioerror:
844 # There may be junk add the end of the file
845 # See http://stackoverflow.com/q/4928560/35070 for details
846 for i in range(1, 1024):
847 try:
848 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
849 uncompressed = io.BytesIO(gz.read())
850 except IOError:
851 continue
852 break
853 else:
854 raise original_ioerror
855 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 856 resp.msg = old_resp.msg
c047270c 857 del resp.headers['Content-encoding']
59ae15a5
PH
858 # deflate
859 if resp.headers.get('Content-encoding', '') == 'deflate':
860 gz = io.BytesIO(self.deflate(resp.read()))
861 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
862 resp.msg = old_resp.msg
c047270c 863 del resp.headers['Content-encoding']
ad729172
S
864 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
865 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
866 if 300 <= resp.code < 400:
867 location = resp.headers.get('Location')
868 if location:
869 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
870 if sys.version_info >= (3, 0):
871 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
872 else:
873 location = location.decode('utf-8')
5a4d9ddb
S
874 location_escaped = escape_url(location)
875 if location != location_escaped:
876 del resp.headers['Location']
9a4aec8b
YCH
877 if sys.version_info < (3, 0):
878 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 879 resp.headers['Location'] = location_escaped
59ae15a5 880 return resp
0f8d03f8 881
acebc9cd
PH
882 https_request = http_request
883 https_response = http_response
bf50b038 884
5de90176 885
71aff188
YCH
886def make_socks_conn_class(base_class, socks_proxy):
887 assert issubclass(base_class, (
888 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
889
890 url_components = compat_urlparse.urlparse(socks_proxy)
891 if url_components.scheme.lower() == 'socks5':
892 socks_type = ProxyType.SOCKS5
893 elif url_components.scheme.lower() in ('socks', 'socks4'):
894 socks_type = ProxyType.SOCKS4
51fb4995
YCH
895 elif url_components.scheme.lower() == 'socks4a':
896 socks_type = ProxyType.SOCKS4A
71aff188 897
cdd94c2e
YCH
898 def unquote_if_non_empty(s):
899 if not s:
900 return s
901 return compat_urllib_parse_unquote_plus(s)
902
71aff188
YCH
903 proxy_args = (
904 socks_type,
905 url_components.hostname, url_components.port or 1080,
906 True, # Remote DNS
cdd94c2e
YCH
907 unquote_if_non_empty(url_components.username),
908 unquote_if_non_empty(url_components.password),
71aff188
YCH
909 )
910
911 class SocksConnection(base_class):
912 def connect(self):
913 self.sock = sockssocket()
914 self.sock.setproxy(*proxy_args)
915 if type(self.timeout) in (int, float):
916 self.sock.settimeout(self.timeout)
917 self.sock.connect((self.host, self.port))
918
919 if isinstance(self, compat_http_client.HTTPSConnection):
920 if hasattr(self, '_context'): # Python > 2.6
921 self.sock = self._context.wrap_socket(
922 self.sock, server_hostname=self.host)
923 else:
924 self.sock = ssl.wrap_socket(self.sock)
925
926 return SocksConnection
927
928
be4a824d
PH
929class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
930 def __init__(self, params, https_conn_class=None, *args, **kwargs):
931 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
932 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
933 self._params = params
934
935 def https_open(self, req):
4f264c02 936 kwargs = {}
71aff188
YCH
937 conn_class = self._https_conn_class
938
4f264c02
JMF
939 if hasattr(self, '_context'): # python > 2.6
940 kwargs['context'] = self._context
941 if hasattr(self, '_check_hostname'): # python 3.x
942 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
943
944 socks_proxy = req.headers.get('Ytdl-socks-proxy')
945 if socks_proxy:
946 conn_class = make_socks_conn_class(conn_class, socks_proxy)
947 del req.headers['Ytdl-socks-proxy']
948
be4a824d 949 return self.do_open(functools.partial(
71aff188 950 _create_http_connection, self, conn_class, True),
4f264c02 951 req, **kwargs)
be4a824d
PH
952
953
a6420bf5
S
954class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
955 def __init__(self, cookiejar=None):
956 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
957
958 def http_response(self, request, response):
959 # Python 2 will choke on next HTTP request in row if there are non-ASCII
960 # characters in Set-Cookie HTTP header of last response (see
961 # https://github.com/rg3/youtube-dl/issues/6769).
962 # In order to at least prevent crashing we will percent encode Set-Cookie
963 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
964 # if sys.version_info < (3, 0) and response.headers:
965 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
966 # set_cookie = response.headers.get(set_cookie_header)
967 # if set_cookie:
968 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
969 # if set_cookie != set_cookie_escaped:
970 # del response.headers[set_cookie_header]
971 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
972 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
973
974 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
975 https_response = http_response
976
977
08b38d54 978def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
979 """ Return a UNIX timestamp from the given date """
980
981 if date_str is None:
982 return None
983
52c3a6e4
S
984 date_str = re.sub(r'\.[0-9]+', '', date_str)
985
08b38d54
PH
986 if timezone is None:
987 m = re.search(
52c3a6e4 988 r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
08b38d54
PH
989 date_str)
990 if not m:
912b38b4
PH
991 timezone = datetime.timedelta()
992 else:
08b38d54
PH
993 date_str = date_str[:-len(m.group(0))]
994 if not m.group('sign'):
995 timezone = datetime.timedelta()
996 else:
997 sign = 1 if m.group('sign') == '+' else -1
998 timezone = datetime.timedelta(
999 hours=sign * int(m.group('hours')),
1000 minutes=sign * int(m.group('minutes')))
52c3a6e4
S
1001 try:
1002 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1003 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1004 return calendar.timegm(dt.timetuple())
1005 except ValueError:
1006 pass
912b38b4
PH
1007
1008
42bdd9d0 1009def unified_strdate(date_str, day_first=True):
bf50b038 1010 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1011
1012 if date_str is None:
1013 return None
bf50b038 1014 upload_date = None
5f6a1245 1015 # Replace commas
026fcc04 1016 date_str = date_str.replace(',', ' ')
bf50b038 1017 # %z (UTC offset) is only supported in python>=3.2
15ac8413
S
1018 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
1019 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0 1020 # Remove AM/PM + timezone
9bb8e0a3 1021 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
42bdd9d0 1022
19e1d359
JMF
1023 format_expressions = [
1024 '%d %B %Y',
0f99566c 1025 '%d %b %Y',
19e1d359
JMF
1026 '%B %d %Y',
1027 '%b %d %Y',
f160785c
S
1028 '%b %dst %Y %I:%M',
1029 '%b %dnd %Y %I:%M',
1030 '%b %dth %Y %I:%M',
a69801e2 1031 '%Y %m %d',
19e1d359 1032 '%Y-%m-%d',
fe556f1b 1033 '%Y/%m/%d',
19e1d359 1034 '%Y/%m/%d %H:%M:%S',
5d73273f 1035 '%Y-%m-%d %H:%M:%S',
e9be9a6a 1036 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 1037 '%d.%m.%Y %H:%M',
b047de6f 1038 '%d.%m.%Y %H.%M',
19e1d359 1039 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
1040 '%Y-%m-%dT%H:%M:%S.%fZ',
1041 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 1042 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 1043 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 1044 '%Y-%m-%dT%H:%M',
19e1d359 1045 ]
42bdd9d0
PH
1046 if day_first:
1047 format_expressions.extend([
79c21abb 1048 '%d-%m-%Y',
776dc399 1049 '%d.%m.%Y',
5950cb1d 1050 '%d.%m.%y',
776dc399
S
1051 '%d/%m/%Y',
1052 '%d/%m/%y',
42bdd9d0
PH
1053 '%d/%m/%Y %H:%M:%S',
1054 ])
1055 else:
1056 format_expressions.extend([
79c21abb 1057 '%m-%d-%Y',
776dc399
S
1058 '%m.%d.%Y',
1059 '%m/%d/%Y',
1060 '%m/%d/%y',
42bdd9d0
PH
1061 '%m/%d/%Y %H:%M:%S',
1062 ])
bf50b038
JMF
1063 for expression in format_expressions:
1064 try:
1065 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1066 except ValueError:
bf50b038 1067 pass
42393ce2
PH
1068 if upload_date is None:
1069 timetuple = email.utils.parsedate_tz(date_str)
1070 if timetuple:
c6b9cf05
S
1071 try:
1072 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1073 except ValueError:
1074 pass
6a750402
JMF
1075 if upload_date is not None:
1076 return compat_str(upload_date)
bf50b038 1077
5f6a1245 1078
28e614de 1079def determine_ext(url, default_ext='unknown_video'):
f4776371
S
1080 if url is None:
1081 return default_ext
9cb9a5df 1082 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1083 if re.match(r'^[A-Za-z0-9]+$', guess):
1084 return guess
a7aaa398
S
1085 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1086 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1087 return guess.rstrip('/')
73e79f2a 1088 else:
cbdbb766 1089 return default_ext
73e79f2a 1090
5f6a1245 1091
d4051a8e 1092def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1093 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1094
5f6a1245 1095
bd558525 1096def date_from_str(date_str):
37254abc
JMF
1097 """
1098 Return a datetime object from a string in the format YYYYMMDD or
1099 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1100 today = datetime.date.today()
f8795e10 1101 if date_str in ('now', 'today'):
37254abc 1102 return today
f8795e10
PH
1103 if date_str == 'yesterday':
1104 return today - datetime.timedelta(days=1)
37254abc
JMF
1105 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1106 if match is not None:
1107 sign = match.group('sign')
1108 time = int(match.group('time'))
1109 if sign == '-':
1110 time = -time
1111 unit = match.group('unit')
dfb1b146 1112 # A bad approximation?
37254abc
JMF
1113 if unit == 'month':
1114 unit = 'day'
1115 time *= 30
1116 elif unit == 'year':
1117 unit = 'day'
1118 time *= 365
1119 unit += 's'
1120 delta = datetime.timedelta(**{unit: time})
1121 return today + delta
611c1dd9 1122 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1123
1124
e63fc1be 1125def hyphenate_date(date_str):
1126 """
1127 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1128 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1129 if match is not None:
1130 return '-'.join(match.groups())
1131 else:
1132 return date_str
1133
5f6a1245 1134
bd558525
JMF
1135class DateRange(object):
1136 """Represents a time interval between two dates"""
5f6a1245 1137
bd558525
JMF
1138 def __init__(self, start=None, end=None):
1139 """start and end must be strings in the format accepted by date"""
1140 if start is not None:
1141 self.start = date_from_str(start)
1142 else:
1143 self.start = datetime.datetime.min.date()
1144 if end is not None:
1145 self.end = date_from_str(end)
1146 else:
1147 self.end = datetime.datetime.max.date()
37254abc 1148 if self.start > self.end:
bd558525 1149 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1150
bd558525
JMF
1151 @classmethod
1152 def day(cls, day):
1153 """Returns a range that only contains the given day"""
5f6a1245
JW
1154 return cls(day, day)
1155
bd558525
JMF
1156 def __contains__(self, date):
1157 """Check if the date is in the range"""
37254abc
JMF
1158 if not isinstance(date, datetime.date):
1159 date = date_from_str(date)
1160 return self.start <= date <= self.end
5f6a1245 1161
bd558525 1162 def __str__(self):
5f6a1245 1163 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1164
1165
1166def platform_name():
1167 """ Returns the platform name as a compat_str """
1168 res = platform.platform()
1169 if isinstance(res, bytes):
1170 res = res.decode(preferredencoding())
1171
1172 assert isinstance(res, compat_str)
1173 return res
c257baff
PH
1174
1175
b58ddb32
PH
1176def _windows_write_string(s, out):
1177 """ Returns True if the string was written using special methods,
1178 False if it has yet to be written out."""
1179 # Adapted from http://stackoverflow.com/a/3259271/35070
1180
1181 import ctypes
1182 import ctypes.wintypes
1183
1184 WIN_OUTPUT_IDS = {
1185 1: -11,
1186 2: -12,
1187 }
1188
a383a98a
PH
1189 try:
1190 fileno = out.fileno()
1191 except AttributeError:
1192 # If the output stream doesn't have a fileno, it's virtual
1193 return False
aa42e873
PH
1194 except io.UnsupportedOperation:
1195 # Some strange Windows pseudo files?
1196 return False
b58ddb32
PH
1197 if fileno not in WIN_OUTPUT_IDS:
1198 return False
1199
e2f89ec7 1200 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1201 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1202 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1203 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1204
e2f89ec7 1205 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1206 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1207 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1208 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1209 written = ctypes.wintypes.DWORD(0)
1210
611c1dd9 1211 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1212 FILE_TYPE_CHAR = 0x0002
1213 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1214 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1215 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1216 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1217 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1218 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1219
1220 def not_a_console(handle):
1221 if handle == INVALID_HANDLE_VALUE or handle is None:
1222 return True
8fb3ac36
PH
1223 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1224 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1225
1226 if not_a_console(h):
1227 return False
1228
d1b9c912
PH
1229 def next_nonbmp_pos(s):
1230 try:
1231 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1232 except StopIteration:
1233 return len(s)
1234
1235 while s:
1236 count = min(next_nonbmp_pos(s), 1024)
1237
b58ddb32 1238 ret = WriteConsoleW(
d1b9c912 1239 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1240 if ret == 0:
1241 raise OSError('Failed to write string')
d1b9c912
PH
1242 if not count: # We just wrote a non-BMP character
1243 assert written.value == 2
1244 s = s[1:]
1245 else:
1246 assert written.value > 0
1247 s = s[written.value:]
b58ddb32
PH
1248 return True
1249
1250
734f90bb 1251def write_string(s, out=None, encoding=None):
7459e3a2
PH
1252 if out is None:
1253 out = sys.stderr
8bf48f23 1254 assert type(s) == compat_str
7459e3a2 1255
b58ddb32
PH
1256 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1257 if _windows_write_string(s, out):
1258 return
1259
7459e3a2
PH
1260 if ('b' in getattr(out, 'mode', '') or
1261 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1262 byt = s.encode(encoding or preferredencoding(), 'ignore')
1263 out.write(byt)
1264 elif hasattr(out, 'buffer'):
1265 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1266 byt = s.encode(enc, 'ignore')
1267 out.buffer.write(byt)
1268 else:
8bf48f23 1269 out.write(s)
7459e3a2
PH
1270 out.flush()
1271
1272
48ea9cea
PH
1273def bytes_to_intlist(bs):
1274 if not bs:
1275 return []
1276 if isinstance(bs[0], int): # Python 3
1277 return list(bs)
1278 else:
1279 return [ord(c) for c in bs]
1280
c257baff 1281
cba892fa 1282def intlist_to_bytes(xs):
1283 if not xs:
1284 return b''
edaa23f8 1285 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1286
1287
c1c9a79c
PH
1288# Cross-platform file locking
1289if sys.platform == 'win32':
1290 import ctypes.wintypes
1291 import msvcrt
1292
1293 class OVERLAPPED(ctypes.Structure):
1294 _fields_ = [
1295 ('Internal', ctypes.wintypes.LPVOID),
1296 ('InternalHigh', ctypes.wintypes.LPVOID),
1297 ('Offset', ctypes.wintypes.DWORD),
1298 ('OffsetHigh', ctypes.wintypes.DWORD),
1299 ('hEvent', ctypes.wintypes.HANDLE),
1300 ]
1301
1302 kernel32 = ctypes.windll.kernel32
1303 LockFileEx = kernel32.LockFileEx
1304 LockFileEx.argtypes = [
1305 ctypes.wintypes.HANDLE, # hFile
1306 ctypes.wintypes.DWORD, # dwFlags
1307 ctypes.wintypes.DWORD, # dwReserved
1308 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1309 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1310 ctypes.POINTER(OVERLAPPED) # Overlapped
1311 ]
1312 LockFileEx.restype = ctypes.wintypes.BOOL
1313 UnlockFileEx = kernel32.UnlockFileEx
1314 UnlockFileEx.argtypes = [
1315 ctypes.wintypes.HANDLE, # hFile
1316 ctypes.wintypes.DWORD, # dwReserved
1317 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1318 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1319 ctypes.POINTER(OVERLAPPED) # Overlapped
1320 ]
1321 UnlockFileEx.restype = ctypes.wintypes.BOOL
1322 whole_low = 0xffffffff
1323 whole_high = 0x7fffffff
1324
1325 def _lock_file(f, exclusive):
1326 overlapped = OVERLAPPED()
1327 overlapped.Offset = 0
1328 overlapped.OffsetHigh = 0
1329 overlapped.hEvent = 0
1330 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1331 handle = msvcrt.get_osfhandle(f.fileno())
1332 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1333 whole_low, whole_high, f._lock_file_overlapped_p):
1334 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1335
1336 def _unlock_file(f):
1337 assert f._lock_file_overlapped_p
1338 handle = msvcrt.get_osfhandle(f.fileno())
1339 if not UnlockFileEx(handle, 0,
1340 whole_low, whole_high, f._lock_file_overlapped_p):
1341 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1342
1343else:
399a76e6
YCH
1344 # Some platforms, such as Jython, is missing fcntl
1345 try:
1346 import fcntl
c1c9a79c 1347
399a76e6
YCH
1348 def _lock_file(f, exclusive):
1349 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1350
399a76e6
YCH
1351 def _unlock_file(f):
1352 fcntl.flock(f, fcntl.LOCK_UN)
1353 except ImportError:
1354 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1355
1356 def _lock_file(f, exclusive):
1357 raise IOError(UNSUPPORTED_MSG)
1358
1359 def _unlock_file(f):
1360 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1361
1362
1363class locked_file(object):
1364 def __init__(self, filename, mode, encoding=None):
1365 assert mode in ['r', 'a', 'w']
1366 self.f = io.open(filename, mode, encoding=encoding)
1367 self.mode = mode
1368
1369 def __enter__(self):
1370 exclusive = self.mode != 'r'
1371 try:
1372 _lock_file(self.f, exclusive)
1373 except IOError:
1374 self.f.close()
1375 raise
1376 return self
1377
1378 def __exit__(self, etype, value, traceback):
1379 try:
1380 _unlock_file(self.f)
1381 finally:
1382 self.f.close()
1383
1384 def __iter__(self):
1385 return iter(self.f)
1386
1387 def write(self, *args):
1388 return self.f.write(*args)
1389
1390 def read(self, *args):
1391 return self.f.read(*args)
4eb7f1d1
JMF
1392
1393
4644ac55
S
1394def get_filesystem_encoding():
1395 encoding = sys.getfilesystemencoding()
1396 return encoding if encoding is not None else 'utf-8'
1397
1398
4eb7f1d1 1399def shell_quote(args):
a6a173c2 1400 quoted_args = []
4644ac55 1401 encoding = get_filesystem_encoding()
a6a173c2
JMF
1402 for a in args:
1403 if isinstance(a, bytes):
1404 # We may get a filename encoded with 'encodeFilename'
1405 a = a.decode(encoding)
1406 quoted_args.append(pipes.quote(a))
28e614de 1407 return ' '.join(quoted_args)
9d4660ca
PH
1408
1409
1410def smuggle_url(url, data):
1411 """ Pass additional data in a URL for internal use. """
1412
15707c7e 1413 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1414 {'__youtubedl_smuggle': json.dumps(data)})
1415 return url + '#' + sdata
9d4660ca
PH
1416
1417
79f82953 1418def unsmuggle_url(smug_url, default=None):
83e865a3 1419 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1420 return smug_url, default
28e614de
PH
1421 url, _, sdata = smug_url.rpartition('#')
1422 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1423 data = json.loads(jsond)
1424 return url, data
02dbf93f
PH
1425
1426
02dbf93f
PH
1427def format_bytes(bytes):
1428 if bytes is None:
28e614de 1429 return 'N/A'
02dbf93f
PH
1430 if type(bytes) is str:
1431 bytes = float(bytes)
1432 if bytes == 0.0:
1433 exponent = 0
1434 else:
1435 exponent = int(math.log(bytes, 1024.0))
28e614de 1436 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1437 converted = float(bytes) / float(1024 ** exponent)
28e614de 1438 return '%.2f%s' % (converted, suffix)
f53c966a 1439
1c088fa8 1440
fb47597b
S
1441def lookup_unit_table(unit_table, s):
1442 units_re = '|'.join(re.escape(u) for u in unit_table)
1443 m = re.match(
782b1b5b 1444 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1445 if not m:
1446 return None
1447 num_str = m.group('num').replace(',', '.')
1448 mult = unit_table[m.group('unit')]
1449 return int(float(num_str) * mult)
1450
1451
be64b5b0
PH
1452def parse_filesize(s):
1453 if s is None:
1454 return None
1455
dfb1b146 1456 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1457 # but we support those too
1458 _UNIT_TABLE = {
1459 'B': 1,
1460 'b': 1,
1461 'KiB': 1024,
1462 'KB': 1000,
1463 'kB': 1024,
1464 'Kb': 1000,
1465 'MiB': 1024 ** 2,
1466 'MB': 1000 ** 2,
1467 'mB': 1024 ** 2,
1468 'Mb': 1000 ** 2,
1469 'GiB': 1024 ** 3,
1470 'GB': 1000 ** 3,
1471 'gB': 1024 ** 3,
1472 'Gb': 1000 ** 3,
1473 'TiB': 1024 ** 4,
1474 'TB': 1000 ** 4,
1475 'tB': 1024 ** 4,
1476 'Tb': 1000 ** 4,
1477 'PiB': 1024 ** 5,
1478 'PB': 1000 ** 5,
1479 'pB': 1024 ** 5,
1480 'Pb': 1000 ** 5,
1481 'EiB': 1024 ** 6,
1482 'EB': 1000 ** 6,
1483 'eB': 1024 ** 6,
1484 'Eb': 1000 ** 6,
1485 'ZiB': 1024 ** 7,
1486 'ZB': 1000 ** 7,
1487 'zB': 1024 ** 7,
1488 'Zb': 1000 ** 7,
1489 'YiB': 1024 ** 8,
1490 'YB': 1000 ** 8,
1491 'yB': 1024 ** 8,
1492 'Yb': 1000 ** 8,
1493 }
1494
fb47597b
S
1495 return lookup_unit_table(_UNIT_TABLE, s)
1496
1497
1498def parse_count(s):
1499 if s is None:
be64b5b0
PH
1500 return None
1501
fb47597b
S
1502 s = s.strip()
1503
1504 if re.match(r'^[\d,.]+$', s):
1505 return str_to_int(s)
1506
1507 _UNIT_TABLE = {
1508 'k': 1000,
1509 'K': 1000,
1510 'm': 1000 ** 2,
1511 'M': 1000 ** 2,
1512 'kk': 1000 ** 2,
1513 'KK': 1000 ** 2,
1514 }
be64b5b0 1515
fb47597b 1516 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1517
2f7ae819 1518
caefb1de
PH
1519def month_by_name(name):
1520 """ Return the number of a month by (locale-independently) English name """
1521
caefb1de 1522 try:
7105440c
YCH
1523 return ENGLISH_MONTH_NAMES.index(name) + 1
1524 except ValueError:
1525 return None
1526
1527
1528def month_by_abbreviation(abbrev):
1529 """ Return the number of a month by (locale-independently) English
1530 abbreviations """
1531
1532 try:
1533 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1534 except ValueError:
1535 return None
18258362
JMF
1536
1537
5aafe895 1538def fix_xml_ampersands(xml_str):
18258362 1539 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1540 return re.sub(
1541 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1542 '&amp;',
5aafe895 1543 xml_str)
e3946f98
PH
1544
1545
1546def setproctitle(title):
8bf48f23 1547 assert isinstance(title, compat_str)
c1c05c67
YCH
1548
1549 # ctypes in Jython is not complete
1550 # http://bugs.jython.org/issue2148
1551 if sys.platform.startswith('java'):
1552 return
1553
e3946f98 1554 try:
611c1dd9 1555 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1556 except OSError:
1557 return
6eefe533
PH
1558 title_bytes = title.encode('utf-8')
1559 buf = ctypes.create_string_buffer(len(title_bytes))
1560 buf.value = title_bytes
e3946f98 1561 try:
6eefe533 1562 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1563 except AttributeError:
1564 return # Strange libc, just skip this
d7dda168
PH
1565
1566
1567def remove_start(s, start):
46bc9b7d 1568 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1569
1570
2b9faf55 1571def remove_end(s, end):
46bc9b7d 1572 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1573
1574
31b2051e
S
1575def remove_quotes(s):
1576 if s is None or len(s) < 2:
1577 return s
1578 for quote in ('"', "'", ):
1579 if s[0] == quote and s[-1] == quote:
1580 return s[1:-1]
1581 return s
1582
1583
29eb5174 1584def url_basename(url):
9b8aaeed 1585 path = compat_urlparse.urlparse(url).path
28e614de 1586 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1587
1588
1589class HEADRequest(compat_urllib_request.Request):
1590 def get_method(self):
611c1dd9 1591 return 'HEAD'
7217e148
PH
1592
1593
9732d77e 1594def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1595 if get_attr:
1596 if v is not None:
1597 v = getattr(v, get_attr, None)
9572013d
PH
1598 if v == '':
1599 v = None
1812afb7
S
1600 if v is None:
1601 return default
1602 try:
1603 return int(v) * invscale // scale
1604 except ValueError:
af98f8ff 1605 return default
9732d77e 1606
9572013d 1607
40a90862
JMF
1608def str_or_none(v, default=None):
1609 return default if v is None else compat_str(v)
1610
9732d77e
PH
1611
1612def str_to_int(int_str):
48d4681e 1613 """ A more relaxed version of int_or_none """
9732d77e
PH
1614 if int_str is None:
1615 return None
28e614de 1616 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1617 return int(int_str)
608d11f5
PH
1618
1619
9732d77e 1620def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1621 if v is None:
1622 return default
1623 try:
1624 return float(v) * invscale / scale
1625 except ValueError:
1626 return default
43f775e4
PH
1627
1628
608d11f5 1629def parse_duration(s):
8f9312c3 1630 if not isinstance(s, compat_basestring):
608d11f5
PH
1631 return None
1632
ca7b3246
S
1633 s = s.strip()
1634
acaff495 1635 days, hours, mins, secs, ms = [None] * 5
1636 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1637 if m:
1638 days, hours, mins, secs, ms = m.groups()
1639 else:
1640 m = re.match(
1641 r'''(?ix)(?:P?T)?
8f4b58d7 1642 (?:
acaff495 1643 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1644 )?
acaff495 1645 (?:
1646 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1647 )?
1648 (?:
1649 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1650 )?
1651 (?:
1652 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1653 )?$''', s)
1654 if m:
1655 days, hours, mins, secs, ms = m.groups()
1656 else:
1657 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1658 if m:
1659 hours, mins = m.groups()
1660 else:
1661 return None
1662
1663 duration = 0
1664 if secs:
1665 duration += float(secs)
1666 if mins:
1667 duration += float(mins) * 60
1668 if hours:
1669 duration += float(hours) * 60 * 60
1670 if days:
1671 duration += float(days) * 24 * 60 * 60
1672 if ms:
1673 duration += float(ms)
1674 return duration
91d7d0b3
JMF
1675
1676
e65e4c88 1677def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1678 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1679 return (
1680 '{0}.{1}{2}'.format(name, ext, real_ext)
1681 if not expected_real_ext or real_ext[1:] == expected_real_ext
1682 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1683
1684
b3ed15b7
S
1685def replace_extension(filename, ext, expected_real_ext=None):
1686 name, real_ext = os.path.splitext(filename)
1687 return '{0}.{1}'.format(
1688 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1689 ext)
1690
1691
d70ad093
PH
1692def check_executable(exe, args=[]):
1693 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1694 args can be a list of arguments for a short output (like -version) """
1695 try:
1696 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1697 except OSError:
1698 return False
1699 return exe
b7ab0590
PH
1700
1701
95807118 1702def get_exe_version(exe, args=['--version'],
cae97f65 1703 version_re=None, unrecognized='present'):
95807118
PH
1704 """ Returns the version of the specified executable,
1705 or False if the executable is not present """
1706 try:
cae97f65 1707 out, _ = subprocess.Popen(
54116803 1708 [encodeArgument(exe)] + args,
95807118
PH
1709 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1710 except OSError:
1711 return False
cae97f65
PH
1712 if isinstance(out, bytes): # Python 2.x
1713 out = out.decode('ascii', 'ignore')
1714 return detect_exe_version(out, version_re, unrecognized)
1715
1716
1717def detect_exe_version(output, version_re=None, unrecognized='present'):
1718 assert isinstance(output, compat_str)
1719 if version_re is None:
1720 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1721 m = re.search(version_re, output)
95807118
PH
1722 if m:
1723 return m.group(1)
1724 else:
1725 return unrecognized
1726
1727
b7ab0590 1728class PagedList(object):
dd26ced1
PH
1729 def __len__(self):
1730 # This is only useful for tests
1731 return len(self.getslice())
1732
9c44d242
PH
1733
1734class OnDemandPagedList(PagedList):
b95dc034 1735 def __init__(self, pagefunc, pagesize, use_cache=False):
9c44d242
PH
1736 self._pagefunc = pagefunc
1737 self._pagesize = pagesize
b95dc034
YCH
1738 self._use_cache = use_cache
1739 if use_cache:
1740 self._cache = {}
9c44d242 1741
b7ab0590
PH
1742 def getslice(self, start=0, end=None):
1743 res = []
1744 for pagenum in itertools.count(start // self._pagesize):
1745 firstid = pagenum * self._pagesize
1746 nextfirstid = pagenum * self._pagesize + self._pagesize
1747 if start >= nextfirstid:
1748 continue
1749
b95dc034
YCH
1750 page_results = None
1751 if self._use_cache:
1752 page_results = self._cache.get(pagenum)
1753 if page_results is None:
1754 page_results = list(self._pagefunc(pagenum))
1755 if self._use_cache:
1756 self._cache[pagenum] = page_results
b7ab0590
PH
1757
1758 startv = (
1759 start % self._pagesize
1760 if firstid <= start < nextfirstid
1761 else 0)
1762
1763 endv = (
1764 ((end - 1) % self._pagesize) + 1
1765 if (end is not None and firstid <= end <= nextfirstid)
1766 else None)
1767
1768 if startv != 0 or endv is not None:
1769 page_results = page_results[startv:endv]
1770 res.extend(page_results)
1771
1772 # A little optimization - if current page is not "full", ie. does
1773 # not contain page_size videos then we can assume that this page
1774 # is the last one - there are no more ids on further pages -
1775 # i.e. no need to query again.
1776 if len(page_results) + startv < self._pagesize:
1777 break
1778
1779 # If we got the whole page, but the next page is not interesting,
1780 # break out early as well
1781 if end == nextfirstid:
1782 break
1783 return res
81c2f20b
PH
1784
1785
9c44d242
PH
1786class InAdvancePagedList(PagedList):
1787 def __init__(self, pagefunc, pagecount, pagesize):
1788 self._pagefunc = pagefunc
1789 self._pagecount = pagecount
1790 self._pagesize = pagesize
1791
1792 def getslice(self, start=0, end=None):
1793 res = []
1794 start_page = start // self._pagesize
1795 end_page = (
1796 self._pagecount if end is None else (end // self._pagesize + 1))
1797 skip_elems = start - start_page * self._pagesize
1798 only_more = None if end is None else end - start
1799 for pagenum in range(start_page, end_page):
1800 page = list(self._pagefunc(pagenum))
1801 if skip_elems:
1802 page = page[skip_elems:]
1803 skip_elems = None
1804 if only_more is not None:
1805 if len(page) < only_more:
1806 only_more -= len(page)
1807 else:
1808 page = page[:only_more]
1809 res.extend(page)
1810 break
1811 res.extend(page)
1812 return res
1813
1814
81c2f20b 1815def uppercase_escape(s):
676eb3f2 1816 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1817 return re.sub(
a612753d 1818 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1819 lambda m: unicode_escape(m.group(0))[0],
1820 s)
0fe2ff78
YCH
1821
1822
1823def lowercase_escape(s):
1824 unicode_escape = codecs.getdecoder('unicode_escape')
1825 return re.sub(
1826 r'\\u[0-9a-fA-F]{4}',
1827 lambda m: unicode_escape(m.group(0))[0],
1828 s)
b53466e1 1829
d05cfe06
S
1830
1831def escape_rfc3986(s):
1832 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1833 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1834 s = s.encode('utf-8')
ecc0c5ee 1835 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1836
1837
1838def escape_url(url):
1839 """Escape URL as suggested by RFC 3986"""
1840 url_parsed = compat_urllib_parse_urlparse(url)
1841 return url_parsed._replace(
efbed08d 1842 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
1843 path=escape_rfc3986(url_parsed.path),
1844 params=escape_rfc3986(url_parsed.params),
1845 query=escape_rfc3986(url_parsed.query),
1846 fragment=escape_rfc3986(url_parsed.fragment)
1847 ).geturl()
1848
62e609ab
PH
1849
1850def read_batch_urls(batch_fd):
1851 def fixup(url):
1852 if not isinstance(url, compat_str):
1853 url = url.decode('utf-8', 'replace')
28e614de 1854 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1855 if url.startswith(BOM_UTF8):
1856 url = url[len(BOM_UTF8):]
1857 url = url.strip()
1858 if url.startswith(('#', ';', ']')):
1859 return False
1860 return url
1861
1862 with contextlib.closing(batch_fd) as fd:
1863 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1864
1865
1866def urlencode_postdata(*args, **kargs):
15707c7e 1867 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1868
1869
38f9ef31 1870def update_url_query(url, query):
cacd9966
YCH
1871 if not query:
1872 return url
38f9ef31 1873 parsed_url = compat_urlparse.urlparse(url)
1874 qs = compat_parse_qs(parsed_url.query)
1875 qs.update(query)
1876 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 1877 query=compat_urllib_parse_urlencode(qs, True)))
16392824 1878
8e60dc75 1879
ed0291d1
S
1880def update_Request(req, url=None, data=None, headers={}, query={}):
1881 req_headers = req.headers.copy()
1882 req_headers.update(headers)
1883 req_data = data or req.data
1884 req_url = update_url_query(url or req.get_full_url(), query)
1885 req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request
1886 new_req = req_type(
1887 req_url, data=req_data, headers=req_headers,
1888 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
1889 if hasattr(req, 'timeout'):
1890 new_req.timeout = req.timeout
1891 return new_req
1892
1893
86296ad2 1894def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
1895 if isinstance(key_or_keys, (list, tuple)):
1896 for key in key_or_keys:
86296ad2
S
1897 if key not in d or d[key] is None or skip_false_values and not d[key]:
1898 continue
1899 return d[key]
cbecc9b9
S
1900 return default
1901 return d.get(key_or_keys, default)
1902
1903
329ca3be
S
1904def try_get(src, getter, expected_type=None):
1905 try:
1906 v = getter(src)
1907 except (AttributeError, KeyError, TypeError, IndexError):
1908 pass
1909 else:
1910 if expected_type is None or isinstance(v, expected_type):
1911 return v
1912
1913
8e60dc75
S
1914def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
1915 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
1916
16392824 1917
a1a530b0
PH
1918US_RATINGS = {
1919 'G': 0,
1920 'PG': 10,
1921 'PG-13': 13,
1922 'R': 16,
1923 'NC': 18,
1924}
fac55558
PH
1925
1926
146c80e2
S
1927def parse_age_limit(s):
1928 if s is None:
d838b1bd 1929 return None
146c80e2 1930 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d800609c 1931 return int(m.group('age')) if m else US_RATINGS.get(s)
146c80e2
S
1932
1933
fac55558 1934def strip_jsonp(code):
609a61e3 1935 return re.sub(
5950cb1d 1936 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1937
1938
e05f6939
PH
1939def js_to_json(code):
1940 def fix_kv(m):
e7b6d122
PH
1941 v = m.group(0)
1942 if v in ('true', 'false', 'null'):
1943 return v
bd1e4844 1944 elif v.startswith('/*') or v == ',':
1945 return ""
1946
1947 if v[0] in ("'", '"'):
1948 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 1949 '"': '\\"',
bd1e4844 1950 "\\'": "'",
1951 '\\\n': '',
1952 '\\x': '\\u00',
1953 }.get(m.group(0), m.group(0)), v[1:-1])
1954
89ac4a19 1955 INTEGER_TABLE = (
cda6d47a
S
1956 (r'^0[xX][0-9a-fA-F]+', 16),
1957 (r'^0+[0-7]+', 8),
89ac4a19
S
1958 )
1959
1960 for regex, base in INTEGER_TABLE:
1961 im = re.match(regex, v)
1962 if im:
cda6d47a 1963 i = int(im.group(0), base)
89ac4a19
S
1964 return '"%d":' % i if v.endswith(':') else '%d' % i
1965
e7b6d122 1966 return '"%s"' % v
e05f6939 1967
bd1e4844 1968 return re.sub(r'''(?sx)
1969 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
1970 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
1971 /\*.*?\*/|,(?=\s*[\]}])|
1972 [a-zA-Z_][.a-zA-Z_0-9]*|
47212f7b 1973 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
bd1e4844 1974 [0-9]+(?=\s*:)
e05f6939 1975 ''', fix_kv, code)
e05f6939
PH
1976
1977
478c2c61
PH
1978def qualities(quality_ids):
1979 """ Get a numeric quality value out of a list of possible values """
1980 def q(qid):
1981 try:
1982 return quality_ids.index(qid)
1983 except ValueError:
1984 return -1
1985 return q
1986
acd69589
PH
1987
1988DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1989
a020a0dc
PH
1990
1991def limit_length(s, length):
1992 """ Add ellipses to overly long strings """
1993 if s is None:
1994 return None
1995 ELLIPSES = '...'
1996 if len(s) > length:
1997 return s[:length - len(ELLIPSES)] + ELLIPSES
1998 return s
48844745
PH
1999
2000
2001def version_tuple(v):
5f9b8394 2002 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2003
2004
2005def is_outdated_version(version, limit, assume_new=True):
2006 if not version:
2007 return not assume_new
2008 try:
2009 return version_tuple(version) < version_tuple(limit)
2010 except ValueError:
2011 return not assume_new
732ea2f0
PH
2012
2013
2014def ytdl_is_updateable():
2015 """ Returns if youtube-dl can be updated with -U """
2016 from zipimport import zipimporter
2017
2018 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
2019
2020
2021def args_to_str(args):
2022 # Get a short string representation for a subprocess command
702ccf2d 2023 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2024
2025
9b9c5355 2026def error_to_compat_str(err):
fdae2358
S
2027 err_str = str(err)
2028 # On python 2 error byte string must be decoded with proper
2029 # encoding rather than ascii
2030 if sys.version_info[0] < 3:
2031 err_str = err_str.decode(preferredencoding())
2032 return err_str
2033
2034
c460bdd5 2035def mimetype2ext(mt):
eb9ee194
S
2036 if mt is None:
2037 return None
2038
765ac263
JMF
2039 ext = {
2040 'audio/mp4': 'm4a',
6c33d24b
YCH
2041 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2042 # it's the most popular one
2043 'audio/mpeg': 'mp3',
765ac263
JMF
2044 }.get(mt)
2045 if ext is not None:
2046 return ext
2047
c460bdd5
PH
2048 _, _, res = mt.rpartition('/')
2049
2050 return {
f6861ec9 2051 '3gpp': '3gp',
cafcf657 2052 'smptett+xml': 'tt',
2053 'srt': 'srt',
2054 'ttaf+xml': 'dfxp',
a0d8d704 2055 'ttml+xml': 'ttml',
cafcf657 2056 'vtt': 'vtt',
f6861ec9 2057 'x-flv': 'flv',
a0d8d704
YCH
2058 'x-mp4-fragmented': 'mp4',
2059 'x-ms-wmv': 'wmv',
c460bdd5
PH
2060 }.get(res, res)
2061
2062
2ccd1b10 2063def urlhandle_detect_ext(url_handle):
79298173 2064 getheader = url_handle.headers.get
2ccd1b10 2065
b55ee18f
PH
2066 cd = getheader('Content-Disposition')
2067 if cd:
2068 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2069 if m:
2070 e = determine_ext(m.group('filename'), default_ext=None)
2071 if e:
2072 return e
2073
c460bdd5 2074 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2075
2076
1e399778
YCH
2077def encode_data_uri(data, mime_type):
2078 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2079
2080
05900629 2081def age_restricted(content_limit, age_limit):
6ec6cb4e 2082 """ Returns True iff the content should be blocked """
05900629
PH
2083
2084 if age_limit is None: # No limit set
2085 return False
2086 if content_limit is None:
2087 return False # Content available for everyone
2088 return age_limit < content_limit
61ca9a80
PH
2089
2090
2091def is_html(first_bytes):
2092 """ Detect whether a file contains HTML by examining its first bytes. """
2093
2094 BOMS = [
2095 (b'\xef\xbb\xbf', 'utf-8'),
2096 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2097 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2098 (b'\xff\xfe', 'utf-16-le'),
2099 (b'\xfe\xff', 'utf-16-be'),
2100 ]
2101 for bom, enc in BOMS:
2102 if first_bytes.startswith(bom):
2103 s = first_bytes[len(bom):].decode(enc, 'replace')
2104 break
2105 else:
2106 s = first_bytes.decode('utf-8', 'replace')
2107
2108 return re.match(r'^\s*<', s)
a055469f
PH
2109
2110
2111def determine_protocol(info_dict):
2112 protocol = info_dict.get('protocol')
2113 if protocol is not None:
2114 return protocol
2115
2116 url = info_dict['url']
2117 if url.startswith('rtmp'):
2118 return 'rtmp'
2119 elif url.startswith('mms'):
2120 return 'mms'
2121 elif url.startswith('rtsp'):
2122 return 'rtsp'
2123
2124 ext = determine_ext(url)
2125 if ext == 'm3u8':
2126 return 'm3u8'
2127 elif ext == 'f4m':
2128 return 'f4m'
2129
2130 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2131
2132
2133def render_table(header_row, data):
2134 """ Render a list of rows, each as a list of values """
2135 table = [header_row] + data
2136 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2137 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2138 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2139
2140
2141def _match_one(filter_part, dct):
2142 COMPARISON_OPERATORS = {
2143 '<': operator.lt,
2144 '<=': operator.le,
2145 '>': operator.gt,
2146 '>=': operator.ge,
2147 '=': operator.eq,
2148 '!=': operator.ne,
2149 }
2150 operator_rex = re.compile(r'''(?x)\s*
2151 (?P<key>[a-z_]+)
2152 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2153 (?:
2154 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2155 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2156 )
2157 \s*$
2158 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2159 m = operator_rex.search(filter_part)
2160 if m:
2161 op = COMPARISON_OPERATORS[m.group('op')]
2162 if m.group('strval') is not None:
2163 if m.group('op') not in ('=', '!='):
2164 raise ValueError(
2165 'Operator %s does not support string values!' % m.group('op'))
2166 comparison_value = m.group('strval')
2167 else:
2168 try:
2169 comparison_value = int(m.group('intval'))
2170 except ValueError:
2171 comparison_value = parse_filesize(m.group('intval'))
2172 if comparison_value is None:
2173 comparison_value = parse_filesize(m.group('intval') + 'B')
2174 if comparison_value is None:
2175 raise ValueError(
2176 'Invalid integer value %r in filter part %r' % (
2177 m.group('intval'), filter_part))
2178 actual_value = dct.get(m.group('key'))
2179 if actual_value is None:
2180 return m.group('none_inclusive')
2181 return op(actual_value, comparison_value)
2182
2183 UNARY_OPERATORS = {
2184 '': lambda v: v is not None,
2185 '!': lambda v: v is None,
2186 }
2187 operator_rex = re.compile(r'''(?x)\s*
2188 (?P<op>%s)\s*(?P<key>[a-z_]+)
2189 \s*$
2190 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2191 m = operator_rex.search(filter_part)
2192 if m:
2193 op = UNARY_OPERATORS[m.group('op')]
2194 actual_value = dct.get(m.group('key'))
2195 return op(actual_value)
2196
2197 raise ValueError('Invalid filter part %r' % filter_part)
2198
2199
2200def match_str(filter_str, dct):
2201 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2202
2203 return all(
2204 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2205
2206
2207def match_filter_func(filter_str):
2208 def _match_func(info_dict):
2209 if match_str(filter_str, info_dict):
2210 return None
2211 else:
2212 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2213 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2214 return _match_func
91410c9b
PH
2215
2216
bf6427d2
YCH
2217def parse_dfxp_time_expr(time_expr):
2218 if not time_expr:
d631d5f9 2219 return
bf6427d2
YCH
2220
2221 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2222 if mobj:
2223 return float(mobj.group('time_offset'))
2224
db2fe38b 2225 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2226 if mobj:
db2fe38b 2227 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2228
2229
c1c924ab
YCH
2230def srt_subtitles_timecode(seconds):
2231 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2232
2233
2234def dfxp2srt(dfxp_data):
4e335771
YCH
2235 _x = functools.partial(xpath_with_ns, ns_map={
2236 'ttml': 'http://www.w3.org/ns/ttml',
2237 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
5bf28d78 2238 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
4e335771 2239 })
bf6427d2 2240
87de7069 2241 class TTMLPElementParser(object):
2b14cb56 2242 out = ''
bf6427d2 2243
2b14cb56 2244 def start(self, tag, attrib):
2245 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2246 self.out += '\n'
bf6427d2 2247
2b14cb56 2248 def end(self, tag):
2249 pass
bf6427d2 2250
2b14cb56 2251 def data(self, data):
2252 self.out += data
2253
2254 def close(self):
2255 return self.out.strip()
2256
2257 def parse_node(node):
2258 target = TTMLPElementParser()
2259 parser = xml.etree.ElementTree.XMLParser(target=target)
2260 parser.feed(xml.etree.ElementTree.tostring(node))
2261 return parser.close()
bf6427d2 2262
36e6f62c 2263 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
bf6427d2 2264 out = []
5bf28d78 2265 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2266
2267 if not paras:
2268 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
2269
2270 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2271 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2272 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2273 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2274 if begin_time is None:
2275 continue
7dff0363 2276 if not end_time:
d631d5f9
YCH
2277 if not dur:
2278 continue
2279 end_time = begin_time + dur
bf6427d2
YCH
2280 out.append('%d\n%s --> %s\n%s\n\n' % (
2281 index,
c1c924ab
YCH
2282 srt_subtitles_timecode(begin_time),
2283 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2284 parse_node(para)))
2285
2286 return ''.join(out)
2287
2288
66e289ba
S
2289def cli_option(params, command_option, param):
2290 param = params.get(param)
2291 return [command_option, param] if param is not None else []
2292
2293
2294def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2295 param = params.get(param)
2296 assert isinstance(param, bool)
2297 if separator:
2298 return [command_option + separator + (true_value if param else false_value)]
2299 return [command_option, true_value if param else false_value]
2300
2301
2302def cli_valueless_option(params, command_option, param, expected_value=True):
2303 param = params.get(param)
2304 return [command_option] if param == expected_value else []
2305
2306
2307def cli_configuration_args(params, param, default=[]):
2308 ex_args = params.get(param)
2309 if ex_args is None:
2310 return default
2311 assert isinstance(ex_args, list)
2312 return ex_args
2313
2314
39672624
YCH
2315class ISO639Utils(object):
2316 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2317 _lang_map = {
2318 'aa': 'aar',
2319 'ab': 'abk',
2320 'ae': 'ave',
2321 'af': 'afr',
2322 'ak': 'aka',
2323 'am': 'amh',
2324 'an': 'arg',
2325 'ar': 'ara',
2326 'as': 'asm',
2327 'av': 'ava',
2328 'ay': 'aym',
2329 'az': 'aze',
2330 'ba': 'bak',
2331 'be': 'bel',
2332 'bg': 'bul',
2333 'bh': 'bih',
2334 'bi': 'bis',
2335 'bm': 'bam',
2336 'bn': 'ben',
2337 'bo': 'bod',
2338 'br': 'bre',
2339 'bs': 'bos',
2340 'ca': 'cat',
2341 'ce': 'che',
2342 'ch': 'cha',
2343 'co': 'cos',
2344 'cr': 'cre',
2345 'cs': 'ces',
2346 'cu': 'chu',
2347 'cv': 'chv',
2348 'cy': 'cym',
2349 'da': 'dan',
2350 'de': 'deu',
2351 'dv': 'div',
2352 'dz': 'dzo',
2353 'ee': 'ewe',
2354 'el': 'ell',
2355 'en': 'eng',
2356 'eo': 'epo',
2357 'es': 'spa',
2358 'et': 'est',
2359 'eu': 'eus',
2360 'fa': 'fas',
2361 'ff': 'ful',
2362 'fi': 'fin',
2363 'fj': 'fij',
2364 'fo': 'fao',
2365 'fr': 'fra',
2366 'fy': 'fry',
2367 'ga': 'gle',
2368 'gd': 'gla',
2369 'gl': 'glg',
2370 'gn': 'grn',
2371 'gu': 'guj',
2372 'gv': 'glv',
2373 'ha': 'hau',
2374 'he': 'heb',
2375 'hi': 'hin',
2376 'ho': 'hmo',
2377 'hr': 'hrv',
2378 'ht': 'hat',
2379 'hu': 'hun',
2380 'hy': 'hye',
2381 'hz': 'her',
2382 'ia': 'ina',
2383 'id': 'ind',
2384 'ie': 'ile',
2385 'ig': 'ibo',
2386 'ii': 'iii',
2387 'ik': 'ipk',
2388 'io': 'ido',
2389 'is': 'isl',
2390 'it': 'ita',
2391 'iu': 'iku',
2392 'ja': 'jpn',
2393 'jv': 'jav',
2394 'ka': 'kat',
2395 'kg': 'kon',
2396 'ki': 'kik',
2397 'kj': 'kua',
2398 'kk': 'kaz',
2399 'kl': 'kal',
2400 'km': 'khm',
2401 'kn': 'kan',
2402 'ko': 'kor',
2403 'kr': 'kau',
2404 'ks': 'kas',
2405 'ku': 'kur',
2406 'kv': 'kom',
2407 'kw': 'cor',
2408 'ky': 'kir',
2409 'la': 'lat',
2410 'lb': 'ltz',
2411 'lg': 'lug',
2412 'li': 'lim',
2413 'ln': 'lin',
2414 'lo': 'lao',
2415 'lt': 'lit',
2416 'lu': 'lub',
2417 'lv': 'lav',
2418 'mg': 'mlg',
2419 'mh': 'mah',
2420 'mi': 'mri',
2421 'mk': 'mkd',
2422 'ml': 'mal',
2423 'mn': 'mon',
2424 'mr': 'mar',
2425 'ms': 'msa',
2426 'mt': 'mlt',
2427 'my': 'mya',
2428 'na': 'nau',
2429 'nb': 'nob',
2430 'nd': 'nde',
2431 'ne': 'nep',
2432 'ng': 'ndo',
2433 'nl': 'nld',
2434 'nn': 'nno',
2435 'no': 'nor',
2436 'nr': 'nbl',
2437 'nv': 'nav',
2438 'ny': 'nya',
2439 'oc': 'oci',
2440 'oj': 'oji',
2441 'om': 'orm',
2442 'or': 'ori',
2443 'os': 'oss',
2444 'pa': 'pan',
2445 'pi': 'pli',
2446 'pl': 'pol',
2447 'ps': 'pus',
2448 'pt': 'por',
2449 'qu': 'que',
2450 'rm': 'roh',
2451 'rn': 'run',
2452 'ro': 'ron',
2453 'ru': 'rus',
2454 'rw': 'kin',
2455 'sa': 'san',
2456 'sc': 'srd',
2457 'sd': 'snd',
2458 'se': 'sme',
2459 'sg': 'sag',
2460 'si': 'sin',
2461 'sk': 'slk',
2462 'sl': 'slv',
2463 'sm': 'smo',
2464 'sn': 'sna',
2465 'so': 'som',
2466 'sq': 'sqi',
2467 'sr': 'srp',
2468 'ss': 'ssw',
2469 'st': 'sot',
2470 'su': 'sun',
2471 'sv': 'swe',
2472 'sw': 'swa',
2473 'ta': 'tam',
2474 'te': 'tel',
2475 'tg': 'tgk',
2476 'th': 'tha',
2477 'ti': 'tir',
2478 'tk': 'tuk',
2479 'tl': 'tgl',
2480 'tn': 'tsn',
2481 'to': 'ton',
2482 'tr': 'tur',
2483 'ts': 'tso',
2484 'tt': 'tat',
2485 'tw': 'twi',
2486 'ty': 'tah',
2487 'ug': 'uig',
2488 'uk': 'ukr',
2489 'ur': 'urd',
2490 'uz': 'uzb',
2491 've': 'ven',
2492 'vi': 'vie',
2493 'vo': 'vol',
2494 'wa': 'wln',
2495 'wo': 'wol',
2496 'xh': 'xho',
2497 'yi': 'yid',
2498 'yo': 'yor',
2499 'za': 'zha',
2500 'zh': 'zho',
2501 'zu': 'zul',
2502 }
2503
2504 @classmethod
2505 def short2long(cls, code):
2506 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2507 return cls._lang_map.get(code[:2])
2508
2509 @classmethod
2510 def long2short(cls, code):
2511 """Convert language code from ISO 639-2/T to ISO 639-1"""
2512 for short_name, long_name in cls._lang_map.items():
2513 if long_name == code:
2514 return short_name
2515
2516
4eb10f66
YCH
2517class ISO3166Utils(object):
2518 # From http://data.okfn.org/data/core/country-list
2519 _country_map = {
2520 'AF': 'Afghanistan',
2521 'AX': 'Åland Islands',
2522 'AL': 'Albania',
2523 'DZ': 'Algeria',
2524 'AS': 'American Samoa',
2525 'AD': 'Andorra',
2526 'AO': 'Angola',
2527 'AI': 'Anguilla',
2528 'AQ': 'Antarctica',
2529 'AG': 'Antigua and Barbuda',
2530 'AR': 'Argentina',
2531 'AM': 'Armenia',
2532 'AW': 'Aruba',
2533 'AU': 'Australia',
2534 'AT': 'Austria',
2535 'AZ': 'Azerbaijan',
2536 'BS': 'Bahamas',
2537 'BH': 'Bahrain',
2538 'BD': 'Bangladesh',
2539 'BB': 'Barbados',
2540 'BY': 'Belarus',
2541 'BE': 'Belgium',
2542 'BZ': 'Belize',
2543 'BJ': 'Benin',
2544 'BM': 'Bermuda',
2545 'BT': 'Bhutan',
2546 'BO': 'Bolivia, Plurinational State of',
2547 'BQ': 'Bonaire, Sint Eustatius and Saba',
2548 'BA': 'Bosnia and Herzegovina',
2549 'BW': 'Botswana',
2550 'BV': 'Bouvet Island',
2551 'BR': 'Brazil',
2552 'IO': 'British Indian Ocean Territory',
2553 'BN': 'Brunei Darussalam',
2554 'BG': 'Bulgaria',
2555 'BF': 'Burkina Faso',
2556 'BI': 'Burundi',
2557 'KH': 'Cambodia',
2558 'CM': 'Cameroon',
2559 'CA': 'Canada',
2560 'CV': 'Cape Verde',
2561 'KY': 'Cayman Islands',
2562 'CF': 'Central African Republic',
2563 'TD': 'Chad',
2564 'CL': 'Chile',
2565 'CN': 'China',
2566 'CX': 'Christmas Island',
2567 'CC': 'Cocos (Keeling) Islands',
2568 'CO': 'Colombia',
2569 'KM': 'Comoros',
2570 'CG': 'Congo',
2571 'CD': 'Congo, the Democratic Republic of the',
2572 'CK': 'Cook Islands',
2573 'CR': 'Costa Rica',
2574 'CI': 'Côte d\'Ivoire',
2575 'HR': 'Croatia',
2576 'CU': 'Cuba',
2577 'CW': 'Curaçao',
2578 'CY': 'Cyprus',
2579 'CZ': 'Czech Republic',
2580 'DK': 'Denmark',
2581 'DJ': 'Djibouti',
2582 'DM': 'Dominica',
2583 'DO': 'Dominican Republic',
2584 'EC': 'Ecuador',
2585 'EG': 'Egypt',
2586 'SV': 'El Salvador',
2587 'GQ': 'Equatorial Guinea',
2588 'ER': 'Eritrea',
2589 'EE': 'Estonia',
2590 'ET': 'Ethiopia',
2591 'FK': 'Falkland Islands (Malvinas)',
2592 'FO': 'Faroe Islands',
2593 'FJ': 'Fiji',
2594 'FI': 'Finland',
2595 'FR': 'France',
2596 'GF': 'French Guiana',
2597 'PF': 'French Polynesia',
2598 'TF': 'French Southern Territories',
2599 'GA': 'Gabon',
2600 'GM': 'Gambia',
2601 'GE': 'Georgia',
2602 'DE': 'Germany',
2603 'GH': 'Ghana',
2604 'GI': 'Gibraltar',
2605 'GR': 'Greece',
2606 'GL': 'Greenland',
2607 'GD': 'Grenada',
2608 'GP': 'Guadeloupe',
2609 'GU': 'Guam',
2610 'GT': 'Guatemala',
2611 'GG': 'Guernsey',
2612 'GN': 'Guinea',
2613 'GW': 'Guinea-Bissau',
2614 'GY': 'Guyana',
2615 'HT': 'Haiti',
2616 'HM': 'Heard Island and McDonald Islands',
2617 'VA': 'Holy See (Vatican City State)',
2618 'HN': 'Honduras',
2619 'HK': 'Hong Kong',
2620 'HU': 'Hungary',
2621 'IS': 'Iceland',
2622 'IN': 'India',
2623 'ID': 'Indonesia',
2624 'IR': 'Iran, Islamic Republic of',
2625 'IQ': 'Iraq',
2626 'IE': 'Ireland',
2627 'IM': 'Isle of Man',
2628 'IL': 'Israel',
2629 'IT': 'Italy',
2630 'JM': 'Jamaica',
2631 'JP': 'Japan',
2632 'JE': 'Jersey',
2633 'JO': 'Jordan',
2634 'KZ': 'Kazakhstan',
2635 'KE': 'Kenya',
2636 'KI': 'Kiribati',
2637 'KP': 'Korea, Democratic People\'s Republic of',
2638 'KR': 'Korea, Republic of',
2639 'KW': 'Kuwait',
2640 'KG': 'Kyrgyzstan',
2641 'LA': 'Lao People\'s Democratic Republic',
2642 'LV': 'Latvia',
2643 'LB': 'Lebanon',
2644 'LS': 'Lesotho',
2645 'LR': 'Liberia',
2646 'LY': 'Libya',
2647 'LI': 'Liechtenstein',
2648 'LT': 'Lithuania',
2649 'LU': 'Luxembourg',
2650 'MO': 'Macao',
2651 'MK': 'Macedonia, the Former Yugoslav Republic of',
2652 'MG': 'Madagascar',
2653 'MW': 'Malawi',
2654 'MY': 'Malaysia',
2655 'MV': 'Maldives',
2656 'ML': 'Mali',
2657 'MT': 'Malta',
2658 'MH': 'Marshall Islands',
2659 'MQ': 'Martinique',
2660 'MR': 'Mauritania',
2661 'MU': 'Mauritius',
2662 'YT': 'Mayotte',
2663 'MX': 'Mexico',
2664 'FM': 'Micronesia, Federated States of',
2665 'MD': 'Moldova, Republic of',
2666 'MC': 'Monaco',
2667 'MN': 'Mongolia',
2668 'ME': 'Montenegro',
2669 'MS': 'Montserrat',
2670 'MA': 'Morocco',
2671 'MZ': 'Mozambique',
2672 'MM': 'Myanmar',
2673 'NA': 'Namibia',
2674 'NR': 'Nauru',
2675 'NP': 'Nepal',
2676 'NL': 'Netherlands',
2677 'NC': 'New Caledonia',
2678 'NZ': 'New Zealand',
2679 'NI': 'Nicaragua',
2680 'NE': 'Niger',
2681 'NG': 'Nigeria',
2682 'NU': 'Niue',
2683 'NF': 'Norfolk Island',
2684 'MP': 'Northern Mariana Islands',
2685 'NO': 'Norway',
2686 'OM': 'Oman',
2687 'PK': 'Pakistan',
2688 'PW': 'Palau',
2689 'PS': 'Palestine, State of',
2690 'PA': 'Panama',
2691 'PG': 'Papua New Guinea',
2692 'PY': 'Paraguay',
2693 'PE': 'Peru',
2694 'PH': 'Philippines',
2695 'PN': 'Pitcairn',
2696 'PL': 'Poland',
2697 'PT': 'Portugal',
2698 'PR': 'Puerto Rico',
2699 'QA': 'Qatar',
2700 'RE': 'Réunion',
2701 'RO': 'Romania',
2702 'RU': 'Russian Federation',
2703 'RW': 'Rwanda',
2704 'BL': 'Saint Barthélemy',
2705 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2706 'KN': 'Saint Kitts and Nevis',
2707 'LC': 'Saint Lucia',
2708 'MF': 'Saint Martin (French part)',
2709 'PM': 'Saint Pierre and Miquelon',
2710 'VC': 'Saint Vincent and the Grenadines',
2711 'WS': 'Samoa',
2712 'SM': 'San Marino',
2713 'ST': 'Sao Tome and Principe',
2714 'SA': 'Saudi Arabia',
2715 'SN': 'Senegal',
2716 'RS': 'Serbia',
2717 'SC': 'Seychelles',
2718 'SL': 'Sierra Leone',
2719 'SG': 'Singapore',
2720 'SX': 'Sint Maarten (Dutch part)',
2721 'SK': 'Slovakia',
2722 'SI': 'Slovenia',
2723 'SB': 'Solomon Islands',
2724 'SO': 'Somalia',
2725 'ZA': 'South Africa',
2726 'GS': 'South Georgia and the South Sandwich Islands',
2727 'SS': 'South Sudan',
2728 'ES': 'Spain',
2729 'LK': 'Sri Lanka',
2730 'SD': 'Sudan',
2731 'SR': 'Suriname',
2732 'SJ': 'Svalbard and Jan Mayen',
2733 'SZ': 'Swaziland',
2734 'SE': 'Sweden',
2735 'CH': 'Switzerland',
2736 'SY': 'Syrian Arab Republic',
2737 'TW': 'Taiwan, Province of China',
2738 'TJ': 'Tajikistan',
2739 'TZ': 'Tanzania, United Republic of',
2740 'TH': 'Thailand',
2741 'TL': 'Timor-Leste',
2742 'TG': 'Togo',
2743 'TK': 'Tokelau',
2744 'TO': 'Tonga',
2745 'TT': 'Trinidad and Tobago',
2746 'TN': 'Tunisia',
2747 'TR': 'Turkey',
2748 'TM': 'Turkmenistan',
2749 'TC': 'Turks and Caicos Islands',
2750 'TV': 'Tuvalu',
2751 'UG': 'Uganda',
2752 'UA': 'Ukraine',
2753 'AE': 'United Arab Emirates',
2754 'GB': 'United Kingdom',
2755 'US': 'United States',
2756 'UM': 'United States Minor Outlying Islands',
2757 'UY': 'Uruguay',
2758 'UZ': 'Uzbekistan',
2759 'VU': 'Vanuatu',
2760 'VE': 'Venezuela, Bolivarian Republic of',
2761 'VN': 'Viet Nam',
2762 'VG': 'Virgin Islands, British',
2763 'VI': 'Virgin Islands, U.S.',
2764 'WF': 'Wallis and Futuna',
2765 'EH': 'Western Sahara',
2766 'YE': 'Yemen',
2767 'ZM': 'Zambia',
2768 'ZW': 'Zimbabwe',
2769 }
2770
2771 @classmethod
2772 def short2full(cls, code):
2773 """Convert an ISO 3166-2 country code to the corresponding full name"""
2774 return cls._country_map.get(code.upper())
2775
2776
91410c9b 2777class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2778 def __init__(self, proxies=None):
2779 # Set default handlers
2780 for type in ('http', 'https'):
2781 setattr(self, '%s_open' % type,
2782 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2783 meth(r, proxy, type))
2784 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2785
91410c9b 2786 def proxy_open(self, req, proxy, type):
2461f79d 2787 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
2788 if req_proxy is not None:
2789 proxy = req_proxy
2461f79d
PH
2790 del req.headers['Ytdl-request-proxy']
2791
2792 if proxy == '__noproxy__':
2793 return None # No Proxy
51fb4995 2794 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
2795 req.add_header('Ytdl-socks-proxy', proxy)
2796 # youtube-dl's http/https handlers do wrapping the socket with socks
2797 return None
91410c9b
PH
2798 return compat_urllib_request.ProxyHandler.proxy_open(
2799 self, req, proxy, type)
5bc880b9
YCH
2800
2801
2802def ohdave_rsa_encrypt(data, exponent, modulus):
2803 '''
2804 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2805
2806 Input:
2807 data: data to encrypt, bytes-like object
2808 exponent, modulus: parameter e and N of RSA algorithm, both integer
2809 Output: hex string of encrypted data
2810
2811 Limitation: supports one block encryption only
2812 '''
2813
2814 payload = int(binascii.hexlify(data[::-1]), 16)
2815 encrypted = pow(payload, exponent, modulus)
2816 return '%x' % encrypted
81bdc8fd
YCH
2817
2818
5eb6bdce 2819def encode_base_n(num, n, table=None):
59f898b7 2820 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
2821 if not table:
2822 table = FULL_TABLE[:n]
2823
5eb6bdce
YCH
2824 if n > len(table):
2825 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
2826
2827 if num == 0:
2828 return table[0]
2829
81bdc8fd
YCH
2830 ret = ''
2831 while num:
2832 ret = table[num % n] + ret
2833 num = num // n
2834 return ret
f52354a8
YCH
2835
2836
2837def decode_packed_codes(code):
2838 mobj = re.search(
680079be 2839 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
f52354a8
YCH
2840 code)
2841 obfucasted_code, base, count, symbols = mobj.groups()
2842 base = int(base)
2843 count = int(count)
2844 symbols = symbols.split('|')
2845 symbol_table = {}
2846
2847 while count:
2848 count -= 1
5eb6bdce 2849 base_n_count = encode_base_n(count, base)
f52354a8
YCH
2850 symbol_table[base_n_count] = symbols[count] or base_n_count
2851
2852 return re.sub(
2853 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
2854 obfucasted_code)