]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[utils] Clarify for redirecting STDIN in get_exe_version()
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd 1#!/usr/bin/env python
dcdb292f 2# coding: utf-8
d77c3dfd 3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
f45c185f 14import errno
be4a824d 15import functools
d77c3dfd 16import gzip
03f9daab 17import io
79a2e94e 18import itertools
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
347de493 22import operator
d77c3dfd 23import os
4eb7f1d1 24import pipes
c496ca96 25import platform
d77c3dfd 26import re
c496ca96 27import socket
79a2e94e 28import ssl
1c088fa8 29import subprocess
d77c3dfd 30import sys
181c8655 31import tempfile
01951dda 32import traceback
bcf89ce6 33import xml.etree.ElementTree
d77c3dfd 34import zlib
d77c3dfd 35
8c25f81b 36from .compat import (
8bb56eee 37 compat_HTMLParser,
8f9312c3 38 compat_basestring,
8c25f81b 39 compat_chr,
36e6f62c 40 compat_etree_fromstring,
8c25f81b 41 compat_html_entities,
55b2f099 42 compat_html_entities_html5,
be4a824d 43 compat_http_client,
c86b6142 44 compat_kwargs,
efa97bdc 45 compat_os_name,
8c25f81b 46 compat_parse_qs,
702ccf2d 47 compat_shlex_quote,
be4a824d 48 compat_socket_create_connection,
8c25f81b 49 compat_str,
edaa23f8 50 compat_struct_pack,
d3f8e038 51 compat_struct_unpack,
8c25f81b
PH
52 compat_urllib_error,
53 compat_urllib_parse,
15707c7e 54 compat_urllib_parse_urlencode,
8c25f81b 55 compat_urllib_parse_urlparse,
7581bfc9 56 compat_urllib_parse_unquote_plus,
8c25f81b
PH
57 compat_urllib_request,
58 compat_urlparse,
810c10ba 59 compat_xpath,
8c25f81b 60)
4644ac55 61
71aff188
YCH
62from .socks import (
63 ProxyType,
64 sockssocket,
65)
66
4644ac55 67
51fb4995
YCH
68def register_socks_protocols():
69 # "Register" SOCKS protocols
d5ae6bb5
YCH
70 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
71 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
72 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
73 if scheme not in compat_urlparse.uses_netloc:
74 compat_urlparse.uses_netloc.append(scheme)
75
76
468e2e92
FV
77# This is not clearly defined otherwise
78compiled_regex_type = type(re.compile(''))
79
3e669f36 80std_headers = {
15d10678 81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
59ae15a5
PH
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 86}
f427df17 87
5f6a1245 88
bf42a990
S
89NO_DEFAULT = object()
90
7105440c
YCH
91ENGLISH_MONTH_NAMES = [
92 'January', 'February', 'March', 'April', 'May', 'June',
93 'July', 'August', 'September', 'October', 'November', 'December']
94
f6717dec
S
95MONTH_NAMES = {
96 'en': ENGLISH_MONTH_NAMES,
97 'fr': [
3e4185c3
S
98 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
99 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 100}
a942d6cb 101
a7aaa398
S
102KNOWN_EXTENSIONS = (
103 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
104 'flv', 'f4v', 'f4a', 'f4b',
105 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
106 'mkv', 'mka', 'mk3d',
107 'avi', 'divx',
108 'mov',
109 'asf', 'wmv', 'wma',
110 '3gp', '3g2',
111 'mp3',
112 'flac',
113 'ape',
114 'wav',
115 'f4f', 'f4m', 'm3u8', 'smil')
116
c587cbb7 117# needed for sanitizing filenames in restricted mode
c8827027 118ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
119 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
120 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
c587cbb7 121
46f59e89
S
122DATE_FORMATS = (
123 '%d %B %Y',
124 '%d %b %Y',
125 '%B %d %Y',
126 '%b %d %Y',
127 '%b %dst %Y %I:%M',
128 '%b %dnd %Y %I:%M',
129 '%b %dth %Y %I:%M',
130 '%Y %m %d',
131 '%Y-%m-%d',
132 '%Y/%m/%d',
81c13222 133 '%Y/%m/%d %H:%M',
46f59e89
S
134 '%Y/%m/%d %H:%M:%S',
135 '%Y-%m-%d %H:%M:%S',
136 '%Y-%m-%d %H:%M:%S.%f',
137 '%d.%m.%Y %H:%M',
138 '%d.%m.%Y %H.%M',
139 '%Y-%m-%dT%H:%M:%SZ',
140 '%Y-%m-%dT%H:%M:%S.%fZ',
141 '%Y-%m-%dT%H:%M:%S.%f0Z',
142 '%Y-%m-%dT%H:%M:%S',
143 '%Y-%m-%dT%H:%M:%S.%f',
144 '%Y-%m-%dT%H:%M',
c6eed6b8
S
145 '%b %d %Y at %H:%M',
146 '%b %d %Y at %H:%M:%S',
46f59e89
S
147)
148
149DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
150DATE_FORMATS_DAY_FIRST.extend([
151 '%d-%m-%Y',
152 '%d.%m.%Y',
153 '%d.%m.%y',
154 '%d/%m/%Y',
155 '%d/%m/%y',
156 '%d/%m/%Y %H:%M:%S',
157])
158
159DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
160DATE_FORMATS_MONTH_FIRST.extend([
161 '%m-%d-%Y',
162 '%m.%d.%Y',
163 '%m/%d/%Y',
164 '%m/%d/%y',
165 '%m/%d/%Y %H:%M:%S',
166])
167
06b3fe29
S
168PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
169
7105440c 170
d77c3dfd 171def preferredencoding():
59ae15a5 172 """Get preferred encoding.
d77c3dfd 173
59ae15a5
PH
174 Returns the best encoding scheme for the system, based on
175 locale.getpreferredencoding() and some further tweaks.
176 """
177 try:
178 pref = locale.getpreferredencoding()
28e614de 179 'TEST'.encode(pref)
70a1165b 180 except Exception:
59ae15a5 181 pref = 'UTF-8'
bae611f2 182
59ae15a5 183 return pref
d77c3dfd 184
f4bfd65f 185
181c8655 186def write_json_file(obj, fn):
1394646a 187 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 188
92120217 189 fn = encodeFilename(fn)
61ee5aeb 190 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
191 encoding = get_filesystem_encoding()
192 # os.path.basename returns a bytes object, but NamedTemporaryFile
193 # will fail if the filename contains non ascii characters unless we
194 # use a unicode object
195 path_basename = lambda f: os.path.basename(fn).decode(encoding)
196 # the same for os.path.dirname
197 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
198 else:
199 path_basename = os.path.basename
200 path_dirname = os.path.dirname
201
73159f99
S
202 args = {
203 'suffix': '.tmp',
ec5f6016
JMF
204 'prefix': path_basename(fn) + '.',
205 'dir': path_dirname(fn),
73159f99
S
206 'delete': False,
207 }
208
181c8655
PH
209 # In Python 2.x, json.dump expects a bytestream.
210 # In Python 3.x, it writes to a character stream
211 if sys.version_info < (3, 0):
73159f99 212 args['mode'] = 'wb'
181c8655 213 else:
73159f99
S
214 args.update({
215 'mode': 'w',
216 'encoding': 'utf-8',
217 })
218
c86b6142 219 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
220
221 try:
222 with tf:
223 json.dump(obj, tf)
1394646a
IK
224 if sys.platform == 'win32':
225 # Need to remove existing file on Windows, else os.rename raises
226 # WindowsError or FileExistsError.
227 try:
228 os.unlink(fn)
229 except OSError:
230 pass
181c8655 231 os.rename(tf.name, fn)
70a1165b 232 except Exception:
181c8655
PH
233 try:
234 os.remove(tf.name)
235 except OSError:
236 pass
237 raise
238
239
240if sys.version_info >= (2, 7):
ee114368 241 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 242 """ Find the xpath xpath[@key=val] """
5d2354f1 243 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 244 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
245 return node.find(expr)
246else:
ee114368 247 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 248 for f in node.findall(compat_xpath(xpath)):
ee114368
S
249 if key not in f.attrib:
250 continue
251 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
252 return f
253 return None
254
d7e66d39
JMF
255# On python2.6 the xml.etree.ElementTree.Element methods don't support
256# the namespace parameter
5f6a1245
JW
257
258
d7e66d39
JMF
259def xpath_with_ns(path, ns_map):
260 components = [c.split(':') for c in path.split('/')]
261 replaced = []
262 for c in components:
263 if len(c) == 1:
264 replaced.append(c[0])
265 else:
266 ns, tag = c
267 replaced.append('{%s}%s' % (ns_map[ns], tag))
268 return '/'.join(replaced)
269
d77c3dfd 270
a41fb80c 271def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 272 def _find_xpath(xpath):
810c10ba 273 return node.find(compat_xpath(xpath))
578c0745
S
274
275 if isinstance(xpath, (str, compat_str)):
276 n = _find_xpath(xpath)
277 else:
278 for xp in xpath:
279 n = _find_xpath(xp)
280 if n is not None:
281 break
d74bebd5 282
8e636da4 283 if n is None:
bf42a990
S
284 if default is not NO_DEFAULT:
285 return default
286 elif fatal:
bf0ff932
PH
287 name = xpath if name is None else name
288 raise ExtractorError('Could not find XML element %s' % name)
289 else:
290 return None
a41fb80c
S
291 return n
292
293
294def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
295 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
296 if n is None or n == default:
297 return n
298 if n.text is None:
299 if default is not NO_DEFAULT:
300 return default
301 elif fatal:
302 name = xpath if name is None else name
303 raise ExtractorError('Could not find XML element\'s text %s' % name)
304 else:
305 return None
306 return n.text
a41fb80c
S
307
308
309def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
310 n = find_xpath_attr(node, xpath, key)
311 if n is None:
312 if default is not NO_DEFAULT:
313 return default
314 elif fatal:
315 name = '%s[@%s]' % (xpath, key) if name is None else name
316 raise ExtractorError('Could not find XML attribute %s' % name)
317 else:
318 return None
319 return n.attrib[key]
bf0ff932
PH
320
321
9e6dd238 322def get_element_by_id(id, html):
43e8fafd 323 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 324 return get_element_by_attribute('id', id, html)
43e8fafd 325
12ea2f30 326
84c237fb
YCH
327def get_element_by_class(class_name, html):
328 return get_element_by_attribute(
329 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
330 html, escape_value=False)
331
332
333def get_element_by_attribute(attribute, value, html, escape_value=True):
43e8fafd 334 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 335
84c237fb
YCH
336 value = re.escape(value) if escape_value else value
337
38285056
PH
338 m = re.search(r'''(?xs)
339 <([a-zA-Z0-9:._-]+)
abc97b5e 340 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056 341 \s+%s=['"]?%s['"]?
abc97b5e 342 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056
PH
343 \s*>
344 (?P<content>.*?)
345 </\1>
84c237fb 346 ''' % (re.escape(attribute), value), html)
38285056
PH
347
348 if not m:
349 return None
350 res = m.group('content')
351
352 if res.startswith('"') or res.startswith("'"):
353 res = res[1:-1]
a921f407 354
38285056 355 return unescapeHTML(res)
a921f407 356
c5229f39 357
8bb56eee
BF
358class HTMLAttributeParser(compat_HTMLParser):
359 """Trivial HTML parser to gather the attributes for a single element"""
360 def __init__(self):
c5229f39 361 self.attrs = {}
8bb56eee
BF
362 compat_HTMLParser.__init__(self)
363
364 def handle_starttag(self, tag, attrs):
365 self.attrs = dict(attrs)
366
c5229f39 367
8bb56eee
BF
368def extract_attributes(html_element):
369 """Given a string for an HTML element such as
370 <el
371 a="foo" B="bar" c="&98;az" d=boz
372 empty= noval entity="&amp;"
373 sq='"' dq="'"
374 >
375 Decode and return a dictionary of attributes.
376 {
377 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
378 'empty': '', 'noval': None, 'entity': '&',
379 'sq': '"', 'dq': '\''
380 }.
381 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
382 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
383 """
384 parser = HTMLAttributeParser()
385 parser.feed(html_element)
386 parser.close()
387 return parser.attrs
9e6dd238 388
c5229f39 389
9e6dd238 390def clean_html(html):
59ae15a5 391 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
392
393 if html is None: # Convenience for sanitizing descriptions etc.
394 return html
395
59ae15a5
PH
396 # Newline vs <br />
397 html = html.replace('\n', ' ')
6b3aef80
FV
398 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
399 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
400 # Strip html tags
401 html = re.sub('<.*?>', '', html)
402 # Replace html entities
403 html = unescapeHTML(html)
7decf895 404 return html.strip()
9e6dd238
FV
405
406
d77c3dfd 407def sanitize_open(filename, open_mode):
59ae15a5
PH
408 """Try to open the given filename, and slightly tweak it if this fails.
409
410 Attempts to open the given filename. If this fails, it tries to change
411 the filename slightly, step by step, until it's either able to open it
412 or it fails and raises a final exception, like the standard open()
413 function.
414
415 It returns the tuple (stream, definitive_file_name).
416 """
417 try:
28e614de 418 if filename == '-':
59ae15a5
PH
419 if sys.platform == 'win32':
420 import msvcrt
421 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 422 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
423 stream = open(encodeFilename(filename), open_mode)
424 return (stream, filename)
425 except (IOError, OSError) as err:
f45c185f
PH
426 if err.errno in (errno.EACCES,):
427 raise
59ae15a5 428
f45c185f 429 # In case of error, try to remove win32 forbidden chars
d55de57b 430 alt_filename = sanitize_path(filename)
f45c185f
PH
431 if alt_filename == filename:
432 raise
433 else:
434 # An exception here should be caught in the caller
d55de57b 435 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 436 return (stream, alt_filename)
d77c3dfd
FV
437
438
439def timeconvert(timestr):
59ae15a5
PH
440 """Convert RFC 2822 defined time string into system timestamp"""
441 timestamp = None
442 timetuple = email.utils.parsedate_tz(timestr)
443 if timetuple is not None:
444 timestamp = email.utils.mktime_tz(timetuple)
445 return timestamp
1c469a94 446
5f6a1245 447
796173d0 448def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
449 """Sanitizes a string so it could be used as part of a filename.
450 If restricted is set, use a stricter subset of allowed characters.
796173d0 451 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
452 """
453 def replace_insane(char):
c587cbb7
AT
454 if restricted and char in ACCENT_CHARS:
455 return ACCENT_CHARS[char]
59ae15a5
PH
456 if char == '?' or ord(char) < 32 or ord(char) == 127:
457 return ''
458 elif char == '"':
459 return '' if restricted else '\''
460 elif char == ':':
461 return '_-' if restricted else ' -'
462 elif char in '\\/|*<>':
463 return '_'
627dcfff 464 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
465 return '_'
466 if restricted and ord(char) > 127:
467 return '_'
468 return char
469
2aeb06d6
PH
470 # Handle timestamps
471 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 472 result = ''.join(map(replace_insane, s))
796173d0
PH
473 if not is_id:
474 while '__' in result:
475 result = result.replace('__', '_')
476 result = result.strip('_')
477 # Common case of "Foreign band name - English song title"
478 if restricted and result.startswith('-_'):
479 result = result[2:]
5a42414b
PH
480 if result.startswith('-'):
481 result = '_' + result[len('-'):]
a7440261 482 result = result.lstrip('.')
796173d0
PH
483 if not result:
484 result = '_'
59ae15a5 485 return result
d77c3dfd 486
5f6a1245 487
a2aaf4db
S
488def sanitize_path(s):
489 """Sanitizes and normalizes path on Windows"""
490 if sys.platform != 'win32':
491 return s
be531ef1
S
492 drive_or_unc, _ = os.path.splitdrive(s)
493 if sys.version_info < (2, 7) and not drive_or_unc:
494 drive_or_unc, _ = os.path.splitunc(s)
495 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
496 if drive_or_unc:
a2aaf4db
S
497 norm_path.pop(0)
498 sanitized_path = [
c90d16cf 499 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
a2aaf4db 500 for path_part in norm_path]
be531ef1
S
501 if drive_or_unc:
502 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
503 return os.path.join(*sanitized_path)
504
505
67dda517
S
506# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
507# unwanted failures due to missing protocol
17bcc626
S
508def sanitize_url(url):
509 return 'http:%s' % url if url.startswith('//') else url
510
511
67dda517 512def sanitized_Request(url, *args, **kwargs):
17bcc626 513 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
514
515
d77c3dfd 516def orderedSet(iterable):
59ae15a5
PH
517 """ Remove all duplicates from the input iterable """
518 res = []
519 for el in iterable:
520 if el not in res:
521 res.append(el)
522 return res
d77c3dfd 523
912b38b4 524
55b2f099 525def _htmlentity_transform(entity_with_semicolon):
4e408e47 526 """Transforms an HTML entity to a character."""
55b2f099
YCH
527 entity = entity_with_semicolon[:-1]
528
4e408e47
PH
529 # Known non-numeric HTML entity
530 if entity in compat_html_entities.name2codepoint:
531 return compat_chr(compat_html_entities.name2codepoint[entity])
532
55b2f099
YCH
533 # TODO: HTML5 allows entities without a semicolon. For example,
534 # '&Eacuteric' should be decoded as 'Éric'.
535 if entity_with_semicolon in compat_html_entities_html5:
536 return compat_html_entities_html5[entity_with_semicolon]
537
91757b0f 538 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
539 if mobj is not None:
540 numstr = mobj.group(1)
28e614de 541 if numstr.startswith('x'):
4e408e47 542 base = 16
28e614de 543 numstr = '0%s' % numstr
4e408e47
PH
544 else:
545 base = 10
7aefc49c
S
546 # See https://github.com/rg3/youtube-dl/issues/7518
547 try:
548 return compat_chr(int(numstr, base))
549 except ValueError:
550 pass
4e408e47
PH
551
552 # Unknown entity in name, return its literal representation
7a3f0c00 553 return '&%s;' % entity
4e408e47
PH
554
555
d77c3dfd 556def unescapeHTML(s):
912b38b4
PH
557 if s is None:
558 return None
559 assert type(s) == compat_str
d77c3dfd 560
4e408e47 561 return re.sub(
55b2f099 562 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 563
8bf48f23 564
aa49acd1
S
565def get_subprocess_encoding():
566 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
567 # For subprocess calls, encode with locale encoding
568 # Refer to http://stackoverflow.com/a/9951851/35070
569 encoding = preferredencoding()
570 else:
571 encoding = sys.getfilesystemencoding()
572 if encoding is None:
573 encoding = 'utf-8'
574 return encoding
575
576
8bf48f23 577def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
578 """
579 @param s The name of the file
580 """
d77c3dfd 581
8bf48f23 582 assert type(s) == compat_str
d77c3dfd 583
59ae15a5
PH
584 # Python 3 has a Unicode API
585 if sys.version_info >= (3, 0):
586 return s
0f00efed 587
aa49acd1
S
588 # Pass '' directly to use Unicode APIs on Windows 2000 and up
589 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
590 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
591 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
592 return s
593
8ee239e9
YCH
594 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
595 if sys.platform.startswith('java'):
596 return s
597
aa49acd1
S
598 return s.encode(get_subprocess_encoding(), 'ignore')
599
600
601def decodeFilename(b, for_subprocess=False):
602
603 if sys.version_info >= (3, 0):
604 return b
605
606 if not isinstance(b, bytes):
607 return b
608
609 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 610
f07b74fc
PH
611
612def encodeArgument(s):
613 if not isinstance(s, compat_str):
614 # Legacy code that uses byte strings
615 # Uncomment the following line after fixing all post processors
7af808a5 616 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
617 s = s.decode('ascii')
618 return encodeFilename(s, True)
619
620
aa49acd1
S
621def decodeArgument(b):
622 return decodeFilename(b, True)
623
624
8271226a
PH
625def decodeOption(optval):
626 if optval is None:
627 return optval
628 if isinstance(optval, bytes):
629 optval = optval.decode(preferredencoding())
630
631 assert isinstance(optval, compat_str)
632 return optval
1c256f70 633
5f6a1245 634
4539dd30
PH
635def formatSeconds(secs):
636 if secs > 3600:
637 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
638 elif secs > 60:
639 return '%d:%02d' % (secs // 60, secs % 60)
640 else:
641 return '%d' % secs
642
a0ddb8a2 643
be4a824d
PH
644def make_HTTPS_handler(params, **kwargs):
645 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 646 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 647 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 648 if opts_no_check_certificate:
be5f2c19 649 context.check_hostname = False
0db261ba 650 context.verify_mode = ssl.CERT_NONE
a2366922 651 try:
be4a824d 652 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
653 except TypeError:
654 # Python 2.7.8
655 # (create_default_context present but HTTPSHandler has no context=)
656 pass
657
658 if sys.version_info < (3, 2):
d7932313 659 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 660 else: # Python < 3.4
d7932313 661 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 662 context.verify_mode = (ssl.CERT_NONE
dca08720 663 if opts_no_check_certificate
ea6d901e 664 else ssl.CERT_REQUIRED)
303b479e 665 context.set_default_verify_paths()
be4a824d 666 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 667
732ea2f0 668
08f2a92c
JMF
669def bug_reports_message():
670 if ytdl_is_updateable():
671 update_cmd = 'type youtube-dl -U to update'
672 else:
673 update_cmd = 'see https://yt-dl.org/update on how to update'
674 msg = '; please report this issue on https://yt-dl.org/bug .'
675 msg += ' Make sure you are using the latest version; %s.' % update_cmd
676 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
677 return msg
678
679
1c256f70
PH
680class ExtractorError(Exception):
681 """Error during info extraction."""
5f6a1245 682
d11271dd 683 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
684 """ tb, if given, is the original traceback (so that it can be printed out).
685 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
686 """
687
688 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
689 expected = True
d11271dd
PH
690 if video_id is not None:
691 msg = video_id + ': ' + msg
410f3e73 692 if cause:
28e614de 693 msg += ' (caused by %r)' % cause
9a82b238 694 if not expected:
08f2a92c 695 msg += bug_reports_message()
1c256f70 696 super(ExtractorError, self).__init__(msg)
d5979c5d 697
1c256f70 698 self.traceback = tb
8cc83b8d 699 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 700 self.cause = cause
d11271dd 701 self.video_id = video_id
1c256f70 702
01951dda
PH
703 def format_traceback(self):
704 if self.traceback is None:
705 return None
28e614de 706 return ''.join(traceback.format_tb(self.traceback))
01951dda 707
1c256f70 708
416c7fcb
PH
709class UnsupportedError(ExtractorError):
710 def __init__(self, url):
711 super(UnsupportedError, self).__init__(
712 'Unsupported URL: %s' % url, expected=True)
713 self.url = url
714
715
55b3e45b
JMF
716class RegexNotFoundError(ExtractorError):
717 """Error when a regex didn't match"""
718 pass
719
720
d77c3dfd 721class DownloadError(Exception):
59ae15a5 722 """Download Error exception.
d77c3dfd 723
59ae15a5
PH
724 This exception may be thrown by FileDownloader objects if they are not
725 configured to continue on errors. They will contain the appropriate
726 error message.
727 """
5f6a1245 728
8cc83b8d
FV
729 def __init__(self, msg, exc_info=None):
730 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
731 super(DownloadError, self).__init__(msg)
732 self.exc_info = exc_info
d77c3dfd
FV
733
734
735class SameFileError(Exception):
59ae15a5 736 """Same File exception.
d77c3dfd 737
59ae15a5
PH
738 This exception will be thrown by FileDownloader objects if they detect
739 multiple files would have to be downloaded to the same file on disk.
740 """
741 pass
d77c3dfd
FV
742
743
744class PostProcessingError(Exception):
59ae15a5 745 """Post Processing exception.
d77c3dfd 746
59ae15a5
PH
747 This exception may be raised by PostProcessor's .run() method to
748 indicate an error in the postprocessing task.
749 """
5f6a1245 750
7851b379
PH
751 def __init__(self, msg):
752 self.msg = msg
d77c3dfd 753
5f6a1245 754
d77c3dfd 755class MaxDownloadsReached(Exception):
59ae15a5
PH
756 """ --max-downloads limit has been reached. """
757 pass
d77c3dfd
FV
758
759
760class UnavailableVideoError(Exception):
59ae15a5 761 """Unavailable Format exception.
d77c3dfd 762
59ae15a5
PH
763 This exception will be thrown when a video is requested
764 in a format that is not available for that video.
765 """
766 pass
d77c3dfd
FV
767
768
769class ContentTooShortError(Exception):
59ae15a5 770 """Content Too Short exception.
d77c3dfd 771
59ae15a5
PH
772 This exception may be raised by FileDownloader objects when a file they
773 download is too small for what the server announced first, indicating
774 the connection was probably interrupted.
775 """
d77c3dfd 776
59ae15a5 777 def __init__(self, downloaded, expected):
2c7ed247 778 # Both in bytes
59ae15a5
PH
779 self.downloaded = downloaded
780 self.expected = expected
d77c3dfd 781
5f6a1245 782
efa97bdc
YCH
783class XAttrMetadataError(Exception):
784 def __init__(self, code=None, msg='Unknown error'):
785 super(XAttrMetadataError, self).__init__(msg)
786 self.code = code
bd264412 787 self.msg = msg
efa97bdc
YCH
788
789 # Parsing code and msg
790 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
791 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
792 self.reason = 'NO_SPACE'
793 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
794 self.reason = 'VALUE_TOO_LONG'
795 else:
796 self.reason = 'NOT_SUPPORTED'
797
798
799class XAttrUnavailableError(Exception):
800 pass
801
802
c5a59d93 803def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
804 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
805 # expected HTTP responses to meet HTTP/1.0 or later (see also
806 # https://github.com/rg3/youtube-dl/issues/6727)
807 if sys.version_info < (3, 0):
5a1a2e94 808 kwargs[b'strict'] = True
be4a824d
PH
809 hc = http_class(*args, **kwargs)
810 source_address = ydl_handler._params.get('source_address')
811 if source_address is not None:
812 sa = (source_address, 0)
813 if hasattr(hc, 'source_address'): # Python 2.7+
814 hc.source_address = sa
815 else: # Python 2.6
816 def _hc_connect(self, *args, **kwargs):
817 sock = compat_socket_create_connection(
818 (self.host, self.port), self.timeout, sa)
819 if is_https:
d7932313
PH
820 self.sock = ssl.wrap_socket(
821 sock, self.key_file, self.cert_file,
822 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
823 else:
824 self.sock = sock
825 hc.connect = functools.partial(_hc_connect, hc)
826
827 return hc
828
829
87f0e62d 830def handle_youtubedl_headers(headers):
992fc9d6
YCH
831 filtered_headers = headers
832
833 if 'Youtubedl-no-compression' in filtered_headers:
834 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 835 del filtered_headers['Youtubedl-no-compression']
87f0e62d 836
992fc9d6 837 return filtered_headers
87f0e62d
YCH
838
839
acebc9cd 840class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
841 """Handler for HTTP requests and responses.
842
843 This class, when installed with an OpenerDirector, automatically adds
844 the standard headers to every HTTP request and handles gzipped and
845 deflated responses from web servers. If compression is to be avoided in
846 a particular request, the original request in the program code only has
0424ec30 847 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
848 removed before making the real request.
849
850 Part of this code was copied from:
851
852 http://techknack.net/python-urllib2-handlers/
853
854 Andrew Rowls, the author of that code, agreed to release it to the
855 public domain.
856 """
857
be4a824d
PH
858 def __init__(self, params, *args, **kwargs):
859 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
860 self._params = params
861
862 def http_open(self, req):
71aff188
YCH
863 conn_class = compat_http_client.HTTPConnection
864
865 socks_proxy = req.headers.get('Ytdl-socks-proxy')
866 if socks_proxy:
867 conn_class = make_socks_conn_class(conn_class, socks_proxy)
868 del req.headers['Ytdl-socks-proxy']
869
be4a824d 870 return self.do_open(functools.partial(
71aff188 871 _create_http_connection, self, conn_class, False),
be4a824d
PH
872 req)
873
59ae15a5
PH
874 @staticmethod
875 def deflate(data):
876 try:
877 return zlib.decompress(data, -zlib.MAX_WBITS)
878 except zlib.error:
879 return zlib.decompress(data)
880
881 @staticmethod
882 def addinfourl_wrapper(stream, headers, url, code):
883 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
884 return compat_urllib_request.addinfourl(stream, headers, url, code)
885 ret = compat_urllib_request.addinfourl(stream, headers, url)
886 ret.code = code
887 return ret
888
acebc9cd 889 def http_request(self, req):
51f267d9
S
890 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
891 # always respected by websites, some tend to give out URLs with non percent-encoded
892 # non-ASCII characters (see telemb.py, ard.py [#3412])
893 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
894 # To work around aforementioned issue we will replace request's original URL with
895 # percent-encoded one
896 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
897 # the code of this workaround has been moved here from YoutubeDL.urlopen()
898 url = req.get_full_url()
899 url_escaped = escape_url(url)
900
901 # Substitute URL if any change after escaping
902 if url != url_escaped:
15d260eb 903 req = update_Request(req, url=url_escaped)
51f267d9 904
33ac271b 905 for h, v in std_headers.items():
3d5f7a39
JK
906 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
907 # The dict keys are capitalized because of this bug by urllib
908 if h.capitalize() not in req.headers:
33ac271b 909 req.add_header(h, v)
87f0e62d
YCH
910
911 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
912
913 if sys.version_info < (2, 7) and '#' in req.get_full_url():
914 # Python 2.6 is brain-dead when it comes to fragments
915 req._Request__original = req._Request__original.partition('#')[0]
916 req._Request__r_type = req._Request__r_type.partition('#')[0]
917
59ae15a5
PH
918 return req
919
acebc9cd 920 def http_response(self, req, resp):
59ae15a5
PH
921 old_resp = resp
922 # gzip
923 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
924 content = resp.read()
925 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
926 try:
927 uncompressed = io.BytesIO(gz.read())
928 except IOError as original_ioerror:
929 # There may be junk add the end of the file
930 # See http://stackoverflow.com/q/4928560/35070 for details
931 for i in range(1, 1024):
932 try:
933 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
934 uncompressed = io.BytesIO(gz.read())
935 except IOError:
936 continue
937 break
938 else:
939 raise original_ioerror
940 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 941 resp.msg = old_resp.msg
c047270c 942 del resp.headers['Content-encoding']
59ae15a5
PH
943 # deflate
944 if resp.headers.get('Content-encoding', '') == 'deflate':
945 gz = io.BytesIO(self.deflate(resp.read()))
946 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
947 resp.msg = old_resp.msg
c047270c 948 del resp.headers['Content-encoding']
ad729172
S
949 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
950 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
951 if 300 <= resp.code < 400:
952 location = resp.headers.get('Location')
953 if location:
954 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
955 if sys.version_info >= (3, 0):
956 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
957 else:
958 location = location.decode('utf-8')
5a4d9ddb
S
959 location_escaped = escape_url(location)
960 if location != location_escaped:
961 del resp.headers['Location']
9a4aec8b
YCH
962 if sys.version_info < (3, 0):
963 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 964 resp.headers['Location'] = location_escaped
59ae15a5 965 return resp
0f8d03f8 966
acebc9cd
PH
967 https_request = http_request
968 https_response = http_response
bf50b038 969
5de90176 970
71aff188
YCH
971def make_socks_conn_class(base_class, socks_proxy):
972 assert issubclass(base_class, (
973 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
974
975 url_components = compat_urlparse.urlparse(socks_proxy)
976 if url_components.scheme.lower() == 'socks5':
977 socks_type = ProxyType.SOCKS5
978 elif url_components.scheme.lower() in ('socks', 'socks4'):
979 socks_type = ProxyType.SOCKS4
51fb4995
YCH
980 elif url_components.scheme.lower() == 'socks4a':
981 socks_type = ProxyType.SOCKS4A
71aff188 982
cdd94c2e
YCH
983 def unquote_if_non_empty(s):
984 if not s:
985 return s
986 return compat_urllib_parse_unquote_plus(s)
987
71aff188
YCH
988 proxy_args = (
989 socks_type,
990 url_components.hostname, url_components.port or 1080,
991 True, # Remote DNS
cdd94c2e
YCH
992 unquote_if_non_empty(url_components.username),
993 unquote_if_non_empty(url_components.password),
71aff188
YCH
994 )
995
996 class SocksConnection(base_class):
997 def connect(self):
998 self.sock = sockssocket()
999 self.sock.setproxy(*proxy_args)
1000 if type(self.timeout) in (int, float):
1001 self.sock.settimeout(self.timeout)
1002 self.sock.connect((self.host, self.port))
1003
1004 if isinstance(self, compat_http_client.HTTPSConnection):
1005 if hasattr(self, '_context'): # Python > 2.6
1006 self.sock = self._context.wrap_socket(
1007 self.sock, server_hostname=self.host)
1008 else:
1009 self.sock = ssl.wrap_socket(self.sock)
1010
1011 return SocksConnection
1012
1013
be4a824d
PH
1014class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1015 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1016 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1017 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1018 self._params = params
1019
1020 def https_open(self, req):
4f264c02 1021 kwargs = {}
71aff188
YCH
1022 conn_class = self._https_conn_class
1023
4f264c02
JMF
1024 if hasattr(self, '_context'): # python > 2.6
1025 kwargs['context'] = self._context
1026 if hasattr(self, '_check_hostname'): # python 3.x
1027 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1028
1029 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1030 if socks_proxy:
1031 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1032 del req.headers['Ytdl-socks-proxy']
1033
be4a824d 1034 return self.do_open(functools.partial(
71aff188 1035 _create_http_connection, self, conn_class, True),
4f264c02 1036 req, **kwargs)
be4a824d
PH
1037
1038
a6420bf5
S
1039class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1040 def __init__(self, cookiejar=None):
1041 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1042
1043 def http_response(self, request, response):
1044 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1045 # characters in Set-Cookie HTTP header of last response (see
1046 # https://github.com/rg3/youtube-dl/issues/6769).
1047 # In order to at least prevent crashing we will percent encode Set-Cookie
1048 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1049 # if sys.version_info < (3, 0) and response.headers:
1050 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1051 # set_cookie = response.headers.get(set_cookie_header)
1052 # if set_cookie:
1053 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1054 # if set_cookie != set_cookie_escaped:
1055 # del response.headers[set_cookie_header]
1056 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1057 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1058
1059 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1060 https_response = http_response
1061
1062
46f59e89
S
1063def extract_timezone(date_str):
1064 m = re.search(
1065 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1066 date_str)
1067 if not m:
1068 timezone = datetime.timedelta()
1069 else:
1070 date_str = date_str[:-len(m.group('tz'))]
1071 if not m.group('sign'):
1072 timezone = datetime.timedelta()
1073 else:
1074 sign = 1 if m.group('sign') == '+' else -1
1075 timezone = datetime.timedelta(
1076 hours=sign * int(m.group('hours')),
1077 minutes=sign * int(m.group('minutes')))
1078 return timezone, date_str
1079
1080
08b38d54 1081def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1082 """ Return a UNIX timestamp from the given date """
1083
1084 if date_str is None:
1085 return None
1086
52c3a6e4
S
1087 date_str = re.sub(r'\.[0-9]+', '', date_str)
1088
08b38d54 1089 if timezone is None:
46f59e89
S
1090 timezone, date_str = extract_timezone(date_str)
1091
52c3a6e4
S
1092 try:
1093 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1094 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1095 return calendar.timegm(dt.timetuple())
1096 except ValueError:
1097 pass
912b38b4
PH
1098
1099
46f59e89
S
1100def date_formats(day_first=True):
1101 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1102
1103
42bdd9d0 1104def unified_strdate(date_str, day_first=True):
bf50b038 1105 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1106
1107 if date_str is None:
1108 return None
bf50b038 1109 upload_date = None
5f6a1245 1110 # Replace commas
026fcc04 1111 date_str = date_str.replace(',', ' ')
42bdd9d0 1112 # Remove AM/PM + timezone
9bb8e0a3 1113 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1114 _, date_str = extract_timezone(date_str)
42bdd9d0 1115
46f59e89 1116 for expression in date_formats(day_first):
bf50b038
JMF
1117 try:
1118 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1119 except ValueError:
bf50b038 1120 pass
42393ce2
PH
1121 if upload_date is None:
1122 timetuple = email.utils.parsedate_tz(date_str)
1123 if timetuple:
c6b9cf05
S
1124 try:
1125 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1126 except ValueError:
1127 pass
6a750402
JMF
1128 if upload_date is not None:
1129 return compat_str(upload_date)
bf50b038 1130
5f6a1245 1131
46f59e89
S
1132def unified_timestamp(date_str, day_first=True):
1133 if date_str is None:
1134 return None
1135
1136 date_str = date_str.replace(',', ' ')
1137
7dc2a74e 1138 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1139 timezone, date_str = extract_timezone(date_str)
1140
1141 # Remove AM/PM + timezone
1142 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1143
1144 for expression in date_formats(day_first):
1145 try:
7dc2a74e 1146 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1147 return calendar.timegm(dt.timetuple())
1148 except ValueError:
1149 pass
1150 timetuple = email.utils.parsedate_tz(date_str)
1151 if timetuple:
7dc2a74e 1152 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1153
1154
28e614de 1155def determine_ext(url, default_ext='unknown_video'):
f4776371
S
1156 if url is None:
1157 return default_ext
9cb9a5df 1158 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1159 if re.match(r'^[A-Za-z0-9]+$', guess):
1160 return guess
a7aaa398
S
1161 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1162 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1163 return guess.rstrip('/')
73e79f2a 1164 else:
cbdbb766 1165 return default_ext
73e79f2a 1166
5f6a1245 1167
d4051a8e 1168def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1169 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1170
5f6a1245 1171
bd558525 1172def date_from_str(date_str):
37254abc
JMF
1173 """
1174 Return a datetime object from a string in the format YYYYMMDD or
1175 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1176 today = datetime.date.today()
f8795e10 1177 if date_str in ('now', 'today'):
37254abc 1178 return today
f8795e10
PH
1179 if date_str == 'yesterday':
1180 return today - datetime.timedelta(days=1)
37254abc
JMF
1181 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1182 if match is not None:
1183 sign = match.group('sign')
1184 time = int(match.group('time'))
1185 if sign == '-':
1186 time = -time
1187 unit = match.group('unit')
dfb1b146 1188 # A bad approximation?
37254abc
JMF
1189 if unit == 'month':
1190 unit = 'day'
1191 time *= 30
1192 elif unit == 'year':
1193 unit = 'day'
1194 time *= 365
1195 unit += 's'
1196 delta = datetime.timedelta(**{unit: time})
1197 return today + delta
611c1dd9 1198 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1199
1200
e63fc1be 1201def hyphenate_date(date_str):
1202 """
1203 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1204 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1205 if match is not None:
1206 return '-'.join(match.groups())
1207 else:
1208 return date_str
1209
5f6a1245 1210
bd558525
JMF
1211class DateRange(object):
1212 """Represents a time interval between two dates"""
5f6a1245 1213
bd558525
JMF
1214 def __init__(self, start=None, end=None):
1215 """start and end must be strings in the format accepted by date"""
1216 if start is not None:
1217 self.start = date_from_str(start)
1218 else:
1219 self.start = datetime.datetime.min.date()
1220 if end is not None:
1221 self.end = date_from_str(end)
1222 else:
1223 self.end = datetime.datetime.max.date()
37254abc 1224 if self.start > self.end:
bd558525 1225 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1226
bd558525
JMF
1227 @classmethod
1228 def day(cls, day):
1229 """Returns a range that only contains the given day"""
5f6a1245
JW
1230 return cls(day, day)
1231
bd558525
JMF
1232 def __contains__(self, date):
1233 """Check if the date is in the range"""
37254abc
JMF
1234 if not isinstance(date, datetime.date):
1235 date = date_from_str(date)
1236 return self.start <= date <= self.end
5f6a1245 1237
bd558525 1238 def __str__(self):
5f6a1245 1239 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1240
1241
1242def platform_name():
1243 """ Returns the platform name as a compat_str """
1244 res = platform.platform()
1245 if isinstance(res, bytes):
1246 res = res.decode(preferredencoding())
1247
1248 assert isinstance(res, compat_str)
1249 return res
c257baff
PH
1250
1251
b58ddb32
PH
1252def _windows_write_string(s, out):
1253 """ Returns True if the string was written using special methods,
1254 False if it has yet to be written out."""
1255 # Adapted from http://stackoverflow.com/a/3259271/35070
1256
1257 import ctypes
1258 import ctypes.wintypes
1259
1260 WIN_OUTPUT_IDS = {
1261 1: -11,
1262 2: -12,
1263 }
1264
a383a98a
PH
1265 try:
1266 fileno = out.fileno()
1267 except AttributeError:
1268 # If the output stream doesn't have a fileno, it's virtual
1269 return False
aa42e873
PH
1270 except io.UnsupportedOperation:
1271 # Some strange Windows pseudo files?
1272 return False
b58ddb32
PH
1273 if fileno not in WIN_OUTPUT_IDS:
1274 return False
1275
e2f89ec7 1276 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1277 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1278 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1279 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1280
e2f89ec7 1281 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1282 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1283 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1284 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1285 written = ctypes.wintypes.DWORD(0)
1286
611c1dd9 1287 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1288 FILE_TYPE_CHAR = 0x0002
1289 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1290 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1291 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1292 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1293 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1294 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1295
1296 def not_a_console(handle):
1297 if handle == INVALID_HANDLE_VALUE or handle is None:
1298 return True
8fb3ac36
PH
1299 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1300 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1301
1302 if not_a_console(h):
1303 return False
1304
d1b9c912
PH
1305 def next_nonbmp_pos(s):
1306 try:
1307 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1308 except StopIteration:
1309 return len(s)
1310
1311 while s:
1312 count = min(next_nonbmp_pos(s), 1024)
1313
b58ddb32 1314 ret = WriteConsoleW(
d1b9c912 1315 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1316 if ret == 0:
1317 raise OSError('Failed to write string')
d1b9c912
PH
1318 if not count: # We just wrote a non-BMP character
1319 assert written.value == 2
1320 s = s[1:]
1321 else:
1322 assert written.value > 0
1323 s = s[written.value:]
b58ddb32
PH
1324 return True
1325
1326
734f90bb 1327def write_string(s, out=None, encoding=None):
7459e3a2
PH
1328 if out is None:
1329 out = sys.stderr
8bf48f23 1330 assert type(s) == compat_str
7459e3a2 1331
b58ddb32
PH
1332 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1333 if _windows_write_string(s, out):
1334 return
1335
7459e3a2
PH
1336 if ('b' in getattr(out, 'mode', '') or
1337 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1338 byt = s.encode(encoding or preferredencoding(), 'ignore')
1339 out.write(byt)
1340 elif hasattr(out, 'buffer'):
1341 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1342 byt = s.encode(enc, 'ignore')
1343 out.buffer.write(byt)
1344 else:
8bf48f23 1345 out.write(s)
7459e3a2
PH
1346 out.flush()
1347
1348
48ea9cea
PH
1349def bytes_to_intlist(bs):
1350 if not bs:
1351 return []
1352 if isinstance(bs[0], int): # Python 3
1353 return list(bs)
1354 else:
1355 return [ord(c) for c in bs]
1356
c257baff 1357
cba892fa 1358def intlist_to_bytes(xs):
1359 if not xs:
1360 return b''
edaa23f8 1361 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1362
1363
c1c9a79c
PH
1364# Cross-platform file locking
1365if sys.platform == 'win32':
1366 import ctypes.wintypes
1367 import msvcrt
1368
1369 class OVERLAPPED(ctypes.Structure):
1370 _fields_ = [
1371 ('Internal', ctypes.wintypes.LPVOID),
1372 ('InternalHigh', ctypes.wintypes.LPVOID),
1373 ('Offset', ctypes.wintypes.DWORD),
1374 ('OffsetHigh', ctypes.wintypes.DWORD),
1375 ('hEvent', ctypes.wintypes.HANDLE),
1376 ]
1377
1378 kernel32 = ctypes.windll.kernel32
1379 LockFileEx = kernel32.LockFileEx
1380 LockFileEx.argtypes = [
1381 ctypes.wintypes.HANDLE, # hFile
1382 ctypes.wintypes.DWORD, # dwFlags
1383 ctypes.wintypes.DWORD, # dwReserved
1384 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1385 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1386 ctypes.POINTER(OVERLAPPED) # Overlapped
1387 ]
1388 LockFileEx.restype = ctypes.wintypes.BOOL
1389 UnlockFileEx = kernel32.UnlockFileEx
1390 UnlockFileEx.argtypes = [
1391 ctypes.wintypes.HANDLE, # hFile
1392 ctypes.wintypes.DWORD, # dwReserved
1393 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1394 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1395 ctypes.POINTER(OVERLAPPED) # Overlapped
1396 ]
1397 UnlockFileEx.restype = ctypes.wintypes.BOOL
1398 whole_low = 0xffffffff
1399 whole_high = 0x7fffffff
1400
1401 def _lock_file(f, exclusive):
1402 overlapped = OVERLAPPED()
1403 overlapped.Offset = 0
1404 overlapped.OffsetHigh = 0
1405 overlapped.hEvent = 0
1406 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1407 handle = msvcrt.get_osfhandle(f.fileno())
1408 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1409 whole_low, whole_high, f._lock_file_overlapped_p):
1410 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1411
1412 def _unlock_file(f):
1413 assert f._lock_file_overlapped_p
1414 handle = msvcrt.get_osfhandle(f.fileno())
1415 if not UnlockFileEx(handle, 0,
1416 whole_low, whole_high, f._lock_file_overlapped_p):
1417 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1418
1419else:
399a76e6
YCH
1420 # Some platforms, such as Jython, is missing fcntl
1421 try:
1422 import fcntl
c1c9a79c 1423
399a76e6
YCH
1424 def _lock_file(f, exclusive):
1425 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1426
399a76e6
YCH
1427 def _unlock_file(f):
1428 fcntl.flock(f, fcntl.LOCK_UN)
1429 except ImportError:
1430 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1431
1432 def _lock_file(f, exclusive):
1433 raise IOError(UNSUPPORTED_MSG)
1434
1435 def _unlock_file(f):
1436 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1437
1438
1439class locked_file(object):
1440 def __init__(self, filename, mode, encoding=None):
1441 assert mode in ['r', 'a', 'w']
1442 self.f = io.open(filename, mode, encoding=encoding)
1443 self.mode = mode
1444
1445 def __enter__(self):
1446 exclusive = self.mode != 'r'
1447 try:
1448 _lock_file(self.f, exclusive)
1449 except IOError:
1450 self.f.close()
1451 raise
1452 return self
1453
1454 def __exit__(self, etype, value, traceback):
1455 try:
1456 _unlock_file(self.f)
1457 finally:
1458 self.f.close()
1459
1460 def __iter__(self):
1461 return iter(self.f)
1462
1463 def write(self, *args):
1464 return self.f.write(*args)
1465
1466 def read(self, *args):
1467 return self.f.read(*args)
4eb7f1d1
JMF
1468
1469
4644ac55
S
1470def get_filesystem_encoding():
1471 encoding = sys.getfilesystemencoding()
1472 return encoding if encoding is not None else 'utf-8'
1473
1474
4eb7f1d1 1475def shell_quote(args):
a6a173c2 1476 quoted_args = []
4644ac55 1477 encoding = get_filesystem_encoding()
a6a173c2
JMF
1478 for a in args:
1479 if isinstance(a, bytes):
1480 # We may get a filename encoded with 'encodeFilename'
1481 a = a.decode(encoding)
1482 quoted_args.append(pipes.quote(a))
28e614de 1483 return ' '.join(quoted_args)
9d4660ca
PH
1484
1485
1486def smuggle_url(url, data):
1487 """ Pass additional data in a URL for internal use. """
1488
81953d1a
RA
1489 url, idata = unsmuggle_url(url, {})
1490 data.update(idata)
15707c7e 1491 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1492 {'__youtubedl_smuggle': json.dumps(data)})
1493 return url + '#' + sdata
9d4660ca
PH
1494
1495
79f82953 1496def unsmuggle_url(smug_url, default=None):
83e865a3 1497 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1498 return smug_url, default
28e614de
PH
1499 url, _, sdata = smug_url.rpartition('#')
1500 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1501 data = json.loads(jsond)
1502 return url, data
02dbf93f
PH
1503
1504
02dbf93f
PH
1505def format_bytes(bytes):
1506 if bytes is None:
28e614de 1507 return 'N/A'
02dbf93f
PH
1508 if type(bytes) is str:
1509 bytes = float(bytes)
1510 if bytes == 0.0:
1511 exponent = 0
1512 else:
1513 exponent = int(math.log(bytes, 1024.0))
28e614de 1514 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1515 converted = float(bytes) / float(1024 ** exponent)
28e614de 1516 return '%.2f%s' % (converted, suffix)
f53c966a 1517
1c088fa8 1518
fb47597b
S
1519def lookup_unit_table(unit_table, s):
1520 units_re = '|'.join(re.escape(u) for u in unit_table)
1521 m = re.match(
782b1b5b 1522 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1523 if not m:
1524 return None
1525 num_str = m.group('num').replace(',', '.')
1526 mult = unit_table[m.group('unit')]
1527 return int(float(num_str) * mult)
1528
1529
be64b5b0
PH
1530def parse_filesize(s):
1531 if s is None:
1532 return None
1533
dfb1b146 1534 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1535 # but we support those too
1536 _UNIT_TABLE = {
1537 'B': 1,
1538 'b': 1,
70852b47 1539 'bytes': 1,
be64b5b0
PH
1540 'KiB': 1024,
1541 'KB': 1000,
1542 'kB': 1024,
1543 'Kb': 1000,
13585d76 1544 'kb': 1000,
70852b47
YCH
1545 'kilobytes': 1000,
1546 'kibibytes': 1024,
be64b5b0
PH
1547 'MiB': 1024 ** 2,
1548 'MB': 1000 ** 2,
1549 'mB': 1024 ** 2,
1550 'Mb': 1000 ** 2,
13585d76 1551 'mb': 1000 ** 2,
70852b47
YCH
1552 'megabytes': 1000 ** 2,
1553 'mebibytes': 1024 ** 2,
be64b5b0
PH
1554 'GiB': 1024 ** 3,
1555 'GB': 1000 ** 3,
1556 'gB': 1024 ** 3,
1557 'Gb': 1000 ** 3,
13585d76 1558 'gb': 1000 ** 3,
70852b47
YCH
1559 'gigabytes': 1000 ** 3,
1560 'gibibytes': 1024 ** 3,
be64b5b0
PH
1561 'TiB': 1024 ** 4,
1562 'TB': 1000 ** 4,
1563 'tB': 1024 ** 4,
1564 'Tb': 1000 ** 4,
13585d76 1565 'tb': 1000 ** 4,
70852b47
YCH
1566 'terabytes': 1000 ** 4,
1567 'tebibytes': 1024 ** 4,
be64b5b0
PH
1568 'PiB': 1024 ** 5,
1569 'PB': 1000 ** 5,
1570 'pB': 1024 ** 5,
1571 'Pb': 1000 ** 5,
13585d76 1572 'pb': 1000 ** 5,
70852b47
YCH
1573 'petabytes': 1000 ** 5,
1574 'pebibytes': 1024 ** 5,
be64b5b0
PH
1575 'EiB': 1024 ** 6,
1576 'EB': 1000 ** 6,
1577 'eB': 1024 ** 6,
1578 'Eb': 1000 ** 6,
13585d76 1579 'eb': 1000 ** 6,
70852b47
YCH
1580 'exabytes': 1000 ** 6,
1581 'exbibytes': 1024 ** 6,
be64b5b0
PH
1582 'ZiB': 1024 ** 7,
1583 'ZB': 1000 ** 7,
1584 'zB': 1024 ** 7,
1585 'Zb': 1000 ** 7,
13585d76 1586 'zb': 1000 ** 7,
70852b47
YCH
1587 'zettabytes': 1000 ** 7,
1588 'zebibytes': 1024 ** 7,
be64b5b0
PH
1589 'YiB': 1024 ** 8,
1590 'YB': 1000 ** 8,
1591 'yB': 1024 ** 8,
1592 'Yb': 1000 ** 8,
13585d76 1593 'yb': 1000 ** 8,
70852b47
YCH
1594 'yottabytes': 1000 ** 8,
1595 'yobibytes': 1024 ** 8,
be64b5b0
PH
1596 }
1597
fb47597b
S
1598 return lookup_unit_table(_UNIT_TABLE, s)
1599
1600
1601def parse_count(s):
1602 if s is None:
be64b5b0
PH
1603 return None
1604
fb47597b
S
1605 s = s.strip()
1606
1607 if re.match(r'^[\d,.]+$', s):
1608 return str_to_int(s)
1609
1610 _UNIT_TABLE = {
1611 'k': 1000,
1612 'K': 1000,
1613 'm': 1000 ** 2,
1614 'M': 1000 ** 2,
1615 'kk': 1000 ** 2,
1616 'KK': 1000 ** 2,
1617 }
be64b5b0 1618
fb47597b 1619 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1620
2f7ae819 1621
a942d6cb 1622def month_by_name(name, lang='en'):
caefb1de
PH
1623 """ Return the number of a month by (locale-independently) English name """
1624
f6717dec 1625 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1626
caefb1de 1627 try:
f6717dec 1628 return month_names.index(name) + 1
7105440c
YCH
1629 except ValueError:
1630 return None
1631
1632
1633def month_by_abbreviation(abbrev):
1634 """ Return the number of a month by (locale-independently) English
1635 abbreviations """
1636
1637 try:
1638 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1639 except ValueError:
1640 return None
18258362
JMF
1641
1642
5aafe895 1643def fix_xml_ampersands(xml_str):
18258362 1644 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1645 return re.sub(
1646 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1647 '&amp;',
5aafe895 1648 xml_str)
e3946f98
PH
1649
1650
1651def setproctitle(title):
8bf48f23 1652 assert isinstance(title, compat_str)
c1c05c67
YCH
1653
1654 # ctypes in Jython is not complete
1655 # http://bugs.jython.org/issue2148
1656 if sys.platform.startswith('java'):
1657 return
1658
e3946f98 1659 try:
611c1dd9 1660 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1661 except OSError:
1662 return
6eefe533
PH
1663 title_bytes = title.encode('utf-8')
1664 buf = ctypes.create_string_buffer(len(title_bytes))
1665 buf.value = title_bytes
e3946f98 1666 try:
6eefe533 1667 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1668 except AttributeError:
1669 return # Strange libc, just skip this
d7dda168
PH
1670
1671
1672def remove_start(s, start):
46bc9b7d 1673 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1674
1675
2b9faf55 1676def remove_end(s, end):
46bc9b7d 1677 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1678
1679
31b2051e
S
1680def remove_quotes(s):
1681 if s is None or len(s) < 2:
1682 return s
1683 for quote in ('"', "'", ):
1684 if s[0] == quote and s[-1] == quote:
1685 return s[1:-1]
1686 return s
1687
1688
29eb5174 1689def url_basename(url):
9b8aaeed 1690 path = compat_urlparse.urlparse(url).path
28e614de 1691 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1692
1693
1694class HEADRequest(compat_urllib_request.Request):
1695 def get_method(self):
611c1dd9 1696 return 'HEAD'
7217e148
PH
1697
1698
95cf60e8
S
1699class PUTRequest(compat_urllib_request.Request):
1700 def get_method(self):
1701 return 'PUT'
1702
1703
9732d77e 1704def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1705 if get_attr:
1706 if v is not None:
1707 v = getattr(v, get_attr, None)
9572013d
PH
1708 if v == '':
1709 v = None
1812afb7
S
1710 if v is None:
1711 return default
1712 try:
1713 return int(v) * invscale // scale
1714 except ValueError:
af98f8ff 1715 return default
9732d77e 1716
9572013d 1717
40a90862
JMF
1718def str_or_none(v, default=None):
1719 return default if v is None else compat_str(v)
1720
9732d77e
PH
1721
1722def str_to_int(int_str):
48d4681e 1723 """ A more relaxed version of int_or_none """
9732d77e
PH
1724 if int_str is None:
1725 return None
28e614de 1726 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1727 return int(int_str)
608d11f5
PH
1728
1729
9732d77e 1730def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1731 if v is None:
1732 return default
1733 try:
1734 return float(v) * invscale / scale
1735 except ValueError:
1736 return default
43f775e4
PH
1737
1738
b72b4431
S
1739def strip_or_none(v):
1740 return None if v is None else v.strip()
1741
1742
608d11f5 1743def parse_duration(s):
8f9312c3 1744 if not isinstance(s, compat_basestring):
608d11f5
PH
1745 return None
1746
ca7b3246
S
1747 s = s.strip()
1748
acaff495 1749 days, hours, mins, secs, ms = [None] * 5
1750 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1751 if m:
1752 days, hours, mins, secs, ms = m.groups()
1753 else:
1754 m = re.match(
1755 r'''(?ix)(?:P?T)?
8f4b58d7 1756 (?:
acaff495 1757 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1758 )?
acaff495 1759 (?:
1760 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1761 )?
1762 (?:
1763 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1764 )?
1765 (?:
1766 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1767 )?$''', s)
1768 if m:
1769 days, hours, mins, secs, ms = m.groups()
1770 else:
1771 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1772 if m:
1773 hours, mins = m.groups()
1774 else:
1775 return None
1776
1777 duration = 0
1778 if secs:
1779 duration += float(secs)
1780 if mins:
1781 duration += float(mins) * 60
1782 if hours:
1783 duration += float(hours) * 60 * 60
1784 if days:
1785 duration += float(days) * 24 * 60 * 60
1786 if ms:
1787 duration += float(ms)
1788 return duration
91d7d0b3
JMF
1789
1790
e65e4c88 1791def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1792 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1793 return (
1794 '{0}.{1}{2}'.format(name, ext, real_ext)
1795 if not expected_real_ext or real_ext[1:] == expected_real_ext
1796 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1797
1798
b3ed15b7
S
1799def replace_extension(filename, ext, expected_real_ext=None):
1800 name, real_ext = os.path.splitext(filename)
1801 return '{0}.{1}'.format(
1802 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1803 ext)
1804
1805
d70ad093
PH
1806def check_executable(exe, args=[]):
1807 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1808 args can be a list of arguments for a short output (like -version) """
1809 try:
1810 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1811 except OSError:
1812 return False
1813 return exe
b7ab0590
PH
1814
1815
95807118 1816def get_exe_version(exe, args=['--version'],
cae97f65 1817 version_re=None, unrecognized='present'):
95807118
PH
1818 """ Returns the version of the specified executable,
1819 or False if the executable is not present """
1820 try:
b64d04c1
YCH
1821 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1822 # SIGTTOU if youtube-dl is run in the background.
1823 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
cae97f65 1824 out, _ = subprocess.Popen(
54116803 1825 [encodeArgument(exe)] + args,
00ca7552 1826 stdin=subprocess.PIPE,
95807118
PH
1827 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1828 except OSError:
1829 return False
cae97f65
PH
1830 if isinstance(out, bytes): # Python 2.x
1831 out = out.decode('ascii', 'ignore')
1832 return detect_exe_version(out, version_re, unrecognized)
1833
1834
1835def detect_exe_version(output, version_re=None, unrecognized='present'):
1836 assert isinstance(output, compat_str)
1837 if version_re is None:
1838 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1839 m = re.search(version_re, output)
95807118
PH
1840 if m:
1841 return m.group(1)
1842 else:
1843 return unrecognized
1844
1845
b7ab0590 1846class PagedList(object):
dd26ced1
PH
1847 def __len__(self):
1848 # This is only useful for tests
1849 return len(self.getslice())
1850
9c44d242
PH
1851
1852class OnDemandPagedList(PagedList):
b95dc034 1853 def __init__(self, pagefunc, pagesize, use_cache=False):
9c44d242
PH
1854 self._pagefunc = pagefunc
1855 self._pagesize = pagesize
b95dc034
YCH
1856 self._use_cache = use_cache
1857 if use_cache:
1858 self._cache = {}
9c44d242 1859
b7ab0590
PH
1860 def getslice(self, start=0, end=None):
1861 res = []
1862 for pagenum in itertools.count(start // self._pagesize):
1863 firstid = pagenum * self._pagesize
1864 nextfirstid = pagenum * self._pagesize + self._pagesize
1865 if start >= nextfirstid:
1866 continue
1867
b95dc034
YCH
1868 page_results = None
1869 if self._use_cache:
1870 page_results = self._cache.get(pagenum)
1871 if page_results is None:
1872 page_results = list(self._pagefunc(pagenum))
1873 if self._use_cache:
1874 self._cache[pagenum] = page_results
b7ab0590
PH
1875
1876 startv = (
1877 start % self._pagesize
1878 if firstid <= start < nextfirstid
1879 else 0)
1880
1881 endv = (
1882 ((end - 1) % self._pagesize) + 1
1883 if (end is not None and firstid <= end <= nextfirstid)
1884 else None)
1885
1886 if startv != 0 or endv is not None:
1887 page_results = page_results[startv:endv]
1888 res.extend(page_results)
1889
1890 # A little optimization - if current page is not "full", ie. does
1891 # not contain page_size videos then we can assume that this page
1892 # is the last one - there are no more ids on further pages -
1893 # i.e. no need to query again.
1894 if len(page_results) + startv < self._pagesize:
1895 break
1896
1897 # If we got the whole page, but the next page is not interesting,
1898 # break out early as well
1899 if end == nextfirstid:
1900 break
1901 return res
81c2f20b
PH
1902
1903
9c44d242
PH
1904class InAdvancePagedList(PagedList):
1905 def __init__(self, pagefunc, pagecount, pagesize):
1906 self._pagefunc = pagefunc
1907 self._pagecount = pagecount
1908 self._pagesize = pagesize
1909
1910 def getslice(self, start=0, end=None):
1911 res = []
1912 start_page = start // self._pagesize
1913 end_page = (
1914 self._pagecount if end is None else (end // self._pagesize + 1))
1915 skip_elems = start - start_page * self._pagesize
1916 only_more = None if end is None else end - start
1917 for pagenum in range(start_page, end_page):
1918 page = list(self._pagefunc(pagenum))
1919 if skip_elems:
1920 page = page[skip_elems:]
1921 skip_elems = None
1922 if only_more is not None:
1923 if len(page) < only_more:
1924 only_more -= len(page)
1925 else:
1926 page = page[:only_more]
1927 res.extend(page)
1928 break
1929 res.extend(page)
1930 return res
1931
1932
81c2f20b 1933def uppercase_escape(s):
676eb3f2 1934 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1935 return re.sub(
a612753d 1936 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1937 lambda m: unicode_escape(m.group(0))[0],
1938 s)
0fe2ff78
YCH
1939
1940
1941def lowercase_escape(s):
1942 unicode_escape = codecs.getdecoder('unicode_escape')
1943 return re.sub(
1944 r'\\u[0-9a-fA-F]{4}',
1945 lambda m: unicode_escape(m.group(0))[0],
1946 s)
b53466e1 1947
d05cfe06
S
1948
1949def escape_rfc3986(s):
1950 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1951 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1952 s = s.encode('utf-8')
ecc0c5ee 1953 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1954
1955
1956def escape_url(url):
1957 """Escape URL as suggested by RFC 3986"""
1958 url_parsed = compat_urllib_parse_urlparse(url)
1959 return url_parsed._replace(
efbed08d 1960 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
1961 path=escape_rfc3986(url_parsed.path),
1962 params=escape_rfc3986(url_parsed.params),
1963 query=escape_rfc3986(url_parsed.query),
1964 fragment=escape_rfc3986(url_parsed.fragment)
1965 ).geturl()
1966
62e609ab
PH
1967
1968def read_batch_urls(batch_fd):
1969 def fixup(url):
1970 if not isinstance(url, compat_str):
1971 url = url.decode('utf-8', 'replace')
28e614de 1972 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1973 if url.startswith(BOM_UTF8):
1974 url = url[len(BOM_UTF8):]
1975 url = url.strip()
1976 if url.startswith(('#', ';', ']')):
1977 return False
1978 return url
1979
1980 with contextlib.closing(batch_fd) as fd:
1981 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1982
1983
1984def urlencode_postdata(*args, **kargs):
15707c7e 1985 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1986
1987
38f9ef31 1988def update_url_query(url, query):
cacd9966
YCH
1989 if not query:
1990 return url
38f9ef31 1991 parsed_url = compat_urlparse.urlparse(url)
1992 qs = compat_parse_qs(parsed_url.query)
1993 qs.update(query)
1994 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 1995 query=compat_urllib_parse_urlencode(qs, True)))
16392824 1996
8e60dc75 1997
ed0291d1
S
1998def update_Request(req, url=None, data=None, headers={}, query={}):
1999 req_headers = req.headers.copy()
2000 req_headers.update(headers)
2001 req_data = data or req.data
2002 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2003 req_get_method = req.get_method()
2004 if req_get_method == 'HEAD':
2005 req_type = HEADRequest
2006 elif req_get_method == 'PUT':
2007 req_type = PUTRequest
2008 else:
2009 req_type = compat_urllib_request.Request
ed0291d1
S
2010 new_req = req_type(
2011 req_url, data=req_data, headers=req_headers,
2012 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2013 if hasattr(req, 'timeout'):
2014 new_req.timeout = req.timeout
2015 return new_req
2016
2017
86296ad2 2018def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
2019 if isinstance(key_or_keys, (list, tuple)):
2020 for key in key_or_keys:
86296ad2
S
2021 if key not in d or d[key] is None or skip_false_values and not d[key]:
2022 continue
2023 return d[key]
cbecc9b9
S
2024 return default
2025 return d.get(key_or_keys, default)
2026
2027
329ca3be
S
2028def try_get(src, getter, expected_type=None):
2029 try:
2030 v = getter(src)
2031 except (AttributeError, KeyError, TypeError, IndexError):
2032 pass
2033 else:
2034 if expected_type is None or isinstance(v, expected_type):
2035 return v
2036
2037
8e60dc75
S
2038def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2039 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2040
16392824 2041
a1a530b0
PH
2042US_RATINGS = {
2043 'G': 0,
2044 'PG': 10,
2045 'PG-13': 13,
2046 'R': 16,
2047 'NC': 18,
2048}
fac55558
PH
2049
2050
a8795327
S
2051TV_PARENTAL_GUIDELINES = {
2052 'TV-Y': 0,
2053 'TV-Y7': 7,
2054 'TV-G': 0,
2055 'TV-PG': 0,
2056 'TV-14': 14,
2057 'TV-MA': 17,
2058}
2059
2060
146c80e2 2061def parse_age_limit(s):
a8795327
S
2062 if type(s) == int:
2063 return s if 0 <= s <= 21 else None
2064 if not isinstance(s, compat_basestring):
d838b1bd 2065 return None
146c80e2 2066 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2067 if m:
2068 return int(m.group('age'))
2069 if s in US_RATINGS:
2070 return US_RATINGS[s]
2071 return TV_PARENTAL_GUIDELINES.get(s)
146c80e2
S
2072
2073
fac55558 2074def strip_jsonp(code):
609a61e3 2075 return re.sub(
5950cb1d 2076 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
2077
2078
e05f6939
PH
2079def js_to_json(code):
2080 def fix_kv(m):
e7b6d122
PH
2081 v = m.group(0)
2082 if v in ('true', 'false', 'null'):
2083 return v
bd1e4844 2084 elif v.startswith('/*') or v == ',':
2085 return ""
2086
2087 if v[0] in ("'", '"'):
2088 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 2089 '"': '\\"',
bd1e4844 2090 "\\'": "'",
2091 '\\\n': '',
2092 '\\x': '\\u00',
2093 }.get(m.group(0), m.group(0)), v[1:-1])
2094
89ac4a19 2095 INTEGER_TABLE = (
e4659b45
YCH
2096 (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
2097 (r'^(0+[0-7]+)\s*:?$', 8),
89ac4a19
S
2098 )
2099
2100 for regex, base in INTEGER_TABLE:
2101 im = re.match(regex, v)
2102 if im:
e4659b45 2103 i = int(im.group(1), base)
89ac4a19
S
2104 return '"%d":' % i if v.endswith(':') else '%d' % i
2105
e7b6d122 2106 return '"%s"' % v
e05f6939 2107
bd1e4844 2108 return re.sub(r'''(?sx)
2109 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2110 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2111 /\*.*?\*/|,(?=\s*[\]}])|
2112 [a-zA-Z_][.a-zA-Z_0-9]*|
47212f7b 2113 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
bd1e4844 2114 [0-9]+(?=\s*:)
e05f6939 2115 ''', fix_kv, code)
e05f6939
PH
2116
2117
478c2c61
PH
2118def qualities(quality_ids):
2119 """ Get a numeric quality value out of a list of possible values """
2120 def q(qid):
2121 try:
2122 return quality_ids.index(qid)
2123 except ValueError:
2124 return -1
2125 return q
2126
acd69589
PH
2127
2128DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 2129
a020a0dc
PH
2130
2131def limit_length(s, length):
2132 """ Add ellipses to overly long strings """
2133 if s is None:
2134 return None
2135 ELLIPSES = '...'
2136 if len(s) > length:
2137 return s[:length - len(ELLIPSES)] + ELLIPSES
2138 return s
48844745
PH
2139
2140
2141def version_tuple(v):
5f9b8394 2142 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2143
2144
2145def is_outdated_version(version, limit, assume_new=True):
2146 if not version:
2147 return not assume_new
2148 try:
2149 return version_tuple(version) < version_tuple(limit)
2150 except ValueError:
2151 return not assume_new
732ea2f0
PH
2152
2153
2154def ytdl_is_updateable():
2155 """ Returns if youtube-dl can be updated with -U """
2156 from zipimport import zipimporter
2157
2158 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
2159
2160
2161def args_to_str(args):
2162 # Get a short string representation for a subprocess command
702ccf2d 2163 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2164
2165
9b9c5355 2166def error_to_compat_str(err):
fdae2358
S
2167 err_str = str(err)
2168 # On python 2 error byte string must be decoded with proper
2169 # encoding rather than ascii
2170 if sys.version_info[0] < 3:
2171 err_str = err_str.decode(preferredencoding())
2172 return err_str
2173
2174
c460bdd5 2175def mimetype2ext(mt):
eb9ee194
S
2176 if mt is None:
2177 return None
2178
765ac263
JMF
2179 ext = {
2180 'audio/mp4': 'm4a',
6c33d24b
YCH
2181 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2182 # it's the most popular one
2183 'audio/mpeg': 'mp3',
765ac263
JMF
2184 }.get(mt)
2185 if ext is not None:
2186 return ext
2187
c460bdd5 2188 _, _, res = mt.rpartition('/')
6562d34a 2189 res = res.split(';')[0].strip().lower()
c460bdd5
PH
2190
2191 return {
f6861ec9 2192 '3gpp': '3gp',
cafcf657 2193 'smptett+xml': 'tt',
2194 'srt': 'srt',
2195 'ttaf+xml': 'dfxp',
a0d8d704 2196 'ttml+xml': 'ttml',
cafcf657 2197 'vtt': 'vtt',
f6861ec9 2198 'x-flv': 'flv',
a0d8d704
YCH
2199 'x-mp4-fragmented': 'mp4',
2200 'x-ms-wmv': 'wmv',
b4173f15
RA
2201 'mpegurl': 'm3u8',
2202 'x-mpegurl': 'm3u8',
2203 'vnd.apple.mpegurl': 'm3u8',
2204 'dash+xml': 'mpd',
2205 'f4m': 'f4m',
2206 'f4m+xml': 'f4m',
f164b971 2207 'hds+xml': 'f4m',
e910fe2f 2208 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 2209 'quicktime': 'mov',
c460bdd5
PH
2210 }.get(res, res)
2211
2212
4f3c5e06 2213def parse_codecs(codecs_str):
2214 # http://tools.ietf.org/html/rfc6381
2215 if not codecs_str:
2216 return {}
2217 splited_codecs = list(filter(None, map(
2218 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2219 vcodec, acodec = None, None
2220 for full_codec in splited_codecs:
2221 codec = full_codec.split('.')[0]
2222 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2223 if not vcodec:
2224 vcodec = full_codec
073ac122 2225 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
4f3c5e06 2226 if not acodec:
2227 acodec = full_codec
2228 else:
2229 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2230 if not vcodec and not acodec:
2231 if len(splited_codecs) == 2:
2232 return {
2233 'vcodec': vcodec,
2234 'acodec': acodec,
2235 }
2236 elif len(splited_codecs) == 1:
2237 return {
2238 'vcodec': 'none',
2239 'acodec': vcodec,
2240 }
2241 else:
2242 return {
2243 'vcodec': vcodec or 'none',
2244 'acodec': acodec or 'none',
2245 }
2246 return {}
2247
2248
2ccd1b10 2249def urlhandle_detect_ext(url_handle):
79298173 2250 getheader = url_handle.headers.get
2ccd1b10 2251
b55ee18f
PH
2252 cd = getheader('Content-Disposition')
2253 if cd:
2254 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2255 if m:
2256 e = determine_ext(m.group('filename'), default_ext=None)
2257 if e:
2258 return e
2259
c460bdd5 2260 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2261
2262
1e399778
YCH
2263def encode_data_uri(data, mime_type):
2264 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2265
2266
05900629 2267def age_restricted(content_limit, age_limit):
6ec6cb4e 2268 """ Returns True iff the content should be blocked """
05900629
PH
2269
2270 if age_limit is None: # No limit set
2271 return False
2272 if content_limit is None:
2273 return False # Content available for everyone
2274 return age_limit < content_limit
61ca9a80
PH
2275
2276
2277def is_html(first_bytes):
2278 """ Detect whether a file contains HTML by examining its first bytes. """
2279
2280 BOMS = [
2281 (b'\xef\xbb\xbf', 'utf-8'),
2282 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2283 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2284 (b'\xff\xfe', 'utf-16-le'),
2285 (b'\xfe\xff', 'utf-16-be'),
2286 ]
2287 for bom, enc in BOMS:
2288 if first_bytes.startswith(bom):
2289 s = first_bytes[len(bom):].decode(enc, 'replace')
2290 break
2291 else:
2292 s = first_bytes.decode('utf-8', 'replace')
2293
2294 return re.match(r'^\s*<', s)
a055469f
PH
2295
2296
2297def determine_protocol(info_dict):
2298 protocol = info_dict.get('protocol')
2299 if protocol is not None:
2300 return protocol
2301
2302 url = info_dict['url']
2303 if url.startswith('rtmp'):
2304 return 'rtmp'
2305 elif url.startswith('mms'):
2306 return 'mms'
2307 elif url.startswith('rtsp'):
2308 return 'rtsp'
2309
2310 ext = determine_ext(url)
2311 if ext == 'm3u8':
2312 return 'm3u8'
2313 elif ext == 'f4m':
2314 return 'f4m'
2315
2316 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2317
2318
2319def render_table(header_row, data):
2320 """ Render a list of rows, each as a list of values """
2321 table = [header_row] + data
2322 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2323 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2324 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2325
2326
2327def _match_one(filter_part, dct):
2328 COMPARISON_OPERATORS = {
2329 '<': operator.lt,
2330 '<=': operator.le,
2331 '>': operator.gt,
2332 '>=': operator.ge,
2333 '=': operator.eq,
2334 '!=': operator.ne,
2335 }
2336 operator_rex = re.compile(r'''(?x)\s*
2337 (?P<key>[a-z_]+)
2338 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2339 (?:
2340 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2341 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2342 )
2343 \s*$
2344 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2345 m = operator_rex.search(filter_part)
2346 if m:
2347 op = COMPARISON_OPERATORS[m.group('op')]
2348 if m.group('strval') is not None:
2349 if m.group('op') not in ('=', '!='):
2350 raise ValueError(
2351 'Operator %s does not support string values!' % m.group('op'))
2352 comparison_value = m.group('strval')
2353 else:
2354 try:
2355 comparison_value = int(m.group('intval'))
2356 except ValueError:
2357 comparison_value = parse_filesize(m.group('intval'))
2358 if comparison_value is None:
2359 comparison_value = parse_filesize(m.group('intval') + 'B')
2360 if comparison_value is None:
2361 raise ValueError(
2362 'Invalid integer value %r in filter part %r' % (
2363 m.group('intval'), filter_part))
2364 actual_value = dct.get(m.group('key'))
2365 if actual_value is None:
2366 return m.group('none_inclusive')
2367 return op(actual_value, comparison_value)
2368
2369 UNARY_OPERATORS = {
2370 '': lambda v: v is not None,
2371 '!': lambda v: v is None,
2372 }
2373 operator_rex = re.compile(r'''(?x)\s*
2374 (?P<op>%s)\s*(?P<key>[a-z_]+)
2375 \s*$
2376 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2377 m = operator_rex.search(filter_part)
2378 if m:
2379 op = UNARY_OPERATORS[m.group('op')]
2380 actual_value = dct.get(m.group('key'))
2381 return op(actual_value)
2382
2383 raise ValueError('Invalid filter part %r' % filter_part)
2384
2385
2386def match_str(filter_str, dct):
2387 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2388
2389 return all(
2390 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2391
2392
2393def match_filter_func(filter_str):
2394 def _match_func(info_dict):
2395 if match_str(filter_str, info_dict):
2396 return None
2397 else:
2398 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2399 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2400 return _match_func
91410c9b
PH
2401
2402
bf6427d2
YCH
2403def parse_dfxp_time_expr(time_expr):
2404 if not time_expr:
d631d5f9 2405 return
bf6427d2
YCH
2406
2407 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2408 if mobj:
2409 return float(mobj.group('time_offset'))
2410
db2fe38b 2411 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2412 if mobj:
db2fe38b 2413 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2414
2415
c1c924ab
YCH
2416def srt_subtitles_timecode(seconds):
2417 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2418
2419
2420def dfxp2srt(dfxp_data):
4e335771
YCH
2421 _x = functools.partial(xpath_with_ns, ns_map={
2422 'ttml': 'http://www.w3.org/ns/ttml',
2423 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
5bf28d78 2424 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
4e335771 2425 })
bf6427d2 2426
87de7069 2427 class TTMLPElementParser(object):
2b14cb56 2428 out = ''
bf6427d2 2429
2b14cb56 2430 def start(self, tag, attrib):
2431 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2432 self.out += '\n'
bf6427d2 2433
2b14cb56 2434 def end(self, tag):
2435 pass
bf6427d2 2436
2b14cb56 2437 def data(self, data):
2438 self.out += data
2439
2440 def close(self):
2441 return self.out.strip()
2442
2443 def parse_node(node):
2444 target = TTMLPElementParser()
2445 parser = xml.etree.ElementTree.XMLParser(target=target)
2446 parser.feed(xml.etree.ElementTree.tostring(node))
2447 return parser.close()
bf6427d2 2448
36e6f62c 2449 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
bf6427d2 2450 out = []
5bf28d78 2451 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2452
2453 if not paras:
2454 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
2455
2456 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2457 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2458 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2459 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2460 if begin_time is None:
2461 continue
7dff0363 2462 if not end_time:
d631d5f9
YCH
2463 if not dur:
2464 continue
2465 end_time = begin_time + dur
bf6427d2
YCH
2466 out.append('%d\n%s --> %s\n%s\n\n' % (
2467 index,
c1c924ab
YCH
2468 srt_subtitles_timecode(begin_time),
2469 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2470 parse_node(para)))
2471
2472 return ''.join(out)
2473
2474
66e289ba
S
2475def cli_option(params, command_option, param):
2476 param = params.get(param)
98e698f1
RA
2477 if param:
2478 param = compat_str(param)
66e289ba
S
2479 return [command_option, param] if param is not None else []
2480
2481
2482def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2483 param = params.get(param)
2484 assert isinstance(param, bool)
2485 if separator:
2486 return [command_option + separator + (true_value if param else false_value)]
2487 return [command_option, true_value if param else false_value]
2488
2489
2490def cli_valueless_option(params, command_option, param, expected_value=True):
2491 param = params.get(param)
2492 return [command_option] if param == expected_value else []
2493
2494
2495def cli_configuration_args(params, param, default=[]):
2496 ex_args = params.get(param)
2497 if ex_args is None:
2498 return default
2499 assert isinstance(ex_args, list)
2500 return ex_args
2501
2502
39672624
YCH
2503class ISO639Utils(object):
2504 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2505 _lang_map = {
2506 'aa': 'aar',
2507 'ab': 'abk',
2508 'ae': 'ave',
2509 'af': 'afr',
2510 'ak': 'aka',
2511 'am': 'amh',
2512 'an': 'arg',
2513 'ar': 'ara',
2514 'as': 'asm',
2515 'av': 'ava',
2516 'ay': 'aym',
2517 'az': 'aze',
2518 'ba': 'bak',
2519 'be': 'bel',
2520 'bg': 'bul',
2521 'bh': 'bih',
2522 'bi': 'bis',
2523 'bm': 'bam',
2524 'bn': 'ben',
2525 'bo': 'bod',
2526 'br': 'bre',
2527 'bs': 'bos',
2528 'ca': 'cat',
2529 'ce': 'che',
2530 'ch': 'cha',
2531 'co': 'cos',
2532 'cr': 'cre',
2533 'cs': 'ces',
2534 'cu': 'chu',
2535 'cv': 'chv',
2536 'cy': 'cym',
2537 'da': 'dan',
2538 'de': 'deu',
2539 'dv': 'div',
2540 'dz': 'dzo',
2541 'ee': 'ewe',
2542 'el': 'ell',
2543 'en': 'eng',
2544 'eo': 'epo',
2545 'es': 'spa',
2546 'et': 'est',
2547 'eu': 'eus',
2548 'fa': 'fas',
2549 'ff': 'ful',
2550 'fi': 'fin',
2551 'fj': 'fij',
2552 'fo': 'fao',
2553 'fr': 'fra',
2554 'fy': 'fry',
2555 'ga': 'gle',
2556 'gd': 'gla',
2557 'gl': 'glg',
2558 'gn': 'grn',
2559 'gu': 'guj',
2560 'gv': 'glv',
2561 'ha': 'hau',
2562 'he': 'heb',
2563 'hi': 'hin',
2564 'ho': 'hmo',
2565 'hr': 'hrv',
2566 'ht': 'hat',
2567 'hu': 'hun',
2568 'hy': 'hye',
2569 'hz': 'her',
2570 'ia': 'ina',
2571 'id': 'ind',
2572 'ie': 'ile',
2573 'ig': 'ibo',
2574 'ii': 'iii',
2575 'ik': 'ipk',
2576 'io': 'ido',
2577 'is': 'isl',
2578 'it': 'ita',
2579 'iu': 'iku',
2580 'ja': 'jpn',
2581 'jv': 'jav',
2582 'ka': 'kat',
2583 'kg': 'kon',
2584 'ki': 'kik',
2585 'kj': 'kua',
2586 'kk': 'kaz',
2587 'kl': 'kal',
2588 'km': 'khm',
2589 'kn': 'kan',
2590 'ko': 'kor',
2591 'kr': 'kau',
2592 'ks': 'kas',
2593 'ku': 'kur',
2594 'kv': 'kom',
2595 'kw': 'cor',
2596 'ky': 'kir',
2597 'la': 'lat',
2598 'lb': 'ltz',
2599 'lg': 'lug',
2600 'li': 'lim',
2601 'ln': 'lin',
2602 'lo': 'lao',
2603 'lt': 'lit',
2604 'lu': 'lub',
2605 'lv': 'lav',
2606 'mg': 'mlg',
2607 'mh': 'mah',
2608 'mi': 'mri',
2609 'mk': 'mkd',
2610 'ml': 'mal',
2611 'mn': 'mon',
2612 'mr': 'mar',
2613 'ms': 'msa',
2614 'mt': 'mlt',
2615 'my': 'mya',
2616 'na': 'nau',
2617 'nb': 'nob',
2618 'nd': 'nde',
2619 'ne': 'nep',
2620 'ng': 'ndo',
2621 'nl': 'nld',
2622 'nn': 'nno',
2623 'no': 'nor',
2624 'nr': 'nbl',
2625 'nv': 'nav',
2626 'ny': 'nya',
2627 'oc': 'oci',
2628 'oj': 'oji',
2629 'om': 'orm',
2630 'or': 'ori',
2631 'os': 'oss',
2632 'pa': 'pan',
2633 'pi': 'pli',
2634 'pl': 'pol',
2635 'ps': 'pus',
2636 'pt': 'por',
2637 'qu': 'que',
2638 'rm': 'roh',
2639 'rn': 'run',
2640 'ro': 'ron',
2641 'ru': 'rus',
2642 'rw': 'kin',
2643 'sa': 'san',
2644 'sc': 'srd',
2645 'sd': 'snd',
2646 'se': 'sme',
2647 'sg': 'sag',
2648 'si': 'sin',
2649 'sk': 'slk',
2650 'sl': 'slv',
2651 'sm': 'smo',
2652 'sn': 'sna',
2653 'so': 'som',
2654 'sq': 'sqi',
2655 'sr': 'srp',
2656 'ss': 'ssw',
2657 'st': 'sot',
2658 'su': 'sun',
2659 'sv': 'swe',
2660 'sw': 'swa',
2661 'ta': 'tam',
2662 'te': 'tel',
2663 'tg': 'tgk',
2664 'th': 'tha',
2665 'ti': 'tir',
2666 'tk': 'tuk',
2667 'tl': 'tgl',
2668 'tn': 'tsn',
2669 'to': 'ton',
2670 'tr': 'tur',
2671 'ts': 'tso',
2672 'tt': 'tat',
2673 'tw': 'twi',
2674 'ty': 'tah',
2675 'ug': 'uig',
2676 'uk': 'ukr',
2677 'ur': 'urd',
2678 'uz': 'uzb',
2679 've': 'ven',
2680 'vi': 'vie',
2681 'vo': 'vol',
2682 'wa': 'wln',
2683 'wo': 'wol',
2684 'xh': 'xho',
2685 'yi': 'yid',
2686 'yo': 'yor',
2687 'za': 'zha',
2688 'zh': 'zho',
2689 'zu': 'zul',
2690 }
2691
2692 @classmethod
2693 def short2long(cls, code):
2694 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2695 return cls._lang_map.get(code[:2])
2696
2697 @classmethod
2698 def long2short(cls, code):
2699 """Convert language code from ISO 639-2/T to ISO 639-1"""
2700 for short_name, long_name in cls._lang_map.items():
2701 if long_name == code:
2702 return short_name
2703
2704
4eb10f66
YCH
2705class ISO3166Utils(object):
2706 # From http://data.okfn.org/data/core/country-list
2707 _country_map = {
2708 'AF': 'Afghanistan',
2709 'AX': 'Åland Islands',
2710 'AL': 'Albania',
2711 'DZ': 'Algeria',
2712 'AS': 'American Samoa',
2713 'AD': 'Andorra',
2714 'AO': 'Angola',
2715 'AI': 'Anguilla',
2716 'AQ': 'Antarctica',
2717 'AG': 'Antigua and Barbuda',
2718 'AR': 'Argentina',
2719 'AM': 'Armenia',
2720 'AW': 'Aruba',
2721 'AU': 'Australia',
2722 'AT': 'Austria',
2723 'AZ': 'Azerbaijan',
2724 'BS': 'Bahamas',
2725 'BH': 'Bahrain',
2726 'BD': 'Bangladesh',
2727 'BB': 'Barbados',
2728 'BY': 'Belarus',
2729 'BE': 'Belgium',
2730 'BZ': 'Belize',
2731 'BJ': 'Benin',
2732 'BM': 'Bermuda',
2733 'BT': 'Bhutan',
2734 'BO': 'Bolivia, Plurinational State of',
2735 'BQ': 'Bonaire, Sint Eustatius and Saba',
2736 'BA': 'Bosnia and Herzegovina',
2737 'BW': 'Botswana',
2738 'BV': 'Bouvet Island',
2739 'BR': 'Brazil',
2740 'IO': 'British Indian Ocean Territory',
2741 'BN': 'Brunei Darussalam',
2742 'BG': 'Bulgaria',
2743 'BF': 'Burkina Faso',
2744 'BI': 'Burundi',
2745 'KH': 'Cambodia',
2746 'CM': 'Cameroon',
2747 'CA': 'Canada',
2748 'CV': 'Cape Verde',
2749 'KY': 'Cayman Islands',
2750 'CF': 'Central African Republic',
2751 'TD': 'Chad',
2752 'CL': 'Chile',
2753 'CN': 'China',
2754 'CX': 'Christmas Island',
2755 'CC': 'Cocos (Keeling) Islands',
2756 'CO': 'Colombia',
2757 'KM': 'Comoros',
2758 'CG': 'Congo',
2759 'CD': 'Congo, the Democratic Republic of the',
2760 'CK': 'Cook Islands',
2761 'CR': 'Costa Rica',
2762 'CI': 'Côte d\'Ivoire',
2763 'HR': 'Croatia',
2764 'CU': 'Cuba',
2765 'CW': 'Curaçao',
2766 'CY': 'Cyprus',
2767 'CZ': 'Czech Republic',
2768 'DK': 'Denmark',
2769 'DJ': 'Djibouti',
2770 'DM': 'Dominica',
2771 'DO': 'Dominican Republic',
2772 'EC': 'Ecuador',
2773 'EG': 'Egypt',
2774 'SV': 'El Salvador',
2775 'GQ': 'Equatorial Guinea',
2776 'ER': 'Eritrea',
2777 'EE': 'Estonia',
2778 'ET': 'Ethiopia',
2779 'FK': 'Falkland Islands (Malvinas)',
2780 'FO': 'Faroe Islands',
2781 'FJ': 'Fiji',
2782 'FI': 'Finland',
2783 'FR': 'France',
2784 'GF': 'French Guiana',
2785 'PF': 'French Polynesia',
2786 'TF': 'French Southern Territories',
2787 'GA': 'Gabon',
2788 'GM': 'Gambia',
2789 'GE': 'Georgia',
2790 'DE': 'Germany',
2791 'GH': 'Ghana',
2792 'GI': 'Gibraltar',
2793 'GR': 'Greece',
2794 'GL': 'Greenland',
2795 'GD': 'Grenada',
2796 'GP': 'Guadeloupe',
2797 'GU': 'Guam',
2798 'GT': 'Guatemala',
2799 'GG': 'Guernsey',
2800 'GN': 'Guinea',
2801 'GW': 'Guinea-Bissau',
2802 'GY': 'Guyana',
2803 'HT': 'Haiti',
2804 'HM': 'Heard Island and McDonald Islands',
2805 'VA': 'Holy See (Vatican City State)',
2806 'HN': 'Honduras',
2807 'HK': 'Hong Kong',
2808 'HU': 'Hungary',
2809 'IS': 'Iceland',
2810 'IN': 'India',
2811 'ID': 'Indonesia',
2812 'IR': 'Iran, Islamic Republic of',
2813 'IQ': 'Iraq',
2814 'IE': 'Ireland',
2815 'IM': 'Isle of Man',
2816 'IL': 'Israel',
2817 'IT': 'Italy',
2818 'JM': 'Jamaica',
2819 'JP': 'Japan',
2820 'JE': 'Jersey',
2821 'JO': 'Jordan',
2822 'KZ': 'Kazakhstan',
2823 'KE': 'Kenya',
2824 'KI': 'Kiribati',
2825 'KP': 'Korea, Democratic People\'s Republic of',
2826 'KR': 'Korea, Republic of',
2827 'KW': 'Kuwait',
2828 'KG': 'Kyrgyzstan',
2829 'LA': 'Lao People\'s Democratic Republic',
2830 'LV': 'Latvia',
2831 'LB': 'Lebanon',
2832 'LS': 'Lesotho',
2833 'LR': 'Liberia',
2834 'LY': 'Libya',
2835 'LI': 'Liechtenstein',
2836 'LT': 'Lithuania',
2837 'LU': 'Luxembourg',
2838 'MO': 'Macao',
2839 'MK': 'Macedonia, the Former Yugoslav Republic of',
2840 'MG': 'Madagascar',
2841 'MW': 'Malawi',
2842 'MY': 'Malaysia',
2843 'MV': 'Maldives',
2844 'ML': 'Mali',
2845 'MT': 'Malta',
2846 'MH': 'Marshall Islands',
2847 'MQ': 'Martinique',
2848 'MR': 'Mauritania',
2849 'MU': 'Mauritius',
2850 'YT': 'Mayotte',
2851 'MX': 'Mexico',
2852 'FM': 'Micronesia, Federated States of',
2853 'MD': 'Moldova, Republic of',
2854 'MC': 'Monaco',
2855 'MN': 'Mongolia',
2856 'ME': 'Montenegro',
2857 'MS': 'Montserrat',
2858 'MA': 'Morocco',
2859 'MZ': 'Mozambique',
2860 'MM': 'Myanmar',
2861 'NA': 'Namibia',
2862 'NR': 'Nauru',
2863 'NP': 'Nepal',
2864 'NL': 'Netherlands',
2865 'NC': 'New Caledonia',
2866 'NZ': 'New Zealand',
2867 'NI': 'Nicaragua',
2868 'NE': 'Niger',
2869 'NG': 'Nigeria',
2870 'NU': 'Niue',
2871 'NF': 'Norfolk Island',
2872 'MP': 'Northern Mariana Islands',
2873 'NO': 'Norway',
2874 'OM': 'Oman',
2875 'PK': 'Pakistan',
2876 'PW': 'Palau',
2877 'PS': 'Palestine, State of',
2878 'PA': 'Panama',
2879 'PG': 'Papua New Guinea',
2880 'PY': 'Paraguay',
2881 'PE': 'Peru',
2882 'PH': 'Philippines',
2883 'PN': 'Pitcairn',
2884 'PL': 'Poland',
2885 'PT': 'Portugal',
2886 'PR': 'Puerto Rico',
2887 'QA': 'Qatar',
2888 'RE': 'Réunion',
2889 'RO': 'Romania',
2890 'RU': 'Russian Federation',
2891 'RW': 'Rwanda',
2892 'BL': 'Saint Barthélemy',
2893 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2894 'KN': 'Saint Kitts and Nevis',
2895 'LC': 'Saint Lucia',
2896 'MF': 'Saint Martin (French part)',
2897 'PM': 'Saint Pierre and Miquelon',
2898 'VC': 'Saint Vincent and the Grenadines',
2899 'WS': 'Samoa',
2900 'SM': 'San Marino',
2901 'ST': 'Sao Tome and Principe',
2902 'SA': 'Saudi Arabia',
2903 'SN': 'Senegal',
2904 'RS': 'Serbia',
2905 'SC': 'Seychelles',
2906 'SL': 'Sierra Leone',
2907 'SG': 'Singapore',
2908 'SX': 'Sint Maarten (Dutch part)',
2909 'SK': 'Slovakia',
2910 'SI': 'Slovenia',
2911 'SB': 'Solomon Islands',
2912 'SO': 'Somalia',
2913 'ZA': 'South Africa',
2914 'GS': 'South Georgia and the South Sandwich Islands',
2915 'SS': 'South Sudan',
2916 'ES': 'Spain',
2917 'LK': 'Sri Lanka',
2918 'SD': 'Sudan',
2919 'SR': 'Suriname',
2920 'SJ': 'Svalbard and Jan Mayen',
2921 'SZ': 'Swaziland',
2922 'SE': 'Sweden',
2923 'CH': 'Switzerland',
2924 'SY': 'Syrian Arab Republic',
2925 'TW': 'Taiwan, Province of China',
2926 'TJ': 'Tajikistan',
2927 'TZ': 'Tanzania, United Republic of',
2928 'TH': 'Thailand',
2929 'TL': 'Timor-Leste',
2930 'TG': 'Togo',
2931 'TK': 'Tokelau',
2932 'TO': 'Tonga',
2933 'TT': 'Trinidad and Tobago',
2934 'TN': 'Tunisia',
2935 'TR': 'Turkey',
2936 'TM': 'Turkmenistan',
2937 'TC': 'Turks and Caicos Islands',
2938 'TV': 'Tuvalu',
2939 'UG': 'Uganda',
2940 'UA': 'Ukraine',
2941 'AE': 'United Arab Emirates',
2942 'GB': 'United Kingdom',
2943 'US': 'United States',
2944 'UM': 'United States Minor Outlying Islands',
2945 'UY': 'Uruguay',
2946 'UZ': 'Uzbekistan',
2947 'VU': 'Vanuatu',
2948 'VE': 'Venezuela, Bolivarian Republic of',
2949 'VN': 'Viet Nam',
2950 'VG': 'Virgin Islands, British',
2951 'VI': 'Virgin Islands, U.S.',
2952 'WF': 'Wallis and Futuna',
2953 'EH': 'Western Sahara',
2954 'YE': 'Yemen',
2955 'ZM': 'Zambia',
2956 'ZW': 'Zimbabwe',
2957 }
2958
2959 @classmethod
2960 def short2full(cls, code):
2961 """Convert an ISO 3166-2 country code to the corresponding full name"""
2962 return cls._country_map.get(code.upper())
2963
2964
91410c9b 2965class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2966 def __init__(self, proxies=None):
2967 # Set default handlers
2968 for type in ('http', 'https'):
2969 setattr(self, '%s_open' % type,
2970 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2971 meth(r, proxy, type))
2972 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2973
91410c9b 2974 def proxy_open(self, req, proxy, type):
2461f79d 2975 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
2976 if req_proxy is not None:
2977 proxy = req_proxy
2461f79d
PH
2978 del req.headers['Ytdl-request-proxy']
2979
2980 if proxy == '__noproxy__':
2981 return None # No Proxy
51fb4995 2982 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
2983 req.add_header('Ytdl-socks-proxy', proxy)
2984 # youtube-dl's http/https handlers do wrapping the socket with socks
2985 return None
91410c9b
PH
2986 return compat_urllib_request.ProxyHandler.proxy_open(
2987 self, req, proxy, type)
5bc880b9
YCH
2988
2989
2990def ohdave_rsa_encrypt(data, exponent, modulus):
2991 '''
2992 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2993
2994 Input:
2995 data: data to encrypt, bytes-like object
2996 exponent, modulus: parameter e and N of RSA algorithm, both integer
2997 Output: hex string of encrypted data
2998
2999 Limitation: supports one block encryption only
3000 '''
3001
3002 payload = int(binascii.hexlify(data[::-1]), 16)
3003 encrypted = pow(payload, exponent, modulus)
3004 return '%x' % encrypted
81bdc8fd
YCH
3005
3006
5eb6bdce 3007def encode_base_n(num, n, table=None):
59f898b7 3008 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
3009 if not table:
3010 table = FULL_TABLE[:n]
3011
5eb6bdce
YCH
3012 if n > len(table):
3013 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3014
3015 if num == 0:
3016 return table[0]
3017
81bdc8fd
YCH
3018 ret = ''
3019 while num:
3020 ret = table[num % n] + ret
3021 num = num // n
3022 return ret
f52354a8
YCH
3023
3024
3025def decode_packed_codes(code):
06b3fe29 3026 mobj = re.search(PACKED_CODES_RE, code)
f52354a8
YCH
3027 obfucasted_code, base, count, symbols = mobj.groups()
3028 base = int(base)
3029 count = int(count)
3030 symbols = symbols.split('|')
3031 symbol_table = {}
3032
3033 while count:
3034 count -= 1
5eb6bdce 3035 base_n_count = encode_base_n(count, base)
f52354a8
YCH
3036 symbol_table[base_n_count] = symbols[count] or base_n_count
3037
3038 return re.sub(
3039 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3040 obfucasted_code)
e154c651 3041
3042
3043def parse_m3u8_attributes(attrib):
3044 info = {}
3045 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3046 if val.startswith('"'):
3047 val = val[1:-1]
3048 info[key] = val
3049 return info
1143535d
YCH
3050
3051
3052def urshift(val, n):
3053 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
3054
3055
3056# Based on png2str() written by @gdkchan and improved by @yokrysty
3057# Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3058def decode_png(png_data):
3059 # Reference: https://www.w3.org/TR/PNG/
3060 header = png_data[8:]
3061
3062 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3063 raise IOError('Not a valid PNG file.')
3064
3065 int_map = {1: '>B', 2: '>H', 4: '>I'}
3066 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3067
3068 chunks = []
3069
3070 while header:
3071 length = unpack_integer(header[:4])
3072 header = header[4:]
3073
3074 chunk_type = header[:4]
3075 header = header[4:]
3076
3077 chunk_data = header[:length]
3078 header = header[length:]
3079
3080 header = header[4:] # Skip CRC
3081
3082 chunks.append({
3083 'type': chunk_type,
3084 'length': length,
3085 'data': chunk_data
3086 })
3087
3088 ihdr = chunks[0]['data']
3089
3090 width = unpack_integer(ihdr[:4])
3091 height = unpack_integer(ihdr[4:8])
3092
3093 idat = b''
3094
3095 for chunk in chunks:
3096 if chunk['type'] == b'IDAT':
3097 idat += chunk['data']
3098
3099 if not idat:
3100 raise IOError('Unable to read PNG data.')
3101
3102 decompressed_data = bytearray(zlib.decompress(idat))
3103
3104 stride = width * 3
3105 pixels = []
3106
3107 def _get_pixel(idx):
3108 x = idx % stride
3109 y = idx // stride
3110 return pixels[y][x]
3111
3112 for y in range(height):
3113 basePos = y * (1 + stride)
3114 filter_type = decompressed_data[basePos]
3115
3116 current_row = []
3117
3118 pixels.append(current_row)
3119
3120 for x in range(stride):
3121 color = decompressed_data[1 + basePos + x]
3122 basex = y * stride + x
3123 left = 0
3124 up = 0
3125
3126 if x > 2:
3127 left = _get_pixel(basex - 3)
3128 if y > 0:
3129 up = _get_pixel(basex - stride)
3130
3131 if filter_type == 1: # Sub
3132 color = (color + left) & 0xff
3133 elif filter_type == 2: # Up
3134 color = (color + up) & 0xff
3135 elif filter_type == 3: # Average
3136 color = (color + ((left + up) >> 1)) & 0xff
3137 elif filter_type == 4: # Paeth
3138 a = left
3139 b = up
3140 c = 0
3141
3142 if x > 2 and y > 0:
3143 c = _get_pixel(basex - stride - 3)
3144
3145 p = a + b - c
3146
3147 pa = abs(p - a)
3148 pb = abs(p - b)
3149 pc = abs(p - c)
3150
3151 if pa <= pb and pa <= pc:
3152 color = (color + a) & 0xff
3153 elif pb <= pc:
3154 color = (color + b) & 0xff
3155 else:
3156 color = (color + c) & 0xff
3157
3158 current_row.append(color)
3159
3160 return width, height, pixels
efa97bdc
YCH
3161
3162
3163def write_xattr(path, key, value):
3164 # This mess below finds the best xattr tool for the job
3165 try:
3166 # try the pyxattr module...
3167 import xattr
3168
53a7e3d2
YCH
3169 if hasattr(xattr, 'set'): # pyxattr
3170 # Unicode arguments are not supported in python-pyxattr until
3171 # version 0.5.0
3172 # See https://github.com/rg3/youtube-dl/issues/5498
3173 pyxattr_required_version = '0.5.0'
3174 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3175 # TODO: fallback to CLI tools
3176 raise XAttrUnavailableError(
3177 'python-pyxattr is detected but is too old. '
3178 'youtube-dl requires %s or above while your version is %s. '
3179 'Falling back to other xattr implementations' % (
3180 pyxattr_required_version, xattr.__version__))
3181
3182 setxattr = xattr.set
3183 else: # xattr
3184 setxattr = xattr.setxattr
efa97bdc
YCH
3185
3186 try:
53a7e3d2 3187 setxattr(path, key, value)
efa97bdc
YCH
3188 except EnvironmentError as e:
3189 raise XAttrMetadataError(e.errno, e.strerror)
3190
3191 except ImportError:
3192 if compat_os_name == 'nt':
3193 # Write xattrs to NTFS Alternate Data Streams:
3194 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3195 assert ':' not in key
3196 assert os.path.exists(path)
3197
3198 ads_fn = path + ':' + key
3199 try:
3200 with open(ads_fn, 'wb') as f:
3201 f.write(value)
3202 except EnvironmentError as e:
3203 raise XAttrMetadataError(e.errno, e.strerror)
3204 else:
3205 user_has_setfattr = check_executable('setfattr', ['--version'])
3206 user_has_xattr = check_executable('xattr', ['-h'])
3207
3208 if user_has_setfattr or user_has_xattr:
3209
3210 value = value.decode('utf-8')
3211 if user_has_setfattr:
3212 executable = 'setfattr'
3213 opts = ['-n', key, '-v', value]
3214 elif user_has_xattr:
3215 executable = 'xattr'
3216 opts = ['-w', key, value]
3217
3218 cmd = ([encodeFilename(executable, True)] +
3219 [encodeArgument(o) for o in opts] +
3220 [encodeFilename(path, True)])
3221
3222 try:
3223 p = subprocess.Popen(
3224 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3225 except EnvironmentError as e:
3226 raise XAttrMetadataError(e.errno, e.strerror)
3227 stdout, stderr = p.communicate()
3228 stderr = stderr.decode('utf-8', 'replace')
3229 if p.returncode != 0:
3230 raise XAttrMetadataError(p.returncode, stderr)
3231
3232 else:
3233 # On Unix, and can't find pyxattr, setfattr, or xattr.
3234 if sys.platform.startswith('linux'):
3235 raise XAttrUnavailableError(
3236 "Couldn't find a tool to set the xattrs. "
3237 "Install either the python 'pyxattr' or 'xattr' "
3238 "modules, or the GNU 'attr' package "
3239 "(which contains the 'setfattr' tool).")
3240 else:
3241 raise XAttrUnavailableError(
3242 "Couldn't find a tool to set the xattrs. "
3243 "Install either the python 'xattr' module, "
3244 "or the 'xattr' binary.")