]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[thisoldhouse] Add new extractor(closes #10837)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
f45c185f 14import errno
be4a824d 15import functools
d77c3dfd 16import gzip
03f9daab 17import io
79a2e94e 18import itertools
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
347de493 22import operator
d77c3dfd 23import os
4eb7f1d1 24import pipes
c496ca96 25import platform
d77c3dfd 26import re
c496ca96 27import socket
79a2e94e 28import ssl
1c088fa8 29import subprocess
d77c3dfd 30import sys
181c8655 31import tempfile
01951dda 32import traceback
bcf89ce6 33import xml.etree.ElementTree
d77c3dfd 34import zlib
d77c3dfd 35
8c25f81b 36from .compat import (
8bb56eee 37 compat_HTMLParser,
8f9312c3 38 compat_basestring,
8c25f81b 39 compat_chr,
36e6f62c 40 compat_etree_fromstring,
8c25f81b 41 compat_html_entities,
55b2f099 42 compat_html_entities_html5,
be4a824d 43 compat_http_client,
c86b6142 44 compat_kwargs,
efa97bdc 45 compat_os_name,
8c25f81b 46 compat_parse_qs,
702ccf2d 47 compat_shlex_quote,
be4a824d 48 compat_socket_create_connection,
8c25f81b 49 compat_str,
edaa23f8 50 compat_struct_pack,
d3f8e038 51 compat_struct_unpack,
8c25f81b
PH
52 compat_urllib_error,
53 compat_urllib_parse,
15707c7e 54 compat_urllib_parse_urlencode,
8c25f81b 55 compat_urllib_parse_urlparse,
7581bfc9 56 compat_urllib_parse_unquote_plus,
8c25f81b
PH
57 compat_urllib_request,
58 compat_urlparse,
810c10ba 59 compat_xpath,
8c25f81b 60)
4644ac55 61
71aff188
YCH
62from .socks import (
63 ProxyType,
64 sockssocket,
65)
66
4644ac55 67
51fb4995
YCH
68def register_socks_protocols():
69 # "Register" SOCKS protocols
d5ae6bb5
YCH
70 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
71 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
72 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
73 if scheme not in compat_urlparse.uses_netloc:
74 compat_urlparse.uses_netloc.append(scheme)
75
76
468e2e92
FV
77# This is not clearly defined otherwise
78compiled_regex_type = type(re.compile(''))
79
3e669f36 80std_headers = {
15d10678 81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
59ae15a5
PH
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 86}
f427df17 87
5f6a1245 88
bf42a990
S
89NO_DEFAULT = object()
90
7105440c
YCH
91ENGLISH_MONTH_NAMES = [
92 'January', 'February', 'March', 'April', 'May', 'June',
93 'July', 'August', 'September', 'October', 'November', 'December']
94
f6717dec
S
95MONTH_NAMES = {
96 'en': ENGLISH_MONTH_NAMES,
97 'fr': [
3e4185c3
S
98 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
99 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 100}
a942d6cb 101
a7aaa398
S
102KNOWN_EXTENSIONS = (
103 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
104 'flv', 'f4v', 'f4a', 'f4b',
105 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
106 'mkv', 'mka', 'mk3d',
107 'avi', 'divx',
108 'mov',
109 'asf', 'wmv', 'wma',
110 '3gp', '3g2',
111 'mp3',
112 'flac',
113 'ape',
114 'wav',
115 'f4f', 'f4m', 'm3u8', 'smil')
116
c587cbb7 117# needed for sanitizing filenames in restricted mode
c8827027 118ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
119 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
120 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
c587cbb7 121
46f59e89
S
122DATE_FORMATS = (
123 '%d %B %Y',
124 '%d %b %Y',
125 '%B %d %Y',
126 '%b %d %Y',
127 '%b %dst %Y %I:%M',
128 '%b %dnd %Y %I:%M',
129 '%b %dth %Y %I:%M',
130 '%Y %m %d',
131 '%Y-%m-%d',
132 '%Y/%m/%d',
81c13222 133 '%Y/%m/%d %H:%M',
46f59e89
S
134 '%Y/%m/%d %H:%M:%S',
135 '%Y-%m-%d %H:%M:%S',
136 '%Y-%m-%d %H:%M:%S.%f',
137 '%d.%m.%Y %H:%M',
138 '%d.%m.%Y %H.%M',
139 '%Y-%m-%dT%H:%M:%SZ',
140 '%Y-%m-%dT%H:%M:%S.%fZ',
141 '%Y-%m-%dT%H:%M:%S.%f0Z',
142 '%Y-%m-%dT%H:%M:%S',
143 '%Y-%m-%dT%H:%M:%S.%f',
144 '%Y-%m-%dT%H:%M',
c6eed6b8
S
145 '%b %d %Y at %H:%M',
146 '%b %d %Y at %H:%M:%S',
46f59e89
S
147)
148
149DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
150DATE_FORMATS_DAY_FIRST.extend([
151 '%d-%m-%Y',
152 '%d.%m.%Y',
153 '%d.%m.%y',
154 '%d/%m/%Y',
155 '%d/%m/%y',
156 '%d/%m/%Y %H:%M:%S',
157])
158
159DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
160DATE_FORMATS_MONTH_FIRST.extend([
161 '%m-%d-%Y',
162 '%m.%d.%Y',
163 '%m/%d/%Y',
164 '%m/%d/%y',
165 '%m/%d/%Y %H:%M:%S',
166])
167
7105440c 168
d77c3dfd 169def preferredencoding():
59ae15a5 170 """Get preferred encoding.
d77c3dfd 171
59ae15a5
PH
172 Returns the best encoding scheme for the system, based on
173 locale.getpreferredencoding() and some further tweaks.
174 """
175 try:
176 pref = locale.getpreferredencoding()
28e614de 177 'TEST'.encode(pref)
70a1165b 178 except Exception:
59ae15a5 179 pref = 'UTF-8'
bae611f2 180
59ae15a5 181 return pref
d77c3dfd 182
f4bfd65f 183
181c8655 184def write_json_file(obj, fn):
1394646a 185 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 186
92120217 187 fn = encodeFilename(fn)
61ee5aeb 188 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
189 encoding = get_filesystem_encoding()
190 # os.path.basename returns a bytes object, but NamedTemporaryFile
191 # will fail if the filename contains non ascii characters unless we
192 # use a unicode object
193 path_basename = lambda f: os.path.basename(fn).decode(encoding)
194 # the same for os.path.dirname
195 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
196 else:
197 path_basename = os.path.basename
198 path_dirname = os.path.dirname
199
73159f99
S
200 args = {
201 'suffix': '.tmp',
ec5f6016
JMF
202 'prefix': path_basename(fn) + '.',
203 'dir': path_dirname(fn),
73159f99
S
204 'delete': False,
205 }
206
181c8655
PH
207 # In Python 2.x, json.dump expects a bytestream.
208 # In Python 3.x, it writes to a character stream
209 if sys.version_info < (3, 0):
73159f99 210 args['mode'] = 'wb'
181c8655 211 else:
73159f99
S
212 args.update({
213 'mode': 'w',
214 'encoding': 'utf-8',
215 })
216
c86b6142 217 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
218
219 try:
220 with tf:
221 json.dump(obj, tf)
1394646a
IK
222 if sys.platform == 'win32':
223 # Need to remove existing file on Windows, else os.rename raises
224 # WindowsError or FileExistsError.
225 try:
226 os.unlink(fn)
227 except OSError:
228 pass
181c8655 229 os.rename(tf.name, fn)
70a1165b 230 except Exception:
181c8655
PH
231 try:
232 os.remove(tf.name)
233 except OSError:
234 pass
235 raise
236
237
238if sys.version_info >= (2, 7):
ee114368 239 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 240 """ Find the xpath xpath[@key=val] """
5d2354f1 241 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 242 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
243 return node.find(expr)
244else:
ee114368 245 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 246 for f in node.findall(compat_xpath(xpath)):
ee114368
S
247 if key not in f.attrib:
248 continue
249 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
250 return f
251 return None
252
d7e66d39
JMF
253# On python2.6 the xml.etree.ElementTree.Element methods don't support
254# the namespace parameter
5f6a1245
JW
255
256
d7e66d39
JMF
257def xpath_with_ns(path, ns_map):
258 components = [c.split(':') for c in path.split('/')]
259 replaced = []
260 for c in components:
261 if len(c) == 1:
262 replaced.append(c[0])
263 else:
264 ns, tag = c
265 replaced.append('{%s}%s' % (ns_map[ns], tag))
266 return '/'.join(replaced)
267
d77c3dfd 268
a41fb80c 269def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 270 def _find_xpath(xpath):
810c10ba 271 return node.find(compat_xpath(xpath))
578c0745
S
272
273 if isinstance(xpath, (str, compat_str)):
274 n = _find_xpath(xpath)
275 else:
276 for xp in xpath:
277 n = _find_xpath(xp)
278 if n is not None:
279 break
d74bebd5 280
8e636da4 281 if n is None:
bf42a990
S
282 if default is not NO_DEFAULT:
283 return default
284 elif fatal:
bf0ff932
PH
285 name = xpath if name is None else name
286 raise ExtractorError('Could not find XML element %s' % name)
287 else:
288 return None
a41fb80c
S
289 return n
290
291
292def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
293 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
294 if n is None or n == default:
295 return n
296 if n.text is None:
297 if default is not NO_DEFAULT:
298 return default
299 elif fatal:
300 name = xpath if name is None else name
301 raise ExtractorError('Could not find XML element\'s text %s' % name)
302 else:
303 return None
304 return n.text
a41fb80c
S
305
306
307def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
308 n = find_xpath_attr(node, xpath, key)
309 if n is None:
310 if default is not NO_DEFAULT:
311 return default
312 elif fatal:
313 name = '%s[@%s]' % (xpath, key) if name is None else name
314 raise ExtractorError('Could not find XML attribute %s' % name)
315 else:
316 return None
317 return n.attrib[key]
bf0ff932
PH
318
319
9e6dd238 320def get_element_by_id(id, html):
43e8fafd 321 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 322 return get_element_by_attribute('id', id, html)
43e8fafd 323
12ea2f30 324
84c237fb
YCH
325def get_element_by_class(class_name, html):
326 return get_element_by_attribute(
327 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
328 html, escape_value=False)
329
330
331def get_element_by_attribute(attribute, value, html, escape_value=True):
43e8fafd 332 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 333
84c237fb
YCH
334 value = re.escape(value) if escape_value else value
335
38285056
PH
336 m = re.search(r'''(?xs)
337 <([a-zA-Z0-9:._-]+)
abc97b5e 338 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056 339 \s+%s=['"]?%s['"]?
abc97b5e 340 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056
PH
341 \s*>
342 (?P<content>.*?)
343 </\1>
84c237fb 344 ''' % (re.escape(attribute), value), html)
38285056
PH
345
346 if not m:
347 return None
348 res = m.group('content')
349
350 if res.startswith('"') or res.startswith("'"):
351 res = res[1:-1]
a921f407 352
38285056 353 return unescapeHTML(res)
a921f407 354
c5229f39 355
8bb56eee
BF
356class HTMLAttributeParser(compat_HTMLParser):
357 """Trivial HTML parser to gather the attributes for a single element"""
358 def __init__(self):
c5229f39 359 self.attrs = {}
8bb56eee
BF
360 compat_HTMLParser.__init__(self)
361
362 def handle_starttag(self, tag, attrs):
363 self.attrs = dict(attrs)
364
c5229f39 365
8bb56eee
BF
366def extract_attributes(html_element):
367 """Given a string for an HTML element such as
368 <el
369 a="foo" B="bar" c="&98;az" d=boz
370 empty= noval entity="&amp;"
371 sq='"' dq="'"
372 >
373 Decode and return a dictionary of attributes.
374 {
375 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
376 'empty': '', 'noval': None, 'entity': '&',
377 'sq': '"', 'dq': '\''
378 }.
379 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
380 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
381 """
382 parser = HTMLAttributeParser()
383 parser.feed(html_element)
384 parser.close()
385 return parser.attrs
9e6dd238 386
c5229f39 387
9e6dd238 388def clean_html(html):
59ae15a5 389 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
390
391 if html is None: # Convenience for sanitizing descriptions etc.
392 return html
393
59ae15a5
PH
394 # Newline vs <br />
395 html = html.replace('\n', ' ')
6b3aef80
FV
396 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
397 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
398 # Strip html tags
399 html = re.sub('<.*?>', '', html)
400 # Replace html entities
401 html = unescapeHTML(html)
7decf895 402 return html.strip()
9e6dd238
FV
403
404
d77c3dfd 405def sanitize_open(filename, open_mode):
59ae15a5
PH
406 """Try to open the given filename, and slightly tweak it if this fails.
407
408 Attempts to open the given filename. If this fails, it tries to change
409 the filename slightly, step by step, until it's either able to open it
410 or it fails and raises a final exception, like the standard open()
411 function.
412
413 It returns the tuple (stream, definitive_file_name).
414 """
415 try:
28e614de 416 if filename == '-':
59ae15a5
PH
417 if sys.platform == 'win32':
418 import msvcrt
419 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 420 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
421 stream = open(encodeFilename(filename), open_mode)
422 return (stream, filename)
423 except (IOError, OSError) as err:
f45c185f
PH
424 if err.errno in (errno.EACCES,):
425 raise
59ae15a5 426
f45c185f 427 # In case of error, try to remove win32 forbidden chars
d55de57b 428 alt_filename = sanitize_path(filename)
f45c185f
PH
429 if alt_filename == filename:
430 raise
431 else:
432 # An exception here should be caught in the caller
d55de57b 433 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 434 return (stream, alt_filename)
d77c3dfd
FV
435
436
437def timeconvert(timestr):
59ae15a5
PH
438 """Convert RFC 2822 defined time string into system timestamp"""
439 timestamp = None
440 timetuple = email.utils.parsedate_tz(timestr)
441 if timetuple is not None:
442 timestamp = email.utils.mktime_tz(timetuple)
443 return timestamp
1c469a94 444
5f6a1245 445
796173d0 446def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
447 """Sanitizes a string so it could be used as part of a filename.
448 If restricted is set, use a stricter subset of allowed characters.
796173d0 449 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
450 """
451 def replace_insane(char):
c587cbb7
AT
452 if restricted and char in ACCENT_CHARS:
453 return ACCENT_CHARS[char]
59ae15a5
PH
454 if char == '?' or ord(char) < 32 or ord(char) == 127:
455 return ''
456 elif char == '"':
457 return '' if restricted else '\''
458 elif char == ':':
459 return '_-' if restricted else ' -'
460 elif char in '\\/|*<>':
461 return '_'
627dcfff 462 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
463 return '_'
464 if restricted and ord(char) > 127:
465 return '_'
466 return char
467
2aeb06d6
PH
468 # Handle timestamps
469 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 470 result = ''.join(map(replace_insane, s))
796173d0
PH
471 if not is_id:
472 while '__' in result:
473 result = result.replace('__', '_')
474 result = result.strip('_')
475 # Common case of "Foreign band name - English song title"
476 if restricted and result.startswith('-_'):
477 result = result[2:]
5a42414b
PH
478 if result.startswith('-'):
479 result = '_' + result[len('-'):]
a7440261 480 result = result.lstrip('.')
796173d0
PH
481 if not result:
482 result = '_'
59ae15a5 483 return result
d77c3dfd 484
5f6a1245 485
a2aaf4db
S
486def sanitize_path(s):
487 """Sanitizes and normalizes path on Windows"""
488 if sys.platform != 'win32':
489 return s
be531ef1
S
490 drive_or_unc, _ = os.path.splitdrive(s)
491 if sys.version_info < (2, 7) and not drive_or_unc:
492 drive_or_unc, _ = os.path.splitunc(s)
493 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
494 if drive_or_unc:
a2aaf4db
S
495 norm_path.pop(0)
496 sanitized_path = [
c90d16cf 497 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
a2aaf4db 498 for path_part in norm_path]
be531ef1
S
499 if drive_or_unc:
500 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
501 return os.path.join(*sanitized_path)
502
503
67dda517
S
504# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
505# unwanted failures due to missing protocol
17bcc626
S
506def sanitize_url(url):
507 return 'http:%s' % url if url.startswith('//') else url
508
509
67dda517 510def sanitized_Request(url, *args, **kwargs):
17bcc626 511 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
512
513
d77c3dfd 514def orderedSet(iterable):
59ae15a5
PH
515 """ Remove all duplicates from the input iterable """
516 res = []
517 for el in iterable:
518 if el not in res:
519 res.append(el)
520 return res
d77c3dfd 521
912b38b4 522
55b2f099 523def _htmlentity_transform(entity_with_semicolon):
4e408e47 524 """Transforms an HTML entity to a character."""
55b2f099
YCH
525 entity = entity_with_semicolon[:-1]
526
4e408e47
PH
527 # Known non-numeric HTML entity
528 if entity in compat_html_entities.name2codepoint:
529 return compat_chr(compat_html_entities.name2codepoint[entity])
530
55b2f099
YCH
531 # TODO: HTML5 allows entities without a semicolon. For example,
532 # '&Eacuteric' should be decoded as 'Éric'.
533 if entity_with_semicolon in compat_html_entities_html5:
534 return compat_html_entities_html5[entity_with_semicolon]
535
91757b0f 536 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
537 if mobj is not None:
538 numstr = mobj.group(1)
28e614de 539 if numstr.startswith('x'):
4e408e47 540 base = 16
28e614de 541 numstr = '0%s' % numstr
4e408e47
PH
542 else:
543 base = 10
7aefc49c
S
544 # See https://github.com/rg3/youtube-dl/issues/7518
545 try:
546 return compat_chr(int(numstr, base))
547 except ValueError:
548 pass
4e408e47
PH
549
550 # Unknown entity in name, return its literal representation
7a3f0c00 551 return '&%s;' % entity
4e408e47
PH
552
553
d77c3dfd 554def unescapeHTML(s):
912b38b4
PH
555 if s is None:
556 return None
557 assert type(s) == compat_str
d77c3dfd 558
4e408e47 559 return re.sub(
55b2f099 560 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 561
8bf48f23 562
aa49acd1
S
563def get_subprocess_encoding():
564 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
565 # For subprocess calls, encode with locale encoding
566 # Refer to http://stackoverflow.com/a/9951851/35070
567 encoding = preferredencoding()
568 else:
569 encoding = sys.getfilesystemencoding()
570 if encoding is None:
571 encoding = 'utf-8'
572 return encoding
573
574
8bf48f23 575def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
576 """
577 @param s The name of the file
578 """
d77c3dfd 579
8bf48f23 580 assert type(s) == compat_str
d77c3dfd 581
59ae15a5
PH
582 # Python 3 has a Unicode API
583 if sys.version_info >= (3, 0):
584 return s
0f00efed 585
aa49acd1
S
586 # Pass '' directly to use Unicode APIs on Windows 2000 and up
587 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
588 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
589 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
590 return s
591
8ee239e9
YCH
592 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
593 if sys.platform.startswith('java'):
594 return s
595
aa49acd1
S
596 return s.encode(get_subprocess_encoding(), 'ignore')
597
598
599def decodeFilename(b, for_subprocess=False):
600
601 if sys.version_info >= (3, 0):
602 return b
603
604 if not isinstance(b, bytes):
605 return b
606
607 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 608
f07b74fc
PH
609
610def encodeArgument(s):
611 if not isinstance(s, compat_str):
612 # Legacy code that uses byte strings
613 # Uncomment the following line after fixing all post processors
7af808a5 614 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
615 s = s.decode('ascii')
616 return encodeFilename(s, True)
617
618
aa49acd1
S
619def decodeArgument(b):
620 return decodeFilename(b, True)
621
622
8271226a
PH
623def decodeOption(optval):
624 if optval is None:
625 return optval
626 if isinstance(optval, bytes):
627 optval = optval.decode(preferredencoding())
628
629 assert isinstance(optval, compat_str)
630 return optval
1c256f70 631
5f6a1245 632
4539dd30
PH
633def formatSeconds(secs):
634 if secs > 3600:
635 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
636 elif secs > 60:
637 return '%d:%02d' % (secs // 60, secs % 60)
638 else:
639 return '%d' % secs
640
a0ddb8a2 641
be4a824d
PH
642def make_HTTPS_handler(params, **kwargs):
643 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 644 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 645 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 646 if opts_no_check_certificate:
be5f2c19 647 context.check_hostname = False
0db261ba 648 context.verify_mode = ssl.CERT_NONE
a2366922 649 try:
be4a824d 650 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
651 except TypeError:
652 # Python 2.7.8
653 # (create_default_context present but HTTPSHandler has no context=)
654 pass
655
656 if sys.version_info < (3, 2):
d7932313 657 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 658 else: # Python < 3.4
d7932313 659 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 660 context.verify_mode = (ssl.CERT_NONE
dca08720 661 if opts_no_check_certificate
ea6d901e 662 else ssl.CERT_REQUIRED)
303b479e 663 context.set_default_verify_paths()
be4a824d 664 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 665
732ea2f0 666
08f2a92c
JMF
667def bug_reports_message():
668 if ytdl_is_updateable():
669 update_cmd = 'type youtube-dl -U to update'
670 else:
671 update_cmd = 'see https://yt-dl.org/update on how to update'
672 msg = '; please report this issue on https://yt-dl.org/bug .'
673 msg += ' Make sure you are using the latest version; %s.' % update_cmd
674 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
675 return msg
676
677
1c256f70
PH
678class ExtractorError(Exception):
679 """Error during info extraction."""
5f6a1245 680
d11271dd 681 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
682 """ tb, if given, is the original traceback (so that it can be printed out).
683 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
684 """
685
686 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
687 expected = True
d11271dd
PH
688 if video_id is not None:
689 msg = video_id + ': ' + msg
410f3e73 690 if cause:
28e614de 691 msg += ' (caused by %r)' % cause
9a82b238 692 if not expected:
08f2a92c 693 msg += bug_reports_message()
1c256f70 694 super(ExtractorError, self).__init__(msg)
d5979c5d 695
1c256f70 696 self.traceback = tb
8cc83b8d 697 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 698 self.cause = cause
d11271dd 699 self.video_id = video_id
1c256f70 700
01951dda
PH
701 def format_traceback(self):
702 if self.traceback is None:
703 return None
28e614de 704 return ''.join(traceback.format_tb(self.traceback))
01951dda 705
1c256f70 706
416c7fcb
PH
707class UnsupportedError(ExtractorError):
708 def __init__(self, url):
709 super(UnsupportedError, self).__init__(
710 'Unsupported URL: %s' % url, expected=True)
711 self.url = url
712
713
55b3e45b
JMF
714class RegexNotFoundError(ExtractorError):
715 """Error when a regex didn't match"""
716 pass
717
718
d77c3dfd 719class DownloadError(Exception):
59ae15a5 720 """Download Error exception.
d77c3dfd 721
59ae15a5
PH
722 This exception may be thrown by FileDownloader objects if they are not
723 configured to continue on errors. They will contain the appropriate
724 error message.
725 """
5f6a1245 726
8cc83b8d
FV
727 def __init__(self, msg, exc_info=None):
728 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
729 super(DownloadError, self).__init__(msg)
730 self.exc_info = exc_info
d77c3dfd
FV
731
732
733class SameFileError(Exception):
59ae15a5 734 """Same File exception.
d77c3dfd 735
59ae15a5
PH
736 This exception will be thrown by FileDownloader objects if they detect
737 multiple files would have to be downloaded to the same file on disk.
738 """
739 pass
d77c3dfd
FV
740
741
742class PostProcessingError(Exception):
59ae15a5 743 """Post Processing exception.
d77c3dfd 744
59ae15a5
PH
745 This exception may be raised by PostProcessor's .run() method to
746 indicate an error in the postprocessing task.
747 """
5f6a1245 748
7851b379
PH
749 def __init__(self, msg):
750 self.msg = msg
d77c3dfd 751
5f6a1245 752
d77c3dfd 753class MaxDownloadsReached(Exception):
59ae15a5
PH
754 """ --max-downloads limit has been reached. """
755 pass
d77c3dfd
FV
756
757
758class UnavailableVideoError(Exception):
59ae15a5 759 """Unavailable Format exception.
d77c3dfd 760
59ae15a5
PH
761 This exception will be thrown when a video is requested
762 in a format that is not available for that video.
763 """
764 pass
d77c3dfd
FV
765
766
767class ContentTooShortError(Exception):
59ae15a5 768 """Content Too Short exception.
d77c3dfd 769
59ae15a5
PH
770 This exception may be raised by FileDownloader objects when a file they
771 download is too small for what the server announced first, indicating
772 the connection was probably interrupted.
773 """
d77c3dfd 774
59ae15a5 775 def __init__(self, downloaded, expected):
2c7ed247 776 # Both in bytes
59ae15a5
PH
777 self.downloaded = downloaded
778 self.expected = expected
d77c3dfd 779
5f6a1245 780
efa97bdc
YCH
781class XAttrMetadataError(Exception):
782 def __init__(self, code=None, msg='Unknown error'):
783 super(XAttrMetadataError, self).__init__(msg)
784 self.code = code
bd264412 785 self.msg = msg
efa97bdc
YCH
786
787 # Parsing code and msg
788 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
789 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
790 self.reason = 'NO_SPACE'
791 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
792 self.reason = 'VALUE_TOO_LONG'
793 else:
794 self.reason = 'NOT_SUPPORTED'
795
796
797class XAttrUnavailableError(Exception):
798 pass
799
800
c5a59d93 801def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
802 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
803 # expected HTTP responses to meet HTTP/1.0 or later (see also
804 # https://github.com/rg3/youtube-dl/issues/6727)
805 if sys.version_info < (3, 0):
5a1a2e94 806 kwargs[b'strict'] = True
be4a824d
PH
807 hc = http_class(*args, **kwargs)
808 source_address = ydl_handler._params.get('source_address')
809 if source_address is not None:
810 sa = (source_address, 0)
811 if hasattr(hc, 'source_address'): # Python 2.7+
812 hc.source_address = sa
813 else: # Python 2.6
814 def _hc_connect(self, *args, **kwargs):
815 sock = compat_socket_create_connection(
816 (self.host, self.port), self.timeout, sa)
817 if is_https:
d7932313
PH
818 self.sock = ssl.wrap_socket(
819 sock, self.key_file, self.cert_file,
820 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
821 else:
822 self.sock = sock
823 hc.connect = functools.partial(_hc_connect, hc)
824
825 return hc
826
827
87f0e62d 828def handle_youtubedl_headers(headers):
992fc9d6
YCH
829 filtered_headers = headers
830
831 if 'Youtubedl-no-compression' in filtered_headers:
832 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 833 del filtered_headers['Youtubedl-no-compression']
87f0e62d 834
992fc9d6 835 return filtered_headers
87f0e62d
YCH
836
837
acebc9cd 838class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
839 """Handler for HTTP requests and responses.
840
841 This class, when installed with an OpenerDirector, automatically adds
842 the standard headers to every HTTP request and handles gzipped and
843 deflated responses from web servers. If compression is to be avoided in
844 a particular request, the original request in the program code only has
0424ec30 845 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
846 removed before making the real request.
847
848 Part of this code was copied from:
849
850 http://techknack.net/python-urllib2-handlers/
851
852 Andrew Rowls, the author of that code, agreed to release it to the
853 public domain.
854 """
855
be4a824d
PH
856 def __init__(self, params, *args, **kwargs):
857 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
858 self._params = params
859
860 def http_open(self, req):
71aff188
YCH
861 conn_class = compat_http_client.HTTPConnection
862
863 socks_proxy = req.headers.get('Ytdl-socks-proxy')
864 if socks_proxy:
865 conn_class = make_socks_conn_class(conn_class, socks_proxy)
866 del req.headers['Ytdl-socks-proxy']
867
be4a824d 868 return self.do_open(functools.partial(
71aff188 869 _create_http_connection, self, conn_class, False),
be4a824d
PH
870 req)
871
59ae15a5
PH
872 @staticmethod
873 def deflate(data):
874 try:
875 return zlib.decompress(data, -zlib.MAX_WBITS)
876 except zlib.error:
877 return zlib.decompress(data)
878
879 @staticmethod
880 def addinfourl_wrapper(stream, headers, url, code):
881 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
882 return compat_urllib_request.addinfourl(stream, headers, url, code)
883 ret = compat_urllib_request.addinfourl(stream, headers, url)
884 ret.code = code
885 return ret
886
acebc9cd 887 def http_request(self, req):
51f267d9
S
888 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
889 # always respected by websites, some tend to give out URLs with non percent-encoded
890 # non-ASCII characters (see telemb.py, ard.py [#3412])
891 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
892 # To work around aforementioned issue we will replace request's original URL with
893 # percent-encoded one
894 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
895 # the code of this workaround has been moved here from YoutubeDL.urlopen()
896 url = req.get_full_url()
897 url_escaped = escape_url(url)
898
899 # Substitute URL if any change after escaping
900 if url != url_escaped:
15d260eb 901 req = update_Request(req, url=url_escaped)
51f267d9 902
33ac271b 903 for h, v in std_headers.items():
3d5f7a39
JK
904 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
905 # The dict keys are capitalized because of this bug by urllib
906 if h.capitalize() not in req.headers:
33ac271b 907 req.add_header(h, v)
87f0e62d
YCH
908
909 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
910
911 if sys.version_info < (2, 7) and '#' in req.get_full_url():
912 # Python 2.6 is brain-dead when it comes to fragments
913 req._Request__original = req._Request__original.partition('#')[0]
914 req._Request__r_type = req._Request__r_type.partition('#')[0]
915
59ae15a5
PH
916 return req
917
acebc9cd 918 def http_response(self, req, resp):
59ae15a5
PH
919 old_resp = resp
920 # gzip
921 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
922 content = resp.read()
923 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
924 try:
925 uncompressed = io.BytesIO(gz.read())
926 except IOError as original_ioerror:
927 # There may be junk add the end of the file
928 # See http://stackoverflow.com/q/4928560/35070 for details
929 for i in range(1, 1024):
930 try:
931 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
932 uncompressed = io.BytesIO(gz.read())
933 except IOError:
934 continue
935 break
936 else:
937 raise original_ioerror
938 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 939 resp.msg = old_resp.msg
c047270c 940 del resp.headers['Content-encoding']
59ae15a5
PH
941 # deflate
942 if resp.headers.get('Content-encoding', '') == 'deflate':
943 gz = io.BytesIO(self.deflate(resp.read()))
944 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
945 resp.msg = old_resp.msg
c047270c 946 del resp.headers['Content-encoding']
ad729172
S
947 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
948 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
949 if 300 <= resp.code < 400:
950 location = resp.headers.get('Location')
951 if location:
952 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
953 if sys.version_info >= (3, 0):
954 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
955 else:
956 location = location.decode('utf-8')
5a4d9ddb
S
957 location_escaped = escape_url(location)
958 if location != location_escaped:
959 del resp.headers['Location']
9a4aec8b
YCH
960 if sys.version_info < (3, 0):
961 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 962 resp.headers['Location'] = location_escaped
59ae15a5 963 return resp
0f8d03f8 964
acebc9cd
PH
965 https_request = http_request
966 https_response = http_response
bf50b038 967
5de90176 968
71aff188
YCH
969def make_socks_conn_class(base_class, socks_proxy):
970 assert issubclass(base_class, (
971 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
972
973 url_components = compat_urlparse.urlparse(socks_proxy)
974 if url_components.scheme.lower() == 'socks5':
975 socks_type = ProxyType.SOCKS5
976 elif url_components.scheme.lower() in ('socks', 'socks4'):
977 socks_type = ProxyType.SOCKS4
51fb4995
YCH
978 elif url_components.scheme.lower() == 'socks4a':
979 socks_type = ProxyType.SOCKS4A
71aff188 980
cdd94c2e
YCH
981 def unquote_if_non_empty(s):
982 if not s:
983 return s
984 return compat_urllib_parse_unquote_plus(s)
985
71aff188
YCH
986 proxy_args = (
987 socks_type,
988 url_components.hostname, url_components.port or 1080,
989 True, # Remote DNS
cdd94c2e
YCH
990 unquote_if_non_empty(url_components.username),
991 unquote_if_non_empty(url_components.password),
71aff188
YCH
992 )
993
994 class SocksConnection(base_class):
995 def connect(self):
996 self.sock = sockssocket()
997 self.sock.setproxy(*proxy_args)
998 if type(self.timeout) in (int, float):
999 self.sock.settimeout(self.timeout)
1000 self.sock.connect((self.host, self.port))
1001
1002 if isinstance(self, compat_http_client.HTTPSConnection):
1003 if hasattr(self, '_context'): # Python > 2.6
1004 self.sock = self._context.wrap_socket(
1005 self.sock, server_hostname=self.host)
1006 else:
1007 self.sock = ssl.wrap_socket(self.sock)
1008
1009 return SocksConnection
1010
1011
be4a824d
PH
1012class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1013 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1014 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1015 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1016 self._params = params
1017
1018 def https_open(self, req):
4f264c02 1019 kwargs = {}
71aff188
YCH
1020 conn_class = self._https_conn_class
1021
4f264c02
JMF
1022 if hasattr(self, '_context'): # python > 2.6
1023 kwargs['context'] = self._context
1024 if hasattr(self, '_check_hostname'): # python 3.x
1025 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1026
1027 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1028 if socks_proxy:
1029 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1030 del req.headers['Ytdl-socks-proxy']
1031
be4a824d 1032 return self.do_open(functools.partial(
71aff188 1033 _create_http_connection, self, conn_class, True),
4f264c02 1034 req, **kwargs)
be4a824d
PH
1035
1036
a6420bf5
S
1037class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1038 def __init__(self, cookiejar=None):
1039 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1040
1041 def http_response(self, request, response):
1042 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1043 # characters in Set-Cookie HTTP header of last response (see
1044 # https://github.com/rg3/youtube-dl/issues/6769).
1045 # In order to at least prevent crashing we will percent encode Set-Cookie
1046 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1047 # if sys.version_info < (3, 0) and response.headers:
1048 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1049 # set_cookie = response.headers.get(set_cookie_header)
1050 # if set_cookie:
1051 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1052 # if set_cookie != set_cookie_escaped:
1053 # del response.headers[set_cookie_header]
1054 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1055 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1056
1057 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1058 https_response = http_response
1059
1060
46f59e89
S
1061def extract_timezone(date_str):
1062 m = re.search(
1063 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1064 date_str)
1065 if not m:
1066 timezone = datetime.timedelta()
1067 else:
1068 date_str = date_str[:-len(m.group('tz'))]
1069 if not m.group('sign'):
1070 timezone = datetime.timedelta()
1071 else:
1072 sign = 1 if m.group('sign') == '+' else -1
1073 timezone = datetime.timedelta(
1074 hours=sign * int(m.group('hours')),
1075 minutes=sign * int(m.group('minutes')))
1076 return timezone, date_str
1077
1078
08b38d54 1079def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1080 """ Return a UNIX timestamp from the given date """
1081
1082 if date_str is None:
1083 return None
1084
52c3a6e4
S
1085 date_str = re.sub(r'\.[0-9]+', '', date_str)
1086
08b38d54 1087 if timezone is None:
46f59e89
S
1088 timezone, date_str = extract_timezone(date_str)
1089
52c3a6e4
S
1090 try:
1091 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1092 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1093 return calendar.timegm(dt.timetuple())
1094 except ValueError:
1095 pass
912b38b4
PH
1096
1097
46f59e89
S
1098def date_formats(day_first=True):
1099 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1100
1101
42bdd9d0 1102def unified_strdate(date_str, day_first=True):
bf50b038 1103 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1104
1105 if date_str is None:
1106 return None
bf50b038 1107 upload_date = None
5f6a1245 1108 # Replace commas
026fcc04 1109 date_str = date_str.replace(',', ' ')
42bdd9d0 1110 # Remove AM/PM + timezone
9bb8e0a3 1111 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1112 _, date_str = extract_timezone(date_str)
42bdd9d0 1113
46f59e89 1114 for expression in date_formats(day_first):
bf50b038
JMF
1115 try:
1116 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1117 except ValueError:
bf50b038 1118 pass
42393ce2
PH
1119 if upload_date is None:
1120 timetuple = email.utils.parsedate_tz(date_str)
1121 if timetuple:
c6b9cf05
S
1122 try:
1123 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1124 except ValueError:
1125 pass
6a750402
JMF
1126 if upload_date is not None:
1127 return compat_str(upload_date)
bf50b038 1128
5f6a1245 1129
46f59e89
S
1130def unified_timestamp(date_str, day_first=True):
1131 if date_str is None:
1132 return None
1133
1134 date_str = date_str.replace(',', ' ')
1135
7dc2a74e 1136 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1137 timezone, date_str = extract_timezone(date_str)
1138
1139 # Remove AM/PM + timezone
1140 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1141
1142 for expression in date_formats(day_first):
1143 try:
7dc2a74e 1144 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1145 return calendar.timegm(dt.timetuple())
1146 except ValueError:
1147 pass
1148 timetuple = email.utils.parsedate_tz(date_str)
1149 if timetuple:
7dc2a74e 1150 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1151
1152
28e614de 1153def determine_ext(url, default_ext='unknown_video'):
f4776371
S
1154 if url is None:
1155 return default_ext
9cb9a5df 1156 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1157 if re.match(r'^[A-Za-z0-9]+$', guess):
1158 return guess
a7aaa398
S
1159 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1160 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1161 return guess.rstrip('/')
73e79f2a 1162 else:
cbdbb766 1163 return default_ext
73e79f2a 1164
5f6a1245 1165
d4051a8e 1166def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1167 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1168
5f6a1245 1169
bd558525 1170def date_from_str(date_str):
37254abc
JMF
1171 """
1172 Return a datetime object from a string in the format YYYYMMDD or
1173 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1174 today = datetime.date.today()
f8795e10 1175 if date_str in ('now', 'today'):
37254abc 1176 return today
f8795e10
PH
1177 if date_str == 'yesterday':
1178 return today - datetime.timedelta(days=1)
37254abc
JMF
1179 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1180 if match is not None:
1181 sign = match.group('sign')
1182 time = int(match.group('time'))
1183 if sign == '-':
1184 time = -time
1185 unit = match.group('unit')
dfb1b146 1186 # A bad approximation?
37254abc
JMF
1187 if unit == 'month':
1188 unit = 'day'
1189 time *= 30
1190 elif unit == 'year':
1191 unit = 'day'
1192 time *= 365
1193 unit += 's'
1194 delta = datetime.timedelta(**{unit: time})
1195 return today + delta
611c1dd9 1196 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1197
1198
e63fc1be 1199def hyphenate_date(date_str):
1200 """
1201 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1202 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1203 if match is not None:
1204 return '-'.join(match.groups())
1205 else:
1206 return date_str
1207
5f6a1245 1208
bd558525
JMF
1209class DateRange(object):
1210 """Represents a time interval between two dates"""
5f6a1245 1211
bd558525
JMF
1212 def __init__(self, start=None, end=None):
1213 """start and end must be strings in the format accepted by date"""
1214 if start is not None:
1215 self.start = date_from_str(start)
1216 else:
1217 self.start = datetime.datetime.min.date()
1218 if end is not None:
1219 self.end = date_from_str(end)
1220 else:
1221 self.end = datetime.datetime.max.date()
37254abc 1222 if self.start > self.end:
bd558525 1223 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1224
bd558525
JMF
1225 @classmethod
1226 def day(cls, day):
1227 """Returns a range that only contains the given day"""
5f6a1245
JW
1228 return cls(day, day)
1229
bd558525
JMF
1230 def __contains__(self, date):
1231 """Check if the date is in the range"""
37254abc
JMF
1232 if not isinstance(date, datetime.date):
1233 date = date_from_str(date)
1234 return self.start <= date <= self.end
5f6a1245 1235
bd558525 1236 def __str__(self):
5f6a1245 1237 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1238
1239
1240def platform_name():
1241 """ Returns the platform name as a compat_str """
1242 res = platform.platform()
1243 if isinstance(res, bytes):
1244 res = res.decode(preferredencoding())
1245
1246 assert isinstance(res, compat_str)
1247 return res
c257baff
PH
1248
1249
b58ddb32
PH
1250def _windows_write_string(s, out):
1251 """ Returns True if the string was written using special methods,
1252 False if it has yet to be written out."""
1253 # Adapted from http://stackoverflow.com/a/3259271/35070
1254
1255 import ctypes
1256 import ctypes.wintypes
1257
1258 WIN_OUTPUT_IDS = {
1259 1: -11,
1260 2: -12,
1261 }
1262
a383a98a
PH
1263 try:
1264 fileno = out.fileno()
1265 except AttributeError:
1266 # If the output stream doesn't have a fileno, it's virtual
1267 return False
aa42e873
PH
1268 except io.UnsupportedOperation:
1269 # Some strange Windows pseudo files?
1270 return False
b58ddb32
PH
1271 if fileno not in WIN_OUTPUT_IDS:
1272 return False
1273
e2f89ec7 1274 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1275 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1276 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1277 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1278
e2f89ec7 1279 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1280 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1281 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1282 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1283 written = ctypes.wintypes.DWORD(0)
1284
611c1dd9 1285 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1286 FILE_TYPE_CHAR = 0x0002
1287 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1288 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1289 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1290 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1291 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1292 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1293
1294 def not_a_console(handle):
1295 if handle == INVALID_HANDLE_VALUE or handle is None:
1296 return True
8fb3ac36
PH
1297 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1298 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1299
1300 if not_a_console(h):
1301 return False
1302
d1b9c912
PH
1303 def next_nonbmp_pos(s):
1304 try:
1305 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1306 except StopIteration:
1307 return len(s)
1308
1309 while s:
1310 count = min(next_nonbmp_pos(s), 1024)
1311
b58ddb32 1312 ret = WriteConsoleW(
d1b9c912 1313 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1314 if ret == 0:
1315 raise OSError('Failed to write string')
d1b9c912
PH
1316 if not count: # We just wrote a non-BMP character
1317 assert written.value == 2
1318 s = s[1:]
1319 else:
1320 assert written.value > 0
1321 s = s[written.value:]
b58ddb32
PH
1322 return True
1323
1324
734f90bb 1325def write_string(s, out=None, encoding=None):
7459e3a2
PH
1326 if out is None:
1327 out = sys.stderr
8bf48f23 1328 assert type(s) == compat_str
7459e3a2 1329
b58ddb32
PH
1330 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1331 if _windows_write_string(s, out):
1332 return
1333
7459e3a2
PH
1334 if ('b' in getattr(out, 'mode', '') or
1335 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1336 byt = s.encode(encoding or preferredencoding(), 'ignore')
1337 out.write(byt)
1338 elif hasattr(out, 'buffer'):
1339 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1340 byt = s.encode(enc, 'ignore')
1341 out.buffer.write(byt)
1342 else:
8bf48f23 1343 out.write(s)
7459e3a2
PH
1344 out.flush()
1345
1346
48ea9cea
PH
1347def bytes_to_intlist(bs):
1348 if not bs:
1349 return []
1350 if isinstance(bs[0], int): # Python 3
1351 return list(bs)
1352 else:
1353 return [ord(c) for c in bs]
1354
c257baff 1355
cba892fa 1356def intlist_to_bytes(xs):
1357 if not xs:
1358 return b''
edaa23f8 1359 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1360
1361
c1c9a79c
PH
1362# Cross-platform file locking
1363if sys.platform == 'win32':
1364 import ctypes.wintypes
1365 import msvcrt
1366
1367 class OVERLAPPED(ctypes.Structure):
1368 _fields_ = [
1369 ('Internal', ctypes.wintypes.LPVOID),
1370 ('InternalHigh', ctypes.wintypes.LPVOID),
1371 ('Offset', ctypes.wintypes.DWORD),
1372 ('OffsetHigh', ctypes.wintypes.DWORD),
1373 ('hEvent', ctypes.wintypes.HANDLE),
1374 ]
1375
1376 kernel32 = ctypes.windll.kernel32
1377 LockFileEx = kernel32.LockFileEx
1378 LockFileEx.argtypes = [
1379 ctypes.wintypes.HANDLE, # hFile
1380 ctypes.wintypes.DWORD, # dwFlags
1381 ctypes.wintypes.DWORD, # dwReserved
1382 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1383 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1384 ctypes.POINTER(OVERLAPPED) # Overlapped
1385 ]
1386 LockFileEx.restype = ctypes.wintypes.BOOL
1387 UnlockFileEx = kernel32.UnlockFileEx
1388 UnlockFileEx.argtypes = [
1389 ctypes.wintypes.HANDLE, # hFile
1390 ctypes.wintypes.DWORD, # dwReserved
1391 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1392 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1393 ctypes.POINTER(OVERLAPPED) # Overlapped
1394 ]
1395 UnlockFileEx.restype = ctypes.wintypes.BOOL
1396 whole_low = 0xffffffff
1397 whole_high = 0x7fffffff
1398
1399 def _lock_file(f, exclusive):
1400 overlapped = OVERLAPPED()
1401 overlapped.Offset = 0
1402 overlapped.OffsetHigh = 0
1403 overlapped.hEvent = 0
1404 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1405 handle = msvcrt.get_osfhandle(f.fileno())
1406 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1407 whole_low, whole_high, f._lock_file_overlapped_p):
1408 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1409
1410 def _unlock_file(f):
1411 assert f._lock_file_overlapped_p
1412 handle = msvcrt.get_osfhandle(f.fileno())
1413 if not UnlockFileEx(handle, 0,
1414 whole_low, whole_high, f._lock_file_overlapped_p):
1415 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1416
1417else:
399a76e6
YCH
1418 # Some platforms, such as Jython, is missing fcntl
1419 try:
1420 import fcntl
c1c9a79c 1421
399a76e6
YCH
1422 def _lock_file(f, exclusive):
1423 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1424
399a76e6
YCH
1425 def _unlock_file(f):
1426 fcntl.flock(f, fcntl.LOCK_UN)
1427 except ImportError:
1428 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1429
1430 def _lock_file(f, exclusive):
1431 raise IOError(UNSUPPORTED_MSG)
1432
1433 def _unlock_file(f):
1434 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1435
1436
1437class locked_file(object):
1438 def __init__(self, filename, mode, encoding=None):
1439 assert mode in ['r', 'a', 'w']
1440 self.f = io.open(filename, mode, encoding=encoding)
1441 self.mode = mode
1442
1443 def __enter__(self):
1444 exclusive = self.mode != 'r'
1445 try:
1446 _lock_file(self.f, exclusive)
1447 except IOError:
1448 self.f.close()
1449 raise
1450 return self
1451
1452 def __exit__(self, etype, value, traceback):
1453 try:
1454 _unlock_file(self.f)
1455 finally:
1456 self.f.close()
1457
1458 def __iter__(self):
1459 return iter(self.f)
1460
1461 def write(self, *args):
1462 return self.f.write(*args)
1463
1464 def read(self, *args):
1465 return self.f.read(*args)
4eb7f1d1
JMF
1466
1467
4644ac55
S
1468def get_filesystem_encoding():
1469 encoding = sys.getfilesystemencoding()
1470 return encoding if encoding is not None else 'utf-8'
1471
1472
4eb7f1d1 1473def shell_quote(args):
a6a173c2 1474 quoted_args = []
4644ac55 1475 encoding = get_filesystem_encoding()
a6a173c2
JMF
1476 for a in args:
1477 if isinstance(a, bytes):
1478 # We may get a filename encoded with 'encodeFilename'
1479 a = a.decode(encoding)
1480 quoted_args.append(pipes.quote(a))
28e614de 1481 return ' '.join(quoted_args)
9d4660ca
PH
1482
1483
1484def smuggle_url(url, data):
1485 """ Pass additional data in a URL for internal use. """
1486
81953d1a
RA
1487 url, idata = unsmuggle_url(url, {})
1488 data.update(idata)
15707c7e 1489 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1490 {'__youtubedl_smuggle': json.dumps(data)})
1491 return url + '#' + sdata
9d4660ca
PH
1492
1493
79f82953 1494def unsmuggle_url(smug_url, default=None):
83e865a3 1495 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1496 return smug_url, default
28e614de
PH
1497 url, _, sdata = smug_url.rpartition('#')
1498 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1499 data = json.loads(jsond)
1500 return url, data
02dbf93f
PH
1501
1502
02dbf93f
PH
1503def format_bytes(bytes):
1504 if bytes is None:
28e614de 1505 return 'N/A'
02dbf93f
PH
1506 if type(bytes) is str:
1507 bytes = float(bytes)
1508 if bytes == 0.0:
1509 exponent = 0
1510 else:
1511 exponent = int(math.log(bytes, 1024.0))
28e614de 1512 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1513 converted = float(bytes) / float(1024 ** exponent)
28e614de 1514 return '%.2f%s' % (converted, suffix)
f53c966a 1515
1c088fa8 1516
fb47597b
S
1517def lookup_unit_table(unit_table, s):
1518 units_re = '|'.join(re.escape(u) for u in unit_table)
1519 m = re.match(
782b1b5b 1520 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1521 if not m:
1522 return None
1523 num_str = m.group('num').replace(',', '.')
1524 mult = unit_table[m.group('unit')]
1525 return int(float(num_str) * mult)
1526
1527
be64b5b0
PH
1528def parse_filesize(s):
1529 if s is None:
1530 return None
1531
dfb1b146 1532 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1533 # but we support those too
1534 _UNIT_TABLE = {
1535 'B': 1,
1536 'b': 1,
70852b47 1537 'bytes': 1,
be64b5b0
PH
1538 'KiB': 1024,
1539 'KB': 1000,
1540 'kB': 1024,
1541 'Kb': 1000,
13585d76 1542 'kb': 1000,
70852b47
YCH
1543 'kilobytes': 1000,
1544 'kibibytes': 1024,
be64b5b0
PH
1545 'MiB': 1024 ** 2,
1546 'MB': 1000 ** 2,
1547 'mB': 1024 ** 2,
1548 'Mb': 1000 ** 2,
13585d76 1549 'mb': 1000 ** 2,
70852b47
YCH
1550 'megabytes': 1000 ** 2,
1551 'mebibytes': 1024 ** 2,
be64b5b0
PH
1552 'GiB': 1024 ** 3,
1553 'GB': 1000 ** 3,
1554 'gB': 1024 ** 3,
1555 'Gb': 1000 ** 3,
13585d76 1556 'gb': 1000 ** 3,
70852b47
YCH
1557 'gigabytes': 1000 ** 3,
1558 'gibibytes': 1024 ** 3,
be64b5b0
PH
1559 'TiB': 1024 ** 4,
1560 'TB': 1000 ** 4,
1561 'tB': 1024 ** 4,
1562 'Tb': 1000 ** 4,
13585d76 1563 'tb': 1000 ** 4,
70852b47
YCH
1564 'terabytes': 1000 ** 4,
1565 'tebibytes': 1024 ** 4,
be64b5b0
PH
1566 'PiB': 1024 ** 5,
1567 'PB': 1000 ** 5,
1568 'pB': 1024 ** 5,
1569 'Pb': 1000 ** 5,
13585d76 1570 'pb': 1000 ** 5,
70852b47
YCH
1571 'petabytes': 1000 ** 5,
1572 'pebibytes': 1024 ** 5,
be64b5b0
PH
1573 'EiB': 1024 ** 6,
1574 'EB': 1000 ** 6,
1575 'eB': 1024 ** 6,
1576 'Eb': 1000 ** 6,
13585d76 1577 'eb': 1000 ** 6,
70852b47
YCH
1578 'exabytes': 1000 ** 6,
1579 'exbibytes': 1024 ** 6,
be64b5b0
PH
1580 'ZiB': 1024 ** 7,
1581 'ZB': 1000 ** 7,
1582 'zB': 1024 ** 7,
1583 'Zb': 1000 ** 7,
13585d76 1584 'zb': 1000 ** 7,
70852b47
YCH
1585 'zettabytes': 1000 ** 7,
1586 'zebibytes': 1024 ** 7,
be64b5b0
PH
1587 'YiB': 1024 ** 8,
1588 'YB': 1000 ** 8,
1589 'yB': 1024 ** 8,
1590 'Yb': 1000 ** 8,
13585d76 1591 'yb': 1000 ** 8,
70852b47
YCH
1592 'yottabytes': 1000 ** 8,
1593 'yobibytes': 1024 ** 8,
be64b5b0
PH
1594 }
1595
fb47597b
S
1596 return lookup_unit_table(_UNIT_TABLE, s)
1597
1598
1599def parse_count(s):
1600 if s is None:
be64b5b0
PH
1601 return None
1602
fb47597b
S
1603 s = s.strip()
1604
1605 if re.match(r'^[\d,.]+$', s):
1606 return str_to_int(s)
1607
1608 _UNIT_TABLE = {
1609 'k': 1000,
1610 'K': 1000,
1611 'm': 1000 ** 2,
1612 'M': 1000 ** 2,
1613 'kk': 1000 ** 2,
1614 'KK': 1000 ** 2,
1615 }
be64b5b0 1616
fb47597b 1617 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1618
2f7ae819 1619
a942d6cb 1620def month_by_name(name, lang='en'):
caefb1de
PH
1621 """ Return the number of a month by (locale-independently) English name """
1622
f6717dec 1623 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1624
caefb1de 1625 try:
f6717dec 1626 return month_names.index(name) + 1
7105440c
YCH
1627 except ValueError:
1628 return None
1629
1630
1631def month_by_abbreviation(abbrev):
1632 """ Return the number of a month by (locale-independently) English
1633 abbreviations """
1634
1635 try:
1636 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1637 except ValueError:
1638 return None
18258362
JMF
1639
1640
5aafe895 1641def fix_xml_ampersands(xml_str):
18258362 1642 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1643 return re.sub(
1644 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1645 '&amp;',
5aafe895 1646 xml_str)
e3946f98
PH
1647
1648
1649def setproctitle(title):
8bf48f23 1650 assert isinstance(title, compat_str)
c1c05c67
YCH
1651
1652 # ctypes in Jython is not complete
1653 # http://bugs.jython.org/issue2148
1654 if sys.platform.startswith('java'):
1655 return
1656
e3946f98 1657 try:
611c1dd9 1658 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1659 except OSError:
1660 return
6eefe533
PH
1661 title_bytes = title.encode('utf-8')
1662 buf = ctypes.create_string_buffer(len(title_bytes))
1663 buf.value = title_bytes
e3946f98 1664 try:
6eefe533 1665 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1666 except AttributeError:
1667 return # Strange libc, just skip this
d7dda168
PH
1668
1669
1670def remove_start(s, start):
46bc9b7d 1671 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1672
1673
2b9faf55 1674def remove_end(s, end):
46bc9b7d 1675 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1676
1677
31b2051e
S
1678def remove_quotes(s):
1679 if s is None or len(s) < 2:
1680 return s
1681 for quote in ('"', "'", ):
1682 if s[0] == quote and s[-1] == quote:
1683 return s[1:-1]
1684 return s
1685
1686
29eb5174 1687def url_basename(url):
9b8aaeed 1688 path = compat_urlparse.urlparse(url).path
28e614de 1689 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1690
1691
1692class HEADRequest(compat_urllib_request.Request):
1693 def get_method(self):
611c1dd9 1694 return 'HEAD'
7217e148
PH
1695
1696
95cf60e8
S
1697class PUTRequest(compat_urllib_request.Request):
1698 def get_method(self):
1699 return 'PUT'
1700
1701
9732d77e 1702def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1703 if get_attr:
1704 if v is not None:
1705 v = getattr(v, get_attr, None)
9572013d
PH
1706 if v == '':
1707 v = None
1812afb7
S
1708 if v is None:
1709 return default
1710 try:
1711 return int(v) * invscale // scale
1712 except ValueError:
af98f8ff 1713 return default
9732d77e 1714
9572013d 1715
40a90862
JMF
1716def str_or_none(v, default=None):
1717 return default if v is None else compat_str(v)
1718
9732d77e
PH
1719
1720def str_to_int(int_str):
48d4681e 1721 """ A more relaxed version of int_or_none """
9732d77e
PH
1722 if int_str is None:
1723 return None
28e614de 1724 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1725 return int(int_str)
608d11f5
PH
1726
1727
9732d77e 1728def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1729 if v is None:
1730 return default
1731 try:
1732 return float(v) * invscale / scale
1733 except ValueError:
1734 return default
43f775e4
PH
1735
1736
b72b4431
S
1737def strip_or_none(v):
1738 return None if v is None else v.strip()
1739
1740
608d11f5 1741def parse_duration(s):
8f9312c3 1742 if not isinstance(s, compat_basestring):
608d11f5
PH
1743 return None
1744
ca7b3246
S
1745 s = s.strip()
1746
acaff495 1747 days, hours, mins, secs, ms = [None] * 5
1748 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1749 if m:
1750 days, hours, mins, secs, ms = m.groups()
1751 else:
1752 m = re.match(
1753 r'''(?ix)(?:P?T)?
8f4b58d7 1754 (?:
acaff495 1755 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1756 )?
acaff495 1757 (?:
1758 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1759 )?
1760 (?:
1761 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1762 )?
1763 (?:
1764 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1765 )?$''', s)
1766 if m:
1767 days, hours, mins, secs, ms = m.groups()
1768 else:
1769 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1770 if m:
1771 hours, mins = m.groups()
1772 else:
1773 return None
1774
1775 duration = 0
1776 if secs:
1777 duration += float(secs)
1778 if mins:
1779 duration += float(mins) * 60
1780 if hours:
1781 duration += float(hours) * 60 * 60
1782 if days:
1783 duration += float(days) * 24 * 60 * 60
1784 if ms:
1785 duration += float(ms)
1786 return duration
91d7d0b3
JMF
1787
1788
e65e4c88 1789def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1790 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1791 return (
1792 '{0}.{1}{2}'.format(name, ext, real_ext)
1793 if not expected_real_ext or real_ext[1:] == expected_real_ext
1794 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1795
1796
b3ed15b7
S
1797def replace_extension(filename, ext, expected_real_ext=None):
1798 name, real_ext = os.path.splitext(filename)
1799 return '{0}.{1}'.format(
1800 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1801 ext)
1802
1803
d70ad093
PH
1804def check_executable(exe, args=[]):
1805 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1806 args can be a list of arguments for a short output (like -version) """
1807 try:
1808 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1809 except OSError:
1810 return False
1811 return exe
b7ab0590
PH
1812
1813
95807118 1814def get_exe_version(exe, args=['--version'],
cae97f65 1815 version_re=None, unrecognized='present'):
95807118
PH
1816 """ Returns the version of the specified executable,
1817 or False if the executable is not present """
1818 try:
cae97f65 1819 out, _ = subprocess.Popen(
54116803 1820 [encodeArgument(exe)] + args,
95807118
PH
1821 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1822 except OSError:
1823 return False
cae97f65
PH
1824 if isinstance(out, bytes): # Python 2.x
1825 out = out.decode('ascii', 'ignore')
1826 return detect_exe_version(out, version_re, unrecognized)
1827
1828
1829def detect_exe_version(output, version_re=None, unrecognized='present'):
1830 assert isinstance(output, compat_str)
1831 if version_re is None:
1832 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1833 m = re.search(version_re, output)
95807118
PH
1834 if m:
1835 return m.group(1)
1836 else:
1837 return unrecognized
1838
1839
b7ab0590 1840class PagedList(object):
dd26ced1
PH
1841 def __len__(self):
1842 # This is only useful for tests
1843 return len(self.getslice())
1844
9c44d242
PH
1845
1846class OnDemandPagedList(PagedList):
b95dc034 1847 def __init__(self, pagefunc, pagesize, use_cache=False):
9c44d242
PH
1848 self._pagefunc = pagefunc
1849 self._pagesize = pagesize
b95dc034
YCH
1850 self._use_cache = use_cache
1851 if use_cache:
1852 self._cache = {}
9c44d242 1853
b7ab0590
PH
1854 def getslice(self, start=0, end=None):
1855 res = []
1856 for pagenum in itertools.count(start // self._pagesize):
1857 firstid = pagenum * self._pagesize
1858 nextfirstid = pagenum * self._pagesize + self._pagesize
1859 if start >= nextfirstid:
1860 continue
1861
b95dc034
YCH
1862 page_results = None
1863 if self._use_cache:
1864 page_results = self._cache.get(pagenum)
1865 if page_results is None:
1866 page_results = list(self._pagefunc(pagenum))
1867 if self._use_cache:
1868 self._cache[pagenum] = page_results
b7ab0590
PH
1869
1870 startv = (
1871 start % self._pagesize
1872 if firstid <= start < nextfirstid
1873 else 0)
1874
1875 endv = (
1876 ((end - 1) % self._pagesize) + 1
1877 if (end is not None and firstid <= end <= nextfirstid)
1878 else None)
1879
1880 if startv != 0 or endv is not None:
1881 page_results = page_results[startv:endv]
1882 res.extend(page_results)
1883
1884 # A little optimization - if current page is not "full", ie. does
1885 # not contain page_size videos then we can assume that this page
1886 # is the last one - there are no more ids on further pages -
1887 # i.e. no need to query again.
1888 if len(page_results) + startv < self._pagesize:
1889 break
1890
1891 # If we got the whole page, but the next page is not interesting,
1892 # break out early as well
1893 if end == nextfirstid:
1894 break
1895 return res
81c2f20b
PH
1896
1897
9c44d242
PH
1898class InAdvancePagedList(PagedList):
1899 def __init__(self, pagefunc, pagecount, pagesize):
1900 self._pagefunc = pagefunc
1901 self._pagecount = pagecount
1902 self._pagesize = pagesize
1903
1904 def getslice(self, start=0, end=None):
1905 res = []
1906 start_page = start // self._pagesize
1907 end_page = (
1908 self._pagecount if end is None else (end // self._pagesize + 1))
1909 skip_elems = start - start_page * self._pagesize
1910 only_more = None if end is None else end - start
1911 for pagenum in range(start_page, end_page):
1912 page = list(self._pagefunc(pagenum))
1913 if skip_elems:
1914 page = page[skip_elems:]
1915 skip_elems = None
1916 if only_more is not None:
1917 if len(page) < only_more:
1918 only_more -= len(page)
1919 else:
1920 page = page[:only_more]
1921 res.extend(page)
1922 break
1923 res.extend(page)
1924 return res
1925
1926
81c2f20b 1927def uppercase_escape(s):
676eb3f2 1928 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1929 return re.sub(
a612753d 1930 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1931 lambda m: unicode_escape(m.group(0))[0],
1932 s)
0fe2ff78
YCH
1933
1934
1935def lowercase_escape(s):
1936 unicode_escape = codecs.getdecoder('unicode_escape')
1937 return re.sub(
1938 r'\\u[0-9a-fA-F]{4}',
1939 lambda m: unicode_escape(m.group(0))[0],
1940 s)
b53466e1 1941
d05cfe06
S
1942
1943def escape_rfc3986(s):
1944 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1945 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1946 s = s.encode('utf-8')
ecc0c5ee 1947 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1948
1949
1950def escape_url(url):
1951 """Escape URL as suggested by RFC 3986"""
1952 url_parsed = compat_urllib_parse_urlparse(url)
1953 return url_parsed._replace(
efbed08d 1954 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
1955 path=escape_rfc3986(url_parsed.path),
1956 params=escape_rfc3986(url_parsed.params),
1957 query=escape_rfc3986(url_parsed.query),
1958 fragment=escape_rfc3986(url_parsed.fragment)
1959 ).geturl()
1960
62e609ab
PH
1961
1962def read_batch_urls(batch_fd):
1963 def fixup(url):
1964 if not isinstance(url, compat_str):
1965 url = url.decode('utf-8', 'replace')
28e614de 1966 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1967 if url.startswith(BOM_UTF8):
1968 url = url[len(BOM_UTF8):]
1969 url = url.strip()
1970 if url.startswith(('#', ';', ']')):
1971 return False
1972 return url
1973
1974 with contextlib.closing(batch_fd) as fd:
1975 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1976
1977
1978def urlencode_postdata(*args, **kargs):
15707c7e 1979 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1980
1981
38f9ef31 1982def update_url_query(url, query):
cacd9966
YCH
1983 if not query:
1984 return url
38f9ef31 1985 parsed_url = compat_urlparse.urlparse(url)
1986 qs = compat_parse_qs(parsed_url.query)
1987 qs.update(query)
1988 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 1989 query=compat_urllib_parse_urlencode(qs, True)))
16392824 1990
8e60dc75 1991
ed0291d1
S
1992def update_Request(req, url=None, data=None, headers={}, query={}):
1993 req_headers = req.headers.copy()
1994 req_headers.update(headers)
1995 req_data = data or req.data
1996 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
1997 req_get_method = req.get_method()
1998 if req_get_method == 'HEAD':
1999 req_type = HEADRequest
2000 elif req_get_method == 'PUT':
2001 req_type = PUTRequest
2002 else:
2003 req_type = compat_urllib_request.Request
ed0291d1
S
2004 new_req = req_type(
2005 req_url, data=req_data, headers=req_headers,
2006 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2007 if hasattr(req, 'timeout'):
2008 new_req.timeout = req.timeout
2009 return new_req
2010
2011
86296ad2 2012def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
2013 if isinstance(key_or_keys, (list, tuple)):
2014 for key in key_or_keys:
86296ad2
S
2015 if key not in d or d[key] is None or skip_false_values and not d[key]:
2016 continue
2017 return d[key]
cbecc9b9
S
2018 return default
2019 return d.get(key_or_keys, default)
2020
2021
329ca3be
S
2022def try_get(src, getter, expected_type=None):
2023 try:
2024 v = getter(src)
2025 except (AttributeError, KeyError, TypeError, IndexError):
2026 pass
2027 else:
2028 if expected_type is None or isinstance(v, expected_type):
2029 return v
2030
2031
8e60dc75
S
2032def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2033 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2034
16392824 2035
a1a530b0
PH
2036US_RATINGS = {
2037 'G': 0,
2038 'PG': 10,
2039 'PG-13': 13,
2040 'R': 16,
2041 'NC': 18,
2042}
fac55558
PH
2043
2044
a8795327
S
2045TV_PARENTAL_GUIDELINES = {
2046 'TV-Y': 0,
2047 'TV-Y7': 7,
2048 'TV-G': 0,
2049 'TV-PG': 0,
2050 'TV-14': 14,
2051 'TV-MA': 17,
2052}
2053
2054
146c80e2 2055def parse_age_limit(s):
a8795327
S
2056 if type(s) == int:
2057 return s if 0 <= s <= 21 else None
2058 if not isinstance(s, compat_basestring):
d838b1bd 2059 return None
146c80e2 2060 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2061 if m:
2062 return int(m.group('age'))
2063 if s in US_RATINGS:
2064 return US_RATINGS[s]
2065 return TV_PARENTAL_GUIDELINES.get(s)
146c80e2
S
2066
2067
fac55558 2068def strip_jsonp(code):
609a61e3 2069 return re.sub(
5950cb1d 2070 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
2071
2072
e05f6939
PH
2073def js_to_json(code):
2074 def fix_kv(m):
e7b6d122
PH
2075 v = m.group(0)
2076 if v in ('true', 'false', 'null'):
2077 return v
bd1e4844 2078 elif v.startswith('/*') or v == ',':
2079 return ""
2080
2081 if v[0] in ("'", '"'):
2082 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 2083 '"': '\\"',
bd1e4844 2084 "\\'": "'",
2085 '\\\n': '',
2086 '\\x': '\\u00',
2087 }.get(m.group(0), m.group(0)), v[1:-1])
2088
89ac4a19 2089 INTEGER_TABLE = (
e4659b45
YCH
2090 (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
2091 (r'^(0+[0-7]+)\s*:?$', 8),
89ac4a19
S
2092 )
2093
2094 for regex, base in INTEGER_TABLE:
2095 im = re.match(regex, v)
2096 if im:
e4659b45 2097 i = int(im.group(1), base)
89ac4a19
S
2098 return '"%d":' % i if v.endswith(':') else '%d' % i
2099
e7b6d122 2100 return '"%s"' % v
e05f6939 2101
bd1e4844 2102 return re.sub(r'''(?sx)
2103 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2104 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2105 /\*.*?\*/|,(?=\s*[\]}])|
2106 [a-zA-Z_][.a-zA-Z_0-9]*|
47212f7b 2107 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
bd1e4844 2108 [0-9]+(?=\s*:)
e05f6939 2109 ''', fix_kv, code)
e05f6939
PH
2110
2111
478c2c61
PH
2112def qualities(quality_ids):
2113 """ Get a numeric quality value out of a list of possible values """
2114 def q(qid):
2115 try:
2116 return quality_ids.index(qid)
2117 except ValueError:
2118 return -1
2119 return q
2120
acd69589
PH
2121
2122DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 2123
a020a0dc
PH
2124
2125def limit_length(s, length):
2126 """ Add ellipses to overly long strings """
2127 if s is None:
2128 return None
2129 ELLIPSES = '...'
2130 if len(s) > length:
2131 return s[:length - len(ELLIPSES)] + ELLIPSES
2132 return s
48844745
PH
2133
2134
2135def version_tuple(v):
5f9b8394 2136 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2137
2138
2139def is_outdated_version(version, limit, assume_new=True):
2140 if not version:
2141 return not assume_new
2142 try:
2143 return version_tuple(version) < version_tuple(limit)
2144 except ValueError:
2145 return not assume_new
732ea2f0
PH
2146
2147
2148def ytdl_is_updateable():
2149 """ Returns if youtube-dl can be updated with -U """
2150 from zipimport import zipimporter
2151
2152 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
2153
2154
2155def args_to_str(args):
2156 # Get a short string representation for a subprocess command
702ccf2d 2157 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2158
2159
9b9c5355 2160def error_to_compat_str(err):
fdae2358
S
2161 err_str = str(err)
2162 # On python 2 error byte string must be decoded with proper
2163 # encoding rather than ascii
2164 if sys.version_info[0] < 3:
2165 err_str = err_str.decode(preferredencoding())
2166 return err_str
2167
2168
c460bdd5 2169def mimetype2ext(mt):
eb9ee194
S
2170 if mt is None:
2171 return None
2172
765ac263
JMF
2173 ext = {
2174 'audio/mp4': 'm4a',
6c33d24b
YCH
2175 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2176 # it's the most popular one
2177 'audio/mpeg': 'mp3',
765ac263
JMF
2178 }.get(mt)
2179 if ext is not None:
2180 return ext
2181
c460bdd5 2182 _, _, res = mt.rpartition('/')
6562d34a 2183 res = res.split(';')[0].strip().lower()
c460bdd5
PH
2184
2185 return {
f6861ec9 2186 '3gpp': '3gp',
cafcf657 2187 'smptett+xml': 'tt',
2188 'srt': 'srt',
2189 'ttaf+xml': 'dfxp',
a0d8d704 2190 'ttml+xml': 'ttml',
cafcf657 2191 'vtt': 'vtt',
f6861ec9 2192 'x-flv': 'flv',
a0d8d704
YCH
2193 'x-mp4-fragmented': 'mp4',
2194 'x-ms-wmv': 'wmv',
b4173f15
RA
2195 'mpegurl': 'm3u8',
2196 'x-mpegurl': 'm3u8',
2197 'vnd.apple.mpegurl': 'm3u8',
2198 'dash+xml': 'mpd',
2199 'f4m': 'f4m',
2200 'f4m+xml': 'f4m',
f164b971 2201 'hds+xml': 'f4m',
e910fe2f 2202 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 2203 'quicktime': 'mov',
c460bdd5
PH
2204 }.get(res, res)
2205
2206
4f3c5e06 2207def parse_codecs(codecs_str):
2208 # http://tools.ietf.org/html/rfc6381
2209 if not codecs_str:
2210 return {}
2211 splited_codecs = list(filter(None, map(
2212 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2213 vcodec, acodec = None, None
2214 for full_codec in splited_codecs:
2215 codec = full_codec.split('.')[0]
2216 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2217 if not vcodec:
2218 vcodec = full_codec
073ac122 2219 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
4f3c5e06 2220 if not acodec:
2221 acodec = full_codec
2222 else:
2223 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2224 if not vcodec and not acodec:
2225 if len(splited_codecs) == 2:
2226 return {
2227 'vcodec': vcodec,
2228 'acodec': acodec,
2229 }
2230 elif len(splited_codecs) == 1:
2231 return {
2232 'vcodec': 'none',
2233 'acodec': vcodec,
2234 }
2235 else:
2236 return {
2237 'vcodec': vcodec or 'none',
2238 'acodec': acodec or 'none',
2239 }
2240 return {}
2241
2242
2ccd1b10 2243def urlhandle_detect_ext(url_handle):
79298173 2244 getheader = url_handle.headers.get
2ccd1b10 2245
b55ee18f
PH
2246 cd = getheader('Content-Disposition')
2247 if cd:
2248 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2249 if m:
2250 e = determine_ext(m.group('filename'), default_ext=None)
2251 if e:
2252 return e
2253
c460bdd5 2254 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2255
2256
1e399778
YCH
2257def encode_data_uri(data, mime_type):
2258 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2259
2260
05900629 2261def age_restricted(content_limit, age_limit):
6ec6cb4e 2262 """ Returns True iff the content should be blocked """
05900629
PH
2263
2264 if age_limit is None: # No limit set
2265 return False
2266 if content_limit is None:
2267 return False # Content available for everyone
2268 return age_limit < content_limit
61ca9a80
PH
2269
2270
2271def is_html(first_bytes):
2272 """ Detect whether a file contains HTML by examining its first bytes. """
2273
2274 BOMS = [
2275 (b'\xef\xbb\xbf', 'utf-8'),
2276 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2277 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2278 (b'\xff\xfe', 'utf-16-le'),
2279 (b'\xfe\xff', 'utf-16-be'),
2280 ]
2281 for bom, enc in BOMS:
2282 if first_bytes.startswith(bom):
2283 s = first_bytes[len(bom):].decode(enc, 'replace')
2284 break
2285 else:
2286 s = first_bytes.decode('utf-8', 'replace')
2287
2288 return re.match(r'^\s*<', s)
a055469f
PH
2289
2290
2291def determine_protocol(info_dict):
2292 protocol = info_dict.get('protocol')
2293 if protocol is not None:
2294 return protocol
2295
2296 url = info_dict['url']
2297 if url.startswith('rtmp'):
2298 return 'rtmp'
2299 elif url.startswith('mms'):
2300 return 'mms'
2301 elif url.startswith('rtsp'):
2302 return 'rtsp'
2303
2304 ext = determine_ext(url)
2305 if ext == 'm3u8':
2306 return 'm3u8'
2307 elif ext == 'f4m':
2308 return 'f4m'
2309
2310 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2311
2312
2313def render_table(header_row, data):
2314 """ Render a list of rows, each as a list of values """
2315 table = [header_row] + data
2316 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2317 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2318 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2319
2320
2321def _match_one(filter_part, dct):
2322 COMPARISON_OPERATORS = {
2323 '<': operator.lt,
2324 '<=': operator.le,
2325 '>': operator.gt,
2326 '>=': operator.ge,
2327 '=': operator.eq,
2328 '!=': operator.ne,
2329 }
2330 operator_rex = re.compile(r'''(?x)\s*
2331 (?P<key>[a-z_]+)
2332 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2333 (?:
2334 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2335 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2336 )
2337 \s*$
2338 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2339 m = operator_rex.search(filter_part)
2340 if m:
2341 op = COMPARISON_OPERATORS[m.group('op')]
2342 if m.group('strval') is not None:
2343 if m.group('op') not in ('=', '!='):
2344 raise ValueError(
2345 'Operator %s does not support string values!' % m.group('op'))
2346 comparison_value = m.group('strval')
2347 else:
2348 try:
2349 comparison_value = int(m.group('intval'))
2350 except ValueError:
2351 comparison_value = parse_filesize(m.group('intval'))
2352 if comparison_value is None:
2353 comparison_value = parse_filesize(m.group('intval') + 'B')
2354 if comparison_value is None:
2355 raise ValueError(
2356 'Invalid integer value %r in filter part %r' % (
2357 m.group('intval'), filter_part))
2358 actual_value = dct.get(m.group('key'))
2359 if actual_value is None:
2360 return m.group('none_inclusive')
2361 return op(actual_value, comparison_value)
2362
2363 UNARY_OPERATORS = {
2364 '': lambda v: v is not None,
2365 '!': lambda v: v is None,
2366 }
2367 operator_rex = re.compile(r'''(?x)\s*
2368 (?P<op>%s)\s*(?P<key>[a-z_]+)
2369 \s*$
2370 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2371 m = operator_rex.search(filter_part)
2372 if m:
2373 op = UNARY_OPERATORS[m.group('op')]
2374 actual_value = dct.get(m.group('key'))
2375 return op(actual_value)
2376
2377 raise ValueError('Invalid filter part %r' % filter_part)
2378
2379
2380def match_str(filter_str, dct):
2381 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2382
2383 return all(
2384 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2385
2386
2387def match_filter_func(filter_str):
2388 def _match_func(info_dict):
2389 if match_str(filter_str, info_dict):
2390 return None
2391 else:
2392 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2393 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2394 return _match_func
91410c9b
PH
2395
2396
bf6427d2
YCH
2397def parse_dfxp_time_expr(time_expr):
2398 if not time_expr:
d631d5f9 2399 return
bf6427d2
YCH
2400
2401 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2402 if mobj:
2403 return float(mobj.group('time_offset'))
2404
db2fe38b 2405 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2406 if mobj:
db2fe38b 2407 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2408
2409
c1c924ab
YCH
2410def srt_subtitles_timecode(seconds):
2411 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2412
2413
2414def dfxp2srt(dfxp_data):
4e335771
YCH
2415 _x = functools.partial(xpath_with_ns, ns_map={
2416 'ttml': 'http://www.w3.org/ns/ttml',
2417 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
5bf28d78 2418 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
4e335771 2419 })
bf6427d2 2420
87de7069 2421 class TTMLPElementParser(object):
2b14cb56 2422 out = ''
bf6427d2 2423
2b14cb56 2424 def start(self, tag, attrib):
2425 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2426 self.out += '\n'
bf6427d2 2427
2b14cb56 2428 def end(self, tag):
2429 pass
bf6427d2 2430
2b14cb56 2431 def data(self, data):
2432 self.out += data
2433
2434 def close(self):
2435 return self.out.strip()
2436
2437 def parse_node(node):
2438 target = TTMLPElementParser()
2439 parser = xml.etree.ElementTree.XMLParser(target=target)
2440 parser.feed(xml.etree.ElementTree.tostring(node))
2441 return parser.close()
bf6427d2 2442
36e6f62c 2443 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
bf6427d2 2444 out = []
5bf28d78 2445 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2446
2447 if not paras:
2448 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
2449
2450 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2451 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2452 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2453 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2454 if begin_time is None:
2455 continue
7dff0363 2456 if not end_time:
d631d5f9
YCH
2457 if not dur:
2458 continue
2459 end_time = begin_time + dur
bf6427d2
YCH
2460 out.append('%d\n%s --> %s\n%s\n\n' % (
2461 index,
c1c924ab
YCH
2462 srt_subtitles_timecode(begin_time),
2463 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2464 parse_node(para)))
2465
2466 return ''.join(out)
2467
2468
66e289ba
S
2469def cli_option(params, command_option, param):
2470 param = params.get(param)
98e698f1
RA
2471 if param:
2472 param = compat_str(param)
66e289ba
S
2473 return [command_option, param] if param is not None else []
2474
2475
2476def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2477 param = params.get(param)
2478 assert isinstance(param, bool)
2479 if separator:
2480 return [command_option + separator + (true_value if param else false_value)]
2481 return [command_option, true_value if param else false_value]
2482
2483
2484def cli_valueless_option(params, command_option, param, expected_value=True):
2485 param = params.get(param)
2486 return [command_option] if param == expected_value else []
2487
2488
2489def cli_configuration_args(params, param, default=[]):
2490 ex_args = params.get(param)
2491 if ex_args is None:
2492 return default
2493 assert isinstance(ex_args, list)
2494 return ex_args
2495
2496
39672624
YCH
2497class ISO639Utils(object):
2498 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2499 _lang_map = {
2500 'aa': 'aar',
2501 'ab': 'abk',
2502 'ae': 'ave',
2503 'af': 'afr',
2504 'ak': 'aka',
2505 'am': 'amh',
2506 'an': 'arg',
2507 'ar': 'ara',
2508 'as': 'asm',
2509 'av': 'ava',
2510 'ay': 'aym',
2511 'az': 'aze',
2512 'ba': 'bak',
2513 'be': 'bel',
2514 'bg': 'bul',
2515 'bh': 'bih',
2516 'bi': 'bis',
2517 'bm': 'bam',
2518 'bn': 'ben',
2519 'bo': 'bod',
2520 'br': 'bre',
2521 'bs': 'bos',
2522 'ca': 'cat',
2523 'ce': 'che',
2524 'ch': 'cha',
2525 'co': 'cos',
2526 'cr': 'cre',
2527 'cs': 'ces',
2528 'cu': 'chu',
2529 'cv': 'chv',
2530 'cy': 'cym',
2531 'da': 'dan',
2532 'de': 'deu',
2533 'dv': 'div',
2534 'dz': 'dzo',
2535 'ee': 'ewe',
2536 'el': 'ell',
2537 'en': 'eng',
2538 'eo': 'epo',
2539 'es': 'spa',
2540 'et': 'est',
2541 'eu': 'eus',
2542 'fa': 'fas',
2543 'ff': 'ful',
2544 'fi': 'fin',
2545 'fj': 'fij',
2546 'fo': 'fao',
2547 'fr': 'fra',
2548 'fy': 'fry',
2549 'ga': 'gle',
2550 'gd': 'gla',
2551 'gl': 'glg',
2552 'gn': 'grn',
2553 'gu': 'guj',
2554 'gv': 'glv',
2555 'ha': 'hau',
2556 'he': 'heb',
2557 'hi': 'hin',
2558 'ho': 'hmo',
2559 'hr': 'hrv',
2560 'ht': 'hat',
2561 'hu': 'hun',
2562 'hy': 'hye',
2563 'hz': 'her',
2564 'ia': 'ina',
2565 'id': 'ind',
2566 'ie': 'ile',
2567 'ig': 'ibo',
2568 'ii': 'iii',
2569 'ik': 'ipk',
2570 'io': 'ido',
2571 'is': 'isl',
2572 'it': 'ita',
2573 'iu': 'iku',
2574 'ja': 'jpn',
2575 'jv': 'jav',
2576 'ka': 'kat',
2577 'kg': 'kon',
2578 'ki': 'kik',
2579 'kj': 'kua',
2580 'kk': 'kaz',
2581 'kl': 'kal',
2582 'km': 'khm',
2583 'kn': 'kan',
2584 'ko': 'kor',
2585 'kr': 'kau',
2586 'ks': 'kas',
2587 'ku': 'kur',
2588 'kv': 'kom',
2589 'kw': 'cor',
2590 'ky': 'kir',
2591 'la': 'lat',
2592 'lb': 'ltz',
2593 'lg': 'lug',
2594 'li': 'lim',
2595 'ln': 'lin',
2596 'lo': 'lao',
2597 'lt': 'lit',
2598 'lu': 'lub',
2599 'lv': 'lav',
2600 'mg': 'mlg',
2601 'mh': 'mah',
2602 'mi': 'mri',
2603 'mk': 'mkd',
2604 'ml': 'mal',
2605 'mn': 'mon',
2606 'mr': 'mar',
2607 'ms': 'msa',
2608 'mt': 'mlt',
2609 'my': 'mya',
2610 'na': 'nau',
2611 'nb': 'nob',
2612 'nd': 'nde',
2613 'ne': 'nep',
2614 'ng': 'ndo',
2615 'nl': 'nld',
2616 'nn': 'nno',
2617 'no': 'nor',
2618 'nr': 'nbl',
2619 'nv': 'nav',
2620 'ny': 'nya',
2621 'oc': 'oci',
2622 'oj': 'oji',
2623 'om': 'orm',
2624 'or': 'ori',
2625 'os': 'oss',
2626 'pa': 'pan',
2627 'pi': 'pli',
2628 'pl': 'pol',
2629 'ps': 'pus',
2630 'pt': 'por',
2631 'qu': 'que',
2632 'rm': 'roh',
2633 'rn': 'run',
2634 'ro': 'ron',
2635 'ru': 'rus',
2636 'rw': 'kin',
2637 'sa': 'san',
2638 'sc': 'srd',
2639 'sd': 'snd',
2640 'se': 'sme',
2641 'sg': 'sag',
2642 'si': 'sin',
2643 'sk': 'slk',
2644 'sl': 'slv',
2645 'sm': 'smo',
2646 'sn': 'sna',
2647 'so': 'som',
2648 'sq': 'sqi',
2649 'sr': 'srp',
2650 'ss': 'ssw',
2651 'st': 'sot',
2652 'su': 'sun',
2653 'sv': 'swe',
2654 'sw': 'swa',
2655 'ta': 'tam',
2656 'te': 'tel',
2657 'tg': 'tgk',
2658 'th': 'tha',
2659 'ti': 'tir',
2660 'tk': 'tuk',
2661 'tl': 'tgl',
2662 'tn': 'tsn',
2663 'to': 'ton',
2664 'tr': 'tur',
2665 'ts': 'tso',
2666 'tt': 'tat',
2667 'tw': 'twi',
2668 'ty': 'tah',
2669 'ug': 'uig',
2670 'uk': 'ukr',
2671 'ur': 'urd',
2672 'uz': 'uzb',
2673 've': 'ven',
2674 'vi': 'vie',
2675 'vo': 'vol',
2676 'wa': 'wln',
2677 'wo': 'wol',
2678 'xh': 'xho',
2679 'yi': 'yid',
2680 'yo': 'yor',
2681 'za': 'zha',
2682 'zh': 'zho',
2683 'zu': 'zul',
2684 }
2685
2686 @classmethod
2687 def short2long(cls, code):
2688 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2689 return cls._lang_map.get(code[:2])
2690
2691 @classmethod
2692 def long2short(cls, code):
2693 """Convert language code from ISO 639-2/T to ISO 639-1"""
2694 for short_name, long_name in cls._lang_map.items():
2695 if long_name == code:
2696 return short_name
2697
2698
4eb10f66
YCH
2699class ISO3166Utils(object):
2700 # From http://data.okfn.org/data/core/country-list
2701 _country_map = {
2702 'AF': 'Afghanistan',
2703 'AX': 'Åland Islands',
2704 'AL': 'Albania',
2705 'DZ': 'Algeria',
2706 'AS': 'American Samoa',
2707 'AD': 'Andorra',
2708 'AO': 'Angola',
2709 'AI': 'Anguilla',
2710 'AQ': 'Antarctica',
2711 'AG': 'Antigua and Barbuda',
2712 'AR': 'Argentina',
2713 'AM': 'Armenia',
2714 'AW': 'Aruba',
2715 'AU': 'Australia',
2716 'AT': 'Austria',
2717 'AZ': 'Azerbaijan',
2718 'BS': 'Bahamas',
2719 'BH': 'Bahrain',
2720 'BD': 'Bangladesh',
2721 'BB': 'Barbados',
2722 'BY': 'Belarus',
2723 'BE': 'Belgium',
2724 'BZ': 'Belize',
2725 'BJ': 'Benin',
2726 'BM': 'Bermuda',
2727 'BT': 'Bhutan',
2728 'BO': 'Bolivia, Plurinational State of',
2729 'BQ': 'Bonaire, Sint Eustatius and Saba',
2730 'BA': 'Bosnia and Herzegovina',
2731 'BW': 'Botswana',
2732 'BV': 'Bouvet Island',
2733 'BR': 'Brazil',
2734 'IO': 'British Indian Ocean Territory',
2735 'BN': 'Brunei Darussalam',
2736 'BG': 'Bulgaria',
2737 'BF': 'Burkina Faso',
2738 'BI': 'Burundi',
2739 'KH': 'Cambodia',
2740 'CM': 'Cameroon',
2741 'CA': 'Canada',
2742 'CV': 'Cape Verde',
2743 'KY': 'Cayman Islands',
2744 'CF': 'Central African Republic',
2745 'TD': 'Chad',
2746 'CL': 'Chile',
2747 'CN': 'China',
2748 'CX': 'Christmas Island',
2749 'CC': 'Cocos (Keeling) Islands',
2750 'CO': 'Colombia',
2751 'KM': 'Comoros',
2752 'CG': 'Congo',
2753 'CD': 'Congo, the Democratic Republic of the',
2754 'CK': 'Cook Islands',
2755 'CR': 'Costa Rica',
2756 'CI': 'Côte d\'Ivoire',
2757 'HR': 'Croatia',
2758 'CU': 'Cuba',
2759 'CW': 'Curaçao',
2760 'CY': 'Cyprus',
2761 'CZ': 'Czech Republic',
2762 'DK': 'Denmark',
2763 'DJ': 'Djibouti',
2764 'DM': 'Dominica',
2765 'DO': 'Dominican Republic',
2766 'EC': 'Ecuador',
2767 'EG': 'Egypt',
2768 'SV': 'El Salvador',
2769 'GQ': 'Equatorial Guinea',
2770 'ER': 'Eritrea',
2771 'EE': 'Estonia',
2772 'ET': 'Ethiopia',
2773 'FK': 'Falkland Islands (Malvinas)',
2774 'FO': 'Faroe Islands',
2775 'FJ': 'Fiji',
2776 'FI': 'Finland',
2777 'FR': 'France',
2778 'GF': 'French Guiana',
2779 'PF': 'French Polynesia',
2780 'TF': 'French Southern Territories',
2781 'GA': 'Gabon',
2782 'GM': 'Gambia',
2783 'GE': 'Georgia',
2784 'DE': 'Germany',
2785 'GH': 'Ghana',
2786 'GI': 'Gibraltar',
2787 'GR': 'Greece',
2788 'GL': 'Greenland',
2789 'GD': 'Grenada',
2790 'GP': 'Guadeloupe',
2791 'GU': 'Guam',
2792 'GT': 'Guatemala',
2793 'GG': 'Guernsey',
2794 'GN': 'Guinea',
2795 'GW': 'Guinea-Bissau',
2796 'GY': 'Guyana',
2797 'HT': 'Haiti',
2798 'HM': 'Heard Island and McDonald Islands',
2799 'VA': 'Holy See (Vatican City State)',
2800 'HN': 'Honduras',
2801 'HK': 'Hong Kong',
2802 'HU': 'Hungary',
2803 'IS': 'Iceland',
2804 'IN': 'India',
2805 'ID': 'Indonesia',
2806 'IR': 'Iran, Islamic Republic of',
2807 'IQ': 'Iraq',
2808 'IE': 'Ireland',
2809 'IM': 'Isle of Man',
2810 'IL': 'Israel',
2811 'IT': 'Italy',
2812 'JM': 'Jamaica',
2813 'JP': 'Japan',
2814 'JE': 'Jersey',
2815 'JO': 'Jordan',
2816 'KZ': 'Kazakhstan',
2817 'KE': 'Kenya',
2818 'KI': 'Kiribati',
2819 'KP': 'Korea, Democratic People\'s Republic of',
2820 'KR': 'Korea, Republic of',
2821 'KW': 'Kuwait',
2822 'KG': 'Kyrgyzstan',
2823 'LA': 'Lao People\'s Democratic Republic',
2824 'LV': 'Latvia',
2825 'LB': 'Lebanon',
2826 'LS': 'Lesotho',
2827 'LR': 'Liberia',
2828 'LY': 'Libya',
2829 'LI': 'Liechtenstein',
2830 'LT': 'Lithuania',
2831 'LU': 'Luxembourg',
2832 'MO': 'Macao',
2833 'MK': 'Macedonia, the Former Yugoslav Republic of',
2834 'MG': 'Madagascar',
2835 'MW': 'Malawi',
2836 'MY': 'Malaysia',
2837 'MV': 'Maldives',
2838 'ML': 'Mali',
2839 'MT': 'Malta',
2840 'MH': 'Marshall Islands',
2841 'MQ': 'Martinique',
2842 'MR': 'Mauritania',
2843 'MU': 'Mauritius',
2844 'YT': 'Mayotte',
2845 'MX': 'Mexico',
2846 'FM': 'Micronesia, Federated States of',
2847 'MD': 'Moldova, Republic of',
2848 'MC': 'Monaco',
2849 'MN': 'Mongolia',
2850 'ME': 'Montenegro',
2851 'MS': 'Montserrat',
2852 'MA': 'Morocco',
2853 'MZ': 'Mozambique',
2854 'MM': 'Myanmar',
2855 'NA': 'Namibia',
2856 'NR': 'Nauru',
2857 'NP': 'Nepal',
2858 'NL': 'Netherlands',
2859 'NC': 'New Caledonia',
2860 'NZ': 'New Zealand',
2861 'NI': 'Nicaragua',
2862 'NE': 'Niger',
2863 'NG': 'Nigeria',
2864 'NU': 'Niue',
2865 'NF': 'Norfolk Island',
2866 'MP': 'Northern Mariana Islands',
2867 'NO': 'Norway',
2868 'OM': 'Oman',
2869 'PK': 'Pakistan',
2870 'PW': 'Palau',
2871 'PS': 'Palestine, State of',
2872 'PA': 'Panama',
2873 'PG': 'Papua New Guinea',
2874 'PY': 'Paraguay',
2875 'PE': 'Peru',
2876 'PH': 'Philippines',
2877 'PN': 'Pitcairn',
2878 'PL': 'Poland',
2879 'PT': 'Portugal',
2880 'PR': 'Puerto Rico',
2881 'QA': 'Qatar',
2882 'RE': 'Réunion',
2883 'RO': 'Romania',
2884 'RU': 'Russian Federation',
2885 'RW': 'Rwanda',
2886 'BL': 'Saint Barthélemy',
2887 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2888 'KN': 'Saint Kitts and Nevis',
2889 'LC': 'Saint Lucia',
2890 'MF': 'Saint Martin (French part)',
2891 'PM': 'Saint Pierre and Miquelon',
2892 'VC': 'Saint Vincent and the Grenadines',
2893 'WS': 'Samoa',
2894 'SM': 'San Marino',
2895 'ST': 'Sao Tome and Principe',
2896 'SA': 'Saudi Arabia',
2897 'SN': 'Senegal',
2898 'RS': 'Serbia',
2899 'SC': 'Seychelles',
2900 'SL': 'Sierra Leone',
2901 'SG': 'Singapore',
2902 'SX': 'Sint Maarten (Dutch part)',
2903 'SK': 'Slovakia',
2904 'SI': 'Slovenia',
2905 'SB': 'Solomon Islands',
2906 'SO': 'Somalia',
2907 'ZA': 'South Africa',
2908 'GS': 'South Georgia and the South Sandwich Islands',
2909 'SS': 'South Sudan',
2910 'ES': 'Spain',
2911 'LK': 'Sri Lanka',
2912 'SD': 'Sudan',
2913 'SR': 'Suriname',
2914 'SJ': 'Svalbard and Jan Mayen',
2915 'SZ': 'Swaziland',
2916 'SE': 'Sweden',
2917 'CH': 'Switzerland',
2918 'SY': 'Syrian Arab Republic',
2919 'TW': 'Taiwan, Province of China',
2920 'TJ': 'Tajikistan',
2921 'TZ': 'Tanzania, United Republic of',
2922 'TH': 'Thailand',
2923 'TL': 'Timor-Leste',
2924 'TG': 'Togo',
2925 'TK': 'Tokelau',
2926 'TO': 'Tonga',
2927 'TT': 'Trinidad and Tobago',
2928 'TN': 'Tunisia',
2929 'TR': 'Turkey',
2930 'TM': 'Turkmenistan',
2931 'TC': 'Turks and Caicos Islands',
2932 'TV': 'Tuvalu',
2933 'UG': 'Uganda',
2934 'UA': 'Ukraine',
2935 'AE': 'United Arab Emirates',
2936 'GB': 'United Kingdom',
2937 'US': 'United States',
2938 'UM': 'United States Minor Outlying Islands',
2939 'UY': 'Uruguay',
2940 'UZ': 'Uzbekistan',
2941 'VU': 'Vanuatu',
2942 'VE': 'Venezuela, Bolivarian Republic of',
2943 'VN': 'Viet Nam',
2944 'VG': 'Virgin Islands, British',
2945 'VI': 'Virgin Islands, U.S.',
2946 'WF': 'Wallis and Futuna',
2947 'EH': 'Western Sahara',
2948 'YE': 'Yemen',
2949 'ZM': 'Zambia',
2950 'ZW': 'Zimbabwe',
2951 }
2952
2953 @classmethod
2954 def short2full(cls, code):
2955 """Convert an ISO 3166-2 country code to the corresponding full name"""
2956 return cls._country_map.get(code.upper())
2957
2958
91410c9b 2959class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2960 def __init__(self, proxies=None):
2961 # Set default handlers
2962 for type in ('http', 'https'):
2963 setattr(self, '%s_open' % type,
2964 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2965 meth(r, proxy, type))
2966 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2967
91410c9b 2968 def proxy_open(self, req, proxy, type):
2461f79d 2969 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
2970 if req_proxy is not None:
2971 proxy = req_proxy
2461f79d
PH
2972 del req.headers['Ytdl-request-proxy']
2973
2974 if proxy == '__noproxy__':
2975 return None # No Proxy
51fb4995 2976 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
2977 req.add_header('Ytdl-socks-proxy', proxy)
2978 # youtube-dl's http/https handlers do wrapping the socket with socks
2979 return None
91410c9b
PH
2980 return compat_urllib_request.ProxyHandler.proxy_open(
2981 self, req, proxy, type)
5bc880b9
YCH
2982
2983
2984def ohdave_rsa_encrypt(data, exponent, modulus):
2985 '''
2986 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
2987
2988 Input:
2989 data: data to encrypt, bytes-like object
2990 exponent, modulus: parameter e and N of RSA algorithm, both integer
2991 Output: hex string of encrypted data
2992
2993 Limitation: supports one block encryption only
2994 '''
2995
2996 payload = int(binascii.hexlify(data[::-1]), 16)
2997 encrypted = pow(payload, exponent, modulus)
2998 return '%x' % encrypted
81bdc8fd
YCH
2999
3000
5eb6bdce 3001def encode_base_n(num, n, table=None):
59f898b7 3002 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
3003 if not table:
3004 table = FULL_TABLE[:n]
3005
5eb6bdce
YCH
3006 if n > len(table):
3007 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3008
3009 if num == 0:
3010 return table[0]
3011
81bdc8fd
YCH
3012 ret = ''
3013 while num:
3014 ret = table[num % n] + ret
3015 num = num // n
3016 return ret
f52354a8
YCH
3017
3018
3019def decode_packed_codes(code):
3020 mobj = re.search(
680079be 3021 r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)",
f52354a8
YCH
3022 code)
3023 obfucasted_code, base, count, symbols = mobj.groups()
3024 base = int(base)
3025 count = int(count)
3026 symbols = symbols.split('|')
3027 symbol_table = {}
3028
3029 while count:
3030 count -= 1
5eb6bdce 3031 base_n_count = encode_base_n(count, base)
f52354a8
YCH
3032 symbol_table[base_n_count] = symbols[count] or base_n_count
3033
3034 return re.sub(
3035 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3036 obfucasted_code)
e154c651 3037
3038
3039def parse_m3u8_attributes(attrib):
3040 info = {}
3041 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3042 if val.startswith('"'):
3043 val = val[1:-1]
3044 info[key] = val
3045 return info
1143535d
YCH
3046
3047
3048def urshift(val, n):
3049 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
3050
3051
3052# Based on png2str() written by @gdkchan and improved by @yokrysty
3053# Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3054def decode_png(png_data):
3055 # Reference: https://www.w3.org/TR/PNG/
3056 header = png_data[8:]
3057
3058 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3059 raise IOError('Not a valid PNG file.')
3060
3061 int_map = {1: '>B', 2: '>H', 4: '>I'}
3062 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3063
3064 chunks = []
3065
3066 while header:
3067 length = unpack_integer(header[:4])
3068 header = header[4:]
3069
3070 chunk_type = header[:4]
3071 header = header[4:]
3072
3073 chunk_data = header[:length]
3074 header = header[length:]
3075
3076 header = header[4:] # Skip CRC
3077
3078 chunks.append({
3079 'type': chunk_type,
3080 'length': length,
3081 'data': chunk_data
3082 })
3083
3084 ihdr = chunks[0]['data']
3085
3086 width = unpack_integer(ihdr[:4])
3087 height = unpack_integer(ihdr[4:8])
3088
3089 idat = b''
3090
3091 for chunk in chunks:
3092 if chunk['type'] == b'IDAT':
3093 idat += chunk['data']
3094
3095 if not idat:
3096 raise IOError('Unable to read PNG data.')
3097
3098 decompressed_data = bytearray(zlib.decompress(idat))
3099
3100 stride = width * 3
3101 pixels = []
3102
3103 def _get_pixel(idx):
3104 x = idx % stride
3105 y = idx // stride
3106 return pixels[y][x]
3107
3108 for y in range(height):
3109 basePos = y * (1 + stride)
3110 filter_type = decompressed_data[basePos]
3111
3112 current_row = []
3113
3114 pixels.append(current_row)
3115
3116 for x in range(stride):
3117 color = decompressed_data[1 + basePos + x]
3118 basex = y * stride + x
3119 left = 0
3120 up = 0
3121
3122 if x > 2:
3123 left = _get_pixel(basex - 3)
3124 if y > 0:
3125 up = _get_pixel(basex - stride)
3126
3127 if filter_type == 1: # Sub
3128 color = (color + left) & 0xff
3129 elif filter_type == 2: # Up
3130 color = (color + up) & 0xff
3131 elif filter_type == 3: # Average
3132 color = (color + ((left + up) >> 1)) & 0xff
3133 elif filter_type == 4: # Paeth
3134 a = left
3135 b = up
3136 c = 0
3137
3138 if x > 2 and y > 0:
3139 c = _get_pixel(basex - stride - 3)
3140
3141 p = a + b - c
3142
3143 pa = abs(p - a)
3144 pb = abs(p - b)
3145 pc = abs(p - c)
3146
3147 if pa <= pb and pa <= pc:
3148 color = (color + a) & 0xff
3149 elif pb <= pc:
3150 color = (color + b) & 0xff
3151 else:
3152 color = (color + c) & 0xff
3153
3154 current_row.append(color)
3155
3156 return width, height, pixels
efa97bdc
YCH
3157
3158
3159def write_xattr(path, key, value):
3160 # This mess below finds the best xattr tool for the job
3161 try:
3162 # try the pyxattr module...
3163 import xattr
3164
53a7e3d2
YCH
3165 if hasattr(xattr, 'set'): # pyxattr
3166 # Unicode arguments are not supported in python-pyxattr until
3167 # version 0.5.0
3168 # See https://github.com/rg3/youtube-dl/issues/5498
3169 pyxattr_required_version = '0.5.0'
3170 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3171 # TODO: fallback to CLI tools
3172 raise XAttrUnavailableError(
3173 'python-pyxattr is detected but is too old. '
3174 'youtube-dl requires %s or above while your version is %s. '
3175 'Falling back to other xattr implementations' % (
3176 pyxattr_required_version, xattr.__version__))
3177
3178 setxattr = xattr.set
3179 else: # xattr
3180 setxattr = xattr.setxattr
efa97bdc
YCH
3181
3182 try:
53a7e3d2 3183 setxattr(path, key, value)
efa97bdc
YCH
3184 except EnvironmentError as e:
3185 raise XAttrMetadataError(e.errno, e.strerror)
3186
3187 except ImportError:
3188 if compat_os_name == 'nt':
3189 # Write xattrs to NTFS Alternate Data Streams:
3190 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3191 assert ':' not in key
3192 assert os.path.exists(path)
3193
3194 ads_fn = path + ':' + key
3195 try:
3196 with open(ads_fn, 'wb') as f:
3197 f.write(value)
3198 except EnvironmentError as e:
3199 raise XAttrMetadataError(e.errno, e.strerror)
3200 else:
3201 user_has_setfattr = check_executable('setfattr', ['--version'])
3202 user_has_xattr = check_executable('xattr', ['-h'])
3203
3204 if user_has_setfattr or user_has_xattr:
3205
3206 value = value.decode('utf-8')
3207 if user_has_setfattr:
3208 executable = 'setfattr'
3209 opts = ['-n', key, '-v', value]
3210 elif user_has_xattr:
3211 executable = 'xattr'
3212 opts = ['-w', key, value]
3213
3214 cmd = ([encodeFilename(executable, True)] +
3215 [encodeArgument(o) for o in opts] +
3216 [encodeFilename(path, True)])
3217
3218 try:
3219 p = subprocess.Popen(
3220 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3221 except EnvironmentError as e:
3222 raise XAttrMetadataError(e.errno, e.strerror)
3223 stdout, stderr = p.communicate()
3224 stderr = stderr.decode('utf-8', 'replace')
3225 if p.returncode != 0:
3226 raise XAttrMetadataError(p.returncode, stderr)
3227
3228 else:
3229 # On Unix, and can't find pyxattr, setfattr, or xattr.
3230 if sys.platform.startswith('linux'):
3231 raise XAttrUnavailableError(
3232 "Couldn't find a tool to set the xattrs. "
3233 "Install either the python 'pyxattr' or 'xattr' "
3234 "modules, or the GNU 'attr' package "
3235 "(which contains the 'setfattr' tool).")
3236 else:
3237 raise XAttrUnavailableError(
3238 "Couldn't find a tool to set the xattrs. "
3239 "Install either the python 'xattr' module, "
3240 "or the 'xattr' binary.")