]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
Fix "invalid escape sequences" error on Python 3.6
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd 1#!/usr/bin/env python
dcdb292f 2# coding: utf-8
d77c3dfd 3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
f45c185f 14import errno
be4a824d 15import functools
d77c3dfd 16import gzip
03f9daab 17import io
79a2e94e 18import itertools
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
347de493 22import operator
d77c3dfd 23import os
4eb7f1d1 24import pipes
c496ca96 25import platform
d77c3dfd 26import re
c496ca96 27import socket
79a2e94e 28import ssl
1c088fa8 29import subprocess
d77c3dfd 30import sys
181c8655 31import tempfile
01951dda 32import traceback
bcf89ce6 33import xml.etree.ElementTree
d77c3dfd 34import zlib
d77c3dfd 35
8c25f81b 36from .compat import (
8bb56eee 37 compat_HTMLParser,
8f9312c3 38 compat_basestring,
8c25f81b 39 compat_chr,
36e6f62c 40 compat_etree_fromstring,
8c25f81b 41 compat_html_entities,
55b2f099 42 compat_html_entities_html5,
be4a824d 43 compat_http_client,
c86b6142 44 compat_kwargs,
efa97bdc 45 compat_os_name,
8c25f81b 46 compat_parse_qs,
702ccf2d 47 compat_shlex_quote,
be4a824d 48 compat_socket_create_connection,
8c25f81b 49 compat_str,
edaa23f8 50 compat_struct_pack,
d3f8e038 51 compat_struct_unpack,
8c25f81b
PH
52 compat_urllib_error,
53 compat_urllib_parse,
15707c7e 54 compat_urllib_parse_urlencode,
8c25f81b 55 compat_urllib_parse_urlparse,
7581bfc9 56 compat_urllib_parse_unquote_plus,
8c25f81b
PH
57 compat_urllib_request,
58 compat_urlparse,
810c10ba 59 compat_xpath,
8c25f81b 60)
4644ac55 61
71aff188
YCH
62from .socks import (
63 ProxyType,
64 sockssocket,
65)
66
4644ac55 67
51fb4995
YCH
68def register_socks_protocols():
69 # "Register" SOCKS protocols
d5ae6bb5
YCH
70 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
71 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
72 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
73 if scheme not in compat_urlparse.uses_netloc:
74 compat_urlparse.uses_netloc.append(scheme)
75
76
468e2e92
FV
77# This is not clearly defined otherwise
78compiled_regex_type = type(re.compile(''))
79
3e669f36 80std_headers = {
15d10678 81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
59ae15a5
PH
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 86}
f427df17 87
5f6a1245 88
fb37eb25
S
89USER_AGENTS = {
90 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
91}
92
93
bf42a990
S
94NO_DEFAULT = object()
95
7105440c
YCH
96ENGLISH_MONTH_NAMES = [
97 'January', 'February', 'March', 'April', 'May', 'June',
98 'July', 'August', 'September', 'October', 'November', 'December']
99
f6717dec
S
100MONTH_NAMES = {
101 'en': ENGLISH_MONTH_NAMES,
102 'fr': [
3e4185c3
S
103 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
104 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 105}
a942d6cb 106
a7aaa398
S
107KNOWN_EXTENSIONS = (
108 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
109 'flv', 'f4v', 'f4a', 'f4b',
110 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
111 'mkv', 'mka', 'mk3d',
112 'avi', 'divx',
113 'mov',
114 'asf', 'wmv', 'wma',
115 '3gp', '3g2',
116 'mp3',
117 'flac',
118 'ape',
119 'wav',
120 'f4f', 'f4m', 'm3u8', 'smil')
121
c587cbb7 122# needed for sanitizing filenames in restricted mode
c8827027 123ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
124 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
125 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
c587cbb7 126
46f59e89
S
127DATE_FORMATS = (
128 '%d %B %Y',
129 '%d %b %Y',
130 '%B %d %Y',
131 '%b %d %Y',
132 '%b %dst %Y %I:%M',
133 '%b %dnd %Y %I:%M',
134 '%b %dth %Y %I:%M',
135 '%Y %m %d',
136 '%Y-%m-%d',
137 '%Y/%m/%d',
81c13222 138 '%Y/%m/%d %H:%M',
46f59e89
S
139 '%Y/%m/%d %H:%M:%S',
140 '%Y-%m-%d %H:%M:%S',
141 '%Y-%m-%d %H:%M:%S.%f',
142 '%d.%m.%Y %H:%M',
143 '%d.%m.%Y %H.%M',
144 '%Y-%m-%dT%H:%M:%SZ',
145 '%Y-%m-%dT%H:%M:%S.%fZ',
146 '%Y-%m-%dT%H:%M:%S.%f0Z',
147 '%Y-%m-%dT%H:%M:%S',
148 '%Y-%m-%dT%H:%M:%S.%f',
149 '%Y-%m-%dT%H:%M',
c6eed6b8
S
150 '%b %d %Y at %H:%M',
151 '%b %d %Y at %H:%M:%S',
46f59e89
S
152)
153
154DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
155DATE_FORMATS_DAY_FIRST.extend([
156 '%d-%m-%Y',
157 '%d.%m.%Y',
158 '%d.%m.%y',
159 '%d/%m/%Y',
160 '%d/%m/%y',
161 '%d/%m/%Y %H:%M:%S',
162])
163
164DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
165DATE_FORMATS_MONTH_FIRST.extend([
166 '%m-%d-%Y',
167 '%m.%d.%Y',
168 '%m/%d/%Y',
169 '%m/%d/%y',
170 '%m/%d/%Y %H:%M:%S',
171])
172
06b3fe29
S
173PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
174
7105440c 175
d77c3dfd 176def preferredencoding():
59ae15a5 177 """Get preferred encoding.
d77c3dfd 178
59ae15a5
PH
179 Returns the best encoding scheme for the system, based on
180 locale.getpreferredencoding() and some further tweaks.
181 """
182 try:
183 pref = locale.getpreferredencoding()
28e614de 184 'TEST'.encode(pref)
70a1165b 185 except Exception:
59ae15a5 186 pref = 'UTF-8'
bae611f2 187
59ae15a5 188 return pref
d77c3dfd 189
f4bfd65f 190
181c8655 191def write_json_file(obj, fn):
1394646a 192 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 193
92120217 194 fn = encodeFilename(fn)
61ee5aeb 195 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
196 encoding = get_filesystem_encoding()
197 # os.path.basename returns a bytes object, but NamedTemporaryFile
198 # will fail if the filename contains non ascii characters unless we
199 # use a unicode object
200 path_basename = lambda f: os.path.basename(fn).decode(encoding)
201 # the same for os.path.dirname
202 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
203 else:
204 path_basename = os.path.basename
205 path_dirname = os.path.dirname
206
73159f99
S
207 args = {
208 'suffix': '.tmp',
ec5f6016
JMF
209 'prefix': path_basename(fn) + '.',
210 'dir': path_dirname(fn),
73159f99
S
211 'delete': False,
212 }
213
181c8655
PH
214 # In Python 2.x, json.dump expects a bytestream.
215 # In Python 3.x, it writes to a character stream
216 if sys.version_info < (3, 0):
73159f99 217 args['mode'] = 'wb'
181c8655 218 else:
73159f99
S
219 args.update({
220 'mode': 'w',
221 'encoding': 'utf-8',
222 })
223
c86b6142 224 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
225
226 try:
227 with tf:
228 json.dump(obj, tf)
1394646a
IK
229 if sys.platform == 'win32':
230 # Need to remove existing file on Windows, else os.rename raises
231 # WindowsError or FileExistsError.
232 try:
233 os.unlink(fn)
234 except OSError:
235 pass
181c8655 236 os.rename(tf.name, fn)
70a1165b 237 except Exception:
181c8655
PH
238 try:
239 os.remove(tf.name)
240 except OSError:
241 pass
242 raise
243
244
245if sys.version_info >= (2, 7):
ee114368 246 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 247 """ Find the xpath xpath[@key=val] """
5d2354f1 248 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 249 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
250 return node.find(expr)
251else:
ee114368 252 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 253 for f in node.findall(compat_xpath(xpath)):
ee114368
S
254 if key not in f.attrib:
255 continue
256 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
257 return f
258 return None
259
d7e66d39
JMF
260# On python2.6 the xml.etree.ElementTree.Element methods don't support
261# the namespace parameter
5f6a1245
JW
262
263
d7e66d39
JMF
264def xpath_with_ns(path, ns_map):
265 components = [c.split(':') for c in path.split('/')]
266 replaced = []
267 for c in components:
268 if len(c) == 1:
269 replaced.append(c[0])
270 else:
271 ns, tag = c
272 replaced.append('{%s}%s' % (ns_map[ns], tag))
273 return '/'.join(replaced)
274
d77c3dfd 275
a41fb80c 276def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 277 def _find_xpath(xpath):
810c10ba 278 return node.find(compat_xpath(xpath))
578c0745
S
279
280 if isinstance(xpath, (str, compat_str)):
281 n = _find_xpath(xpath)
282 else:
283 for xp in xpath:
284 n = _find_xpath(xp)
285 if n is not None:
286 break
d74bebd5 287
8e636da4 288 if n is None:
bf42a990
S
289 if default is not NO_DEFAULT:
290 return default
291 elif fatal:
bf0ff932
PH
292 name = xpath if name is None else name
293 raise ExtractorError('Could not find XML element %s' % name)
294 else:
295 return None
a41fb80c
S
296 return n
297
298
299def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
300 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
301 if n is None or n == default:
302 return n
303 if n.text is None:
304 if default is not NO_DEFAULT:
305 return default
306 elif fatal:
307 name = xpath if name is None else name
308 raise ExtractorError('Could not find XML element\'s text %s' % name)
309 else:
310 return None
311 return n.text
a41fb80c
S
312
313
314def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
315 n = find_xpath_attr(node, xpath, key)
316 if n is None:
317 if default is not NO_DEFAULT:
318 return default
319 elif fatal:
320 name = '%s[@%s]' % (xpath, key) if name is None else name
321 raise ExtractorError('Could not find XML attribute %s' % name)
322 else:
323 return None
324 return n.attrib[key]
bf0ff932
PH
325
326
9e6dd238 327def get_element_by_id(id, html):
43e8fafd 328 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 329 return get_element_by_attribute('id', id, html)
43e8fafd 330
12ea2f30 331
84c237fb
YCH
332def get_element_by_class(class_name, html):
333 return get_element_by_attribute(
334 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
335 html, escape_value=False)
336
337
338def get_element_by_attribute(attribute, value, html, escape_value=True):
43e8fafd 339 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 340
84c237fb
YCH
341 value = re.escape(value) if escape_value else value
342
38285056
PH
343 m = re.search(r'''(?xs)
344 <([a-zA-Z0-9:._-]+)
abc97b5e 345 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056 346 \s+%s=['"]?%s['"]?
abc97b5e 347 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056
PH
348 \s*>
349 (?P<content>.*?)
350 </\1>
84c237fb 351 ''' % (re.escape(attribute), value), html)
38285056
PH
352
353 if not m:
354 return None
355 res = m.group('content')
356
357 if res.startswith('"') or res.startswith("'"):
358 res = res[1:-1]
a921f407 359
38285056 360 return unescapeHTML(res)
a921f407 361
c5229f39 362
8bb56eee
BF
363class HTMLAttributeParser(compat_HTMLParser):
364 """Trivial HTML parser to gather the attributes for a single element"""
365 def __init__(self):
c5229f39 366 self.attrs = {}
8bb56eee
BF
367 compat_HTMLParser.__init__(self)
368
369 def handle_starttag(self, tag, attrs):
370 self.attrs = dict(attrs)
371
c5229f39 372
8bb56eee
BF
373def extract_attributes(html_element):
374 """Given a string for an HTML element such as
375 <el
376 a="foo" B="bar" c="&98;az" d=boz
377 empty= noval entity="&amp;"
378 sq='"' dq="'"
379 >
380 Decode and return a dictionary of attributes.
381 {
382 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
383 'empty': '', 'noval': None, 'entity': '&',
384 'sq': '"', 'dq': '\''
385 }.
386 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
387 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
388 """
389 parser = HTMLAttributeParser()
390 parser.feed(html_element)
391 parser.close()
392 return parser.attrs
9e6dd238 393
c5229f39 394
9e6dd238 395def clean_html(html):
59ae15a5 396 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
397
398 if html is None: # Convenience for sanitizing descriptions etc.
399 return html
400
59ae15a5
PH
401 # Newline vs <br />
402 html = html.replace('\n', ' ')
6b3aef80
FV
403 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
404 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
405 # Strip html tags
406 html = re.sub('<.*?>', '', html)
407 # Replace html entities
408 html = unescapeHTML(html)
7decf895 409 return html.strip()
9e6dd238
FV
410
411
d77c3dfd 412def sanitize_open(filename, open_mode):
59ae15a5
PH
413 """Try to open the given filename, and slightly tweak it if this fails.
414
415 Attempts to open the given filename. If this fails, it tries to change
416 the filename slightly, step by step, until it's either able to open it
417 or it fails and raises a final exception, like the standard open()
418 function.
419
420 It returns the tuple (stream, definitive_file_name).
421 """
422 try:
28e614de 423 if filename == '-':
59ae15a5
PH
424 if sys.platform == 'win32':
425 import msvcrt
426 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 427 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
428 stream = open(encodeFilename(filename), open_mode)
429 return (stream, filename)
430 except (IOError, OSError) as err:
f45c185f
PH
431 if err.errno in (errno.EACCES,):
432 raise
59ae15a5 433
f45c185f 434 # In case of error, try to remove win32 forbidden chars
d55de57b 435 alt_filename = sanitize_path(filename)
f45c185f
PH
436 if alt_filename == filename:
437 raise
438 else:
439 # An exception here should be caught in the caller
d55de57b 440 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 441 return (stream, alt_filename)
d77c3dfd
FV
442
443
444def timeconvert(timestr):
59ae15a5
PH
445 """Convert RFC 2822 defined time string into system timestamp"""
446 timestamp = None
447 timetuple = email.utils.parsedate_tz(timestr)
448 if timetuple is not None:
449 timestamp = email.utils.mktime_tz(timetuple)
450 return timestamp
1c469a94 451
5f6a1245 452
796173d0 453def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
454 """Sanitizes a string so it could be used as part of a filename.
455 If restricted is set, use a stricter subset of allowed characters.
796173d0 456 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
457 """
458 def replace_insane(char):
c587cbb7
AT
459 if restricted and char in ACCENT_CHARS:
460 return ACCENT_CHARS[char]
59ae15a5
PH
461 if char == '?' or ord(char) < 32 or ord(char) == 127:
462 return ''
463 elif char == '"':
464 return '' if restricted else '\''
465 elif char == ':':
466 return '_-' if restricted else ' -'
467 elif char in '\\/|*<>':
468 return '_'
627dcfff 469 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
470 return '_'
471 if restricted and ord(char) > 127:
472 return '_'
473 return char
474
2aeb06d6
PH
475 # Handle timestamps
476 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 477 result = ''.join(map(replace_insane, s))
796173d0
PH
478 if not is_id:
479 while '__' in result:
480 result = result.replace('__', '_')
481 result = result.strip('_')
482 # Common case of "Foreign band name - English song title"
483 if restricted and result.startswith('-_'):
484 result = result[2:]
5a42414b
PH
485 if result.startswith('-'):
486 result = '_' + result[len('-'):]
a7440261 487 result = result.lstrip('.')
796173d0
PH
488 if not result:
489 result = '_'
59ae15a5 490 return result
d77c3dfd 491
5f6a1245 492
a2aaf4db
S
493def sanitize_path(s):
494 """Sanitizes and normalizes path on Windows"""
495 if sys.platform != 'win32':
496 return s
be531ef1
S
497 drive_or_unc, _ = os.path.splitdrive(s)
498 if sys.version_info < (2, 7) and not drive_or_unc:
499 drive_or_unc, _ = os.path.splitunc(s)
500 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
501 if drive_or_unc:
a2aaf4db
S
502 norm_path.pop(0)
503 sanitized_path = [
ec85ded8 504 path_part if path_part in ['.', '..'] else re.sub(r'(?:[/<>:"\|\\?\*]|[\s.]$)', '#', path_part)
a2aaf4db 505 for path_part in norm_path]
be531ef1
S
506 if drive_or_unc:
507 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
508 return os.path.join(*sanitized_path)
509
510
67dda517
S
511# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
512# unwanted failures due to missing protocol
17bcc626
S
513def sanitize_url(url):
514 return 'http:%s' % url if url.startswith('//') else url
515
516
67dda517 517def sanitized_Request(url, *args, **kwargs):
17bcc626 518 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
519
520
d77c3dfd 521def orderedSet(iterable):
59ae15a5
PH
522 """ Remove all duplicates from the input iterable """
523 res = []
524 for el in iterable:
525 if el not in res:
526 res.append(el)
527 return res
d77c3dfd 528
912b38b4 529
55b2f099 530def _htmlentity_transform(entity_with_semicolon):
4e408e47 531 """Transforms an HTML entity to a character."""
55b2f099
YCH
532 entity = entity_with_semicolon[:-1]
533
4e408e47
PH
534 # Known non-numeric HTML entity
535 if entity in compat_html_entities.name2codepoint:
536 return compat_chr(compat_html_entities.name2codepoint[entity])
537
55b2f099
YCH
538 # TODO: HTML5 allows entities without a semicolon. For example,
539 # '&Eacuteric' should be decoded as 'Éric'.
540 if entity_with_semicolon in compat_html_entities_html5:
541 return compat_html_entities_html5[entity_with_semicolon]
542
91757b0f 543 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
544 if mobj is not None:
545 numstr = mobj.group(1)
28e614de 546 if numstr.startswith('x'):
4e408e47 547 base = 16
28e614de 548 numstr = '0%s' % numstr
4e408e47
PH
549 else:
550 base = 10
7aefc49c
S
551 # See https://github.com/rg3/youtube-dl/issues/7518
552 try:
553 return compat_chr(int(numstr, base))
554 except ValueError:
555 pass
4e408e47
PH
556
557 # Unknown entity in name, return its literal representation
7a3f0c00 558 return '&%s;' % entity
4e408e47
PH
559
560
d77c3dfd 561def unescapeHTML(s):
912b38b4
PH
562 if s is None:
563 return None
564 assert type(s) == compat_str
d77c3dfd 565
4e408e47 566 return re.sub(
55b2f099 567 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 568
8bf48f23 569
aa49acd1
S
570def get_subprocess_encoding():
571 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
572 # For subprocess calls, encode with locale encoding
573 # Refer to http://stackoverflow.com/a/9951851/35070
574 encoding = preferredencoding()
575 else:
576 encoding = sys.getfilesystemencoding()
577 if encoding is None:
578 encoding = 'utf-8'
579 return encoding
580
581
8bf48f23 582def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
583 """
584 @param s The name of the file
585 """
d77c3dfd 586
8bf48f23 587 assert type(s) == compat_str
d77c3dfd 588
59ae15a5
PH
589 # Python 3 has a Unicode API
590 if sys.version_info >= (3, 0):
591 return s
0f00efed 592
aa49acd1
S
593 # Pass '' directly to use Unicode APIs on Windows 2000 and up
594 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
595 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
596 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
597 return s
598
8ee239e9
YCH
599 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
600 if sys.platform.startswith('java'):
601 return s
602
aa49acd1
S
603 return s.encode(get_subprocess_encoding(), 'ignore')
604
605
606def decodeFilename(b, for_subprocess=False):
607
608 if sys.version_info >= (3, 0):
609 return b
610
611 if not isinstance(b, bytes):
612 return b
613
614 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 615
f07b74fc
PH
616
617def encodeArgument(s):
618 if not isinstance(s, compat_str):
619 # Legacy code that uses byte strings
620 # Uncomment the following line after fixing all post processors
7af808a5 621 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
622 s = s.decode('ascii')
623 return encodeFilename(s, True)
624
625
aa49acd1
S
626def decodeArgument(b):
627 return decodeFilename(b, True)
628
629
8271226a
PH
630def decodeOption(optval):
631 if optval is None:
632 return optval
633 if isinstance(optval, bytes):
634 optval = optval.decode(preferredencoding())
635
636 assert isinstance(optval, compat_str)
637 return optval
1c256f70 638
5f6a1245 639
4539dd30
PH
640def formatSeconds(secs):
641 if secs > 3600:
642 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
643 elif secs > 60:
644 return '%d:%02d' % (secs // 60, secs % 60)
645 else:
646 return '%d' % secs
647
a0ddb8a2 648
be4a824d
PH
649def make_HTTPS_handler(params, **kwargs):
650 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 651 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 652 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 653 if opts_no_check_certificate:
be5f2c19 654 context.check_hostname = False
0db261ba 655 context.verify_mode = ssl.CERT_NONE
a2366922 656 try:
be4a824d 657 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
658 except TypeError:
659 # Python 2.7.8
660 # (create_default_context present but HTTPSHandler has no context=)
661 pass
662
663 if sys.version_info < (3, 2):
d7932313 664 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 665 else: # Python < 3.4
d7932313 666 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 667 context.verify_mode = (ssl.CERT_NONE
dca08720 668 if opts_no_check_certificate
ea6d901e 669 else ssl.CERT_REQUIRED)
303b479e 670 context.set_default_verify_paths()
be4a824d 671 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 672
732ea2f0 673
08f2a92c
JMF
674def bug_reports_message():
675 if ytdl_is_updateable():
676 update_cmd = 'type youtube-dl -U to update'
677 else:
678 update_cmd = 'see https://yt-dl.org/update on how to update'
679 msg = '; please report this issue on https://yt-dl.org/bug .'
680 msg += ' Make sure you are using the latest version; %s.' % update_cmd
681 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
682 return msg
683
684
1c256f70
PH
685class ExtractorError(Exception):
686 """Error during info extraction."""
5f6a1245 687
d11271dd 688 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
689 """ tb, if given, is the original traceback (so that it can be printed out).
690 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
691 """
692
693 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
694 expected = True
d11271dd
PH
695 if video_id is not None:
696 msg = video_id + ': ' + msg
410f3e73 697 if cause:
28e614de 698 msg += ' (caused by %r)' % cause
9a82b238 699 if not expected:
08f2a92c 700 msg += bug_reports_message()
1c256f70 701 super(ExtractorError, self).__init__(msg)
d5979c5d 702
1c256f70 703 self.traceback = tb
8cc83b8d 704 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 705 self.cause = cause
d11271dd 706 self.video_id = video_id
1c256f70 707
01951dda
PH
708 def format_traceback(self):
709 if self.traceback is None:
710 return None
28e614de 711 return ''.join(traceback.format_tb(self.traceback))
01951dda 712
1c256f70 713
416c7fcb
PH
714class UnsupportedError(ExtractorError):
715 def __init__(self, url):
716 super(UnsupportedError, self).__init__(
717 'Unsupported URL: %s' % url, expected=True)
718 self.url = url
719
720
55b3e45b
JMF
721class RegexNotFoundError(ExtractorError):
722 """Error when a regex didn't match"""
723 pass
724
725
d77c3dfd 726class DownloadError(Exception):
59ae15a5 727 """Download Error exception.
d77c3dfd 728
59ae15a5
PH
729 This exception may be thrown by FileDownloader objects if they are not
730 configured to continue on errors. They will contain the appropriate
731 error message.
732 """
5f6a1245 733
8cc83b8d
FV
734 def __init__(self, msg, exc_info=None):
735 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
736 super(DownloadError, self).__init__(msg)
737 self.exc_info = exc_info
d77c3dfd
FV
738
739
740class SameFileError(Exception):
59ae15a5 741 """Same File exception.
d77c3dfd 742
59ae15a5
PH
743 This exception will be thrown by FileDownloader objects if they detect
744 multiple files would have to be downloaded to the same file on disk.
745 """
746 pass
d77c3dfd
FV
747
748
749class PostProcessingError(Exception):
59ae15a5 750 """Post Processing exception.
d77c3dfd 751
59ae15a5
PH
752 This exception may be raised by PostProcessor's .run() method to
753 indicate an error in the postprocessing task.
754 """
5f6a1245 755
7851b379
PH
756 def __init__(self, msg):
757 self.msg = msg
d77c3dfd 758
5f6a1245 759
d77c3dfd 760class MaxDownloadsReached(Exception):
59ae15a5
PH
761 """ --max-downloads limit has been reached. """
762 pass
d77c3dfd
FV
763
764
765class UnavailableVideoError(Exception):
59ae15a5 766 """Unavailable Format exception.
d77c3dfd 767
59ae15a5
PH
768 This exception will be thrown when a video is requested
769 in a format that is not available for that video.
770 """
771 pass
d77c3dfd
FV
772
773
774class ContentTooShortError(Exception):
59ae15a5 775 """Content Too Short exception.
d77c3dfd 776
59ae15a5
PH
777 This exception may be raised by FileDownloader objects when a file they
778 download is too small for what the server announced first, indicating
779 the connection was probably interrupted.
780 """
d77c3dfd 781
59ae15a5 782 def __init__(self, downloaded, expected):
2c7ed247 783 # Both in bytes
59ae15a5
PH
784 self.downloaded = downloaded
785 self.expected = expected
d77c3dfd 786
5f6a1245 787
efa97bdc
YCH
788class XAttrMetadataError(Exception):
789 def __init__(self, code=None, msg='Unknown error'):
790 super(XAttrMetadataError, self).__init__(msg)
791 self.code = code
bd264412 792 self.msg = msg
efa97bdc
YCH
793
794 # Parsing code and msg
795 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
796 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
797 self.reason = 'NO_SPACE'
798 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
799 self.reason = 'VALUE_TOO_LONG'
800 else:
801 self.reason = 'NOT_SUPPORTED'
802
803
804class XAttrUnavailableError(Exception):
805 pass
806
807
c5a59d93 808def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
809 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
810 # expected HTTP responses to meet HTTP/1.0 or later (see also
811 # https://github.com/rg3/youtube-dl/issues/6727)
812 if sys.version_info < (3, 0):
5a1a2e94 813 kwargs[b'strict'] = True
be4a824d
PH
814 hc = http_class(*args, **kwargs)
815 source_address = ydl_handler._params.get('source_address')
816 if source_address is not None:
817 sa = (source_address, 0)
818 if hasattr(hc, 'source_address'): # Python 2.7+
819 hc.source_address = sa
820 else: # Python 2.6
821 def _hc_connect(self, *args, **kwargs):
822 sock = compat_socket_create_connection(
823 (self.host, self.port), self.timeout, sa)
824 if is_https:
d7932313
PH
825 self.sock = ssl.wrap_socket(
826 sock, self.key_file, self.cert_file,
827 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
828 else:
829 self.sock = sock
830 hc.connect = functools.partial(_hc_connect, hc)
831
832 return hc
833
834
87f0e62d 835def handle_youtubedl_headers(headers):
992fc9d6
YCH
836 filtered_headers = headers
837
838 if 'Youtubedl-no-compression' in filtered_headers:
839 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 840 del filtered_headers['Youtubedl-no-compression']
87f0e62d 841
992fc9d6 842 return filtered_headers
87f0e62d
YCH
843
844
acebc9cd 845class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
846 """Handler for HTTP requests and responses.
847
848 This class, when installed with an OpenerDirector, automatically adds
849 the standard headers to every HTTP request and handles gzipped and
850 deflated responses from web servers. If compression is to be avoided in
851 a particular request, the original request in the program code only has
0424ec30 852 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
853 removed before making the real request.
854
855 Part of this code was copied from:
856
857 http://techknack.net/python-urllib2-handlers/
858
859 Andrew Rowls, the author of that code, agreed to release it to the
860 public domain.
861 """
862
be4a824d
PH
863 def __init__(self, params, *args, **kwargs):
864 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
865 self._params = params
866
867 def http_open(self, req):
71aff188
YCH
868 conn_class = compat_http_client.HTTPConnection
869
870 socks_proxy = req.headers.get('Ytdl-socks-proxy')
871 if socks_proxy:
872 conn_class = make_socks_conn_class(conn_class, socks_proxy)
873 del req.headers['Ytdl-socks-proxy']
874
be4a824d 875 return self.do_open(functools.partial(
71aff188 876 _create_http_connection, self, conn_class, False),
be4a824d
PH
877 req)
878
59ae15a5
PH
879 @staticmethod
880 def deflate(data):
881 try:
882 return zlib.decompress(data, -zlib.MAX_WBITS)
883 except zlib.error:
884 return zlib.decompress(data)
885
886 @staticmethod
887 def addinfourl_wrapper(stream, headers, url, code):
888 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
889 return compat_urllib_request.addinfourl(stream, headers, url, code)
890 ret = compat_urllib_request.addinfourl(stream, headers, url)
891 ret.code = code
892 return ret
893
acebc9cd 894 def http_request(self, req):
51f267d9
S
895 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
896 # always respected by websites, some tend to give out URLs with non percent-encoded
897 # non-ASCII characters (see telemb.py, ard.py [#3412])
898 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
899 # To work around aforementioned issue we will replace request's original URL with
900 # percent-encoded one
901 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
902 # the code of this workaround has been moved here from YoutubeDL.urlopen()
903 url = req.get_full_url()
904 url_escaped = escape_url(url)
905
906 # Substitute URL if any change after escaping
907 if url != url_escaped:
15d260eb 908 req = update_Request(req, url=url_escaped)
51f267d9 909
33ac271b 910 for h, v in std_headers.items():
3d5f7a39
JK
911 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
912 # The dict keys are capitalized because of this bug by urllib
913 if h.capitalize() not in req.headers:
33ac271b 914 req.add_header(h, v)
87f0e62d
YCH
915
916 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
917
918 if sys.version_info < (2, 7) and '#' in req.get_full_url():
919 # Python 2.6 is brain-dead when it comes to fragments
920 req._Request__original = req._Request__original.partition('#')[0]
921 req._Request__r_type = req._Request__r_type.partition('#')[0]
922
59ae15a5
PH
923 return req
924
acebc9cd 925 def http_response(self, req, resp):
59ae15a5
PH
926 old_resp = resp
927 # gzip
928 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
929 content = resp.read()
930 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
931 try:
932 uncompressed = io.BytesIO(gz.read())
933 except IOError as original_ioerror:
934 # There may be junk add the end of the file
935 # See http://stackoverflow.com/q/4928560/35070 for details
936 for i in range(1, 1024):
937 try:
938 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
939 uncompressed = io.BytesIO(gz.read())
940 except IOError:
941 continue
942 break
943 else:
944 raise original_ioerror
945 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 946 resp.msg = old_resp.msg
c047270c 947 del resp.headers['Content-encoding']
59ae15a5
PH
948 # deflate
949 if resp.headers.get('Content-encoding', '') == 'deflate':
950 gz = io.BytesIO(self.deflate(resp.read()))
951 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
952 resp.msg = old_resp.msg
c047270c 953 del resp.headers['Content-encoding']
ad729172
S
954 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
955 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
956 if 300 <= resp.code < 400:
957 location = resp.headers.get('Location')
958 if location:
959 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
960 if sys.version_info >= (3, 0):
961 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
962 else:
963 location = location.decode('utf-8')
5a4d9ddb
S
964 location_escaped = escape_url(location)
965 if location != location_escaped:
966 del resp.headers['Location']
9a4aec8b
YCH
967 if sys.version_info < (3, 0):
968 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 969 resp.headers['Location'] = location_escaped
59ae15a5 970 return resp
0f8d03f8 971
acebc9cd
PH
972 https_request = http_request
973 https_response = http_response
bf50b038 974
5de90176 975
71aff188
YCH
976def make_socks_conn_class(base_class, socks_proxy):
977 assert issubclass(base_class, (
978 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
979
980 url_components = compat_urlparse.urlparse(socks_proxy)
981 if url_components.scheme.lower() == 'socks5':
982 socks_type = ProxyType.SOCKS5
983 elif url_components.scheme.lower() in ('socks', 'socks4'):
984 socks_type = ProxyType.SOCKS4
51fb4995
YCH
985 elif url_components.scheme.lower() == 'socks4a':
986 socks_type = ProxyType.SOCKS4A
71aff188 987
cdd94c2e
YCH
988 def unquote_if_non_empty(s):
989 if not s:
990 return s
991 return compat_urllib_parse_unquote_plus(s)
992
71aff188
YCH
993 proxy_args = (
994 socks_type,
995 url_components.hostname, url_components.port or 1080,
996 True, # Remote DNS
cdd94c2e
YCH
997 unquote_if_non_empty(url_components.username),
998 unquote_if_non_empty(url_components.password),
71aff188
YCH
999 )
1000
1001 class SocksConnection(base_class):
1002 def connect(self):
1003 self.sock = sockssocket()
1004 self.sock.setproxy(*proxy_args)
1005 if type(self.timeout) in (int, float):
1006 self.sock.settimeout(self.timeout)
1007 self.sock.connect((self.host, self.port))
1008
1009 if isinstance(self, compat_http_client.HTTPSConnection):
1010 if hasattr(self, '_context'): # Python > 2.6
1011 self.sock = self._context.wrap_socket(
1012 self.sock, server_hostname=self.host)
1013 else:
1014 self.sock = ssl.wrap_socket(self.sock)
1015
1016 return SocksConnection
1017
1018
be4a824d
PH
1019class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1020 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1021 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1022 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1023 self._params = params
1024
1025 def https_open(self, req):
4f264c02 1026 kwargs = {}
71aff188
YCH
1027 conn_class = self._https_conn_class
1028
4f264c02
JMF
1029 if hasattr(self, '_context'): # python > 2.6
1030 kwargs['context'] = self._context
1031 if hasattr(self, '_check_hostname'): # python 3.x
1032 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1033
1034 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1035 if socks_proxy:
1036 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1037 del req.headers['Ytdl-socks-proxy']
1038
be4a824d 1039 return self.do_open(functools.partial(
71aff188 1040 _create_http_connection, self, conn_class, True),
4f264c02 1041 req, **kwargs)
be4a824d
PH
1042
1043
a6420bf5
S
1044class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1045 def __init__(self, cookiejar=None):
1046 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1047
1048 def http_response(self, request, response):
1049 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1050 # characters in Set-Cookie HTTP header of last response (see
1051 # https://github.com/rg3/youtube-dl/issues/6769).
1052 # In order to at least prevent crashing we will percent encode Set-Cookie
1053 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1054 # if sys.version_info < (3, 0) and response.headers:
1055 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1056 # set_cookie = response.headers.get(set_cookie_header)
1057 # if set_cookie:
1058 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1059 # if set_cookie != set_cookie_escaped:
1060 # del response.headers[set_cookie_header]
1061 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1062 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1063
1064 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1065 https_response = http_response
1066
1067
46f59e89
S
1068def extract_timezone(date_str):
1069 m = re.search(
1070 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1071 date_str)
1072 if not m:
1073 timezone = datetime.timedelta()
1074 else:
1075 date_str = date_str[:-len(m.group('tz'))]
1076 if not m.group('sign'):
1077 timezone = datetime.timedelta()
1078 else:
1079 sign = 1 if m.group('sign') == '+' else -1
1080 timezone = datetime.timedelta(
1081 hours=sign * int(m.group('hours')),
1082 minutes=sign * int(m.group('minutes')))
1083 return timezone, date_str
1084
1085
08b38d54 1086def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1087 """ Return a UNIX timestamp from the given date """
1088
1089 if date_str is None:
1090 return None
1091
52c3a6e4
S
1092 date_str = re.sub(r'\.[0-9]+', '', date_str)
1093
08b38d54 1094 if timezone is None:
46f59e89
S
1095 timezone, date_str = extract_timezone(date_str)
1096
52c3a6e4
S
1097 try:
1098 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1099 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1100 return calendar.timegm(dt.timetuple())
1101 except ValueError:
1102 pass
912b38b4
PH
1103
1104
46f59e89
S
1105def date_formats(day_first=True):
1106 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1107
1108
42bdd9d0 1109def unified_strdate(date_str, day_first=True):
bf50b038 1110 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1111
1112 if date_str is None:
1113 return None
bf50b038 1114 upload_date = None
5f6a1245 1115 # Replace commas
026fcc04 1116 date_str = date_str.replace(',', ' ')
42bdd9d0 1117 # Remove AM/PM + timezone
9bb8e0a3 1118 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1119 _, date_str = extract_timezone(date_str)
42bdd9d0 1120
46f59e89 1121 for expression in date_formats(day_first):
bf50b038
JMF
1122 try:
1123 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1124 except ValueError:
bf50b038 1125 pass
42393ce2
PH
1126 if upload_date is None:
1127 timetuple = email.utils.parsedate_tz(date_str)
1128 if timetuple:
c6b9cf05
S
1129 try:
1130 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1131 except ValueError:
1132 pass
6a750402
JMF
1133 if upload_date is not None:
1134 return compat_str(upload_date)
bf50b038 1135
5f6a1245 1136
46f59e89
S
1137def unified_timestamp(date_str, day_first=True):
1138 if date_str is None:
1139 return None
1140
1141 date_str = date_str.replace(',', ' ')
1142
7dc2a74e 1143 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1144 timezone, date_str = extract_timezone(date_str)
1145
1146 # Remove AM/PM + timezone
1147 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1148
1149 for expression in date_formats(day_first):
1150 try:
7dc2a74e 1151 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1152 return calendar.timegm(dt.timetuple())
1153 except ValueError:
1154 pass
1155 timetuple = email.utils.parsedate_tz(date_str)
1156 if timetuple:
7dc2a74e 1157 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1158
1159
28e614de 1160def determine_ext(url, default_ext='unknown_video'):
f4776371
S
1161 if url is None:
1162 return default_ext
9cb9a5df 1163 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1164 if re.match(r'^[A-Za-z0-9]+$', guess):
1165 return guess
a7aaa398
S
1166 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1167 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1168 return guess.rstrip('/')
73e79f2a 1169 else:
cbdbb766 1170 return default_ext
73e79f2a 1171
5f6a1245 1172
d4051a8e 1173def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1174 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1175
5f6a1245 1176
bd558525 1177def date_from_str(date_str):
37254abc
JMF
1178 """
1179 Return a datetime object from a string in the format YYYYMMDD or
1180 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1181 today = datetime.date.today()
f8795e10 1182 if date_str in ('now', 'today'):
37254abc 1183 return today
f8795e10
PH
1184 if date_str == 'yesterday':
1185 return today - datetime.timedelta(days=1)
ec85ded8 1186 match = re.match(r'(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
37254abc
JMF
1187 if match is not None:
1188 sign = match.group('sign')
1189 time = int(match.group('time'))
1190 if sign == '-':
1191 time = -time
1192 unit = match.group('unit')
dfb1b146 1193 # A bad approximation?
37254abc
JMF
1194 if unit == 'month':
1195 unit = 'day'
1196 time *= 30
1197 elif unit == 'year':
1198 unit = 'day'
1199 time *= 365
1200 unit += 's'
1201 delta = datetime.timedelta(**{unit: time})
1202 return today + delta
611c1dd9 1203 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1204
1205
e63fc1be 1206def hyphenate_date(date_str):
1207 """
1208 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1209 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1210 if match is not None:
1211 return '-'.join(match.groups())
1212 else:
1213 return date_str
1214
5f6a1245 1215
bd558525
JMF
1216class DateRange(object):
1217 """Represents a time interval between two dates"""
5f6a1245 1218
bd558525
JMF
1219 def __init__(self, start=None, end=None):
1220 """start and end must be strings in the format accepted by date"""
1221 if start is not None:
1222 self.start = date_from_str(start)
1223 else:
1224 self.start = datetime.datetime.min.date()
1225 if end is not None:
1226 self.end = date_from_str(end)
1227 else:
1228 self.end = datetime.datetime.max.date()
37254abc 1229 if self.start > self.end:
bd558525 1230 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1231
bd558525
JMF
1232 @classmethod
1233 def day(cls, day):
1234 """Returns a range that only contains the given day"""
5f6a1245
JW
1235 return cls(day, day)
1236
bd558525
JMF
1237 def __contains__(self, date):
1238 """Check if the date is in the range"""
37254abc
JMF
1239 if not isinstance(date, datetime.date):
1240 date = date_from_str(date)
1241 return self.start <= date <= self.end
5f6a1245 1242
bd558525 1243 def __str__(self):
5f6a1245 1244 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1245
1246
1247def platform_name():
1248 """ Returns the platform name as a compat_str """
1249 res = platform.platform()
1250 if isinstance(res, bytes):
1251 res = res.decode(preferredencoding())
1252
1253 assert isinstance(res, compat_str)
1254 return res
c257baff
PH
1255
1256
b58ddb32
PH
1257def _windows_write_string(s, out):
1258 """ Returns True if the string was written using special methods,
1259 False if it has yet to be written out."""
1260 # Adapted from http://stackoverflow.com/a/3259271/35070
1261
1262 import ctypes
1263 import ctypes.wintypes
1264
1265 WIN_OUTPUT_IDS = {
1266 1: -11,
1267 2: -12,
1268 }
1269
a383a98a
PH
1270 try:
1271 fileno = out.fileno()
1272 except AttributeError:
1273 # If the output stream doesn't have a fileno, it's virtual
1274 return False
aa42e873
PH
1275 except io.UnsupportedOperation:
1276 # Some strange Windows pseudo files?
1277 return False
b58ddb32
PH
1278 if fileno not in WIN_OUTPUT_IDS:
1279 return False
1280
e2f89ec7 1281 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1282 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1283 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1284 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1285
e2f89ec7 1286 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1287 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1288 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1289 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1290 written = ctypes.wintypes.DWORD(0)
1291
611c1dd9 1292 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1293 FILE_TYPE_CHAR = 0x0002
1294 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1295 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1296 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1297 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1298 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1299 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1300
1301 def not_a_console(handle):
1302 if handle == INVALID_HANDLE_VALUE or handle is None:
1303 return True
8fb3ac36
PH
1304 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1305 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1306
1307 if not_a_console(h):
1308 return False
1309
d1b9c912
PH
1310 def next_nonbmp_pos(s):
1311 try:
1312 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1313 except StopIteration:
1314 return len(s)
1315
1316 while s:
1317 count = min(next_nonbmp_pos(s), 1024)
1318
b58ddb32 1319 ret = WriteConsoleW(
d1b9c912 1320 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1321 if ret == 0:
1322 raise OSError('Failed to write string')
d1b9c912
PH
1323 if not count: # We just wrote a non-BMP character
1324 assert written.value == 2
1325 s = s[1:]
1326 else:
1327 assert written.value > 0
1328 s = s[written.value:]
b58ddb32
PH
1329 return True
1330
1331
734f90bb 1332def write_string(s, out=None, encoding=None):
7459e3a2
PH
1333 if out is None:
1334 out = sys.stderr
8bf48f23 1335 assert type(s) == compat_str
7459e3a2 1336
b58ddb32
PH
1337 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1338 if _windows_write_string(s, out):
1339 return
1340
7459e3a2
PH
1341 if ('b' in getattr(out, 'mode', '') or
1342 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1343 byt = s.encode(encoding or preferredencoding(), 'ignore')
1344 out.write(byt)
1345 elif hasattr(out, 'buffer'):
1346 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1347 byt = s.encode(enc, 'ignore')
1348 out.buffer.write(byt)
1349 else:
8bf48f23 1350 out.write(s)
7459e3a2
PH
1351 out.flush()
1352
1353
48ea9cea
PH
1354def bytes_to_intlist(bs):
1355 if not bs:
1356 return []
1357 if isinstance(bs[0], int): # Python 3
1358 return list(bs)
1359 else:
1360 return [ord(c) for c in bs]
1361
c257baff 1362
cba892fa 1363def intlist_to_bytes(xs):
1364 if not xs:
1365 return b''
edaa23f8 1366 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1367
1368
c1c9a79c
PH
1369# Cross-platform file locking
1370if sys.platform == 'win32':
1371 import ctypes.wintypes
1372 import msvcrt
1373
1374 class OVERLAPPED(ctypes.Structure):
1375 _fields_ = [
1376 ('Internal', ctypes.wintypes.LPVOID),
1377 ('InternalHigh', ctypes.wintypes.LPVOID),
1378 ('Offset', ctypes.wintypes.DWORD),
1379 ('OffsetHigh', ctypes.wintypes.DWORD),
1380 ('hEvent', ctypes.wintypes.HANDLE),
1381 ]
1382
1383 kernel32 = ctypes.windll.kernel32
1384 LockFileEx = kernel32.LockFileEx
1385 LockFileEx.argtypes = [
1386 ctypes.wintypes.HANDLE, # hFile
1387 ctypes.wintypes.DWORD, # dwFlags
1388 ctypes.wintypes.DWORD, # dwReserved
1389 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1390 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1391 ctypes.POINTER(OVERLAPPED) # Overlapped
1392 ]
1393 LockFileEx.restype = ctypes.wintypes.BOOL
1394 UnlockFileEx = kernel32.UnlockFileEx
1395 UnlockFileEx.argtypes = [
1396 ctypes.wintypes.HANDLE, # hFile
1397 ctypes.wintypes.DWORD, # dwReserved
1398 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1399 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1400 ctypes.POINTER(OVERLAPPED) # Overlapped
1401 ]
1402 UnlockFileEx.restype = ctypes.wintypes.BOOL
1403 whole_low = 0xffffffff
1404 whole_high = 0x7fffffff
1405
1406 def _lock_file(f, exclusive):
1407 overlapped = OVERLAPPED()
1408 overlapped.Offset = 0
1409 overlapped.OffsetHigh = 0
1410 overlapped.hEvent = 0
1411 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1412 handle = msvcrt.get_osfhandle(f.fileno())
1413 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1414 whole_low, whole_high, f._lock_file_overlapped_p):
1415 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1416
1417 def _unlock_file(f):
1418 assert f._lock_file_overlapped_p
1419 handle = msvcrt.get_osfhandle(f.fileno())
1420 if not UnlockFileEx(handle, 0,
1421 whole_low, whole_high, f._lock_file_overlapped_p):
1422 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1423
1424else:
399a76e6
YCH
1425 # Some platforms, such as Jython, is missing fcntl
1426 try:
1427 import fcntl
c1c9a79c 1428
399a76e6
YCH
1429 def _lock_file(f, exclusive):
1430 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1431
399a76e6
YCH
1432 def _unlock_file(f):
1433 fcntl.flock(f, fcntl.LOCK_UN)
1434 except ImportError:
1435 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1436
1437 def _lock_file(f, exclusive):
1438 raise IOError(UNSUPPORTED_MSG)
1439
1440 def _unlock_file(f):
1441 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1442
1443
1444class locked_file(object):
1445 def __init__(self, filename, mode, encoding=None):
1446 assert mode in ['r', 'a', 'w']
1447 self.f = io.open(filename, mode, encoding=encoding)
1448 self.mode = mode
1449
1450 def __enter__(self):
1451 exclusive = self.mode != 'r'
1452 try:
1453 _lock_file(self.f, exclusive)
1454 except IOError:
1455 self.f.close()
1456 raise
1457 return self
1458
1459 def __exit__(self, etype, value, traceback):
1460 try:
1461 _unlock_file(self.f)
1462 finally:
1463 self.f.close()
1464
1465 def __iter__(self):
1466 return iter(self.f)
1467
1468 def write(self, *args):
1469 return self.f.write(*args)
1470
1471 def read(self, *args):
1472 return self.f.read(*args)
4eb7f1d1
JMF
1473
1474
4644ac55
S
1475def get_filesystem_encoding():
1476 encoding = sys.getfilesystemencoding()
1477 return encoding if encoding is not None else 'utf-8'
1478
1479
4eb7f1d1 1480def shell_quote(args):
a6a173c2 1481 quoted_args = []
4644ac55 1482 encoding = get_filesystem_encoding()
a6a173c2
JMF
1483 for a in args:
1484 if isinstance(a, bytes):
1485 # We may get a filename encoded with 'encodeFilename'
1486 a = a.decode(encoding)
1487 quoted_args.append(pipes.quote(a))
28e614de 1488 return ' '.join(quoted_args)
9d4660ca
PH
1489
1490
1491def smuggle_url(url, data):
1492 """ Pass additional data in a URL for internal use. """
1493
81953d1a
RA
1494 url, idata = unsmuggle_url(url, {})
1495 data.update(idata)
15707c7e 1496 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1497 {'__youtubedl_smuggle': json.dumps(data)})
1498 return url + '#' + sdata
9d4660ca
PH
1499
1500
79f82953 1501def unsmuggle_url(smug_url, default=None):
83e865a3 1502 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1503 return smug_url, default
28e614de
PH
1504 url, _, sdata = smug_url.rpartition('#')
1505 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1506 data = json.loads(jsond)
1507 return url, data
02dbf93f
PH
1508
1509
02dbf93f
PH
1510def format_bytes(bytes):
1511 if bytes is None:
28e614de 1512 return 'N/A'
02dbf93f
PH
1513 if type(bytes) is str:
1514 bytes = float(bytes)
1515 if bytes == 0.0:
1516 exponent = 0
1517 else:
1518 exponent = int(math.log(bytes, 1024.0))
28e614de 1519 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1520 converted = float(bytes) / float(1024 ** exponent)
28e614de 1521 return '%.2f%s' % (converted, suffix)
f53c966a 1522
1c088fa8 1523
fb47597b
S
1524def lookup_unit_table(unit_table, s):
1525 units_re = '|'.join(re.escape(u) for u in unit_table)
1526 m = re.match(
782b1b5b 1527 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1528 if not m:
1529 return None
1530 num_str = m.group('num').replace(',', '.')
1531 mult = unit_table[m.group('unit')]
1532 return int(float(num_str) * mult)
1533
1534
be64b5b0
PH
1535def parse_filesize(s):
1536 if s is None:
1537 return None
1538
dfb1b146 1539 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1540 # but we support those too
1541 _UNIT_TABLE = {
1542 'B': 1,
1543 'b': 1,
70852b47 1544 'bytes': 1,
be64b5b0
PH
1545 'KiB': 1024,
1546 'KB': 1000,
1547 'kB': 1024,
1548 'Kb': 1000,
13585d76 1549 'kb': 1000,
70852b47
YCH
1550 'kilobytes': 1000,
1551 'kibibytes': 1024,
be64b5b0
PH
1552 'MiB': 1024 ** 2,
1553 'MB': 1000 ** 2,
1554 'mB': 1024 ** 2,
1555 'Mb': 1000 ** 2,
13585d76 1556 'mb': 1000 ** 2,
70852b47
YCH
1557 'megabytes': 1000 ** 2,
1558 'mebibytes': 1024 ** 2,
be64b5b0
PH
1559 'GiB': 1024 ** 3,
1560 'GB': 1000 ** 3,
1561 'gB': 1024 ** 3,
1562 'Gb': 1000 ** 3,
13585d76 1563 'gb': 1000 ** 3,
70852b47
YCH
1564 'gigabytes': 1000 ** 3,
1565 'gibibytes': 1024 ** 3,
be64b5b0
PH
1566 'TiB': 1024 ** 4,
1567 'TB': 1000 ** 4,
1568 'tB': 1024 ** 4,
1569 'Tb': 1000 ** 4,
13585d76 1570 'tb': 1000 ** 4,
70852b47
YCH
1571 'terabytes': 1000 ** 4,
1572 'tebibytes': 1024 ** 4,
be64b5b0
PH
1573 'PiB': 1024 ** 5,
1574 'PB': 1000 ** 5,
1575 'pB': 1024 ** 5,
1576 'Pb': 1000 ** 5,
13585d76 1577 'pb': 1000 ** 5,
70852b47
YCH
1578 'petabytes': 1000 ** 5,
1579 'pebibytes': 1024 ** 5,
be64b5b0
PH
1580 'EiB': 1024 ** 6,
1581 'EB': 1000 ** 6,
1582 'eB': 1024 ** 6,
1583 'Eb': 1000 ** 6,
13585d76 1584 'eb': 1000 ** 6,
70852b47
YCH
1585 'exabytes': 1000 ** 6,
1586 'exbibytes': 1024 ** 6,
be64b5b0
PH
1587 'ZiB': 1024 ** 7,
1588 'ZB': 1000 ** 7,
1589 'zB': 1024 ** 7,
1590 'Zb': 1000 ** 7,
13585d76 1591 'zb': 1000 ** 7,
70852b47
YCH
1592 'zettabytes': 1000 ** 7,
1593 'zebibytes': 1024 ** 7,
be64b5b0
PH
1594 'YiB': 1024 ** 8,
1595 'YB': 1000 ** 8,
1596 'yB': 1024 ** 8,
1597 'Yb': 1000 ** 8,
13585d76 1598 'yb': 1000 ** 8,
70852b47
YCH
1599 'yottabytes': 1000 ** 8,
1600 'yobibytes': 1024 ** 8,
be64b5b0
PH
1601 }
1602
fb47597b
S
1603 return lookup_unit_table(_UNIT_TABLE, s)
1604
1605
1606def parse_count(s):
1607 if s is None:
be64b5b0
PH
1608 return None
1609
fb47597b
S
1610 s = s.strip()
1611
1612 if re.match(r'^[\d,.]+$', s):
1613 return str_to_int(s)
1614
1615 _UNIT_TABLE = {
1616 'k': 1000,
1617 'K': 1000,
1618 'm': 1000 ** 2,
1619 'M': 1000 ** 2,
1620 'kk': 1000 ** 2,
1621 'KK': 1000 ** 2,
1622 }
be64b5b0 1623
fb47597b 1624 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1625
2f7ae819 1626
a942d6cb 1627def month_by_name(name, lang='en'):
caefb1de
PH
1628 """ Return the number of a month by (locale-independently) English name """
1629
f6717dec 1630 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1631
caefb1de 1632 try:
f6717dec 1633 return month_names.index(name) + 1
7105440c
YCH
1634 except ValueError:
1635 return None
1636
1637
1638def month_by_abbreviation(abbrev):
1639 """ Return the number of a month by (locale-independently) English
1640 abbreviations """
1641
1642 try:
1643 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1644 except ValueError:
1645 return None
18258362
JMF
1646
1647
5aafe895 1648def fix_xml_ampersands(xml_str):
18258362 1649 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1650 return re.sub(
1651 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1652 '&amp;',
5aafe895 1653 xml_str)
e3946f98
PH
1654
1655
1656def setproctitle(title):
8bf48f23 1657 assert isinstance(title, compat_str)
c1c05c67
YCH
1658
1659 # ctypes in Jython is not complete
1660 # http://bugs.jython.org/issue2148
1661 if sys.platform.startswith('java'):
1662 return
1663
e3946f98 1664 try:
611c1dd9 1665 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1666 except OSError:
1667 return
6eefe533
PH
1668 title_bytes = title.encode('utf-8')
1669 buf = ctypes.create_string_buffer(len(title_bytes))
1670 buf.value = title_bytes
e3946f98 1671 try:
6eefe533 1672 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1673 except AttributeError:
1674 return # Strange libc, just skip this
d7dda168
PH
1675
1676
1677def remove_start(s, start):
46bc9b7d 1678 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1679
1680
2b9faf55 1681def remove_end(s, end):
46bc9b7d 1682 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1683
1684
31b2051e
S
1685def remove_quotes(s):
1686 if s is None or len(s) < 2:
1687 return s
1688 for quote in ('"', "'", ):
1689 if s[0] == quote and s[-1] == quote:
1690 return s[1:-1]
1691 return s
1692
1693
29eb5174 1694def url_basename(url):
9b8aaeed 1695 path = compat_urlparse.urlparse(url).path
28e614de 1696 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1697
1698
02dc0a36
S
1699def base_url(url):
1700 return re.match(r'https?://[^?#&]+/', url).group()
1701
1702
e34c3361
S
1703def urljoin(base, path):
1704 if not isinstance(path, compat_str) or not path:
1705 return None
b0c65c67 1706 if re.match(r'^(?:https?:)?//', path):
e34c3361 1707 return path
b0c65c67 1708 if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base):
e34c3361
S
1709 return None
1710 return compat_urlparse.urljoin(base, path)
1711
1712
aa94a6d3
PH
1713class HEADRequest(compat_urllib_request.Request):
1714 def get_method(self):
611c1dd9 1715 return 'HEAD'
7217e148
PH
1716
1717
95cf60e8
S
1718class PUTRequest(compat_urllib_request.Request):
1719 def get_method(self):
1720 return 'PUT'
1721
1722
9732d77e 1723def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1724 if get_attr:
1725 if v is not None:
1726 v = getattr(v, get_attr, None)
9572013d
PH
1727 if v == '':
1728 v = None
1812afb7
S
1729 if v is None:
1730 return default
1731 try:
1732 return int(v) * invscale // scale
1733 except ValueError:
af98f8ff 1734 return default
9732d77e 1735
9572013d 1736
40a90862
JMF
1737def str_or_none(v, default=None):
1738 return default if v is None else compat_str(v)
1739
9732d77e
PH
1740
1741def str_to_int(int_str):
48d4681e 1742 """ A more relaxed version of int_or_none """
9732d77e
PH
1743 if int_str is None:
1744 return None
28e614de 1745 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1746 return int(int_str)
608d11f5
PH
1747
1748
9732d77e 1749def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1750 if v is None:
1751 return default
1752 try:
1753 return float(v) * invscale / scale
1754 except ValueError:
1755 return default
43f775e4
PH
1756
1757
b72b4431
S
1758def strip_or_none(v):
1759 return None if v is None else v.strip()
1760
1761
608d11f5 1762def parse_duration(s):
8f9312c3 1763 if not isinstance(s, compat_basestring):
608d11f5
PH
1764 return None
1765
ca7b3246
S
1766 s = s.strip()
1767
acaff495 1768 days, hours, mins, secs, ms = [None] * 5
1769 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1770 if m:
1771 days, hours, mins, secs, ms = m.groups()
1772 else:
1773 m = re.match(
1774 r'''(?ix)(?:P?T)?
8f4b58d7 1775 (?:
acaff495 1776 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1777 )?
acaff495 1778 (?:
1779 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1780 )?
1781 (?:
1782 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1783 )?
1784 (?:
1785 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1786 )?$''', s)
1787 if m:
1788 days, hours, mins, secs, ms = m.groups()
1789 else:
1790 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1791 if m:
1792 hours, mins = m.groups()
1793 else:
1794 return None
1795
1796 duration = 0
1797 if secs:
1798 duration += float(secs)
1799 if mins:
1800 duration += float(mins) * 60
1801 if hours:
1802 duration += float(hours) * 60 * 60
1803 if days:
1804 duration += float(days) * 24 * 60 * 60
1805 if ms:
1806 duration += float(ms)
1807 return duration
91d7d0b3
JMF
1808
1809
e65e4c88 1810def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1811 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1812 return (
1813 '{0}.{1}{2}'.format(name, ext, real_ext)
1814 if not expected_real_ext or real_ext[1:] == expected_real_ext
1815 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1816
1817
b3ed15b7
S
1818def replace_extension(filename, ext, expected_real_ext=None):
1819 name, real_ext = os.path.splitext(filename)
1820 return '{0}.{1}'.format(
1821 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1822 ext)
1823
1824
d70ad093
PH
1825def check_executable(exe, args=[]):
1826 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1827 args can be a list of arguments for a short output (like -version) """
1828 try:
1829 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1830 except OSError:
1831 return False
1832 return exe
b7ab0590
PH
1833
1834
95807118 1835def get_exe_version(exe, args=['--version'],
cae97f65 1836 version_re=None, unrecognized='present'):
95807118
PH
1837 """ Returns the version of the specified executable,
1838 or False if the executable is not present """
1839 try:
b64d04c1
YCH
1840 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1841 # SIGTTOU if youtube-dl is run in the background.
1842 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
cae97f65 1843 out, _ = subprocess.Popen(
54116803 1844 [encodeArgument(exe)] + args,
00ca7552 1845 stdin=subprocess.PIPE,
95807118
PH
1846 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1847 except OSError:
1848 return False
cae97f65
PH
1849 if isinstance(out, bytes): # Python 2.x
1850 out = out.decode('ascii', 'ignore')
1851 return detect_exe_version(out, version_re, unrecognized)
1852
1853
1854def detect_exe_version(output, version_re=None, unrecognized='present'):
1855 assert isinstance(output, compat_str)
1856 if version_re is None:
1857 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1858 m = re.search(version_re, output)
95807118
PH
1859 if m:
1860 return m.group(1)
1861 else:
1862 return unrecognized
1863
1864
b7ab0590 1865class PagedList(object):
dd26ced1
PH
1866 def __len__(self):
1867 # This is only useful for tests
1868 return len(self.getslice())
1869
9c44d242
PH
1870
1871class OnDemandPagedList(PagedList):
b95dc034 1872 def __init__(self, pagefunc, pagesize, use_cache=False):
9c44d242
PH
1873 self._pagefunc = pagefunc
1874 self._pagesize = pagesize
b95dc034
YCH
1875 self._use_cache = use_cache
1876 if use_cache:
1877 self._cache = {}
9c44d242 1878
b7ab0590
PH
1879 def getslice(self, start=0, end=None):
1880 res = []
1881 for pagenum in itertools.count(start // self._pagesize):
1882 firstid = pagenum * self._pagesize
1883 nextfirstid = pagenum * self._pagesize + self._pagesize
1884 if start >= nextfirstid:
1885 continue
1886
b95dc034
YCH
1887 page_results = None
1888 if self._use_cache:
1889 page_results = self._cache.get(pagenum)
1890 if page_results is None:
1891 page_results = list(self._pagefunc(pagenum))
1892 if self._use_cache:
1893 self._cache[pagenum] = page_results
b7ab0590
PH
1894
1895 startv = (
1896 start % self._pagesize
1897 if firstid <= start < nextfirstid
1898 else 0)
1899
1900 endv = (
1901 ((end - 1) % self._pagesize) + 1
1902 if (end is not None and firstid <= end <= nextfirstid)
1903 else None)
1904
1905 if startv != 0 or endv is not None:
1906 page_results = page_results[startv:endv]
1907 res.extend(page_results)
1908
1909 # A little optimization - if current page is not "full", ie. does
1910 # not contain page_size videos then we can assume that this page
1911 # is the last one - there are no more ids on further pages -
1912 # i.e. no need to query again.
1913 if len(page_results) + startv < self._pagesize:
1914 break
1915
1916 # If we got the whole page, but the next page is not interesting,
1917 # break out early as well
1918 if end == nextfirstid:
1919 break
1920 return res
81c2f20b
PH
1921
1922
9c44d242
PH
1923class InAdvancePagedList(PagedList):
1924 def __init__(self, pagefunc, pagecount, pagesize):
1925 self._pagefunc = pagefunc
1926 self._pagecount = pagecount
1927 self._pagesize = pagesize
1928
1929 def getslice(self, start=0, end=None):
1930 res = []
1931 start_page = start // self._pagesize
1932 end_page = (
1933 self._pagecount if end is None else (end // self._pagesize + 1))
1934 skip_elems = start - start_page * self._pagesize
1935 only_more = None if end is None else end - start
1936 for pagenum in range(start_page, end_page):
1937 page = list(self._pagefunc(pagenum))
1938 if skip_elems:
1939 page = page[skip_elems:]
1940 skip_elems = None
1941 if only_more is not None:
1942 if len(page) < only_more:
1943 only_more -= len(page)
1944 else:
1945 page = page[:only_more]
1946 res.extend(page)
1947 break
1948 res.extend(page)
1949 return res
1950
1951
81c2f20b 1952def uppercase_escape(s):
676eb3f2 1953 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1954 return re.sub(
a612753d 1955 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1956 lambda m: unicode_escape(m.group(0))[0],
1957 s)
0fe2ff78
YCH
1958
1959
1960def lowercase_escape(s):
1961 unicode_escape = codecs.getdecoder('unicode_escape')
1962 return re.sub(
1963 r'\\u[0-9a-fA-F]{4}',
1964 lambda m: unicode_escape(m.group(0))[0],
1965 s)
b53466e1 1966
d05cfe06
S
1967
1968def escape_rfc3986(s):
1969 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1970 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1971 s = s.encode('utf-8')
ecc0c5ee 1972 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1973
1974
1975def escape_url(url):
1976 """Escape URL as suggested by RFC 3986"""
1977 url_parsed = compat_urllib_parse_urlparse(url)
1978 return url_parsed._replace(
efbed08d 1979 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
1980 path=escape_rfc3986(url_parsed.path),
1981 params=escape_rfc3986(url_parsed.params),
1982 query=escape_rfc3986(url_parsed.query),
1983 fragment=escape_rfc3986(url_parsed.fragment)
1984 ).geturl()
1985
62e609ab
PH
1986
1987def read_batch_urls(batch_fd):
1988 def fixup(url):
1989 if not isinstance(url, compat_str):
1990 url = url.decode('utf-8', 'replace')
28e614de 1991 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1992 if url.startswith(BOM_UTF8):
1993 url = url[len(BOM_UTF8):]
1994 url = url.strip()
1995 if url.startswith(('#', ';', ']')):
1996 return False
1997 return url
1998
1999 with contextlib.closing(batch_fd) as fd:
2000 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
2001
2002
2003def urlencode_postdata(*args, **kargs):
15707c7e 2004 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
2005
2006
38f9ef31 2007def update_url_query(url, query):
cacd9966
YCH
2008 if not query:
2009 return url
38f9ef31 2010 parsed_url = compat_urlparse.urlparse(url)
2011 qs = compat_parse_qs(parsed_url.query)
2012 qs.update(query)
2013 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2014 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2015
8e60dc75 2016
ed0291d1
S
2017def update_Request(req, url=None, data=None, headers={}, query={}):
2018 req_headers = req.headers.copy()
2019 req_headers.update(headers)
2020 req_data = data or req.data
2021 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2022 req_get_method = req.get_method()
2023 if req_get_method == 'HEAD':
2024 req_type = HEADRequest
2025 elif req_get_method == 'PUT':
2026 req_type = PUTRequest
2027 else:
2028 req_type = compat_urllib_request.Request
ed0291d1
S
2029 new_req = req_type(
2030 req_url, data=req_data, headers=req_headers,
2031 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2032 if hasattr(req, 'timeout'):
2033 new_req.timeout = req.timeout
2034 return new_req
2035
2036
86296ad2 2037def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
2038 if isinstance(key_or_keys, (list, tuple)):
2039 for key in key_or_keys:
86296ad2
S
2040 if key not in d or d[key] is None or skip_false_values and not d[key]:
2041 continue
2042 return d[key]
cbecc9b9
S
2043 return default
2044 return d.get(key_or_keys, default)
2045
2046
329ca3be
S
2047def try_get(src, getter, expected_type=None):
2048 try:
2049 v = getter(src)
2050 except (AttributeError, KeyError, TypeError, IndexError):
2051 pass
2052 else:
2053 if expected_type is None or isinstance(v, expected_type):
2054 return v
2055
2056
8e60dc75
S
2057def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2058 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2059
16392824 2060
a1a530b0
PH
2061US_RATINGS = {
2062 'G': 0,
2063 'PG': 10,
2064 'PG-13': 13,
2065 'R': 16,
2066 'NC': 18,
2067}
fac55558
PH
2068
2069
a8795327
S
2070TV_PARENTAL_GUIDELINES = {
2071 'TV-Y': 0,
2072 'TV-Y7': 7,
2073 'TV-G': 0,
2074 'TV-PG': 0,
2075 'TV-14': 14,
2076 'TV-MA': 17,
2077}
2078
2079
146c80e2 2080def parse_age_limit(s):
a8795327
S
2081 if type(s) == int:
2082 return s if 0 <= s <= 21 else None
2083 if not isinstance(s, compat_basestring):
d838b1bd 2084 return None
146c80e2 2085 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2086 if m:
2087 return int(m.group('age'))
2088 if s in US_RATINGS:
2089 return US_RATINGS[s]
2090 return TV_PARENTAL_GUIDELINES.get(s)
146c80e2
S
2091
2092
fac55558 2093def strip_jsonp(code):
609a61e3 2094 return re.sub(
5950cb1d 2095 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
2096
2097
e05f6939
PH
2098def js_to_json(code):
2099 def fix_kv(m):
e7b6d122
PH
2100 v = m.group(0)
2101 if v in ('true', 'false', 'null'):
2102 return v
bd1e4844 2103 elif v.startswith('/*') or v == ',':
2104 return ""
2105
2106 if v[0] in ("'", '"'):
2107 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 2108 '"': '\\"',
bd1e4844 2109 "\\'": "'",
2110 '\\\n': '',
2111 '\\x': '\\u00',
2112 }.get(m.group(0), m.group(0)), v[1:-1])
2113
89ac4a19 2114 INTEGER_TABLE = (
e4659b45
YCH
2115 (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
2116 (r'^(0+[0-7]+)\s*:?$', 8),
89ac4a19
S
2117 )
2118
2119 for regex, base in INTEGER_TABLE:
2120 im = re.match(regex, v)
2121 if im:
e4659b45 2122 i = int(im.group(1), base)
89ac4a19
S
2123 return '"%d":' % i if v.endswith(':') else '%d' % i
2124
e7b6d122 2125 return '"%s"' % v
e05f6939 2126
bd1e4844 2127 return re.sub(r'''(?sx)
2128 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2129 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2130 /\*.*?\*/|,(?=\s*[\]}])|
2131 [a-zA-Z_][.a-zA-Z_0-9]*|
47212f7b 2132 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
bd1e4844 2133 [0-9]+(?=\s*:)
e05f6939 2134 ''', fix_kv, code)
e05f6939
PH
2135
2136
478c2c61
PH
2137def qualities(quality_ids):
2138 """ Get a numeric quality value out of a list of possible values """
2139 def q(qid):
2140 try:
2141 return quality_ids.index(qid)
2142 except ValueError:
2143 return -1
2144 return q
2145
acd69589
PH
2146
2147DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 2148
a020a0dc
PH
2149
2150def limit_length(s, length):
2151 """ Add ellipses to overly long strings """
2152 if s is None:
2153 return None
2154 ELLIPSES = '...'
2155 if len(s) > length:
2156 return s[:length - len(ELLIPSES)] + ELLIPSES
2157 return s
48844745
PH
2158
2159
2160def version_tuple(v):
5f9b8394 2161 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2162
2163
2164def is_outdated_version(version, limit, assume_new=True):
2165 if not version:
2166 return not assume_new
2167 try:
2168 return version_tuple(version) < version_tuple(limit)
2169 except ValueError:
2170 return not assume_new
732ea2f0
PH
2171
2172
2173def ytdl_is_updateable():
2174 """ Returns if youtube-dl can be updated with -U """
2175 from zipimport import zipimporter
2176
2177 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
2178
2179
2180def args_to_str(args):
2181 # Get a short string representation for a subprocess command
702ccf2d 2182 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2183
2184
9b9c5355 2185def error_to_compat_str(err):
fdae2358
S
2186 err_str = str(err)
2187 # On python 2 error byte string must be decoded with proper
2188 # encoding rather than ascii
2189 if sys.version_info[0] < 3:
2190 err_str = err_str.decode(preferredencoding())
2191 return err_str
2192
2193
c460bdd5 2194def mimetype2ext(mt):
eb9ee194
S
2195 if mt is None:
2196 return None
2197
765ac263
JMF
2198 ext = {
2199 'audio/mp4': 'm4a',
6c33d24b
YCH
2200 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2201 # it's the most popular one
2202 'audio/mpeg': 'mp3',
765ac263
JMF
2203 }.get(mt)
2204 if ext is not None:
2205 return ext
2206
c460bdd5 2207 _, _, res = mt.rpartition('/')
6562d34a 2208 res = res.split(';')[0].strip().lower()
c460bdd5
PH
2209
2210 return {
f6861ec9 2211 '3gpp': '3gp',
cafcf657 2212 'smptett+xml': 'tt',
2213 'srt': 'srt',
2214 'ttaf+xml': 'dfxp',
a0d8d704 2215 'ttml+xml': 'ttml',
cafcf657 2216 'vtt': 'vtt',
f6861ec9 2217 'x-flv': 'flv',
a0d8d704
YCH
2218 'x-mp4-fragmented': 'mp4',
2219 'x-ms-wmv': 'wmv',
b4173f15
RA
2220 'mpegurl': 'm3u8',
2221 'x-mpegurl': 'm3u8',
2222 'vnd.apple.mpegurl': 'm3u8',
2223 'dash+xml': 'mpd',
2224 'f4m': 'f4m',
2225 'f4m+xml': 'f4m',
f164b971 2226 'hds+xml': 'f4m',
e910fe2f 2227 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 2228 'quicktime': 'mov',
c460bdd5
PH
2229 }.get(res, res)
2230
2231
4f3c5e06 2232def parse_codecs(codecs_str):
2233 # http://tools.ietf.org/html/rfc6381
2234 if not codecs_str:
2235 return {}
2236 splited_codecs = list(filter(None, map(
2237 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2238 vcodec, acodec = None, None
2239 for full_codec in splited_codecs:
2240 codec = full_codec.split('.')[0]
2241 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2242 if not vcodec:
2243 vcodec = full_codec
073ac122 2244 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
4f3c5e06 2245 if not acodec:
2246 acodec = full_codec
2247 else:
2248 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2249 if not vcodec and not acodec:
2250 if len(splited_codecs) == 2:
2251 return {
2252 'vcodec': vcodec,
2253 'acodec': acodec,
2254 }
2255 elif len(splited_codecs) == 1:
2256 return {
2257 'vcodec': 'none',
2258 'acodec': vcodec,
2259 }
2260 else:
2261 return {
2262 'vcodec': vcodec or 'none',
2263 'acodec': acodec or 'none',
2264 }
2265 return {}
2266
2267
2ccd1b10 2268def urlhandle_detect_ext(url_handle):
79298173 2269 getheader = url_handle.headers.get
2ccd1b10 2270
b55ee18f
PH
2271 cd = getheader('Content-Disposition')
2272 if cd:
2273 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2274 if m:
2275 e = determine_ext(m.group('filename'), default_ext=None)
2276 if e:
2277 return e
2278
c460bdd5 2279 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2280
2281
1e399778
YCH
2282def encode_data_uri(data, mime_type):
2283 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2284
2285
05900629 2286def age_restricted(content_limit, age_limit):
6ec6cb4e 2287 """ Returns True iff the content should be blocked """
05900629
PH
2288
2289 if age_limit is None: # No limit set
2290 return False
2291 if content_limit is None:
2292 return False # Content available for everyone
2293 return age_limit < content_limit
61ca9a80
PH
2294
2295
2296def is_html(first_bytes):
2297 """ Detect whether a file contains HTML by examining its first bytes. """
2298
2299 BOMS = [
2300 (b'\xef\xbb\xbf', 'utf-8'),
2301 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2302 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2303 (b'\xff\xfe', 'utf-16-le'),
2304 (b'\xfe\xff', 'utf-16-be'),
2305 ]
2306 for bom, enc in BOMS:
2307 if first_bytes.startswith(bom):
2308 s = first_bytes[len(bom):].decode(enc, 'replace')
2309 break
2310 else:
2311 s = first_bytes.decode('utf-8', 'replace')
2312
2313 return re.match(r'^\s*<', s)
a055469f
PH
2314
2315
2316def determine_protocol(info_dict):
2317 protocol = info_dict.get('protocol')
2318 if protocol is not None:
2319 return protocol
2320
2321 url = info_dict['url']
2322 if url.startswith('rtmp'):
2323 return 'rtmp'
2324 elif url.startswith('mms'):
2325 return 'mms'
2326 elif url.startswith('rtsp'):
2327 return 'rtsp'
2328
2329 ext = determine_ext(url)
2330 if ext == 'm3u8':
2331 return 'm3u8'
2332 elif ext == 'f4m':
2333 return 'f4m'
2334
2335 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2336
2337
2338def render_table(header_row, data):
2339 """ Render a list of rows, each as a list of values """
2340 table = [header_row] + data
2341 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2342 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2343 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2344
2345
2346def _match_one(filter_part, dct):
2347 COMPARISON_OPERATORS = {
2348 '<': operator.lt,
2349 '<=': operator.le,
2350 '>': operator.gt,
2351 '>=': operator.ge,
2352 '=': operator.eq,
2353 '!=': operator.ne,
2354 }
2355 operator_rex = re.compile(r'''(?x)\s*
2356 (?P<key>[a-z_]+)
2357 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2358 (?:
2359 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2360 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2361 )
2362 \s*$
2363 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2364 m = operator_rex.search(filter_part)
2365 if m:
2366 op = COMPARISON_OPERATORS[m.group('op')]
e5a088dc
S
2367 actual_value = dct.get(m.group('key'))
2368 if (m.group('strval') is not None or
2369 # If the original field is a string and matching comparisonvalue is
2370 # a number we should respect the origin of the original field
2371 # and process comparison value as a string (see
2372 # https://github.com/rg3/youtube-dl/issues/11082).
2373 actual_value is not None and m.group('intval') is not None and
2374 isinstance(actual_value, compat_str)):
347de493
PH
2375 if m.group('op') not in ('=', '!='):
2376 raise ValueError(
2377 'Operator %s does not support string values!' % m.group('op'))
e5a088dc 2378 comparison_value = m.group('strval') or m.group('intval')
347de493
PH
2379 else:
2380 try:
2381 comparison_value = int(m.group('intval'))
2382 except ValueError:
2383 comparison_value = parse_filesize(m.group('intval'))
2384 if comparison_value is None:
2385 comparison_value = parse_filesize(m.group('intval') + 'B')
2386 if comparison_value is None:
2387 raise ValueError(
2388 'Invalid integer value %r in filter part %r' % (
2389 m.group('intval'), filter_part))
347de493
PH
2390 if actual_value is None:
2391 return m.group('none_inclusive')
2392 return op(actual_value, comparison_value)
2393
2394 UNARY_OPERATORS = {
2395 '': lambda v: v is not None,
2396 '!': lambda v: v is None,
2397 }
2398 operator_rex = re.compile(r'''(?x)\s*
2399 (?P<op>%s)\s*(?P<key>[a-z_]+)
2400 \s*$
2401 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2402 m = operator_rex.search(filter_part)
2403 if m:
2404 op = UNARY_OPERATORS[m.group('op')]
2405 actual_value = dct.get(m.group('key'))
2406 return op(actual_value)
2407
2408 raise ValueError('Invalid filter part %r' % filter_part)
2409
2410
2411def match_str(filter_str, dct):
2412 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2413
2414 return all(
2415 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2416
2417
2418def match_filter_func(filter_str):
2419 def _match_func(info_dict):
2420 if match_str(filter_str, info_dict):
2421 return None
2422 else:
2423 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2424 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2425 return _match_func
91410c9b
PH
2426
2427
bf6427d2
YCH
2428def parse_dfxp_time_expr(time_expr):
2429 if not time_expr:
d631d5f9 2430 return
bf6427d2
YCH
2431
2432 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2433 if mobj:
2434 return float(mobj.group('time_offset'))
2435
db2fe38b 2436 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2437 if mobj:
db2fe38b 2438 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2439
2440
c1c924ab
YCH
2441def srt_subtitles_timecode(seconds):
2442 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2443
2444
2445def dfxp2srt(dfxp_data):
4e335771
YCH
2446 _x = functools.partial(xpath_with_ns, ns_map={
2447 'ttml': 'http://www.w3.org/ns/ttml',
2448 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
5bf28d78 2449 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
4e335771 2450 })
bf6427d2 2451
87de7069 2452 class TTMLPElementParser(object):
2b14cb56 2453 out = ''
bf6427d2 2454
2b14cb56 2455 def start(self, tag, attrib):
2456 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2457 self.out += '\n'
bf6427d2 2458
2b14cb56 2459 def end(self, tag):
2460 pass
bf6427d2 2461
2b14cb56 2462 def data(self, data):
2463 self.out += data
2464
2465 def close(self):
2466 return self.out.strip()
2467
2468 def parse_node(node):
2469 target = TTMLPElementParser()
2470 parser = xml.etree.ElementTree.XMLParser(target=target)
2471 parser.feed(xml.etree.ElementTree.tostring(node))
2472 return parser.close()
bf6427d2 2473
36e6f62c 2474 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
bf6427d2 2475 out = []
5bf28d78 2476 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2477
2478 if not paras:
2479 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
2480
2481 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2482 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2483 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2484 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2485 if begin_time is None:
2486 continue
7dff0363 2487 if not end_time:
d631d5f9
YCH
2488 if not dur:
2489 continue
2490 end_time = begin_time + dur
bf6427d2
YCH
2491 out.append('%d\n%s --> %s\n%s\n\n' % (
2492 index,
c1c924ab
YCH
2493 srt_subtitles_timecode(begin_time),
2494 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2495 parse_node(para)))
2496
2497 return ''.join(out)
2498
2499
66e289ba
S
2500def cli_option(params, command_option, param):
2501 param = params.get(param)
98e698f1
RA
2502 if param:
2503 param = compat_str(param)
66e289ba
S
2504 return [command_option, param] if param is not None else []
2505
2506
2507def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2508 param = params.get(param)
2509 assert isinstance(param, bool)
2510 if separator:
2511 return [command_option + separator + (true_value if param else false_value)]
2512 return [command_option, true_value if param else false_value]
2513
2514
2515def cli_valueless_option(params, command_option, param, expected_value=True):
2516 param = params.get(param)
2517 return [command_option] if param == expected_value else []
2518
2519
2520def cli_configuration_args(params, param, default=[]):
2521 ex_args = params.get(param)
2522 if ex_args is None:
2523 return default
2524 assert isinstance(ex_args, list)
2525 return ex_args
2526
2527
39672624
YCH
2528class ISO639Utils(object):
2529 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2530 _lang_map = {
2531 'aa': 'aar',
2532 'ab': 'abk',
2533 'ae': 'ave',
2534 'af': 'afr',
2535 'ak': 'aka',
2536 'am': 'amh',
2537 'an': 'arg',
2538 'ar': 'ara',
2539 'as': 'asm',
2540 'av': 'ava',
2541 'ay': 'aym',
2542 'az': 'aze',
2543 'ba': 'bak',
2544 'be': 'bel',
2545 'bg': 'bul',
2546 'bh': 'bih',
2547 'bi': 'bis',
2548 'bm': 'bam',
2549 'bn': 'ben',
2550 'bo': 'bod',
2551 'br': 'bre',
2552 'bs': 'bos',
2553 'ca': 'cat',
2554 'ce': 'che',
2555 'ch': 'cha',
2556 'co': 'cos',
2557 'cr': 'cre',
2558 'cs': 'ces',
2559 'cu': 'chu',
2560 'cv': 'chv',
2561 'cy': 'cym',
2562 'da': 'dan',
2563 'de': 'deu',
2564 'dv': 'div',
2565 'dz': 'dzo',
2566 'ee': 'ewe',
2567 'el': 'ell',
2568 'en': 'eng',
2569 'eo': 'epo',
2570 'es': 'spa',
2571 'et': 'est',
2572 'eu': 'eus',
2573 'fa': 'fas',
2574 'ff': 'ful',
2575 'fi': 'fin',
2576 'fj': 'fij',
2577 'fo': 'fao',
2578 'fr': 'fra',
2579 'fy': 'fry',
2580 'ga': 'gle',
2581 'gd': 'gla',
2582 'gl': 'glg',
2583 'gn': 'grn',
2584 'gu': 'guj',
2585 'gv': 'glv',
2586 'ha': 'hau',
2587 'he': 'heb',
2588 'hi': 'hin',
2589 'ho': 'hmo',
2590 'hr': 'hrv',
2591 'ht': 'hat',
2592 'hu': 'hun',
2593 'hy': 'hye',
2594 'hz': 'her',
2595 'ia': 'ina',
2596 'id': 'ind',
2597 'ie': 'ile',
2598 'ig': 'ibo',
2599 'ii': 'iii',
2600 'ik': 'ipk',
2601 'io': 'ido',
2602 'is': 'isl',
2603 'it': 'ita',
2604 'iu': 'iku',
2605 'ja': 'jpn',
2606 'jv': 'jav',
2607 'ka': 'kat',
2608 'kg': 'kon',
2609 'ki': 'kik',
2610 'kj': 'kua',
2611 'kk': 'kaz',
2612 'kl': 'kal',
2613 'km': 'khm',
2614 'kn': 'kan',
2615 'ko': 'kor',
2616 'kr': 'kau',
2617 'ks': 'kas',
2618 'ku': 'kur',
2619 'kv': 'kom',
2620 'kw': 'cor',
2621 'ky': 'kir',
2622 'la': 'lat',
2623 'lb': 'ltz',
2624 'lg': 'lug',
2625 'li': 'lim',
2626 'ln': 'lin',
2627 'lo': 'lao',
2628 'lt': 'lit',
2629 'lu': 'lub',
2630 'lv': 'lav',
2631 'mg': 'mlg',
2632 'mh': 'mah',
2633 'mi': 'mri',
2634 'mk': 'mkd',
2635 'ml': 'mal',
2636 'mn': 'mon',
2637 'mr': 'mar',
2638 'ms': 'msa',
2639 'mt': 'mlt',
2640 'my': 'mya',
2641 'na': 'nau',
2642 'nb': 'nob',
2643 'nd': 'nde',
2644 'ne': 'nep',
2645 'ng': 'ndo',
2646 'nl': 'nld',
2647 'nn': 'nno',
2648 'no': 'nor',
2649 'nr': 'nbl',
2650 'nv': 'nav',
2651 'ny': 'nya',
2652 'oc': 'oci',
2653 'oj': 'oji',
2654 'om': 'orm',
2655 'or': 'ori',
2656 'os': 'oss',
2657 'pa': 'pan',
2658 'pi': 'pli',
2659 'pl': 'pol',
2660 'ps': 'pus',
2661 'pt': 'por',
2662 'qu': 'que',
2663 'rm': 'roh',
2664 'rn': 'run',
2665 'ro': 'ron',
2666 'ru': 'rus',
2667 'rw': 'kin',
2668 'sa': 'san',
2669 'sc': 'srd',
2670 'sd': 'snd',
2671 'se': 'sme',
2672 'sg': 'sag',
2673 'si': 'sin',
2674 'sk': 'slk',
2675 'sl': 'slv',
2676 'sm': 'smo',
2677 'sn': 'sna',
2678 'so': 'som',
2679 'sq': 'sqi',
2680 'sr': 'srp',
2681 'ss': 'ssw',
2682 'st': 'sot',
2683 'su': 'sun',
2684 'sv': 'swe',
2685 'sw': 'swa',
2686 'ta': 'tam',
2687 'te': 'tel',
2688 'tg': 'tgk',
2689 'th': 'tha',
2690 'ti': 'tir',
2691 'tk': 'tuk',
2692 'tl': 'tgl',
2693 'tn': 'tsn',
2694 'to': 'ton',
2695 'tr': 'tur',
2696 'ts': 'tso',
2697 'tt': 'tat',
2698 'tw': 'twi',
2699 'ty': 'tah',
2700 'ug': 'uig',
2701 'uk': 'ukr',
2702 'ur': 'urd',
2703 'uz': 'uzb',
2704 've': 'ven',
2705 'vi': 'vie',
2706 'vo': 'vol',
2707 'wa': 'wln',
2708 'wo': 'wol',
2709 'xh': 'xho',
2710 'yi': 'yid',
2711 'yo': 'yor',
2712 'za': 'zha',
2713 'zh': 'zho',
2714 'zu': 'zul',
2715 }
2716
2717 @classmethod
2718 def short2long(cls, code):
2719 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2720 return cls._lang_map.get(code[:2])
2721
2722 @classmethod
2723 def long2short(cls, code):
2724 """Convert language code from ISO 639-2/T to ISO 639-1"""
2725 for short_name, long_name in cls._lang_map.items():
2726 if long_name == code:
2727 return short_name
2728
2729
4eb10f66
YCH
2730class ISO3166Utils(object):
2731 # From http://data.okfn.org/data/core/country-list
2732 _country_map = {
2733 'AF': 'Afghanistan',
2734 'AX': 'Åland Islands',
2735 'AL': 'Albania',
2736 'DZ': 'Algeria',
2737 'AS': 'American Samoa',
2738 'AD': 'Andorra',
2739 'AO': 'Angola',
2740 'AI': 'Anguilla',
2741 'AQ': 'Antarctica',
2742 'AG': 'Antigua and Barbuda',
2743 'AR': 'Argentina',
2744 'AM': 'Armenia',
2745 'AW': 'Aruba',
2746 'AU': 'Australia',
2747 'AT': 'Austria',
2748 'AZ': 'Azerbaijan',
2749 'BS': 'Bahamas',
2750 'BH': 'Bahrain',
2751 'BD': 'Bangladesh',
2752 'BB': 'Barbados',
2753 'BY': 'Belarus',
2754 'BE': 'Belgium',
2755 'BZ': 'Belize',
2756 'BJ': 'Benin',
2757 'BM': 'Bermuda',
2758 'BT': 'Bhutan',
2759 'BO': 'Bolivia, Plurinational State of',
2760 'BQ': 'Bonaire, Sint Eustatius and Saba',
2761 'BA': 'Bosnia and Herzegovina',
2762 'BW': 'Botswana',
2763 'BV': 'Bouvet Island',
2764 'BR': 'Brazil',
2765 'IO': 'British Indian Ocean Territory',
2766 'BN': 'Brunei Darussalam',
2767 'BG': 'Bulgaria',
2768 'BF': 'Burkina Faso',
2769 'BI': 'Burundi',
2770 'KH': 'Cambodia',
2771 'CM': 'Cameroon',
2772 'CA': 'Canada',
2773 'CV': 'Cape Verde',
2774 'KY': 'Cayman Islands',
2775 'CF': 'Central African Republic',
2776 'TD': 'Chad',
2777 'CL': 'Chile',
2778 'CN': 'China',
2779 'CX': 'Christmas Island',
2780 'CC': 'Cocos (Keeling) Islands',
2781 'CO': 'Colombia',
2782 'KM': 'Comoros',
2783 'CG': 'Congo',
2784 'CD': 'Congo, the Democratic Republic of the',
2785 'CK': 'Cook Islands',
2786 'CR': 'Costa Rica',
2787 'CI': 'Côte d\'Ivoire',
2788 'HR': 'Croatia',
2789 'CU': 'Cuba',
2790 'CW': 'Curaçao',
2791 'CY': 'Cyprus',
2792 'CZ': 'Czech Republic',
2793 'DK': 'Denmark',
2794 'DJ': 'Djibouti',
2795 'DM': 'Dominica',
2796 'DO': 'Dominican Republic',
2797 'EC': 'Ecuador',
2798 'EG': 'Egypt',
2799 'SV': 'El Salvador',
2800 'GQ': 'Equatorial Guinea',
2801 'ER': 'Eritrea',
2802 'EE': 'Estonia',
2803 'ET': 'Ethiopia',
2804 'FK': 'Falkland Islands (Malvinas)',
2805 'FO': 'Faroe Islands',
2806 'FJ': 'Fiji',
2807 'FI': 'Finland',
2808 'FR': 'France',
2809 'GF': 'French Guiana',
2810 'PF': 'French Polynesia',
2811 'TF': 'French Southern Territories',
2812 'GA': 'Gabon',
2813 'GM': 'Gambia',
2814 'GE': 'Georgia',
2815 'DE': 'Germany',
2816 'GH': 'Ghana',
2817 'GI': 'Gibraltar',
2818 'GR': 'Greece',
2819 'GL': 'Greenland',
2820 'GD': 'Grenada',
2821 'GP': 'Guadeloupe',
2822 'GU': 'Guam',
2823 'GT': 'Guatemala',
2824 'GG': 'Guernsey',
2825 'GN': 'Guinea',
2826 'GW': 'Guinea-Bissau',
2827 'GY': 'Guyana',
2828 'HT': 'Haiti',
2829 'HM': 'Heard Island and McDonald Islands',
2830 'VA': 'Holy See (Vatican City State)',
2831 'HN': 'Honduras',
2832 'HK': 'Hong Kong',
2833 'HU': 'Hungary',
2834 'IS': 'Iceland',
2835 'IN': 'India',
2836 'ID': 'Indonesia',
2837 'IR': 'Iran, Islamic Republic of',
2838 'IQ': 'Iraq',
2839 'IE': 'Ireland',
2840 'IM': 'Isle of Man',
2841 'IL': 'Israel',
2842 'IT': 'Italy',
2843 'JM': 'Jamaica',
2844 'JP': 'Japan',
2845 'JE': 'Jersey',
2846 'JO': 'Jordan',
2847 'KZ': 'Kazakhstan',
2848 'KE': 'Kenya',
2849 'KI': 'Kiribati',
2850 'KP': 'Korea, Democratic People\'s Republic of',
2851 'KR': 'Korea, Republic of',
2852 'KW': 'Kuwait',
2853 'KG': 'Kyrgyzstan',
2854 'LA': 'Lao People\'s Democratic Republic',
2855 'LV': 'Latvia',
2856 'LB': 'Lebanon',
2857 'LS': 'Lesotho',
2858 'LR': 'Liberia',
2859 'LY': 'Libya',
2860 'LI': 'Liechtenstein',
2861 'LT': 'Lithuania',
2862 'LU': 'Luxembourg',
2863 'MO': 'Macao',
2864 'MK': 'Macedonia, the Former Yugoslav Republic of',
2865 'MG': 'Madagascar',
2866 'MW': 'Malawi',
2867 'MY': 'Malaysia',
2868 'MV': 'Maldives',
2869 'ML': 'Mali',
2870 'MT': 'Malta',
2871 'MH': 'Marshall Islands',
2872 'MQ': 'Martinique',
2873 'MR': 'Mauritania',
2874 'MU': 'Mauritius',
2875 'YT': 'Mayotte',
2876 'MX': 'Mexico',
2877 'FM': 'Micronesia, Federated States of',
2878 'MD': 'Moldova, Republic of',
2879 'MC': 'Monaco',
2880 'MN': 'Mongolia',
2881 'ME': 'Montenegro',
2882 'MS': 'Montserrat',
2883 'MA': 'Morocco',
2884 'MZ': 'Mozambique',
2885 'MM': 'Myanmar',
2886 'NA': 'Namibia',
2887 'NR': 'Nauru',
2888 'NP': 'Nepal',
2889 'NL': 'Netherlands',
2890 'NC': 'New Caledonia',
2891 'NZ': 'New Zealand',
2892 'NI': 'Nicaragua',
2893 'NE': 'Niger',
2894 'NG': 'Nigeria',
2895 'NU': 'Niue',
2896 'NF': 'Norfolk Island',
2897 'MP': 'Northern Mariana Islands',
2898 'NO': 'Norway',
2899 'OM': 'Oman',
2900 'PK': 'Pakistan',
2901 'PW': 'Palau',
2902 'PS': 'Palestine, State of',
2903 'PA': 'Panama',
2904 'PG': 'Papua New Guinea',
2905 'PY': 'Paraguay',
2906 'PE': 'Peru',
2907 'PH': 'Philippines',
2908 'PN': 'Pitcairn',
2909 'PL': 'Poland',
2910 'PT': 'Portugal',
2911 'PR': 'Puerto Rico',
2912 'QA': 'Qatar',
2913 'RE': 'Réunion',
2914 'RO': 'Romania',
2915 'RU': 'Russian Federation',
2916 'RW': 'Rwanda',
2917 'BL': 'Saint Barthélemy',
2918 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2919 'KN': 'Saint Kitts and Nevis',
2920 'LC': 'Saint Lucia',
2921 'MF': 'Saint Martin (French part)',
2922 'PM': 'Saint Pierre and Miquelon',
2923 'VC': 'Saint Vincent and the Grenadines',
2924 'WS': 'Samoa',
2925 'SM': 'San Marino',
2926 'ST': 'Sao Tome and Principe',
2927 'SA': 'Saudi Arabia',
2928 'SN': 'Senegal',
2929 'RS': 'Serbia',
2930 'SC': 'Seychelles',
2931 'SL': 'Sierra Leone',
2932 'SG': 'Singapore',
2933 'SX': 'Sint Maarten (Dutch part)',
2934 'SK': 'Slovakia',
2935 'SI': 'Slovenia',
2936 'SB': 'Solomon Islands',
2937 'SO': 'Somalia',
2938 'ZA': 'South Africa',
2939 'GS': 'South Georgia and the South Sandwich Islands',
2940 'SS': 'South Sudan',
2941 'ES': 'Spain',
2942 'LK': 'Sri Lanka',
2943 'SD': 'Sudan',
2944 'SR': 'Suriname',
2945 'SJ': 'Svalbard and Jan Mayen',
2946 'SZ': 'Swaziland',
2947 'SE': 'Sweden',
2948 'CH': 'Switzerland',
2949 'SY': 'Syrian Arab Republic',
2950 'TW': 'Taiwan, Province of China',
2951 'TJ': 'Tajikistan',
2952 'TZ': 'Tanzania, United Republic of',
2953 'TH': 'Thailand',
2954 'TL': 'Timor-Leste',
2955 'TG': 'Togo',
2956 'TK': 'Tokelau',
2957 'TO': 'Tonga',
2958 'TT': 'Trinidad and Tobago',
2959 'TN': 'Tunisia',
2960 'TR': 'Turkey',
2961 'TM': 'Turkmenistan',
2962 'TC': 'Turks and Caicos Islands',
2963 'TV': 'Tuvalu',
2964 'UG': 'Uganda',
2965 'UA': 'Ukraine',
2966 'AE': 'United Arab Emirates',
2967 'GB': 'United Kingdom',
2968 'US': 'United States',
2969 'UM': 'United States Minor Outlying Islands',
2970 'UY': 'Uruguay',
2971 'UZ': 'Uzbekistan',
2972 'VU': 'Vanuatu',
2973 'VE': 'Venezuela, Bolivarian Republic of',
2974 'VN': 'Viet Nam',
2975 'VG': 'Virgin Islands, British',
2976 'VI': 'Virgin Islands, U.S.',
2977 'WF': 'Wallis and Futuna',
2978 'EH': 'Western Sahara',
2979 'YE': 'Yemen',
2980 'ZM': 'Zambia',
2981 'ZW': 'Zimbabwe',
2982 }
2983
2984 @classmethod
2985 def short2full(cls, code):
2986 """Convert an ISO 3166-2 country code to the corresponding full name"""
2987 return cls._country_map.get(code.upper())
2988
2989
91410c9b 2990class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2991 def __init__(self, proxies=None):
2992 # Set default handlers
2993 for type in ('http', 'https'):
2994 setattr(self, '%s_open' % type,
2995 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2996 meth(r, proxy, type))
2997 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2998
91410c9b 2999 def proxy_open(self, req, proxy, type):
2461f79d 3000 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
3001 if req_proxy is not None:
3002 proxy = req_proxy
2461f79d
PH
3003 del req.headers['Ytdl-request-proxy']
3004
3005 if proxy == '__noproxy__':
3006 return None # No Proxy
51fb4995 3007 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
3008 req.add_header('Ytdl-socks-proxy', proxy)
3009 # youtube-dl's http/https handlers do wrapping the socket with socks
3010 return None
91410c9b
PH
3011 return compat_urllib_request.ProxyHandler.proxy_open(
3012 self, req, proxy, type)
5bc880b9
YCH
3013
3014
3015def ohdave_rsa_encrypt(data, exponent, modulus):
3016 '''
3017 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3018
3019 Input:
3020 data: data to encrypt, bytes-like object
3021 exponent, modulus: parameter e and N of RSA algorithm, both integer
3022 Output: hex string of encrypted data
3023
3024 Limitation: supports one block encryption only
3025 '''
3026
3027 payload = int(binascii.hexlify(data[::-1]), 16)
3028 encrypted = pow(payload, exponent, modulus)
3029 return '%x' % encrypted
81bdc8fd
YCH
3030
3031
5eb6bdce 3032def encode_base_n(num, n, table=None):
59f898b7 3033 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
3034 if not table:
3035 table = FULL_TABLE[:n]
3036
5eb6bdce
YCH
3037 if n > len(table):
3038 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3039
3040 if num == 0:
3041 return table[0]
3042
81bdc8fd
YCH
3043 ret = ''
3044 while num:
3045 ret = table[num % n] + ret
3046 num = num // n
3047 return ret
f52354a8
YCH
3048
3049
3050def decode_packed_codes(code):
06b3fe29 3051 mobj = re.search(PACKED_CODES_RE, code)
f52354a8
YCH
3052 obfucasted_code, base, count, symbols = mobj.groups()
3053 base = int(base)
3054 count = int(count)
3055 symbols = symbols.split('|')
3056 symbol_table = {}
3057
3058 while count:
3059 count -= 1
5eb6bdce 3060 base_n_count = encode_base_n(count, base)
f52354a8
YCH
3061 symbol_table[base_n_count] = symbols[count] or base_n_count
3062
3063 return re.sub(
3064 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3065 obfucasted_code)
e154c651 3066
3067
3068def parse_m3u8_attributes(attrib):
3069 info = {}
3070 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3071 if val.startswith('"'):
3072 val = val[1:-1]
3073 info[key] = val
3074 return info
1143535d
YCH
3075
3076
3077def urshift(val, n):
3078 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
3079
3080
3081# Based on png2str() written by @gdkchan and improved by @yokrysty
3082# Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3083def decode_png(png_data):
3084 # Reference: https://www.w3.org/TR/PNG/
3085 header = png_data[8:]
3086
3087 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3088 raise IOError('Not a valid PNG file.')
3089
3090 int_map = {1: '>B', 2: '>H', 4: '>I'}
3091 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3092
3093 chunks = []
3094
3095 while header:
3096 length = unpack_integer(header[:4])
3097 header = header[4:]
3098
3099 chunk_type = header[:4]
3100 header = header[4:]
3101
3102 chunk_data = header[:length]
3103 header = header[length:]
3104
3105 header = header[4:] # Skip CRC
3106
3107 chunks.append({
3108 'type': chunk_type,
3109 'length': length,
3110 'data': chunk_data
3111 })
3112
3113 ihdr = chunks[0]['data']
3114
3115 width = unpack_integer(ihdr[:4])
3116 height = unpack_integer(ihdr[4:8])
3117
3118 idat = b''
3119
3120 for chunk in chunks:
3121 if chunk['type'] == b'IDAT':
3122 idat += chunk['data']
3123
3124 if not idat:
3125 raise IOError('Unable to read PNG data.')
3126
3127 decompressed_data = bytearray(zlib.decompress(idat))
3128
3129 stride = width * 3
3130 pixels = []
3131
3132 def _get_pixel(idx):
3133 x = idx % stride
3134 y = idx // stride
3135 return pixels[y][x]
3136
3137 for y in range(height):
3138 basePos = y * (1 + stride)
3139 filter_type = decompressed_data[basePos]
3140
3141 current_row = []
3142
3143 pixels.append(current_row)
3144
3145 for x in range(stride):
3146 color = decompressed_data[1 + basePos + x]
3147 basex = y * stride + x
3148 left = 0
3149 up = 0
3150
3151 if x > 2:
3152 left = _get_pixel(basex - 3)
3153 if y > 0:
3154 up = _get_pixel(basex - stride)
3155
3156 if filter_type == 1: # Sub
3157 color = (color + left) & 0xff
3158 elif filter_type == 2: # Up
3159 color = (color + up) & 0xff
3160 elif filter_type == 3: # Average
3161 color = (color + ((left + up) >> 1)) & 0xff
3162 elif filter_type == 4: # Paeth
3163 a = left
3164 b = up
3165 c = 0
3166
3167 if x > 2 and y > 0:
3168 c = _get_pixel(basex - stride - 3)
3169
3170 p = a + b - c
3171
3172 pa = abs(p - a)
3173 pb = abs(p - b)
3174 pc = abs(p - c)
3175
3176 if pa <= pb and pa <= pc:
3177 color = (color + a) & 0xff
3178 elif pb <= pc:
3179 color = (color + b) & 0xff
3180 else:
3181 color = (color + c) & 0xff
3182
3183 current_row.append(color)
3184
3185 return width, height, pixels
efa97bdc
YCH
3186
3187
3188def write_xattr(path, key, value):
3189 # This mess below finds the best xattr tool for the job
3190 try:
3191 # try the pyxattr module...
3192 import xattr
3193
53a7e3d2
YCH
3194 if hasattr(xattr, 'set'): # pyxattr
3195 # Unicode arguments are not supported in python-pyxattr until
3196 # version 0.5.0
3197 # See https://github.com/rg3/youtube-dl/issues/5498
3198 pyxattr_required_version = '0.5.0'
3199 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3200 # TODO: fallback to CLI tools
3201 raise XAttrUnavailableError(
3202 'python-pyxattr is detected but is too old. '
3203 'youtube-dl requires %s or above while your version is %s. '
3204 'Falling back to other xattr implementations' % (
3205 pyxattr_required_version, xattr.__version__))
3206
3207 setxattr = xattr.set
3208 else: # xattr
3209 setxattr = xattr.setxattr
efa97bdc
YCH
3210
3211 try:
53a7e3d2 3212 setxattr(path, key, value)
efa97bdc
YCH
3213 except EnvironmentError as e:
3214 raise XAttrMetadataError(e.errno, e.strerror)
3215
3216 except ImportError:
3217 if compat_os_name == 'nt':
3218 # Write xattrs to NTFS Alternate Data Streams:
3219 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3220 assert ':' not in key
3221 assert os.path.exists(path)
3222
3223 ads_fn = path + ':' + key
3224 try:
3225 with open(ads_fn, 'wb') as f:
3226 f.write(value)
3227 except EnvironmentError as e:
3228 raise XAttrMetadataError(e.errno, e.strerror)
3229 else:
3230 user_has_setfattr = check_executable('setfattr', ['--version'])
3231 user_has_xattr = check_executable('xattr', ['-h'])
3232
3233 if user_has_setfattr or user_has_xattr:
3234
3235 value = value.decode('utf-8')
3236 if user_has_setfattr:
3237 executable = 'setfattr'
3238 opts = ['-n', key, '-v', value]
3239 elif user_has_xattr:
3240 executable = 'xattr'
3241 opts = ['-w', key, value]
3242
3243 cmd = ([encodeFilename(executable, True)] +
3244 [encodeArgument(o) for o in opts] +
3245 [encodeFilename(path, True)])
3246
3247 try:
3248 p = subprocess.Popen(
3249 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3250 except EnvironmentError as e:
3251 raise XAttrMetadataError(e.errno, e.strerror)
3252 stdout, stderr = p.communicate()
3253 stderr = stderr.decode('utf-8', 'replace')
3254 if p.returncode != 0:
3255 raise XAttrMetadataError(p.returncode, stderr)
3256
3257 else:
3258 # On Unix, and can't find pyxattr, setfattr, or xattr.
3259 if sys.platform.startswith('linux'):
3260 raise XAttrUnavailableError(
3261 "Couldn't find a tool to set the xattrs. "
3262 "Install either the python 'pyxattr' or 'xattr' "
3263 "modules, or the GNU 'attr' package "
3264 "(which contains the 'setfattr' tool).")
3265 else:
3266 raise XAttrUnavailableError(
3267 "Couldn't find a tool to set the xattrs. "
3268 "Install either the python 'xattr' module, "
3269 "or the 'xattr' binary.")