]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[melonvod] Add extractor for vod.melon.com
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd 1#!/usr/bin/env python
dcdb292f 2# coding: utf-8
d77c3dfd 3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
1e399778 6import base64
5bc880b9 7import binascii
912b38b4 8import calendar
676eb3f2 9import codecs
62e609ab 10import contextlib
e3946f98 11import ctypes
c496ca96
PH
12import datetime
13import email.utils
f45c185f 14import errno
be4a824d 15import functools
d77c3dfd 16import gzip
03f9daab 17import io
79a2e94e 18import itertools
f4bfd65f 19import json
d77c3dfd 20import locale
02dbf93f 21import math
347de493 22import operator
d77c3dfd 23import os
4eb7f1d1 24import pipes
c496ca96 25import platform
d77c3dfd 26import re
c496ca96 27import socket
79a2e94e 28import ssl
1c088fa8 29import subprocess
d77c3dfd 30import sys
181c8655 31import tempfile
01951dda 32import traceback
bcf89ce6 33import xml.etree.ElementTree
d77c3dfd 34import zlib
d77c3dfd 35
8c25f81b 36from .compat import (
8bb56eee 37 compat_HTMLParser,
8f9312c3 38 compat_basestring,
8c25f81b 39 compat_chr,
36e6f62c 40 compat_etree_fromstring,
8c25f81b 41 compat_html_entities,
55b2f099 42 compat_html_entities_html5,
be4a824d 43 compat_http_client,
c86b6142 44 compat_kwargs,
efa97bdc 45 compat_os_name,
8c25f81b 46 compat_parse_qs,
702ccf2d 47 compat_shlex_quote,
be4a824d 48 compat_socket_create_connection,
8c25f81b 49 compat_str,
edaa23f8 50 compat_struct_pack,
d3f8e038 51 compat_struct_unpack,
8c25f81b
PH
52 compat_urllib_error,
53 compat_urllib_parse,
15707c7e 54 compat_urllib_parse_urlencode,
8c25f81b 55 compat_urllib_parse_urlparse,
7581bfc9 56 compat_urllib_parse_unquote_plus,
8c25f81b
PH
57 compat_urllib_request,
58 compat_urlparse,
810c10ba 59 compat_xpath,
8c25f81b 60)
4644ac55 61
71aff188
YCH
62from .socks import (
63 ProxyType,
64 sockssocket,
65)
66
4644ac55 67
51fb4995
YCH
68def register_socks_protocols():
69 # "Register" SOCKS protocols
d5ae6bb5
YCH
70 # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904
71 # URLs with protocols not in urlparse.uses_netloc are not handled correctly
51fb4995
YCH
72 for scheme in ('socks', 'socks4', 'socks4a', 'socks5'):
73 if scheme not in compat_urlparse.uses_netloc:
74 compat_urlparse.uses_netloc.append(scheme)
75
76
468e2e92
FV
77# This is not clearly defined otherwise
78compiled_regex_type = type(re.compile(''))
79
3e669f36 80std_headers = {
15d10678 81 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)',
59ae15a5
PH
82 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
83 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
84 'Accept-Encoding': 'gzip, deflate',
85 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 86}
f427df17 87
5f6a1245 88
fb37eb25
S
89USER_AGENTS = {
90 'Safari': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27',
91}
92
93
bf42a990
S
94NO_DEFAULT = object()
95
7105440c
YCH
96ENGLISH_MONTH_NAMES = [
97 'January', 'February', 'March', 'April', 'May', 'June',
98 'July', 'August', 'September', 'October', 'November', 'December']
99
f6717dec
S
100MONTH_NAMES = {
101 'en': ENGLISH_MONTH_NAMES,
102 'fr': [
3e4185c3
S
103 'janvier', 'février', 'mars', 'avril', 'mai', 'juin',
104 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'],
f6717dec 105}
a942d6cb 106
a7aaa398
S
107KNOWN_EXTENSIONS = (
108 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac',
109 'flv', 'f4v', 'f4a', 'f4b',
110 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus',
111 'mkv', 'mka', 'mk3d',
112 'avi', 'divx',
113 'mov',
114 'asf', 'wmv', 'wma',
115 '3gp', '3g2',
116 'mp3',
117 'flac',
118 'ape',
119 'wav',
120 'f4f', 'f4m', 'm3u8', 'smil')
121
c587cbb7 122# needed for sanitizing filenames in restricted mode
c8827027 123ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ',
124 itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'],
125 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy')))
c587cbb7 126
46f59e89
S
127DATE_FORMATS = (
128 '%d %B %Y',
129 '%d %b %Y',
130 '%B %d %Y',
131 '%b %d %Y',
132 '%b %dst %Y %I:%M',
133 '%b %dnd %Y %I:%M',
134 '%b %dth %Y %I:%M',
135 '%Y %m %d',
136 '%Y-%m-%d',
137 '%Y/%m/%d',
81c13222 138 '%Y/%m/%d %H:%M',
46f59e89
S
139 '%Y/%m/%d %H:%M:%S',
140 '%Y-%m-%d %H:%M:%S',
141 '%Y-%m-%d %H:%M:%S.%f',
142 '%d.%m.%Y %H:%M',
143 '%d.%m.%Y %H.%M',
144 '%Y-%m-%dT%H:%M:%SZ',
145 '%Y-%m-%dT%H:%M:%S.%fZ',
146 '%Y-%m-%dT%H:%M:%S.%f0Z',
147 '%Y-%m-%dT%H:%M:%S',
148 '%Y-%m-%dT%H:%M:%S.%f',
149 '%Y-%m-%dT%H:%M',
c6eed6b8
S
150 '%b %d %Y at %H:%M',
151 '%b %d %Y at %H:%M:%S',
46f59e89
S
152)
153
154DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS)
155DATE_FORMATS_DAY_FIRST.extend([
156 '%d-%m-%Y',
157 '%d.%m.%Y',
158 '%d.%m.%y',
159 '%d/%m/%Y',
160 '%d/%m/%y',
161 '%d/%m/%Y %H:%M:%S',
162])
163
164DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS)
165DATE_FORMATS_MONTH_FIRST.extend([
166 '%m-%d-%Y',
167 '%m.%d.%Y',
168 '%m/%d/%Y',
169 '%m/%d/%y',
170 '%m/%d/%Y %H:%M:%S',
171])
172
06b3fe29
S
173PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)"
174
7105440c 175
d77c3dfd 176def preferredencoding():
59ae15a5 177 """Get preferred encoding.
d77c3dfd 178
59ae15a5
PH
179 Returns the best encoding scheme for the system, based on
180 locale.getpreferredencoding() and some further tweaks.
181 """
182 try:
183 pref = locale.getpreferredencoding()
28e614de 184 'TEST'.encode(pref)
70a1165b 185 except Exception:
59ae15a5 186 pref = 'UTF-8'
bae611f2 187
59ae15a5 188 return pref
d77c3dfd 189
f4bfd65f 190
181c8655 191def write_json_file(obj, fn):
1394646a 192 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 193
92120217 194 fn = encodeFilename(fn)
61ee5aeb 195 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
196 encoding = get_filesystem_encoding()
197 # os.path.basename returns a bytes object, but NamedTemporaryFile
198 # will fail if the filename contains non ascii characters unless we
199 # use a unicode object
200 path_basename = lambda f: os.path.basename(fn).decode(encoding)
201 # the same for os.path.dirname
202 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
203 else:
204 path_basename = os.path.basename
205 path_dirname = os.path.dirname
206
73159f99
S
207 args = {
208 'suffix': '.tmp',
ec5f6016
JMF
209 'prefix': path_basename(fn) + '.',
210 'dir': path_dirname(fn),
73159f99
S
211 'delete': False,
212 }
213
181c8655
PH
214 # In Python 2.x, json.dump expects a bytestream.
215 # In Python 3.x, it writes to a character stream
216 if sys.version_info < (3, 0):
73159f99 217 args['mode'] = 'wb'
181c8655 218 else:
73159f99
S
219 args.update({
220 'mode': 'w',
221 'encoding': 'utf-8',
222 })
223
c86b6142 224 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
225
226 try:
227 with tf:
228 json.dump(obj, tf)
1394646a
IK
229 if sys.platform == 'win32':
230 # Need to remove existing file on Windows, else os.rename raises
231 # WindowsError or FileExistsError.
232 try:
233 os.unlink(fn)
234 except OSError:
235 pass
181c8655 236 os.rename(tf.name, fn)
70a1165b 237 except Exception:
181c8655
PH
238 try:
239 os.remove(tf.name)
240 except OSError:
241 pass
242 raise
243
244
245if sys.version_info >= (2, 7):
ee114368 246 def find_xpath_attr(node, xpath, key, val=None):
59ae56fa 247 """ Find the xpath xpath[@key=val] """
5d2354f1 248 assert re.match(r'^[a-zA-Z_-]+$', key)
ee114368 249 expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val))
59ae56fa
PH
250 return node.find(expr)
251else:
ee114368 252 def find_xpath_attr(node, xpath, key, val=None):
810c10ba 253 for f in node.findall(compat_xpath(xpath)):
ee114368
S
254 if key not in f.attrib:
255 continue
256 if val is None or f.attrib.get(key) == val:
59ae56fa
PH
257 return f
258 return None
259
d7e66d39
JMF
260# On python2.6 the xml.etree.ElementTree.Element methods don't support
261# the namespace parameter
5f6a1245
JW
262
263
d7e66d39
JMF
264def xpath_with_ns(path, ns_map):
265 components = [c.split(':') for c in path.split('/')]
266 replaced = []
267 for c in components:
268 if len(c) == 1:
269 replaced.append(c[0])
270 else:
271 ns, tag = c
272 replaced.append('{%s}%s' % (ns_map[ns], tag))
273 return '/'.join(replaced)
274
d77c3dfd 275
a41fb80c 276def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
578c0745 277 def _find_xpath(xpath):
810c10ba 278 return node.find(compat_xpath(xpath))
578c0745
S
279
280 if isinstance(xpath, (str, compat_str)):
281 n = _find_xpath(xpath)
282 else:
283 for xp in xpath:
284 n = _find_xpath(xp)
285 if n is not None:
286 break
d74bebd5 287
8e636da4 288 if n is None:
bf42a990
S
289 if default is not NO_DEFAULT:
290 return default
291 elif fatal:
bf0ff932
PH
292 name = xpath if name is None else name
293 raise ExtractorError('Could not find XML element %s' % name)
294 else:
295 return None
a41fb80c
S
296 return n
297
298
299def xpath_text(node, xpath, name=None, fatal=False, default=NO_DEFAULT):
8e636da4
S
300 n = xpath_element(node, xpath, name, fatal=fatal, default=default)
301 if n is None or n == default:
302 return n
303 if n.text is None:
304 if default is not NO_DEFAULT:
305 return default
306 elif fatal:
307 name = xpath if name is None else name
308 raise ExtractorError('Could not find XML element\'s text %s' % name)
309 else:
310 return None
311 return n.text
a41fb80c
S
312
313
314def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
315 n = find_xpath_attr(node, xpath, key)
316 if n is None:
317 if default is not NO_DEFAULT:
318 return default
319 elif fatal:
320 name = '%s[@%s]' % (xpath, key) if name is None else name
321 raise ExtractorError('Could not find XML attribute %s' % name)
322 else:
323 return None
324 return n.attrib[key]
bf0ff932
PH
325
326
9e6dd238 327def get_element_by_id(id, html):
43e8fafd 328 """Return the content of the tag with the specified ID in the passed HTML document"""
611c1dd9 329 return get_element_by_attribute('id', id, html)
43e8fafd 330
12ea2f30 331
84c237fb
YCH
332def get_element_by_class(class_name, html):
333 return get_element_by_attribute(
334 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
335 html, escape_value=False)
336
337
338def get_element_by_attribute(attribute, value, html, escape_value=True):
43e8fafd 339 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 340
84c237fb
YCH
341 value = re.escape(value) if escape_value else value
342
38285056
PH
343 m = re.search(r'''(?xs)
344 <([a-zA-Z0-9:._-]+)
abc97b5e 345 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056 346 \s+%s=['"]?%s['"]?
abc97b5e 347 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
38285056
PH
348 \s*>
349 (?P<content>.*?)
350 </\1>
84c237fb 351 ''' % (re.escape(attribute), value), html)
38285056
PH
352
353 if not m:
354 return None
355 res = m.group('content')
356
357 if res.startswith('"') or res.startswith("'"):
358 res = res[1:-1]
a921f407 359
38285056 360 return unescapeHTML(res)
a921f407 361
c5229f39 362
8bb56eee
BF
363class HTMLAttributeParser(compat_HTMLParser):
364 """Trivial HTML parser to gather the attributes for a single element"""
365 def __init__(self):
c5229f39 366 self.attrs = {}
8bb56eee
BF
367 compat_HTMLParser.__init__(self)
368
369 def handle_starttag(self, tag, attrs):
370 self.attrs = dict(attrs)
371
c5229f39 372
8bb56eee
BF
373def extract_attributes(html_element):
374 """Given a string for an HTML element such as
375 <el
376 a="foo" B="bar" c="&98;az" d=boz
377 empty= noval entity="&amp;"
378 sq='"' dq="'"
379 >
380 Decode and return a dictionary of attributes.
381 {
382 'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz',
383 'empty': '', 'noval': None, 'entity': '&',
384 'sq': '"', 'dq': '\''
385 }.
386 NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions,
387 but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5.
388 """
389 parser = HTMLAttributeParser()
390 parser.feed(html_element)
391 parser.close()
392 return parser.attrs
9e6dd238 393
c5229f39 394
9e6dd238 395def clean_html(html):
59ae15a5 396 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
397
398 if html is None: # Convenience for sanitizing descriptions etc.
399 return html
400
59ae15a5
PH
401 # Newline vs <br />
402 html = html.replace('\n', ' ')
6b3aef80
FV
403 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
404 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
405 # Strip html tags
406 html = re.sub('<.*?>', '', html)
407 # Replace html entities
408 html = unescapeHTML(html)
7decf895 409 return html.strip()
9e6dd238
FV
410
411
d77c3dfd 412def sanitize_open(filename, open_mode):
59ae15a5
PH
413 """Try to open the given filename, and slightly tweak it if this fails.
414
415 Attempts to open the given filename. If this fails, it tries to change
416 the filename slightly, step by step, until it's either able to open it
417 or it fails and raises a final exception, like the standard open()
418 function.
419
420 It returns the tuple (stream, definitive_file_name).
421 """
422 try:
28e614de 423 if filename == '-':
59ae15a5
PH
424 if sys.platform == 'win32':
425 import msvcrt
426 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 427 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
428 stream = open(encodeFilename(filename), open_mode)
429 return (stream, filename)
430 except (IOError, OSError) as err:
f45c185f
PH
431 if err.errno in (errno.EACCES,):
432 raise
59ae15a5 433
f45c185f 434 # In case of error, try to remove win32 forbidden chars
d55de57b 435 alt_filename = sanitize_path(filename)
f45c185f
PH
436 if alt_filename == filename:
437 raise
438 else:
439 # An exception here should be caught in the caller
d55de57b 440 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 441 return (stream, alt_filename)
d77c3dfd
FV
442
443
444def timeconvert(timestr):
59ae15a5
PH
445 """Convert RFC 2822 defined time string into system timestamp"""
446 timestamp = None
447 timetuple = email.utils.parsedate_tz(timestr)
448 if timetuple is not None:
449 timestamp = email.utils.mktime_tz(timetuple)
450 return timestamp
1c469a94 451
5f6a1245 452
796173d0 453def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
454 """Sanitizes a string so it could be used as part of a filename.
455 If restricted is set, use a stricter subset of allowed characters.
796173d0 456 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
457 """
458 def replace_insane(char):
c587cbb7
AT
459 if restricted and char in ACCENT_CHARS:
460 return ACCENT_CHARS[char]
59ae15a5
PH
461 if char == '?' or ord(char) < 32 or ord(char) == 127:
462 return ''
463 elif char == '"':
464 return '' if restricted else '\''
465 elif char == ':':
466 return '_-' if restricted else ' -'
467 elif char in '\\/|*<>':
468 return '_'
627dcfff 469 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
470 return '_'
471 if restricted and ord(char) > 127:
472 return '_'
473 return char
474
2aeb06d6
PH
475 # Handle timestamps
476 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 477 result = ''.join(map(replace_insane, s))
796173d0
PH
478 if not is_id:
479 while '__' in result:
480 result = result.replace('__', '_')
481 result = result.strip('_')
482 # Common case of "Foreign band name - English song title"
483 if restricted and result.startswith('-_'):
484 result = result[2:]
5a42414b
PH
485 if result.startswith('-'):
486 result = '_' + result[len('-'):]
a7440261 487 result = result.lstrip('.')
796173d0
PH
488 if not result:
489 result = '_'
59ae15a5 490 return result
d77c3dfd 491
5f6a1245 492
a2aaf4db
S
493def sanitize_path(s):
494 """Sanitizes and normalizes path on Windows"""
495 if sys.platform != 'win32':
496 return s
be531ef1
S
497 drive_or_unc, _ = os.path.splitdrive(s)
498 if sys.version_info < (2, 7) and not drive_or_unc:
499 drive_or_unc, _ = os.path.splitunc(s)
500 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
501 if drive_or_unc:
a2aaf4db
S
502 norm_path.pop(0)
503 sanitized_path = [
c90d16cf 504 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|[\s.]$)', '#', path_part)
a2aaf4db 505 for path_part in norm_path]
be531ef1
S
506 if drive_or_unc:
507 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
508 return os.path.join(*sanitized_path)
509
510
67dda517
S
511# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of
512# unwanted failures due to missing protocol
17bcc626
S
513def sanitize_url(url):
514 return 'http:%s' % url if url.startswith('//') else url
515
516
67dda517 517def sanitized_Request(url, *args, **kwargs):
17bcc626 518 return compat_urllib_request.Request(sanitize_url(url), *args, **kwargs)
67dda517
S
519
520
d77c3dfd 521def orderedSet(iterable):
59ae15a5
PH
522 """ Remove all duplicates from the input iterable """
523 res = []
524 for el in iterable:
525 if el not in res:
526 res.append(el)
527 return res
d77c3dfd 528
912b38b4 529
55b2f099 530def _htmlentity_transform(entity_with_semicolon):
4e408e47 531 """Transforms an HTML entity to a character."""
55b2f099
YCH
532 entity = entity_with_semicolon[:-1]
533
4e408e47
PH
534 # Known non-numeric HTML entity
535 if entity in compat_html_entities.name2codepoint:
536 return compat_chr(compat_html_entities.name2codepoint[entity])
537
55b2f099
YCH
538 # TODO: HTML5 allows entities without a semicolon. For example,
539 # '&Eacuteric' should be decoded as 'Éric'.
540 if entity_with_semicolon in compat_html_entities_html5:
541 return compat_html_entities_html5[entity_with_semicolon]
542
91757b0f 543 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
544 if mobj is not None:
545 numstr = mobj.group(1)
28e614de 546 if numstr.startswith('x'):
4e408e47 547 base = 16
28e614de 548 numstr = '0%s' % numstr
4e408e47
PH
549 else:
550 base = 10
7aefc49c
S
551 # See https://github.com/rg3/youtube-dl/issues/7518
552 try:
553 return compat_chr(int(numstr, base))
554 except ValueError:
555 pass
4e408e47
PH
556
557 # Unknown entity in name, return its literal representation
7a3f0c00 558 return '&%s;' % entity
4e408e47
PH
559
560
d77c3dfd 561def unescapeHTML(s):
912b38b4
PH
562 if s is None:
563 return None
564 assert type(s) == compat_str
d77c3dfd 565
4e408e47 566 return re.sub(
55b2f099 567 r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 568
8bf48f23 569
aa49acd1
S
570def get_subprocess_encoding():
571 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
572 # For subprocess calls, encode with locale encoding
573 # Refer to http://stackoverflow.com/a/9951851/35070
574 encoding = preferredencoding()
575 else:
576 encoding = sys.getfilesystemencoding()
577 if encoding is None:
578 encoding = 'utf-8'
579 return encoding
580
581
8bf48f23 582def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
583 """
584 @param s The name of the file
585 """
d77c3dfd 586
8bf48f23 587 assert type(s) == compat_str
d77c3dfd 588
59ae15a5
PH
589 # Python 3 has a Unicode API
590 if sys.version_info >= (3, 0):
591 return s
0f00efed 592
aa49acd1
S
593 # Pass '' directly to use Unicode APIs on Windows 2000 and up
594 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
595 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
596 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
597 return s
598
8ee239e9
YCH
599 # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible
600 if sys.platform.startswith('java'):
601 return s
602
aa49acd1
S
603 return s.encode(get_subprocess_encoding(), 'ignore')
604
605
606def decodeFilename(b, for_subprocess=False):
607
608 if sys.version_info >= (3, 0):
609 return b
610
611 if not isinstance(b, bytes):
612 return b
613
614 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 615
f07b74fc
PH
616
617def encodeArgument(s):
618 if not isinstance(s, compat_str):
619 # Legacy code that uses byte strings
620 # Uncomment the following line after fixing all post processors
7af808a5 621 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
622 s = s.decode('ascii')
623 return encodeFilename(s, True)
624
625
aa49acd1
S
626def decodeArgument(b):
627 return decodeFilename(b, True)
628
629
8271226a
PH
630def decodeOption(optval):
631 if optval is None:
632 return optval
633 if isinstance(optval, bytes):
634 optval = optval.decode(preferredencoding())
635
636 assert isinstance(optval, compat_str)
637 return optval
1c256f70 638
5f6a1245 639
4539dd30
PH
640def formatSeconds(secs):
641 if secs > 3600:
642 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
643 elif secs > 60:
644 return '%d:%02d' % (secs // 60, secs % 60)
645 else:
646 return '%d' % secs
647
a0ddb8a2 648
be4a824d
PH
649def make_HTTPS_handler(params, **kwargs):
650 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 651 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 652 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 653 if opts_no_check_certificate:
be5f2c19 654 context.check_hostname = False
0db261ba 655 context.verify_mode = ssl.CERT_NONE
a2366922 656 try:
be4a824d 657 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
658 except TypeError:
659 # Python 2.7.8
660 # (create_default_context present but HTTPSHandler has no context=)
661 pass
662
663 if sys.version_info < (3, 2):
d7932313 664 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 665 else: # Python < 3.4
d7932313 666 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 667 context.verify_mode = (ssl.CERT_NONE
dca08720 668 if opts_no_check_certificate
ea6d901e 669 else ssl.CERT_REQUIRED)
303b479e 670 context.set_default_verify_paths()
be4a824d 671 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 672
732ea2f0 673
08f2a92c
JMF
674def bug_reports_message():
675 if ytdl_is_updateable():
676 update_cmd = 'type youtube-dl -U to update'
677 else:
678 update_cmd = 'see https://yt-dl.org/update on how to update'
679 msg = '; please report this issue on https://yt-dl.org/bug .'
680 msg += ' Make sure you are using the latest version; %s.' % update_cmd
681 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
682 return msg
683
684
1c256f70
PH
685class ExtractorError(Exception):
686 """Error during info extraction."""
5f6a1245 687
d11271dd 688 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
689 """ tb, if given, is the original traceback (so that it can be printed out).
690 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
691 """
692
693 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
694 expected = True
d11271dd
PH
695 if video_id is not None:
696 msg = video_id + ': ' + msg
410f3e73 697 if cause:
28e614de 698 msg += ' (caused by %r)' % cause
9a82b238 699 if not expected:
08f2a92c 700 msg += bug_reports_message()
1c256f70 701 super(ExtractorError, self).__init__(msg)
d5979c5d 702
1c256f70 703 self.traceback = tb
8cc83b8d 704 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 705 self.cause = cause
d11271dd 706 self.video_id = video_id
1c256f70 707
01951dda
PH
708 def format_traceback(self):
709 if self.traceback is None:
710 return None
28e614de 711 return ''.join(traceback.format_tb(self.traceback))
01951dda 712
1c256f70 713
416c7fcb
PH
714class UnsupportedError(ExtractorError):
715 def __init__(self, url):
716 super(UnsupportedError, self).__init__(
717 'Unsupported URL: %s' % url, expected=True)
718 self.url = url
719
720
55b3e45b
JMF
721class RegexNotFoundError(ExtractorError):
722 """Error when a regex didn't match"""
723 pass
724
725
d77c3dfd 726class DownloadError(Exception):
59ae15a5 727 """Download Error exception.
d77c3dfd 728
59ae15a5
PH
729 This exception may be thrown by FileDownloader objects if they are not
730 configured to continue on errors. They will contain the appropriate
731 error message.
732 """
5f6a1245 733
8cc83b8d
FV
734 def __init__(self, msg, exc_info=None):
735 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
736 super(DownloadError, self).__init__(msg)
737 self.exc_info = exc_info
d77c3dfd
FV
738
739
740class SameFileError(Exception):
59ae15a5 741 """Same File exception.
d77c3dfd 742
59ae15a5
PH
743 This exception will be thrown by FileDownloader objects if they detect
744 multiple files would have to be downloaded to the same file on disk.
745 """
746 pass
d77c3dfd
FV
747
748
749class PostProcessingError(Exception):
59ae15a5 750 """Post Processing exception.
d77c3dfd 751
59ae15a5
PH
752 This exception may be raised by PostProcessor's .run() method to
753 indicate an error in the postprocessing task.
754 """
5f6a1245 755
7851b379
PH
756 def __init__(self, msg):
757 self.msg = msg
d77c3dfd 758
5f6a1245 759
d77c3dfd 760class MaxDownloadsReached(Exception):
59ae15a5
PH
761 """ --max-downloads limit has been reached. """
762 pass
d77c3dfd
FV
763
764
765class UnavailableVideoError(Exception):
59ae15a5 766 """Unavailable Format exception.
d77c3dfd 767
59ae15a5
PH
768 This exception will be thrown when a video is requested
769 in a format that is not available for that video.
770 """
771 pass
d77c3dfd
FV
772
773
774class ContentTooShortError(Exception):
59ae15a5 775 """Content Too Short exception.
d77c3dfd 776
59ae15a5
PH
777 This exception may be raised by FileDownloader objects when a file they
778 download is too small for what the server announced first, indicating
779 the connection was probably interrupted.
780 """
d77c3dfd 781
59ae15a5 782 def __init__(self, downloaded, expected):
2c7ed247 783 # Both in bytes
59ae15a5
PH
784 self.downloaded = downloaded
785 self.expected = expected
d77c3dfd 786
5f6a1245 787
efa97bdc
YCH
788class XAttrMetadataError(Exception):
789 def __init__(self, code=None, msg='Unknown error'):
790 super(XAttrMetadataError, self).__init__(msg)
791 self.code = code
bd264412 792 self.msg = msg
efa97bdc
YCH
793
794 # Parsing code and msg
795 if (self.code in (errno.ENOSPC, errno.EDQUOT) or
796 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
797 self.reason = 'NO_SPACE'
798 elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
799 self.reason = 'VALUE_TOO_LONG'
800 else:
801 self.reason = 'NOT_SUPPORTED'
802
803
804class XAttrUnavailableError(Exception):
805 pass
806
807
c5a59d93 808def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
e5e78797
S
809 # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting
810 # expected HTTP responses to meet HTTP/1.0 or later (see also
811 # https://github.com/rg3/youtube-dl/issues/6727)
812 if sys.version_info < (3, 0):
5a1a2e94 813 kwargs[b'strict'] = True
be4a824d
PH
814 hc = http_class(*args, **kwargs)
815 source_address = ydl_handler._params.get('source_address')
816 if source_address is not None:
817 sa = (source_address, 0)
818 if hasattr(hc, 'source_address'): # Python 2.7+
819 hc.source_address = sa
820 else: # Python 2.6
821 def _hc_connect(self, *args, **kwargs):
822 sock = compat_socket_create_connection(
823 (self.host, self.port), self.timeout, sa)
824 if is_https:
d7932313
PH
825 self.sock = ssl.wrap_socket(
826 sock, self.key_file, self.cert_file,
827 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
828 else:
829 self.sock = sock
830 hc.connect = functools.partial(_hc_connect, hc)
831
832 return hc
833
834
87f0e62d 835def handle_youtubedl_headers(headers):
992fc9d6
YCH
836 filtered_headers = headers
837
838 if 'Youtubedl-no-compression' in filtered_headers:
839 filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding')
87f0e62d 840 del filtered_headers['Youtubedl-no-compression']
87f0e62d 841
992fc9d6 842 return filtered_headers
87f0e62d
YCH
843
844
acebc9cd 845class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
846 """Handler for HTTP requests and responses.
847
848 This class, when installed with an OpenerDirector, automatically adds
849 the standard headers to every HTTP request and handles gzipped and
850 deflated responses from web servers. If compression is to be avoided in
851 a particular request, the original request in the program code only has
0424ec30 852 to include the HTTP header "Youtubedl-no-compression", which will be
59ae15a5
PH
853 removed before making the real request.
854
855 Part of this code was copied from:
856
857 http://techknack.net/python-urllib2-handlers/
858
859 Andrew Rowls, the author of that code, agreed to release it to the
860 public domain.
861 """
862
be4a824d
PH
863 def __init__(self, params, *args, **kwargs):
864 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
865 self._params = params
866
867 def http_open(self, req):
71aff188
YCH
868 conn_class = compat_http_client.HTTPConnection
869
870 socks_proxy = req.headers.get('Ytdl-socks-proxy')
871 if socks_proxy:
872 conn_class = make_socks_conn_class(conn_class, socks_proxy)
873 del req.headers['Ytdl-socks-proxy']
874
be4a824d 875 return self.do_open(functools.partial(
71aff188 876 _create_http_connection, self, conn_class, False),
be4a824d
PH
877 req)
878
59ae15a5
PH
879 @staticmethod
880 def deflate(data):
881 try:
882 return zlib.decompress(data, -zlib.MAX_WBITS)
883 except zlib.error:
884 return zlib.decompress(data)
885
886 @staticmethod
887 def addinfourl_wrapper(stream, headers, url, code):
888 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
889 return compat_urllib_request.addinfourl(stream, headers, url, code)
890 ret = compat_urllib_request.addinfourl(stream, headers, url)
891 ret.code = code
892 return ret
893
acebc9cd 894 def http_request(self, req):
51f267d9
S
895 # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not
896 # always respected by websites, some tend to give out URLs with non percent-encoded
897 # non-ASCII characters (see telemb.py, ard.py [#3412])
898 # urllib chokes on URLs with non-ASCII characters (see http://bugs.python.org/issue3991)
899 # To work around aforementioned issue we will replace request's original URL with
900 # percent-encoded one
901 # Since redirects are also affected (e.g. http://www.southpark.de/alle-episoden/s18e09)
902 # the code of this workaround has been moved here from YoutubeDL.urlopen()
903 url = req.get_full_url()
904 url_escaped = escape_url(url)
905
906 # Substitute URL if any change after escaping
907 if url != url_escaped:
15d260eb 908 req = update_Request(req, url=url_escaped)
51f267d9 909
33ac271b 910 for h, v in std_headers.items():
3d5f7a39
JK
911 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
912 # The dict keys are capitalized because of this bug by urllib
913 if h.capitalize() not in req.headers:
33ac271b 914 req.add_header(h, v)
87f0e62d
YCH
915
916 req.headers = handle_youtubedl_headers(req.headers)
989b4b2b
PH
917
918 if sys.version_info < (2, 7) and '#' in req.get_full_url():
919 # Python 2.6 is brain-dead when it comes to fragments
920 req._Request__original = req._Request__original.partition('#')[0]
921 req._Request__r_type = req._Request__r_type.partition('#')[0]
922
59ae15a5
PH
923 return req
924
acebc9cd 925 def http_response(self, req, resp):
59ae15a5
PH
926 old_resp = resp
927 # gzip
928 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
929 content = resp.read()
930 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
931 try:
932 uncompressed = io.BytesIO(gz.read())
933 except IOError as original_ioerror:
934 # There may be junk add the end of the file
935 # See http://stackoverflow.com/q/4928560/35070 for details
936 for i in range(1, 1024):
937 try:
938 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
939 uncompressed = io.BytesIO(gz.read())
940 except IOError:
941 continue
942 break
943 else:
944 raise original_ioerror
945 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5 946 resp.msg = old_resp.msg
c047270c 947 del resp.headers['Content-encoding']
59ae15a5
PH
948 # deflate
949 if resp.headers.get('Content-encoding', '') == 'deflate':
950 gz = io.BytesIO(self.deflate(resp.read()))
951 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
952 resp.msg = old_resp.msg
c047270c 953 del resp.headers['Content-encoding']
ad729172
S
954 # Percent-encode redirect URL of Location HTTP header to satisfy RFC 3986 (see
955 # https://github.com/rg3/youtube-dl/issues/6457).
5a4d9ddb
S
956 if 300 <= resp.code < 400:
957 location = resp.headers.get('Location')
958 if location:
959 # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
960 if sys.version_info >= (3, 0):
961 location = location.encode('iso-8859-1').decode('utf-8')
0ea59007
YCH
962 else:
963 location = location.decode('utf-8')
5a4d9ddb
S
964 location_escaped = escape_url(location)
965 if location != location_escaped:
966 del resp.headers['Location']
9a4aec8b
YCH
967 if sys.version_info < (3, 0):
968 location_escaped = location_escaped.encode('utf-8')
5a4d9ddb 969 resp.headers['Location'] = location_escaped
59ae15a5 970 return resp
0f8d03f8 971
acebc9cd
PH
972 https_request = http_request
973 https_response = http_response
bf50b038 974
5de90176 975
71aff188
YCH
976def make_socks_conn_class(base_class, socks_proxy):
977 assert issubclass(base_class, (
978 compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection))
979
980 url_components = compat_urlparse.urlparse(socks_proxy)
981 if url_components.scheme.lower() == 'socks5':
982 socks_type = ProxyType.SOCKS5
983 elif url_components.scheme.lower() in ('socks', 'socks4'):
984 socks_type = ProxyType.SOCKS4
51fb4995
YCH
985 elif url_components.scheme.lower() == 'socks4a':
986 socks_type = ProxyType.SOCKS4A
71aff188 987
cdd94c2e
YCH
988 def unquote_if_non_empty(s):
989 if not s:
990 return s
991 return compat_urllib_parse_unquote_plus(s)
992
71aff188
YCH
993 proxy_args = (
994 socks_type,
995 url_components.hostname, url_components.port or 1080,
996 True, # Remote DNS
cdd94c2e
YCH
997 unquote_if_non_empty(url_components.username),
998 unquote_if_non_empty(url_components.password),
71aff188
YCH
999 )
1000
1001 class SocksConnection(base_class):
1002 def connect(self):
1003 self.sock = sockssocket()
1004 self.sock.setproxy(*proxy_args)
1005 if type(self.timeout) in (int, float):
1006 self.sock.settimeout(self.timeout)
1007 self.sock.connect((self.host, self.port))
1008
1009 if isinstance(self, compat_http_client.HTTPSConnection):
1010 if hasattr(self, '_context'): # Python > 2.6
1011 self.sock = self._context.wrap_socket(
1012 self.sock, server_hostname=self.host)
1013 else:
1014 self.sock = ssl.wrap_socket(self.sock)
1015
1016 return SocksConnection
1017
1018
be4a824d
PH
1019class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
1020 def __init__(self, params, https_conn_class=None, *args, **kwargs):
1021 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
1022 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
1023 self._params = params
1024
1025 def https_open(self, req):
4f264c02 1026 kwargs = {}
71aff188
YCH
1027 conn_class = self._https_conn_class
1028
4f264c02
JMF
1029 if hasattr(self, '_context'): # python > 2.6
1030 kwargs['context'] = self._context
1031 if hasattr(self, '_check_hostname'): # python 3.x
1032 kwargs['check_hostname'] = self._check_hostname
71aff188
YCH
1033
1034 socks_proxy = req.headers.get('Ytdl-socks-proxy')
1035 if socks_proxy:
1036 conn_class = make_socks_conn_class(conn_class, socks_proxy)
1037 del req.headers['Ytdl-socks-proxy']
1038
be4a824d 1039 return self.do_open(functools.partial(
71aff188 1040 _create_http_connection, self, conn_class, True),
4f264c02 1041 req, **kwargs)
be4a824d
PH
1042
1043
a6420bf5
S
1044class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor):
1045 def __init__(self, cookiejar=None):
1046 compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar)
1047
1048 def http_response(self, request, response):
1049 # Python 2 will choke on next HTTP request in row if there are non-ASCII
1050 # characters in Set-Cookie HTTP header of last response (see
1051 # https://github.com/rg3/youtube-dl/issues/6769).
1052 # In order to at least prevent crashing we will percent encode Set-Cookie
1053 # header before HTTPCookieProcessor starts processing it.
e28034c5
S
1054 # if sys.version_info < (3, 0) and response.headers:
1055 # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'):
1056 # set_cookie = response.headers.get(set_cookie_header)
1057 # if set_cookie:
1058 # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ")
1059 # if set_cookie != set_cookie_escaped:
1060 # del response.headers[set_cookie_header]
1061 # response.headers[set_cookie_header] = set_cookie_escaped
a6420bf5
S
1062 return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response)
1063
1064 https_request = compat_urllib_request.HTTPCookieProcessor.http_request
1065 https_response = http_response
1066
1067
46f59e89
S
1068def extract_timezone(date_str):
1069 m = re.search(
1070 r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
1071 date_str)
1072 if not m:
1073 timezone = datetime.timedelta()
1074 else:
1075 date_str = date_str[:-len(m.group('tz'))]
1076 if not m.group('sign'):
1077 timezone = datetime.timedelta()
1078 else:
1079 sign = 1 if m.group('sign') == '+' else -1
1080 timezone = datetime.timedelta(
1081 hours=sign * int(m.group('hours')),
1082 minutes=sign * int(m.group('minutes')))
1083 return timezone, date_str
1084
1085
08b38d54 1086def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
1087 """ Return a UNIX timestamp from the given date """
1088
1089 if date_str is None:
1090 return None
1091
52c3a6e4
S
1092 date_str = re.sub(r'\.[0-9]+', '', date_str)
1093
08b38d54 1094 if timezone is None:
46f59e89
S
1095 timezone, date_str = extract_timezone(date_str)
1096
52c3a6e4
S
1097 try:
1098 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
1099 dt = datetime.datetime.strptime(date_str, date_format) - timezone
1100 return calendar.timegm(dt.timetuple())
1101 except ValueError:
1102 pass
912b38b4
PH
1103
1104
46f59e89
S
1105def date_formats(day_first=True):
1106 return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST
1107
1108
42bdd9d0 1109def unified_strdate(date_str, day_first=True):
bf50b038 1110 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
1111
1112 if date_str is None:
1113 return None
bf50b038 1114 upload_date = None
5f6a1245 1115 # Replace commas
026fcc04 1116 date_str = date_str.replace(',', ' ')
42bdd9d0 1117 # Remove AM/PM + timezone
9bb8e0a3 1118 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
46f59e89 1119 _, date_str = extract_timezone(date_str)
42bdd9d0 1120
46f59e89 1121 for expression in date_formats(day_first):
bf50b038
JMF
1122 try:
1123 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 1124 except ValueError:
bf50b038 1125 pass
42393ce2
PH
1126 if upload_date is None:
1127 timetuple = email.utils.parsedate_tz(date_str)
1128 if timetuple:
c6b9cf05
S
1129 try:
1130 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
1131 except ValueError:
1132 pass
6a750402
JMF
1133 if upload_date is not None:
1134 return compat_str(upload_date)
bf50b038 1135
5f6a1245 1136
46f59e89
S
1137def unified_timestamp(date_str, day_first=True):
1138 if date_str is None:
1139 return None
1140
1141 date_str = date_str.replace(',', ' ')
1142
7dc2a74e 1143 pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0
46f59e89
S
1144 timezone, date_str = extract_timezone(date_str)
1145
1146 # Remove AM/PM + timezone
1147 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
1148
1149 for expression in date_formats(day_first):
1150 try:
7dc2a74e 1151 dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
46f59e89
S
1152 return calendar.timegm(dt.timetuple())
1153 except ValueError:
1154 pass
1155 timetuple = email.utils.parsedate_tz(date_str)
1156 if timetuple:
7dc2a74e 1157 return calendar.timegm(timetuple) + pm_delta * 3600
46f59e89
S
1158
1159
28e614de 1160def determine_ext(url, default_ext='unknown_video'):
f4776371
S
1161 if url is None:
1162 return default_ext
9cb9a5df 1163 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
1164 if re.match(r'^[A-Za-z0-9]+$', guess):
1165 return guess
a7aaa398
S
1166 # Try extract ext from URLs like http://example.com/foo/bar.mp4/?download
1167 elif guess.rstrip('/') in KNOWN_EXTENSIONS:
9cb9a5df 1168 return guess.rstrip('/')
73e79f2a 1169 else:
cbdbb766 1170 return default_ext
73e79f2a 1171
5f6a1245 1172
d4051a8e 1173def subtitles_filename(filename, sub_lang, sub_format):
28e614de 1174 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 1175
5f6a1245 1176
bd558525 1177def date_from_str(date_str):
37254abc
JMF
1178 """
1179 Return a datetime object from a string in the format YYYYMMDD or
1180 (now|today)[+-][0-9](day|week|month|year)(s)?"""
1181 today = datetime.date.today()
f8795e10 1182 if date_str in ('now', 'today'):
37254abc 1183 return today
f8795e10
PH
1184 if date_str == 'yesterday':
1185 return today - datetime.timedelta(days=1)
37254abc
JMF
1186 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
1187 if match is not None:
1188 sign = match.group('sign')
1189 time = int(match.group('time'))
1190 if sign == '-':
1191 time = -time
1192 unit = match.group('unit')
dfb1b146 1193 # A bad approximation?
37254abc
JMF
1194 if unit == 'month':
1195 unit = 'day'
1196 time *= 30
1197 elif unit == 'year':
1198 unit = 'day'
1199 time *= 365
1200 unit += 's'
1201 delta = datetime.timedelta(**{unit: time})
1202 return today + delta
611c1dd9 1203 return datetime.datetime.strptime(date_str, '%Y%m%d').date()
5f6a1245
JW
1204
1205
e63fc1be 1206def hyphenate_date(date_str):
1207 """
1208 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
1209 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
1210 if match is not None:
1211 return '-'.join(match.groups())
1212 else:
1213 return date_str
1214
5f6a1245 1215
bd558525
JMF
1216class DateRange(object):
1217 """Represents a time interval between two dates"""
5f6a1245 1218
bd558525
JMF
1219 def __init__(self, start=None, end=None):
1220 """start and end must be strings in the format accepted by date"""
1221 if start is not None:
1222 self.start = date_from_str(start)
1223 else:
1224 self.start = datetime.datetime.min.date()
1225 if end is not None:
1226 self.end = date_from_str(end)
1227 else:
1228 self.end = datetime.datetime.max.date()
37254abc 1229 if self.start > self.end:
bd558525 1230 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 1231
bd558525
JMF
1232 @classmethod
1233 def day(cls, day):
1234 """Returns a range that only contains the given day"""
5f6a1245
JW
1235 return cls(day, day)
1236
bd558525
JMF
1237 def __contains__(self, date):
1238 """Check if the date is in the range"""
37254abc
JMF
1239 if not isinstance(date, datetime.date):
1240 date = date_from_str(date)
1241 return self.start <= date <= self.end
5f6a1245 1242
bd558525 1243 def __str__(self):
5f6a1245 1244 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
1245
1246
1247def platform_name():
1248 """ Returns the platform name as a compat_str """
1249 res = platform.platform()
1250 if isinstance(res, bytes):
1251 res = res.decode(preferredencoding())
1252
1253 assert isinstance(res, compat_str)
1254 return res
c257baff
PH
1255
1256
b58ddb32
PH
1257def _windows_write_string(s, out):
1258 """ Returns True if the string was written using special methods,
1259 False if it has yet to be written out."""
1260 # Adapted from http://stackoverflow.com/a/3259271/35070
1261
1262 import ctypes
1263 import ctypes.wintypes
1264
1265 WIN_OUTPUT_IDS = {
1266 1: -11,
1267 2: -12,
1268 }
1269
a383a98a
PH
1270 try:
1271 fileno = out.fileno()
1272 except AttributeError:
1273 # If the output stream doesn't have a fileno, it's virtual
1274 return False
aa42e873
PH
1275 except io.UnsupportedOperation:
1276 # Some strange Windows pseudo files?
1277 return False
b58ddb32
PH
1278 if fileno not in WIN_OUTPUT_IDS:
1279 return False
1280
e2f89ec7 1281 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 1282 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
611c1dd9 1283 (b'GetStdHandle', ctypes.windll.kernel32))
b58ddb32
PH
1284 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
1285
e2f89ec7 1286 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
1287 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
1288 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
611c1dd9 1289 ctypes.wintypes.LPVOID)((b'WriteConsoleW', ctypes.windll.kernel32))
b58ddb32
PH
1290 written = ctypes.wintypes.DWORD(0)
1291
611c1dd9 1292 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b'GetFileType', ctypes.windll.kernel32))
b58ddb32
PH
1293 FILE_TYPE_CHAR = 0x0002
1294 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 1295 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
1296 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
1297 ctypes.POINTER(ctypes.wintypes.DWORD))(
611c1dd9 1298 (b'GetConsoleMode', ctypes.windll.kernel32))
b58ddb32
PH
1299 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
1300
1301 def not_a_console(handle):
1302 if handle == INVALID_HANDLE_VALUE or handle is None:
1303 return True
8fb3ac36
PH
1304 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
1305 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
1306
1307 if not_a_console(h):
1308 return False
1309
d1b9c912
PH
1310 def next_nonbmp_pos(s):
1311 try:
1312 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
1313 except StopIteration:
1314 return len(s)
1315
1316 while s:
1317 count = min(next_nonbmp_pos(s), 1024)
1318
b58ddb32 1319 ret = WriteConsoleW(
d1b9c912 1320 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
1321 if ret == 0:
1322 raise OSError('Failed to write string')
d1b9c912
PH
1323 if not count: # We just wrote a non-BMP character
1324 assert written.value == 2
1325 s = s[1:]
1326 else:
1327 assert written.value > 0
1328 s = s[written.value:]
b58ddb32
PH
1329 return True
1330
1331
734f90bb 1332def write_string(s, out=None, encoding=None):
7459e3a2
PH
1333 if out is None:
1334 out = sys.stderr
8bf48f23 1335 assert type(s) == compat_str
7459e3a2 1336
b58ddb32
PH
1337 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
1338 if _windows_write_string(s, out):
1339 return
1340
7459e3a2
PH
1341 if ('b' in getattr(out, 'mode', '') or
1342 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
1343 byt = s.encode(encoding or preferredencoding(), 'ignore')
1344 out.write(byt)
1345 elif hasattr(out, 'buffer'):
1346 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
1347 byt = s.encode(enc, 'ignore')
1348 out.buffer.write(byt)
1349 else:
8bf48f23 1350 out.write(s)
7459e3a2
PH
1351 out.flush()
1352
1353
48ea9cea
PH
1354def bytes_to_intlist(bs):
1355 if not bs:
1356 return []
1357 if isinstance(bs[0], int): # Python 3
1358 return list(bs)
1359 else:
1360 return [ord(c) for c in bs]
1361
c257baff 1362
cba892fa 1363def intlist_to_bytes(xs):
1364 if not xs:
1365 return b''
edaa23f8 1366 return compat_struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1367
1368
c1c9a79c
PH
1369# Cross-platform file locking
1370if sys.platform == 'win32':
1371 import ctypes.wintypes
1372 import msvcrt
1373
1374 class OVERLAPPED(ctypes.Structure):
1375 _fields_ = [
1376 ('Internal', ctypes.wintypes.LPVOID),
1377 ('InternalHigh', ctypes.wintypes.LPVOID),
1378 ('Offset', ctypes.wintypes.DWORD),
1379 ('OffsetHigh', ctypes.wintypes.DWORD),
1380 ('hEvent', ctypes.wintypes.HANDLE),
1381 ]
1382
1383 kernel32 = ctypes.windll.kernel32
1384 LockFileEx = kernel32.LockFileEx
1385 LockFileEx.argtypes = [
1386 ctypes.wintypes.HANDLE, # hFile
1387 ctypes.wintypes.DWORD, # dwFlags
1388 ctypes.wintypes.DWORD, # dwReserved
1389 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1390 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1391 ctypes.POINTER(OVERLAPPED) # Overlapped
1392 ]
1393 LockFileEx.restype = ctypes.wintypes.BOOL
1394 UnlockFileEx = kernel32.UnlockFileEx
1395 UnlockFileEx.argtypes = [
1396 ctypes.wintypes.HANDLE, # hFile
1397 ctypes.wintypes.DWORD, # dwReserved
1398 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1399 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1400 ctypes.POINTER(OVERLAPPED) # Overlapped
1401 ]
1402 UnlockFileEx.restype = ctypes.wintypes.BOOL
1403 whole_low = 0xffffffff
1404 whole_high = 0x7fffffff
1405
1406 def _lock_file(f, exclusive):
1407 overlapped = OVERLAPPED()
1408 overlapped.Offset = 0
1409 overlapped.OffsetHigh = 0
1410 overlapped.hEvent = 0
1411 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1412 handle = msvcrt.get_osfhandle(f.fileno())
1413 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1414 whole_low, whole_high, f._lock_file_overlapped_p):
1415 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1416
1417 def _unlock_file(f):
1418 assert f._lock_file_overlapped_p
1419 handle = msvcrt.get_osfhandle(f.fileno())
1420 if not UnlockFileEx(handle, 0,
1421 whole_low, whole_high, f._lock_file_overlapped_p):
1422 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1423
1424else:
399a76e6
YCH
1425 # Some platforms, such as Jython, is missing fcntl
1426 try:
1427 import fcntl
c1c9a79c 1428
399a76e6
YCH
1429 def _lock_file(f, exclusive):
1430 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c 1431
399a76e6
YCH
1432 def _unlock_file(f):
1433 fcntl.flock(f, fcntl.LOCK_UN)
1434 except ImportError:
1435 UNSUPPORTED_MSG = 'file locking is not supported on this platform'
1436
1437 def _lock_file(f, exclusive):
1438 raise IOError(UNSUPPORTED_MSG)
1439
1440 def _unlock_file(f):
1441 raise IOError(UNSUPPORTED_MSG)
c1c9a79c
PH
1442
1443
1444class locked_file(object):
1445 def __init__(self, filename, mode, encoding=None):
1446 assert mode in ['r', 'a', 'w']
1447 self.f = io.open(filename, mode, encoding=encoding)
1448 self.mode = mode
1449
1450 def __enter__(self):
1451 exclusive = self.mode != 'r'
1452 try:
1453 _lock_file(self.f, exclusive)
1454 except IOError:
1455 self.f.close()
1456 raise
1457 return self
1458
1459 def __exit__(self, etype, value, traceback):
1460 try:
1461 _unlock_file(self.f)
1462 finally:
1463 self.f.close()
1464
1465 def __iter__(self):
1466 return iter(self.f)
1467
1468 def write(self, *args):
1469 return self.f.write(*args)
1470
1471 def read(self, *args):
1472 return self.f.read(*args)
4eb7f1d1
JMF
1473
1474
4644ac55
S
1475def get_filesystem_encoding():
1476 encoding = sys.getfilesystemencoding()
1477 return encoding if encoding is not None else 'utf-8'
1478
1479
4eb7f1d1 1480def shell_quote(args):
a6a173c2 1481 quoted_args = []
4644ac55 1482 encoding = get_filesystem_encoding()
a6a173c2
JMF
1483 for a in args:
1484 if isinstance(a, bytes):
1485 # We may get a filename encoded with 'encodeFilename'
1486 a = a.decode(encoding)
1487 quoted_args.append(pipes.quote(a))
28e614de 1488 return ' '.join(quoted_args)
9d4660ca
PH
1489
1490
1491def smuggle_url(url, data):
1492 """ Pass additional data in a URL for internal use. """
1493
81953d1a
RA
1494 url, idata = unsmuggle_url(url, {})
1495 data.update(idata)
15707c7e 1496 sdata = compat_urllib_parse_urlencode(
28e614de
PH
1497 {'__youtubedl_smuggle': json.dumps(data)})
1498 return url + '#' + sdata
9d4660ca
PH
1499
1500
79f82953 1501def unsmuggle_url(smug_url, default=None):
83e865a3 1502 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1503 return smug_url, default
28e614de
PH
1504 url, _, sdata = smug_url.rpartition('#')
1505 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1506 data = json.loads(jsond)
1507 return url, data
02dbf93f
PH
1508
1509
02dbf93f
PH
1510def format_bytes(bytes):
1511 if bytes is None:
28e614de 1512 return 'N/A'
02dbf93f
PH
1513 if type(bytes) is str:
1514 bytes = float(bytes)
1515 if bytes == 0.0:
1516 exponent = 0
1517 else:
1518 exponent = int(math.log(bytes, 1024.0))
28e614de 1519 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1520 converted = float(bytes) / float(1024 ** exponent)
28e614de 1521 return '%.2f%s' % (converted, suffix)
f53c966a 1522
1c088fa8 1523
fb47597b
S
1524def lookup_unit_table(unit_table, s):
1525 units_re = '|'.join(re.escape(u) for u in unit_table)
1526 m = re.match(
782b1b5b 1527 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s)
fb47597b
S
1528 if not m:
1529 return None
1530 num_str = m.group('num').replace(',', '.')
1531 mult = unit_table[m.group('unit')]
1532 return int(float(num_str) * mult)
1533
1534
be64b5b0
PH
1535def parse_filesize(s):
1536 if s is None:
1537 return None
1538
dfb1b146 1539 # The lower-case forms are of course incorrect and unofficial,
be64b5b0
PH
1540 # but we support those too
1541 _UNIT_TABLE = {
1542 'B': 1,
1543 'b': 1,
70852b47 1544 'bytes': 1,
be64b5b0
PH
1545 'KiB': 1024,
1546 'KB': 1000,
1547 'kB': 1024,
1548 'Kb': 1000,
13585d76 1549 'kb': 1000,
70852b47
YCH
1550 'kilobytes': 1000,
1551 'kibibytes': 1024,
be64b5b0
PH
1552 'MiB': 1024 ** 2,
1553 'MB': 1000 ** 2,
1554 'mB': 1024 ** 2,
1555 'Mb': 1000 ** 2,
13585d76 1556 'mb': 1000 ** 2,
70852b47
YCH
1557 'megabytes': 1000 ** 2,
1558 'mebibytes': 1024 ** 2,
be64b5b0
PH
1559 'GiB': 1024 ** 3,
1560 'GB': 1000 ** 3,
1561 'gB': 1024 ** 3,
1562 'Gb': 1000 ** 3,
13585d76 1563 'gb': 1000 ** 3,
70852b47
YCH
1564 'gigabytes': 1000 ** 3,
1565 'gibibytes': 1024 ** 3,
be64b5b0
PH
1566 'TiB': 1024 ** 4,
1567 'TB': 1000 ** 4,
1568 'tB': 1024 ** 4,
1569 'Tb': 1000 ** 4,
13585d76 1570 'tb': 1000 ** 4,
70852b47
YCH
1571 'terabytes': 1000 ** 4,
1572 'tebibytes': 1024 ** 4,
be64b5b0
PH
1573 'PiB': 1024 ** 5,
1574 'PB': 1000 ** 5,
1575 'pB': 1024 ** 5,
1576 'Pb': 1000 ** 5,
13585d76 1577 'pb': 1000 ** 5,
70852b47
YCH
1578 'petabytes': 1000 ** 5,
1579 'pebibytes': 1024 ** 5,
be64b5b0
PH
1580 'EiB': 1024 ** 6,
1581 'EB': 1000 ** 6,
1582 'eB': 1024 ** 6,
1583 'Eb': 1000 ** 6,
13585d76 1584 'eb': 1000 ** 6,
70852b47
YCH
1585 'exabytes': 1000 ** 6,
1586 'exbibytes': 1024 ** 6,
be64b5b0
PH
1587 'ZiB': 1024 ** 7,
1588 'ZB': 1000 ** 7,
1589 'zB': 1024 ** 7,
1590 'Zb': 1000 ** 7,
13585d76 1591 'zb': 1000 ** 7,
70852b47
YCH
1592 'zettabytes': 1000 ** 7,
1593 'zebibytes': 1024 ** 7,
be64b5b0
PH
1594 'YiB': 1024 ** 8,
1595 'YB': 1000 ** 8,
1596 'yB': 1024 ** 8,
1597 'Yb': 1000 ** 8,
13585d76 1598 'yb': 1000 ** 8,
70852b47
YCH
1599 'yottabytes': 1000 ** 8,
1600 'yobibytes': 1024 ** 8,
be64b5b0
PH
1601 }
1602
fb47597b
S
1603 return lookup_unit_table(_UNIT_TABLE, s)
1604
1605
1606def parse_count(s):
1607 if s is None:
be64b5b0
PH
1608 return None
1609
fb47597b
S
1610 s = s.strip()
1611
1612 if re.match(r'^[\d,.]+$', s):
1613 return str_to_int(s)
1614
1615 _UNIT_TABLE = {
1616 'k': 1000,
1617 'K': 1000,
1618 'm': 1000 ** 2,
1619 'M': 1000 ** 2,
1620 'kk': 1000 ** 2,
1621 'KK': 1000 ** 2,
1622 }
be64b5b0 1623
fb47597b 1624 return lookup_unit_table(_UNIT_TABLE, s)
be64b5b0 1625
2f7ae819 1626
a942d6cb 1627def month_by_name(name, lang='en'):
caefb1de
PH
1628 """ Return the number of a month by (locale-independently) English name """
1629
f6717dec 1630 month_names = MONTH_NAMES.get(lang, MONTH_NAMES['en'])
a942d6cb 1631
caefb1de 1632 try:
f6717dec 1633 return month_names.index(name) + 1
7105440c
YCH
1634 except ValueError:
1635 return None
1636
1637
1638def month_by_abbreviation(abbrev):
1639 """ Return the number of a month by (locale-independently) English
1640 abbreviations """
1641
1642 try:
1643 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1644 except ValueError:
1645 return None
18258362
JMF
1646
1647
5aafe895 1648def fix_xml_ampersands(xml_str):
18258362 1649 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1650 return re.sub(
1651 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1652 '&amp;',
5aafe895 1653 xml_str)
e3946f98
PH
1654
1655
1656def setproctitle(title):
8bf48f23 1657 assert isinstance(title, compat_str)
c1c05c67
YCH
1658
1659 # ctypes in Jython is not complete
1660 # http://bugs.jython.org/issue2148
1661 if sys.platform.startswith('java'):
1662 return
1663
e3946f98 1664 try:
611c1dd9 1665 libc = ctypes.cdll.LoadLibrary('libc.so.6')
e3946f98
PH
1666 except OSError:
1667 return
6eefe533
PH
1668 title_bytes = title.encode('utf-8')
1669 buf = ctypes.create_string_buffer(len(title_bytes))
1670 buf.value = title_bytes
e3946f98 1671 try:
6eefe533 1672 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1673 except AttributeError:
1674 return # Strange libc, just skip this
d7dda168
PH
1675
1676
1677def remove_start(s, start):
46bc9b7d 1678 return s[len(start):] if s is not None and s.startswith(start) else s
29eb5174
PH
1679
1680
2b9faf55 1681def remove_end(s, end):
46bc9b7d 1682 return s[:-len(end)] if s is not None and s.endswith(end) else s
2b9faf55
PH
1683
1684
31b2051e
S
1685def remove_quotes(s):
1686 if s is None or len(s) < 2:
1687 return s
1688 for quote in ('"', "'", ):
1689 if s[0] == quote and s[-1] == quote:
1690 return s[1:-1]
1691 return s
1692
1693
29eb5174 1694def url_basename(url):
9b8aaeed 1695 path = compat_urlparse.urlparse(url).path
28e614de 1696 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1697
1698
02dc0a36
S
1699def base_url(url):
1700 return re.match(r'https?://[^?#&]+/', url).group()
1701
1702
aa94a6d3
PH
1703class HEADRequest(compat_urllib_request.Request):
1704 def get_method(self):
611c1dd9 1705 return 'HEAD'
7217e148
PH
1706
1707
95cf60e8
S
1708class PUTRequest(compat_urllib_request.Request):
1709 def get_method(self):
1710 return 'PUT'
1711
1712
9732d77e 1713def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1714 if get_attr:
1715 if v is not None:
1716 v = getattr(v, get_attr, None)
9572013d
PH
1717 if v == '':
1718 v = None
1812afb7
S
1719 if v is None:
1720 return default
1721 try:
1722 return int(v) * invscale // scale
1723 except ValueError:
af98f8ff 1724 return default
9732d77e 1725
9572013d 1726
40a90862
JMF
1727def str_or_none(v, default=None):
1728 return default if v is None else compat_str(v)
1729
9732d77e
PH
1730
1731def str_to_int(int_str):
48d4681e 1732 """ A more relaxed version of int_or_none """
9732d77e
PH
1733 if int_str is None:
1734 return None
28e614de 1735 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1736 return int(int_str)
608d11f5
PH
1737
1738
9732d77e 1739def float_or_none(v, scale=1, invscale=1, default=None):
caf80631
S
1740 if v is None:
1741 return default
1742 try:
1743 return float(v) * invscale / scale
1744 except ValueError:
1745 return default
43f775e4
PH
1746
1747
b72b4431
S
1748def strip_or_none(v):
1749 return None if v is None else v.strip()
1750
1751
608d11f5 1752def parse_duration(s):
8f9312c3 1753 if not isinstance(s, compat_basestring):
608d11f5
PH
1754 return None
1755
ca7b3246
S
1756 s = s.strip()
1757
acaff495 1758 days, hours, mins, secs, ms = [None] * 5
1759 m = re.match(r'(?:(?:(?:(?P<days>[0-9]+):)?(?P<hours>[0-9]+):)?(?P<mins>[0-9]+):)?(?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?$', s)
1760 if m:
1761 days, hours, mins, secs, ms = m.groups()
1762 else:
1763 m = re.match(
1764 r'''(?ix)(?:P?T)?
8f4b58d7 1765 (?:
acaff495 1766 (?P<days>[0-9]+)\s*d(?:ays?)?\s*
8f4b58d7 1767 )?
acaff495 1768 (?:
1769 (?P<hours>[0-9]+)\s*h(?:ours?)?\s*
1770 )?
1771 (?:
1772 (?P<mins>[0-9]+)\s*m(?:in(?:ute)?s?)?\s*
1773 )?
1774 (?:
1775 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s*
1776 )?$''', s)
1777 if m:
1778 days, hours, mins, secs, ms = m.groups()
1779 else:
1780 m = re.match(r'(?i)(?:(?P<hours>[0-9.]+)\s*(?:hours?)|(?P<mins>[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)$', s)
1781 if m:
1782 hours, mins = m.groups()
1783 else:
1784 return None
1785
1786 duration = 0
1787 if secs:
1788 duration += float(secs)
1789 if mins:
1790 duration += float(mins) * 60
1791 if hours:
1792 duration += float(hours) * 60 * 60
1793 if days:
1794 duration += float(days) * 24 * 60 * 60
1795 if ms:
1796 duration += float(ms)
1797 return duration
91d7d0b3
JMF
1798
1799
e65e4c88 1800def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1801 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1802 return (
1803 '{0}.{1}{2}'.format(name, ext, real_ext)
1804 if not expected_real_ext or real_ext[1:] == expected_real_ext
1805 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1806
1807
b3ed15b7
S
1808def replace_extension(filename, ext, expected_real_ext=None):
1809 name, real_ext = os.path.splitext(filename)
1810 return '{0}.{1}'.format(
1811 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1812 ext)
1813
1814
d70ad093
PH
1815def check_executable(exe, args=[]):
1816 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1817 args can be a list of arguments for a short output (like -version) """
1818 try:
1819 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1820 except OSError:
1821 return False
1822 return exe
b7ab0590
PH
1823
1824
95807118 1825def get_exe_version(exe, args=['--version'],
cae97f65 1826 version_re=None, unrecognized='present'):
95807118
PH
1827 """ Returns the version of the specified executable,
1828 or False if the executable is not present """
1829 try:
b64d04c1
YCH
1830 # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers
1831 # SIGTTOU if youtube-dl is run in the background.
1832 # See https://github.com/rg3/youtube-dl/issues/955#issuecomment-209789656
cae97f65 1833 out, _ = subprocess.Popen(
54116803 1834 [encodeArgument(exe)] + args,
00ca7552 1835 stdin=subprocess.PIPE,
95807118
PH
1836 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1837 except OSError:
1838 return False
cae97f65
PH
1839 if isinstance(out, bytes): # Python 2.x
1840 out = out.decode('ascii', 'ignore')
1841 return detect_exe_version(out, version_re, unrecognized)
1842
1843
1844def detect_exe_version(output, version_re=None, unrecognized='present'):
1845 assert isinstance(output, compat_str)
1846 if version_re is None:
1847 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1848 m = re.search(version_re, output)
95807118
PH
1849 if m:
1850 return m.group(1)
1851 else:
1852 return unrecognized
1853
1854
b7ab0590 1855class PagedList(object):
dd26ced1
PH
1856 def __len__(self):
1857 # This is only useful for tests
1858 return len(self.getslice())
1859
9c44d242
PH
1860
1861class OnDemandPagedList(PagedList):
b95dc034 1862 def __init__(self, pagefunc, pagesize, use_cache=False):
9c44d242
PH
1863 self._pagefunc = pagefunc
1864 self._pagesize = pagesize
b95dc034
YCH
1865 self._use_cache = use_cache
1866 if use_cache:
1867 self._cache = {}
9c44d242 1868
b7ab0590
PH
1869 def getslice(self, start=0, end=None):
1870 res = []
1871 for pagenum in itertools.count(start // self._pagesize):
1872 firstid = pagenum * self._pagesize
1873 nextfirstid = pagenum * self._pagesize + self._pagesize
1874 if start >= nextfirstid:
1875 continue
1876
b95dc034
YCH
1877 page_results = None
1878 if self._use_cache:
1879 page_results = self._cache.get(pagenum)
1880 if page_results is None:
1881 page_results = list(self._pagefunc(pagenum))
1882 if self._use_cache:
1883 self._cache[pagenum] = page_results
b7ab0590
PH
1884
1885 startv = (
1886 start % self._pagesize
1887 if firstid <= start < nextfirstid
1888 else 0)
1889
1890 endv = (
1891 ((end - 1) % self._pagesize) + 1
1892 if (end is not None and firstid <= end <= nextfirstid)
1893 else None)
1894
1895 if startv != 0 or endv is not None:
1896 page_results = page_results[startv:endv]
1897 res.extend(page_results)
1898
1899 # A little optimization - if current page is not "full", ie. does
1900 # not contain page_size videos then we can assume that this page
1901 # is the last one - there are no more ids on further pages -
1902 # i.e. no need to query again.
1903 if len(page_results) + startv < self._pagesize:
1904 break
1905
1906 # If we got the whole page, but the next page is not interesting,
1907 # break out early as well
1908 if end == nextfirstid:
1909 break
1910 return res
81c2f20b
PH
1911
1912
9c44d242
PH
1913class InAdvancePagedList(PagedList):
1914 def __init__(self, pagefunc, pagecount, pagesize):
1915 self._pagefunc = pagefunc
1916 self._pagecount = pagecount
1917 self._pagesize = pagesize
1918
1919 def getslice(self, start=0, end=None):
1920 res = []
1921 start_page = start // self._pagesize
1922 end_page = (
1923 self._pagecount if end is None else (end // self._pagesize + 1))
1924 skip_elems = start - start_page * self._pagesize
1925 only_more = None if end is None else end - start
1926 for pagenum in range(start_page, end_page):
1927 page = list(self._pagefunc(pagenum))
1928 if skip_elems:
1929 page = page[skip_elems:]
1930 skip_elems = None
1931 if only_more is not None:
1932 if len(page) < only_more:
1933 only_more -= len(page)
1934 else:
1935 page = page[:only_more]
1936 res.extend(page)
1937 break
1938 res.extend(page)
1939 return res
1940
1941
81c2f20b 1942def uppercase_escape(s):
676eb3f2 1943 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1944 return re.sub(
a612753d 1945 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1946 lambda m: unicode_escape(m.group(0))[0],
1947 s)
0fe2ff78
YCH
1948
1949
1950def lowercase_escape(s):
1951 unicode_escape = codecs.getdecoder('unicode_escape')
1952 return re.sub(
1953 r'\\u[0-9a-fA-F]{4}',
1954 lambda m: unicode_escape(m.group(0))[0],
1955 s)
b53466e1 1956
d05cfe06
S
1957
1958def escape_rfc3986(s):
1959 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1960 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1961 s = s.encode('utf-8')
ecc0c5ee 1962 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1963
1964
1965def escape_url(url):
1966 """Escape URL as suggested by RFC 3986"""
1967 url_parsed = compat_urllib_parse_urlparse(url)
1968 return url_parsed._replace(
efbed08d 1969 netloc=url_parsed.netloc.encode('idna').decode('ascii'),
d05cfe06
S
1970 path=escape_rfc3986(url_parsed.path),
1971 params=escape_rfc3986(url_parsed.params),
1972 query=escape_rfc3986(url_parsed.query),
1973 fragment=escape_rfc3986(url_parsed.fragment)
1974 ).geturl()
1975
62e609ab
PH
1976
1977def read_batch_urls(batch_fd):
1978 def fixup(url):
1979 if not isinstance(url, compat_str):
1980 url = url.decode('utf-8', 'replace')
28e614de 1981 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1982 if url.startswith(BOM_UTF8):
1983 url = url[len(BOM_UTF8):]
1984 url = url.strip()
1985 if url.startswith(('#', ';', ']')):
1986 return False
1987 return url
1988
1989 with contextlib.closing(batch_fd) as fd:
1990 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1991
1992
1993def urlencode_postdata(*args, **kargs):
15707c7e 1994 return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1995
1996
38f9ef31 1997def update_url_query(url, query):
cacd9966
YCH
1998 if not query:
1999 return url
38f9ef31 2000 parsed_url = compat_urlparse.urlparse(url)
2001 qs = compat_parse_qs(parsed_url.query)
2002 qs.update(query)
2003 return compat_urlparse.urlunparse(parsed_url._replace(
15707c7e 2004 query=compat_urllib_parse_urlencode(qs, True)))
16392824 2005
8e60dc75 2006
ed0291d1
S
2007def update_Request(req, url=None, data=None, headers={}, query={}):
2008 req_headers = req.headers.copy()
2009 req_headers.update(headers)
2010 req_data = data or req.data
2011 req_url = update_url_query(url or req.get_full_url(), query)
95cf60e8
S
2012 req_get_method = req.get_method()
2013 if req_get_method == 'HEAD':
2014 req_type = HEADRequest
2015 elif req_get_method == 'PUT':
2016 req_type = PUTRequest
2017 else:
2018 req_type = compat_urllib_request.Request
ed0291d1
S
2019 new_req = req_type(
2020 req_url, data=req_data, headers=req_headers,
2021 origin_req_host=req.origin_req_host, unverifiable=req.unverifiable)
2022 if hasattr(req, 'timeout'):
2023 new_req.timeout = req.timeout
2024 return new_req
2025
2026
86296ad2 2027def dict_get(d, key_or_keys, default=None, skip_false_values=True):
cbecc9b9
S
2028 if isinstance(key_or_keys, (list, tuple)):
2029 for key in key_or_keys:
86296ad2
S
2030 if key not in d or d[key] is None or skip_false_values and not d[key]:
2031 continue
2032 return d[key]
cbecc9b9
S
2033 return default
2034 return d.get(key_or_keys, default)
2035
2036
329ca3be
S
2037def try_get(src, getter, expected_type=None):
2038 try:
2039 v = getter(src)
2040 except (AttributeError, KeyError, TypeError, IndexError):
2041 pass
2042 else:
2043 if expected_type is None or isinstance(v, expected_type):
2044 return v
2045
2046
8e60dc75
S
2047def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
2048 return string if isinstance(string, compat_str) else compat_str(string, encoding, errors)
2049
16392824 2050
a1a530b0
PH
2051US_RATINGS = {
2052 'G': 0,
2053 'PG': 10,
2054 'PG-13': 13,
2055 'R': 16,
2056 'NC': 18,
2057}
fac55558
PH
2058
2059
a8795327
S
2060TV_PARENTAL_GUIDELINES = {
2061 'TV-Y': 0,
2062 'TV-Y7': 7,
2063 'TV-G': 0,
2064 'TV-PG': 0,
2065 'TV-14': 14,
2066 'TV-MA': 17,
2067}
2068
2069
146c80e2 2070def parse_age_limit(s):
a8795327
S
2071 if type(s) == int:
2072 return s if 0 <= s <= 21 else None
2073 if not isinstance(s, compat_basestring):
d838b1bd 2074 return None
146c80e2 2075 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
a8795327
S
2076 if m:
2077 return int(m.group('age'))
2078 if s in US_RATINGS:
2079 return US_RATINGS[s]
2080 return TV_PARENTAL_GUIDELINES.get(s)
146c80e2
S
2081
2082
fac55558 2083def strip_jsonp(code):
609a61e3 2084 return re.sub(
5950cb1d 2085 r'(?s)^[a-zA-Z0-9_.$]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
2086
2087
e05f6939
PH
2088def js_to_json(code):
2089 def fix_kv(m):
e7b6d122
PH
2090 v = m.group(0)
2091 if v in ('true', 'false', 'null'):
2092 return v
bd1e4844 2093 elif v.startswith('/*') or v == ',':
2094 return ""
2095
2096 if v[0] in ("'", '"'):
2097 v = re.sub(r'(?s)\\.|"', lambda m: {
e7b6d122 2098 '"': '\\"',
bd1e4844 2099 "\\'": "'",
2100 '\\\n': '',
2101 '\\x': '\\u00',
2102 }.get(m.group(0), m.group(0)), v[1:-1])
2103
89ac4a19 2104 INTEGER_TABLE = (
e4659b45
YCH
2105 (r'^(0[xX][0-9a-fA-F]+)\s*:?$', 16),
2106 (r'^(0+[0-7]+)\s*:?$', 8),
89ac4a19
S
2107 )
2108
2109 for regex, base in INTEGER_TABLE:
2110 im = re.match(regex, v)
2111 if im:
e4659b45 2112 i = int(im.group(1), base)
89ac4a19
S
2113 return '"%d":' % i if v.endswith(':') else '%d' % i
2114
e7b6d122 2115 return '"%s"' % v
e05f6939 2116
bd1e4844 2117 return re.sub(r'''(?sx)
2118 "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"|
2119 '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'|
2120 /\*.*?\*/|,(?=\s*[\]}])|
2121 [a-zA-Z_][.a-zA-Z_0-9]*|
47212f7b 2122 \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:\s*:)?|
bd1e4844 2123 [0-9]+(?=\s*:)
e05f6939 2124 ''', fix_kv, code)
e05f6939
PH
2125
2126
478c2c61
PH
2127def qualities(quality_ids):
2128 """ Get a numeric quality value out of a list of possible values """
2129 def q(qid):
2130 try:
2131 return quality_ids.index(qid)
2132 except ValueError:
2133 return -1
2134 return q
2135
acd69589
PH
2136
2137DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 2138
a020a0dc
PH
2139
2140def limit_length(s, length):
2141 """ Add ellipses to overly long strings """
2142 if s is None:
2143 return None
2144 ELLIPSES = '...'
2145 if len(s) > length:
2146 return s[:length - len(ELLIPSES)] + ELLIPSES
2147 return s
48844745
PH
2148
2149
2150def version_tuple(v):
5f9b8394 2151 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
2152
2153
2154def is_outdated_version(version, limit, assume_new=True):
2155 if not version:
2156 return not assume_new
2157 try:
2158 return version_tuple(version) < version_tuple(limit)
2159 except ValueError:
2160 return not assume_new
732ea2f0
PH
2161
2162
2163def ytdl_is_updateable():
2164 """ Returns if youtube-dl can be updated with -U """
2165 from zipimport import zipimporter
2166
2167 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
2168
2169
2170def args_to_str(args):
2171 # Get a short string representation for a subprocess command
702ccf2d 2172 return ' '.join(compat_shlex_quote(a) for a in args)
2ccd1b10
PH
2173
2174
9b9c5355 2175def error_to_compat_str(err):
fdae2358
S
2176 err_str = str(err)
2177 # On python 2 error byte string must be decoded with proper
2178 # encoding rather than ascii
2179 if sys.version_info[0] < 3:
2180 err_str = err_str.decode(preferredencoding())
2181 return err_str
2182
2183
c460bdd5 2184def mimetype2ext(mt):
eb9ee194
S
2185 if mt is None:
2186 return None
2187
765ac263
JMF
2188 ext = {
2189 'audio/mp4': 'm4a',
6c33d24b
YCH
2190 # Per RFC 3003, audio/mpeg can be .mp1, .mp2 or .mp3. Here use .mp3 as
2191 # it's the most popular one
2192 'audio/mpeg': 'mp3',
765ac263
JMF
2193 }.get(mt)
2194 if ext is not None:
2195 return ext
2196
c460bdd5 2197 _, _, res = mt.rpartition('/')
6562d34a 2198 res = res.split(';')[0].strip().lower()
c460bdd5
PH
2199
2200 return {
f6861ec9 2201 '3gpp': '3gp',
cafcf657 2202 'smptett+xml': 'tt',
2203 'srt': 'srt',
2204 'ttaf+xml': 'dfxp',
a0d8d704 2205 'ttml+xml': 'ttml',
cafcf657 2206 'vtt': 'vtt',
f6861ec9 2207 'x-flv': 'flv',
a0d8d704
YCH
2208 'x-mp4-fragmented': 'mp4',
2209 'x-ms-wmv': 'wmv',
b4173f15
RA
2210 'mpegurl': 'm3u8',
2211 'x-mpegurl': 'm3u8',
2212 'vnd.apple.mpegurl': 'm3u8',
2213 'dash+xml': 'mpd',
2214 'f4m': 'f4m',
2215 'f4m+xml': 'f4m',
f164b971 2216 'hds+xml': 'f4m',
e910fe2f 2217 'vnd.ms-sstr+xml': 'ism',
c2b2c7e1 2218 'quicktime': 'mov',
c460bdd5
PH
2219 }.get(res, res)
2220
2221
4f3c5e06 2222def parse_codecs(codecs_str):
2223 # http://tools.ietf.org/html/rfc6381
2224 if not codecs_str:
2225 return {}
2226 splited_codecs = list(filter(None, map(
2227 lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
2228 vcodec, acodec = None, None
2229 for full_codec in splited_codecs:
2230 codec = full_codec.split('.')[0]
2231 if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
2232 if not vcodec:
2233 vcodec = full_codec
073ac122 2234 elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
4f3c5e06 2235 if not acodec:
2236 acodec = full_codec
2237 else:
2238 write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
2239 if not vcodec and not acodec:
2240 if len(splited_codecs) == 2:
2241 return {
2242 'vcodec': vcodec,
2243 'acodec': acodec,
2244 }
2245 elif len(splited_codecs) == 1:
2246 return {
2247 'vcodec': 'none',
2248 'acodec': vcodec,
2249 }
2250 else:
2251 return {
2252 'vcodec': vcodec or 'none',
2253 'acodec': acodec or 'none',
2254 }
2255 return {}
2256
2257
2ccd1b10 2258def urlhandle_detect_ext(url_handle):
79298173 2259 getheader = url_handle.headers.get
2ccd1b10 2260
b55ee18f
PH
2261 cd = getheader('Content-Disposition')
2262 if cd:
2263 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
2264 if m:
2265 e = determine_ext(m.group('filename'), default_ext=None)
2266 if e:
2267 return e
2268
c460bdd5 2269 return mimetype2ext(getheader('Content-Type'))
05900629
PH
2270
2271
1e399778
YCH
2272def encode_data_uri(data, mime_type):
2273 return 'data:%s;base64,%s' % (mime_type, base64.b64encode(data).decode('ascii'))
2274
2275
05900629 2276def age_restricted(content_limit, age_limit):
6ec6cb4e 2277 """ Returns True iff the content should be blocked """
05900629
PH
2278
2279 if age_limit is None: # No limit set
2280 return False
2281 if content_limit is None:
2282 return False # Content available for everyone
2283 return age_limit < content_limit
61ca9a80
PH
2284
2285
2286def is_html(first_bytes):
2287 """ Detect whether a file contains HTML by examining its first bytes. """
2288
2289 BOMS = [
2290 (b'\xef\xbb\xbf', 'utf-8'),
2291 (b'\x00\x00\xfe\xff', 'utf-32-be'),
2292 (b'\xff\xfe\x00\x00', 'utf-32-le'),
2293 (b'\xff\xfe', 'utf-16-le'),
2294 (b'\xfe\xff', 'utf-16-be'),
2295 ]
2296 for bom, enc in BOMS:
2297 if first_bytes.startswith(bom):
2298 s = first_bytes[len(bom):].decode(enc, 'replace')
2299 break
2300 else:
2301 s = first_bytes.decode('utf-8', 'replace')
2302
2303 return re.match(r'^\s*<', s)
a055469f
PH
2304
2305
2306def determine_protocol(info_dict):
2307 protocol = info_dict.get('protocol')
2308 if protocol is not None:
2309 return protocol
2310
2311 url = info_dict['url']
2312 if url.startswith('rtmp'):
2313 return 'rtmp'
2314 elif url.startswith('mms'):
2315 return 'mms'
2316 elif url.startswith('rtsp'):
2317 return 'rtsp'
2318
2319 ext = determine_ext(url)
2320 if ext == 'm3u8':
2321 return 'm3u8'
2322 elif ext == 'f4m':
2323 return 'f4m'
2324
2325 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
2326
2327
2328def render_table(header_row, data):
2329 """ Render a list of rows, each as a list of values """
2330 table = [header_row] + data
2331 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
2332 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
2333 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
2334
2335
2336def _match_one(filter_part, dct):
2337 COMPARISON_OPERATORS = {
2338 '<': operator.lt,
2339 '<=': operator.le,
2340 '>': operator.gt,
2341 '>=': operator.ge,
2342 '=': operator.eq,
2343 '!=': operator.ne,
2344 }
2345 operator_rex = re.compile(r'''(?x)\s*
2346 (?P<key>[a-z_]+)
2347 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
2348 (?:
2349 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
2350 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
2351 )
2352 \s*$
2353 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
2354 m = operator_rex.search(filter_part)
2355 if m:
2356 op = COMPARISON_OPERATORS[m.group('op')]
e5a088dc
S
2357 actual_value = dct.get(m.group('key'))
2358 if (m.group('strval') is not None or
2359 # If the original field is a string and matching comparisonvalue is
2360 # a number we should respect the origin of the original field
2361 # and process comparison value as a string (see
2362 # https://github.com/rg3/youtube-dl/issues/11082).
2363 actual_value is not None and m.group('intval') is not None and
2364 isinstance(actual_value, compat_str)):
347de493
PH
2365 if m.group('op') not in ('=', '!='):
2366 raise ValueError(
2367 'Operator %s does not support string values!' % m.group('op'))
e5a088dc 2368 comparison_value = m.group('strval') or m.group('intval')
347de493
PH
2369 else:
2370 try:
2371 comparison_value = int(m.group('intval'))
2372 except ValueError:
2373 comparison_value = parse_filesize(m.group('intval'))
2374 if comparison_value is None:
2375 comparison_value = parse_filesize(m.group('intval') + 'B')
2376 if comparison_value is None:
2377 raise ValueError(
2378 'Invalid integer value %r in filter part %r' % (
2379 m.group('intval'), filter_part))
347de493
PH
2380 if actual_value is None:
2381 return m.group('none_inclusive')
2382 return op(actual_value, comparison_value)
2383
2384 UNARY_OPERATORS = {
2385 '': lambda v: v is not None,
2386 '!': lambda v: v is None,
2387 }
2388 operator_rex = re.compile(r'''(?x)\s*
2389 (?P<op>%s)\s*(?P<key>[a-z_]+)
2390 \s*$
2391 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
2392 m = operator_rex.search(filter_part)
2393 if m:
2394 op = UNARY_OPERATORS[m.group('op')]
2395 actual_value = dct.get(m.group('key'))
2396 return op(actual_value)
2397
2398 raise ValueError('Invalid filter part %r' % filter_part)
2399
2400
2401def match_str(filter_str, dct):
2402 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
2403
2404 return all(
2405 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
2406
2407
2408def match_filter_func(filter_str):
2409 def _match_func(info_dict):
2410 if match_str(filter_str, info_dict):
2411 return None
2412 else:
2413 video_title = info_dict.get('title', info_dict.get('id', 'video'))
2414 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
2415 return _match_func
91410c9b
PH
2416
2417
bf6427d2
YCH
2418def parse_dfxp_time_expr(time_expr):
2419 if not time_expr:
d631d5f9 2420 return
bf6427d2
YCH
2421
2422 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
2423 if mobj:
2424 return float(mobj.group('time_offset'))
2425
db2fe38b 2426 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:(?:\.|:)\d+)?)$', time_expr)
bf6427d2 2427 if mobj:
db2fe38b 2428 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3).replace(':', '.'))
bf6427d2
YCH
2429
2430
c1c924ab
YCH
2431def srt_subtitles_timecode(seconds):
2432 return '%02d:%02d:%02d,%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000)
bf6427d2
YCH
2433
2434
2435def dfxp2srt(dfxp_data):
4e335771
YCH
2436 _x = functools.partial(xpath_with_ns, ns_map={
2437 'ttml': 'http://www.w3.org/ns/ttml',
2438 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
5bf28d78 2439 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
4e335771 2440 })
bf6427d2 2441
87de7069 2442 class TTMLPElementParser(object):
2b14cb56 2443 out = ''
bf6427d2 2444
2b14cb56 2445 def start(self, tag, attrib):
2446 if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
2447 self.out += '\n'
bf6427d2 2448
2b14cb56 2449 def end(self, tag):
2450 pass
bf6427d2 2451
2b14cb56 2452 def data(self, data):
2453 self.out += data
2454
2455 def close(self):
2456 return self.out.strip()
2457
2458 def parse_node(node):
2459 target = TTMLPElementParser()
2460 parser = xml.etree.ElementTree.XMLParser(target=target)
2461 parser.feed(xml.etree.ElementTree.tostring(node))
2462 return parser.close()
bf6427d2 2463
36e6f62c 2464 dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
bf6427d2 2465 out = []
5bf28d78 2466 paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
1b0427e6
YCH
2467
2468 if not paras:
2469 raise ValueError('Invalid dfxp/TTML subtitle')
bf6427d2
YCH
2470
2471 for para, index in zip(paras, itertools.count(1)):
d631d5f9 2472 begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
7dff0363 2473 end_time = parse_dfxp_time_expr(para.attrib.get('end'))
d631d5f9
YCH
2474 dur = parse_dfxp_time_expr(para.attrib.get('dur'))
2475 if begin_time is None:
2476 continue
7dff0363 2477 if not end_time:
d631d5f9
YCH
2478 if not dur:
2479 continue
2480 end_time = begin_time + dur
bf6427d2
YCH
2481 out.append('%d\n%s --> %s\n%s\n\n' % (
2482 index,
c1c924ab
YCH
2483 srt_subtitles_timecode(begin_time),
2484 srt_subtitles_timecode(end_time),
bf6427d2
YCH
2485 parse_node(para)))
2486
2487 return ''.join(out)
2488
2489
66e289ba
S
2490def cli_option(params, command_option, param):
2491 param = params.get(param)
98e698f1
RA
2492 if param:
2493 param = compat_str(param)
66e289ba
S
2494 return [command_option, param] if param is not None else []
2495
2496
2497def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
2498 param = params.get(param)
2499 assert isinstance(param, bool)
2500 if separator:
2501 return [command_option + separator + (true_value if param else false_value)]
2502 return [command_option, true_value if param else false_value]
2503
2504
2505def cli_valueless_option(params, command_option, param, expected_value=True):
2506 param = params.get(param)
2507 return [command_option] if param == expected_value else []
2508
2509
2510def cli_configuration_args(params, param, default=[]):
2511 ex_args = params.get(param)
2512 if ex_args is None:
2513 return default
2514 assert isinstance(ex_args, list)
2515 return ex_args
2516
2517
39672624
YCH
2518class ISO639Utils(object):
2519 # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt
2520 _lang_map = {
2521 'aa': 'aar',
2522 'ab': 'abk',
2523 'ae': 'ave',
2524 'af': 'afr',
2525 'ak': 'aka',
2526 'am': 'amh',
2527 'an': 'arg',
2528 'ar': 'ara',
2529 'as': 'asm',
2530 'av': 'ava',
2531 'ay': 'aym',
2532 'az': 'aze',
2533 'ba': 'bak',
2534 'be': 'bel',
2535 'bg': 'bul',
2536 'bh': 'bih',
2537 'bi': 'bis',
2538 'bm': 'bam',
2539 'bn': 'ben',
2540 'bo': 'bod',
2541 'br': 'bre',
2542 'bs': 'bos',
2543 'ca': 'cat',
2544 'ce': 'che',
2545 'ch': 'cha',
2546 'co': 'cos',
2547 'cr': 'cre',
2548 'cs': 'ces',
2549 'cu': 'chu',
2550 'cv': 'chv',
2551 'cy': 'cym',
2552 'da': 'dan',
2553 'de': 'deu',
2554 'dv': 'div',
2555 'dz': 'dzo',
2556 'ee': 'ewe',
2557 'el': 'ell',
2558 'en': 'eng',
2559 'eo': 'epo',
2560 'es': 'spa',
2561 'et': 'est',
2562 'eu': 'eus',
2563 'fa': 'fas',
2564 'ff': 'ful',
2565 'fi': 'fin',
2566 'fj': 'fij',
2567 'fo': 'fao',
2568 'fr': 'fra',
2569 'fy': 'fry',
2570 'ga': 'gle',
2571 'gd': 'gla',
2572 'gl': 'glg',
2573 'gn': 'grn',
2574 'gu': 'guj',
2575 'gv': 'glv',
2576 'ha': 'hau',
2577 'he': 'heb',
2578 'hi': 'hin',
2579 'ho': 'hmo',
2580 'hr': 'hrv',
2581 'ht': 'hat',
2582 'hu': 'hun',
2583 'hy': 'hye',
2584 'hz': 'her',
2585 'ia': 'ina',
2586 'id': 'ind',
2587 'ie': 'ile',
2588 'ig': 'ibo',
2589 'ii': 'iii',
2590 'ik': 'ipk',
2591 'io': 'ido',
2592 'is': 'isl',
2593 'it': 'ita',
2594 'iu': 'iku',
2595 'ja': 'jpn',
2596 'jv': 'jav',
2597 'ka': 'kat',
2598 'kg': 'kon',
2599 'ki': 'kik',
2600 'kj': 'kua',
2601 'kk': 'kaz',
2602 'kl': 'kal',
2603 'km': 'khm',
2604 'kn': 'kan',
2605 'ko': 'kor',
2606 'kr': 'kau',
2607 'ks': 'kas',
2608 'ku': 'kur',
2609 'kv': 'kom',
2610 'kw': 'cor',
2611 'ky': 'kir',
2612 'la': 'lat',
2613 'lb': 'ltz',
2614 'lg': 'lug',
2615 'li': 'lim',
2616 'ln': 'lin',
2617 'lo': 'lao',
2618 'lt': 'lit',
2619 'lu': 'lub',
2620 'lv': 'lav',
2621 'mg': 'mlg',
2622 'mh': 'mah',
2623 'mi': 'mri',
2624 'mk': 'mkd',
2625 'ml': 'mal',
2626 'mn': 'mon',
2627 'mr': 'mar',
2628 'ms': 'msa',
2629 'mt': 'mlt',
2630 'my': 'mya',
2631 'na': 'nau',
2632 'nb': 'nob',
2633 'nd': 'nde',
2634 'ne': 'nep',
2635 'ng': 'ndo',
2636 'nl': 'nld',
2637 'nn': 'nno',
2638 'no': 'nor',
2639 'nr': 'nbl',
2640 'nv': 'nav',
2641 'ny': 'nya',
2642 'oc': 'oci',
2643 'oj': 'oji',
2644 'om': 'orm',
2645 'or': 'ori',
2646 'os': 'oss',
2647 'pa': 'pan',
2648 'pi': 'pli',
2649 'pl': 'pol',
2650 'ps': 'pus',
2651 'pt': 'por',
2652 'qu': 'que',
2653 'rm': 'roh',
2654 'rn': 'run',
2655 'ro': 'ron',
2656 'ru': 'rus',
2657 'rw': 'kin',
2658 'sa': 'san',
2659 'sc': 'srd',
2660 'sd': 'snd',
2661 'se': 'sme',
2662 'sg': 'sag',
2663 'si': 'sin',
2664 'sk': 'slk',
2665 'sl': 'slv',
2666 'sm': 'smo',
2667 'sn': 'sna',
2668 'so': 'som',
2669 'sq': 'sqi',
2670 'sr': 'srp',
2671 'ss': 'ssw',
2672 'st': 'sot',
2673 'su': 'sun',
2674 'sv': 'swe',
2675 'sw': 'swa',
2676 'ta': 'tam',
2677 'te': 'tel',
2678 'tg': 'tgk',
2679 'th': 'tha',
2680 'ti': 'tir',
2681 'tk': 'tuk',
2682 'tl': 'tgl',
2683 'tn': 'tsn',
2684 'to': 'ton',
2685 'tr': 'tur',
2686 'ts': 'tso',
2687 'tt': 'tat',
2688 'tw': 'twi',
2689 'ty': 'tah',
2690 'ug': 'uig',
2691 'uk': 'ukr',
2692 'ur': 'urd',
2693 'uz': 'uzb',
2694 've': 'ven',
2695 'vi': 'vie',
2696 'vo': 'vol',
2697 'wa': 'wln',
2698 'wo': 'wol',
2699 'xh': 'xho',
2700 'yi': 'yid',
2701 'yo': 'yor',
2702 'za': 'zha',
2703 'zh': 'zho',
2704 'zu': 'zul',
2705 }
2706
2707 @classmethod
2708 def short2long(cls, code):
2709 """Convert language code from ISO 639-1 to ISO 639-2/T"""
2710 return cls._lang_map.get(code[:2])
2711
2712 @classmethod
2713 def long2short(cls, code):
2714 """Convert language code from ISO 639-2/T to ISO 639-1"""
2715 for short_name, long_name in cls._lang_map.items():
2716 if long_name == code:
2717 return short_name
2718
2719
4eb10f66
YCH
2720class ISO3166Utils(object):
2721 # From http://data.okfn.org/data/core/country-list
2722 _country_map = {
2723 'AF': 'Afghanistan',
2724 'AX': 'Åland Islands',
2725 'AL': 'Albania',
2726 'DZ': 'Algeria',
2727 'AS': 'American Samoa',
2728 'AD': 'Andorra',
2729 'AO': 'Angola',
2730 'AI': 'Anguilla',
2731 'AQ': 'Antarctica',
2732 'AG': 'Antigua and Barbuda',
2733 'AR': 'Argentina',
2734 'AM': 'Armenia',
2735 'AW': 'Aruba',
2736 'AU': 'Australia',
2737 'AT': 'Austria',
2738 'AZ': 'Azerbaijan',
2739 'BS': 'Bahamas',
2740 'BH': 'Bahrain',
2741 'BD': 'Bangladesh',
2742 'BB': 'Barbados',
2743 'BY': 'Belarus',
2744 'BE': 'Belgium',
2745 'BZ': 'Belize',
2746 'BJ': 'Benin',
2747 'BM': 'Bermuda',
2748 'BT': 'Bhutan',
2749 'BO': 'Bolivia, Plurinational State of',
2750 'BQ': 'Bonaire, Sint Eustatius and Saba',
2751 'BA': 'Bosnia and Herzegovina',
2752 'BW': 'Botswana',
2753 'BV': 'Bouvet Island',
2754 'BR': 'Brazil',
2755 'IO': 'British Indian Ocean Territory',
2756 'BN': 'Brunei Darussalam',
2757 'BG': 'Bulgaria',
2758 'BF': 'Burkina Faso',
2759 'BI': 'Burundi',
2760 'KH': 'Cambodia',
2761 'CM': 'Cameroon',
2762 'CA': 'Canada',
2763 'CV': 'Cape Verde',
2764 'KY': 'Cayman Islands',
2765 'CF': 'Central African Republic',
2766 'TD': 'Chad',
2767 'CL': 'Chile',
2768 'CN': 'China',
2769 'CX': 'Christmas Island',
2770 'CC': 'Cocos (Keeling) Islands',
2771 'CO': 'Colombia',
2772 'KM': 'Comoros',
2773 'CG': 'Congo',
2774 'CD': 'Congo, the Democratic Republic of the',
2775 'CK': 'Cook Islands',
2776 'CR': 'Costa Rica',
2777 'CI': 'Côte d\'Ivoire',
2778 'HR': 'Croatia',
2779 'CU': 'Cuba',
2780 'CW': 'Curaçao',
2781 'CY': 'Cyprus',
2782 'CZ': 'Czech Republic',
2783 'DK': 'Denmark',
2784 'DJ': 'Djibouti',
2785 'DM': 'Dominica',
2786 'DO': 'Dominican Republic',
2787 'EC': 'Ecuador',
2788 'EG': 'Egypt',
2789 'SV': 'El Salvador',
2790 'GQ': 'Equatorial Guinea',
2791 'ER': 'Eritrea',
2792 'EE': 'Estonia',
2793 'ET': 'Ethiopia',
2794 'FK': 'Falkland Islands (Malvinas)',
2795 'FO': 'Faroe Islands',
2796 'FJ': 'Fiji',
2797 'FI': 'Finland',
2798 'FR': 'France',
2799 'GF': 'French Guiana',
2800 'PF': 'French Polynesia',
2801 'TF': 'French Southern Territories',
2802 'GA': 'Gabon',
2803 'GM': 'Gambia',
2804 'GE': 'Georgia',
2805 'DE': 'Germany',
2806 'GH': 'Ghana',
2807 'GI': 'Gibraltar',
2808 'GR': 'Greece',
2809 'GL': 'Greenland',
2810 'GD': 'Grenada',
2811 'GP': 'Guadeloupe',
2812 'GU': 'Guam',
2813 'GT': 'Guatemala',
2814 'GG': 'Guernsey',
2815 'GN': 'Guinea',
2816 'GW': 'Guinea-Bissau',
2817 'GY': 'Guyana',
2818 'HT': 'Haiti',
2819 'HM': 'Heard Island and McDonald Islands',
2820 'VA': 'Holy See (Vatican City State)',
2821 'HN': 'Honduras',
2822 'HK': 'Hong Kong',
2823 'HU': 'Hungary',
2824 'IS': 'Iceland',
2825 'IN': 'India',
2826 'ID': 'Indonesia',
2827 'IR': 'Iran, Islamic Republic of',
2828 'IQ': 'Iraq',
2829 'IE': 'Ireland',
2830 'IM': 'Isle of Man',
2831 'IL': 'Israel',
2832 'IT': 'Italy',
2833 'JM': 'Jamaica',
2834 'JP': 'Japan',
2835 'JE': 'Jersey',
2836 'JO': 'Jordan',
2837 'KZ': 'Kazakhstan',
2838 'KE': 'Kenya',
2839 'KI': 'Kiribati',
2840 'KP': 'Korea, Democratic People\'s Republic of',
2841 'KR': 'Korea, Republic of',
2842 'KW': 'Kuwait',
2843 'KG': 'Kyrgyzstan',
2844 'LA': 'Lao People\'s Democratic Republic',
2845 'LV': 'Latvia',
2846 'LB': 'Lebanon',
2847 'LS': 'Lesotho',
2848 'LR': 'Liberia',
2849 'LY': 'Libya',
2850 'LI': 'Liechtenstein',
2851 'LT': 'Lithuania',
2852 'LU': 'Luxembourg',
2853 'MO': 'Macao',
2854 'MK': 'Macedonia, the Former Yugoslav Republic of',
2855 'MG': 'Madagascar',
2856 'MW': 'Malawi',
2857 'MY': 'Malaysia',
2858 'MV': 'Maldives',
2859 'ML': 'Mali',
2860 'MT': 'Malta',
2861 'MH': 'Marshall Islands',
2862 'MQ': 'Martinique',
2863 'MR': 'Mauritania',
2864 'MU': 'Mauritius',
2865 'YT': 'Mayotte',
2866 'MX': 'Mexico',
2867 'FM': 'Micronesia, Federated States of',
2868 'MD': 'Moldova, Republic of',
2869 'MC': 'Monaco',
2870 'MN': 'Mongolia',
2871 'ME': 'Montenegro',
2872 'MS': 'Montserrat',
2873 'MA': 'Morocco',
2874 'MZ': 'Mozambique',
2875 'MM': 'Myanmar',
2876 'NA': 'Namibia',
2877 'NR': 'Nauru',
2878 'NP': 'Nepal',
2879 'NL': 'Netherlands',
2880 'NC': 'New Caledonia',
2881 'NZ': 'New Zealand',
2882 'NI': 'Nicaragua',
2883 'NE': 'Niger',
2884 'NG': 'Nigeria',
2885 'NU': 'Niue',
2886 'NF': 'Norfolk Island',
2887 'MP': 'Northern Mariana Islands',
2888 'NO': 'Norway',
2889 'OM': 'Oman',
2890 'PK': 'Pakistan',
2891 'PW': 'Palau',
2892 'PS': 'Palestine, State of',
2893 'PA': 'Panama',
2894 'PG': 'Papua New Guinea',
2895 'PY': 'Paraguay',
2896 'PE': 'Peru',
2897 'PH': 'Philippines',
2898 'PN': 'Pitcairn',
2899 'PL': 'Poland',
2900 'PT': 'Portugal',
2901 'PR': 'Puerto Rico',
2902 'QA': 'Qatar',
2903 'RE': 'Réunion',
2904 'RO': 'Romania',
2905 'RU': 'Russian Federation',
2906 'RW': 'Rwanda',
2907 'BL': 'Saint Barthélemy',
2908 'SH': 'Saint Helena, Ascension and Tristan da Cunha',
2909 'KN': 'Saint Kitts and Nevis',
2910 'LC': 'Saint Lucia',
2911 'MF': 'Saint Martin (French part)',
2912 'PM': 'Saint Pierre and Miquelon',
2913 'VC': 'Saint Vincent and the Grenadines',
2914 'WS': 'Samoa',
2915 'SM': 'San Marino',
2916 'ST': 'Sao Tome and Principe',
2917 'SA': 'Saudi Arabia',
2918 'SN': 'Senegal',
2919 'RS': 'Serbia',
2920 'SC': 'Seychelles',
2921 'SL': 'Sierra Leone',
2922 'SG': 'Singapore',
2923 'SX': 'Sint Maarten (Dutch part)',
2924 'SK': 'Slovakia',
2925 'SI': 'Slovenia',
2926 'SB': 'Solomon Islands',
2927 'SO': 'Somalia',
2928 'ZA': 'South Africa',
2929 'GS': 'South Georgia and the South Sandwich Islands',
2930 'SS': 'South Sudan',
2931 'ES': 'Spain',
2932 'LK': 'Sri Lanka',
2933 'SD': 'Sudan',
2934 'SR': 'Suriname',
2935 'SJ': 'Svalbard and Jan Mayen',
2936 'SZ': 'Swaziland',
2937 'SE': 'Sweden',
2938 'CH': 'Switzerland',
2939 'SY': 'Syrian Arab Republic',
2940 'TW': 'Taiwan, Province of China',
2941 'TJ': 'Tajikistan',
2942 'TZ': 'Tanzania, United Republic of',
2943 'TH': 'Thailand',
2944 'TL': 'Timor-Leste',
2945 'TG': 'Togo',
2946 'TK': 'Tokelau',
2947 'TO': 'Tonga',
2948 'TT': 'Trinidad and Tobago',
2949 'TN': 'Tunisia',
2950 'TR': 'Turkey',
2951 'TM': 'Turkmenistan',
2952 'TC': 'Turks and Caicos Islands',
2953 'TV': 'Tuvalu',
2954 'UG': 'Uganda',
2955 'UA': 'Ukraine',
2956 'AE': 'United Arab Emirates',
2957 'GB': 'United Kingdom',
2958 'US': 'United States',
2959 'UM': 'United States Minor Outlying Islands',
2960 'UY': 'Uruguay',
2961 'UZ': 'Uzbekistan',
2962 'VU': 'Vanuatu',
2963 'VE': 'Venezuela, Bolivarian Republic of',
2964 'VN': 'Viet Nam',
2965 'VG': 'Virgin Islands, British',
2966 'VI': 'Virgin Islands, U.S.',
2967 'WF': 'Wallis and Futuna',
2968 'EH': 'Western Sahara',
2969 'YE': 'Yemen',
2970 'ZM': 'Zambia',
2971 'ZW': 'Zimbabwe',
2972 }
2973
2974 @classmethod
2975 def short2full(cls, code):
2976 """Convert an ISO 3166-2 country code to the corresponding full name"""
2977 return cls._country_map.get(code.upper())
2978
2979
91410c9b 2980class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
2981 def __init__(self, proxies=None):
2982 # Set default handlers
2983 for type in ('http', 'https'):
2984 setattr(self, '%s_open' % type,
2985 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
2986 meth(r, proxy, type))
2987 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
2988
91410c9b 2989 def proxy_open(self, req, proxy, type):
2461f79d 2990 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
2991 if req_proxy is not None:
2992 proxy = req_proxy
2461f79d
PH
2993 del req.headers['Ytdl-request-proxy']
2994
2995 if proxy == '__noproxy__':
2996 return None # No Proxy
51fb4995 2997 if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'):
71aff188
YCH
2998 req.add_header('Ytdl-socks-proxy', proxy)
2999 # youtube-dl's http/https handlers do wrapping the socket with socks
3000 return None
91410c9b
PH
3001 return compat_urllib_request.ProxyHandler.proxy_open(
3002 self, req, proxy, type)
5bc880b9
YCH
3003
3004
3005def ohdave_rsa_encrypt(data, exponent, modulus):
3006 '''
3007 Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/
3008
3009 Input:
3010 data: data to encrypt, bytes-like object
3011 exponent, modulus: parameter e and N of RSA algorithm, both integer
3012 Output: hex string of encrypted data
3013
3014 Limitation: supports one block encryption only
3015 '''
3016
3017 payload = int(binascii.hexlify(data[::-1]), 16)
3018 encrypted = pow(payload, exponent, modulus)
3019 return '%x' % encrypted
81bdc8fd
YCH
3020
3021
5eb6bdce 3022def encode_base_n(num, n, table=None):
59f898b7 3023 FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
59f898b7
YCH
3024 if not table:
3025 table = FULL_TABLE[:n]
3026
5eb6bdce
YCH
3027 if n > len(table):
3028 raise ValueError('base %d exceeds table length %d' % (n, len(table)))
3029
3030 if num == 0:
3031 return table[0]
3032
81bdc8fd
YCH
3033 ret = ''
3034 while num:
3035 ret = table[num % n] + ret
3036 num = num // n
3037 return ret
f52354a8
YCH
3038
3039
3040def decode_packed_codes(code):
06b3fe29 3041 mobj = re.search(PACKED_CODES_RE, code)
f52354a8
YCH
3042 obfucasted_code, base, count, symbols = mobj.groups()
3043 base = int(base)
3044 count = int(count)
3045 symbols = symbols.split('|')
3046 symbol_table = {}
3047
3048 while count:
3049 count -= 1
5eb6bdce 3050 base_n_count = encode_base_n(count, base)
f52354a8
YCH
3051 symbol_table[base_n_count] = symbols[count] or base_n_count
3052
3053 return re.sub(
3054 r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
3055 obfucasted_code)
e154c651 3056
3057
3058def parse_m3u8_attributes(attrib):
3059 info = {}
3060 for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
3061 if val.startswith('"'):
3062 val = val[1:-1]
3063 info[key] = val
3064 return info
1143535d
YCH
3065
3066
3067def urshift(val, n):
3068 return val >> n if val >= 0 else (val + 0x100000000) >> n
d3f8e038
YCH
3069
3070
3071# Based on png2str() written by @gdkchan and improved by @yokrysty
3072# Originally posted at https://github.com/rg3/youtube-dl/issues/9706
3073def decode_png(png_data):
3074 # Reference: https://www.w3.org/TR/PNG/
3075 header = png_data[8:]
3076
3077 if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR':
3078 raise IOError('Not a valid PNG file.')
3079
3080 int_map = {1: '>B', 2: '>H', 4: '>I'}
3081 unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0]
3082
3083 chunks = []
3084
3085 while header:
3086 length = unpack_integer(header[:4])
3087 header = header[4:]
3088
3089 chunk_type = header[:4]
3090 header = header[4:]
3091
3092 chunk_data = header[:length]
3093 header = header[length:]
3094
3095 header = header[4:] # Skip CRC
3096
3097 chunks.append({
3098 'type': chunk_type,
3099 'length': length,
3100 'data': chunk_data
3101 })
3102
3103 ihdr = chunks[0]['data']
3104
3105 width = unpack_integer(ihdr[:4])
3106 height = unpack_integer(ihdr[4:8])
3107
3108 idat = b''
3109
3110 for chunk in chunks:
3111 if chunk['type'] == b'IDAT':
3112 idat += chunk['data']
3113
3114 if not idat:
3115 raise IOError('Unable to read PNG data.')
3116
3117 decompressed_data = bytearray(zlib.decompress(idat))
3118
3119 stride = width * 3
3120 pixels = []
3121
3122 def _get_pixel(idx):
3123 x = idx % stride
3124 y = idx // stride
3125 return pixels[y][x]
3126
3127 for y in range(height):
3128 basePos = y * (1 + stride)
3129 filter_type = decompressed_data[basePos]
3130
3131 current_row = []
3132
3133 pixels.append(current_row)
3134
3135 for x in range(stride):
3136 color = decompressed_data[1 + basePos + x]
3137 basex = y * stride + x
3138 left = 0
3139 up = 0
3140
3141 if x > 2:
3142 left = _get_pixel(basex - 3)
3143 if y > 0:
3144 up = _get_pixel(basex - stride)
3145
3146 if filter_type == 1: # Sub
3147 color = (color + left) & 0xff
3148 elif filter_type == 2: # Up
3149 color = (color + up) & 0xff
3150 elif filter_type == 3: # Average
3151 color = (color + ((left + up) >> 1)) & 0xff
3152 elif filter_type == 4: # Paeth
3153 a = left
3154 b = up
3155 c = 0
3156
3157 if x > 2 and y > 0:
3158 c = _get_pixel(basex - stride - 3)
3159
3160 p = a + b - c
3161
3162 pa = abs(p - a)
3163 pb = abs(p - b)
3164 pc = abs(p - c)
3165
3166 if pa <= pb and pa <= pc:
3167 color = (color + a) & 0xff
3168 elif pb <= pc:
3169 color = (color + b) & 0xff
3170 else:
3171 color = (color + c) & 0xff
3172
3173 current_row.append(color)
3174
3175 return width, height, pixels
efa97bdc
YCH
3176
3177
3178def write_xattr(path, key, value):
3179 # This mess below finds the best xattr tool for the job
3180 try:
3181 # try the pyxattr module...
3182 import xattr
3183
53a7e3d2
YCH
3184 if hasattr(xattr, 'set'): # pyxattr
3185 # Unicode arguments are not supported in python-pyxattr until
3186 # version 0.5.0
3187 # See https://github.com/rg3/youtube-dl/issues/5498
3188 pyxattr_required_version = '0.5.0'
3189 if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version):
3190 # TODO: fallback to CLI tools
3191 raise XAttrUnavailableError(
3192 'python-pyxattr is detected but is too old. '
3193 'youtube-dl requires %s or above while your version is %s. '
3194 'Falling back to other xattr implementations' % (
3195 pyxattr_required_version, xattr.__version__))
3196
3197 setxattr = xattr.set
3198 else: # xattr
3199 setxattr = xattr.setxattr
efa97bdc
YCH
3200
3201 try:
53a7e3d2 3202 setxattr(path, key, value)
efa97bdc
YCH
3203 except EnvironmentError as e:
3204 raise XAttrMetadataError(e.errno, e.strerror)
3205
3206 except ImportError:
3207 if compat_os_name == 'nt':
3208 # Write xattrs to NTFS Alternate Data Streams:
3209 # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29
3210 assert ':' not in key
3211 assert os.path.exists(path)
3212
3213 ads_fn = path + ':' + key
3214 try:
3215 with open(ads_fn, 'wb') as f:
3216 f.write(value)
3217 except EnvironmentError as e:
3218 raise XAttrMetadataError(e.errno, e.strerror)
3219 else:
3220 user_has_setfattr = check_executable('setfattr', ['--version'])
3221 user_has_xattr = check_executable('xattr', ['-h'])
3222
3223 if user_has_setfattr or user_has_xattr:
3224
3225 value = value.decode('utf-8')
3226 if user_has_setfattr:
3227 executable = 'setfattr'
3228 opts = ['-n', key, '-v', value]
3229 elif user_has_xattr:
3230 executable = 'xattr'
3231 opts = ['-w', key, value]
3232
3233 cmd = ([encodeFilename(executable, True)] +
3234 [encodeArgument(o) for o in opts] +
3235 [encodeFilename(path, True)])
3236
3237 try:
3238 p = subprocess.Popen(
3239 cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
3240 except EnvironmentError as e:
3241 raise XAttrMetadataError(e.errno, e.strerror)
3242 stdout, stderr = p.communicate()
3243 stderr = stderr.decode('utf-8', 'replace')
3244 if p.returncode != 0:
3245 raise XAttrMetadataError(p.returncode, stderr)
3246
3247 else:
3248 # On Unix, and can't find pyxattr, setfattr, or xattr.
3249 if sys.platform.startswith('linux'):
3250 raise XAttrUnavailableError(
3251 "Couldn't find a tool to set the xattrs. "
3252 "Install either the python 'pyxattr' or 'xattr' "
3253 "modules, or the GNU 'attr' package "
3254 "(which contains the 'setfattr' tool).")
3255 else:
3256 raise XAttrUnavailableError(
3257 "Couldn't find a tool to set the xattrs. "
3258 "Install either the python 'xattr' module, "
3259 "or the 'xattr' binary.")