]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[NBC] Enhance embedURL extraction (closes #2549)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
912b38b4 6import calendar
676eb3f2 7import codecs
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96
PH
10import datetime
11import email.utils
f45c185f 12import errno
be4a824d 13import functools
d77c3dfd 14import gzip
b7ab0590 15import itertools
03f9daab 16import io
f4bfd65f 17import json
d77c3dfd 18import locale
02dbf93f 19import math
347de493 20import operator
d77c3dfd 21import os
4eb7f1d1 22import pipes
c496ca96 23import platform
d77c3dfd 24import re
13ebea79 25import ssl
c496ca96 26import socket
b53466e1 27import struct
1c088fa8 28import subprocess
d77c3dfd 29import sys
181c8655 30import tempfile
01951dda 31import traceback
bcf89ce6 32import xml.etree.ElementTree
d77c3dfd 33import zlib
d77c3dfd 34
8c25f81b 35from .compat import (
8f9312c3 36 compat_basestring,
8c25f81b 37 compat_chr,
8c25f81b 38 compat_html_entities,
be4a824d 39 compat_http_client,
c86b6142 40 compat_kwargs,
8c25f81b 41 compat_parse_qs,
be4a824d 42 compat_socket_create_connection,
8c25f81b
PH
43 compat_str,
44 compat_urllib_error,
45 compat_urllib_parse,
46 compat_urllib_parse_urlparse,
47 compat_urllib_request,
48 compat_urlparse,
7d4111ed 49 shlex_quote,
8c25f81b 50)
4644ac55
S
51
52
468e2e92
FV
53# This is not clearly defined otherwise
54compiled_regex_type = type(re.compile(''))
55
3e669f36 56std_headers = {
18313934 57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
59ae15a5
PH
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 62}
f427df17 63
5f6a1245 64
7105440c
YCH
65ENGLISH_MONTH_NAMES = [
66 'January', 'February', 'March', 'April', 'May', 'June',
67 'July', 'August', 'September', 'October', 'November', 'December']
68
69
d77c3dfd 70def preferredencoding():
59ae15a5 71 """Get preferred encoding.
d77c3dfd 72
59ae15a5
PH
73 Returns the best encoding scheme for the system, based on
74 locale.getpreferredencoding() and some further tweaks.
75 """
76 try:
77 pref = locale.getpreferredencoding()
28e614de 78 'TEST'.encode(pref)
70a1165b 79 except Exception:
59ae15a5 80 pref = 'UTF-8'
bae611f2 81
59ae15a5 82 return pref
d77c3dfd 83
f4bfd65f 84
181c8655 85def write_json_file(obj, fn):
1394646a 86 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 87
92120217 88 fn = encodeFilename(fn)
61ee5aeb 89 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
90 encoding = get_filesystem_encoding()
91 # os.path.basename returns a bytes object, but NamedTemporaryFile
92 # will fail if the filename contains non ascii characters unless we
93 # use a unicode object
94 path_basename = lambda f: os.path.basename(fn).decode(encoding)
95 # the same for os.path.dirname
96 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
97 else:
98 path_basename = os.path.basename
99 path_dirname = os.path.dirname
100
73159f99
S
101 args = {
102 'suffix': '.tmp',
ec5f6016
JMF
103 'prefix': path_basename(fn) + '.',
104 'dir': path_dirname(fn),
73159f99
S
105 'delete': False,
106 }
107
181c8655
PH
108 # In Python 2.x, json.dump expects a bytestream.
109 # In Python 3.x, it writes to a character stream
110 if sys.version_info < (3, 0):
73159f99 111 args['mode'] = 'wb'
181c8655 112 else:
73159f99
S
113 args.update({
114 'mode': 'w',
115 'encoding': 'utf-8',
116 })
117
c86b6142 118 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
119
120 try:
121 with tf:
122 json.dump(obj, tf)
1394646a
IK
123 if sys.platform == 'win32':
124 # Need to remove existing file on Windows, else os.rename raises
125 # WindowsError or FileExistsError.
126 try:
127 os.unlink(fn)
128 except OSError:
129 pass
181c8655 130 os.rename(tf.name, fn)
70a1165b 131 except Exception:
181c8655
PH
132 try:
133 os.remove(tf.name)
134 except OSError:
135 pass
136 raise
137
138
139if sys.version_info >= (2, 7):
59ae56fa
PH
140 def find_xpath_attr(node, xpath, key, val):
141 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
142 assert re.match(r'^[a-zA-Z-]+$', key)
143 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
ab4ee31e 144 expr = xpath + "[@%s='%s']" % (key, val)
59ae56fa
PH
145 return node.find(expr)
146else:
147 def find_xpath_attr(node, xpath, key, val):
4eefbfdb
PH
148 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
149 # .//node does not match if a node is a direct child of . !
8f9312c3 150 if isinstance(xpath, compat_str):
4eefbfdb
PH
151 xpath = xpath.encode('ascii')
152
59ae56fa
PH
153 for f in node.findall(xpath):
154 if f.attrib.get(key) == val:
155 return f
156 return None
157
d7e66d39
JMF
158# On python2.6 the xml.etree.ElementTree.Element methods don't support
159# the namespace parameter
5f6a1245
JW
160
161
d7e66d39
JMF
162def xpath_with_ns(path, ns_map):
163 components = [c.split(':') for c in path.split('/')]
164 replaced = []
165 for c in components:
166 if len(c) == 1:
167 replaced.append(c[0])
168 else:
169 ns, tag = c
170 replaced.append('{%s}%s' % (ns_map[ns], tag))
171 return '/'.join(replaced)
172
d77c3dfd 173
bf0ff932 174def xpath_text(node, xpath, name=None, fatal=False):
d74bebd5
PH
175 if sys.version_info < (2, 7): # Crazy 2.6
176 xpath = xpath.encode('ascii')
177
bf0ff932 178 n = node.find(xpath)
42bdd9d0 179 if n is None or n.text is None:
bf0ff932
PH
180 if fatal:
181 name = xpath if name is None else name
182 raise ExtractorError('Could not find XML element %s' % name)
183 else:
184 return None
185 return n.text
186
187
9e6dd238 188def get_element_by_id(id, html):
43e8fafd
ND
189 """Return the content of the tag with the specified ID in the passed HTML document"""
190 return get_element_by_attribute("id", id, html)
191
12ea2f30 192
43e8fafd
ND
193def get_element_by_attribute(attribute, value, html):
194 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 195
38285056
PH
196 m = re.search(r'''(?xs)
197 <([a-zA-Z0-9:._-]+)
198 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
199 \s+%s=['"]?%s['"]?
200 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
201 \s*>
202 (?P<content>.*?)
203 </\1>
204 ''' % (re.escape(attribute), re.escape(value)), html)
205
206 if not m:
207 return None
208 res = m.group('content')
209
210 if res.startswith('"') or res.startswith("'"):
211 res = res[1:-1]
a921f407 212
38285056 213 return unescapeHTML(res)
a921f407 214
9e6dd238
FV
215
216def clean_html(html):
59ae15a5 217 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
218
219 if html is None: # Convenience for sanitizing descriptions etc.
220 return html
221
59ae15a5
PH
222 # Newline vs <br />
223 html = html.replace('\n', ' ')
6b3aef80
FV
224 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
225 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
226 # Strip html tags
227 html = re.sub('<.*?>', '', html)
228 # Replace html entities
229 html = unescapeHTML(html)
7decf895 230 return html.strip()
9e6dd238
FV
231
232
d77c3dfd 233def sanitize_open(filename, open_mode):
59ae15a5
PH
234 """Try to open the given filename, and slightly tweak it if this fails.
235
236 Attempts to open the given filename. If this fails, it tries to change
237 the filename slightly, step by step, until it's either able to open it
238 or it fails and raises a final exception, like the standard open()
239 function.
240
241 It returns the tuple (stream, definitive_file_name).
242 """
243 try:
28e614de 244 if filename == '-':
59ae15a5
PH
245 if sys.platform == 'win32':
246 import msvcrt
247 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 248 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
249 stream = open(encodeFilename(filename), open_mode)
250 return (stream, filename)
251 except (IOError, OSError) as err:
f45c185f
PH
252 if err.errno in (errno.EACCES,):
253 raise
59ae15a5 254
f45c185f 255 # In case of error, try to remove win32 forbidden chars
d55de57b 256 alt_filename = sanitize_path(filename)
f45c185f
PH
257 if alt_filename == filename:
258 raise
259 else:
260 # An exception here should be caught in the caller
d55de57b 261 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 262 return (stream, alt_filename)
d77c3dfd
FV
263
264
265def timeconvert(timestr):
59ae15a5
PH
266 """Convert RFC 2822 defined time string into system timestamp"""
267 timestamp = None
268 timetuple = email.utils.parsedate_tz(timestr)
269 if timetuple is not None:
270 timestamp = email.utils.mktime_tz(timetuple)
271 return timestamp
1c469a94 272
5f6a1245 273
796173d0 274def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
275 """Sanitizes a string so it could be used as part of a filename.
276 If restricted is set, use a stricter subset of allowed characters.
796173d0 277 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
278 """
279 def replace_insane(char):
280 if char == '?' or ord(char) < 32 or ord(char) == 127:
281 return ''
282 elif char == '"':
283 return '' if restricted else '\''
284 elif char == ':':
285 return '_-' if restricted else ' -'
286 elif char in '\\/|*<>':
287 return '_'
627dcfff 288 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
289 return '_'
290 if restricted and ord(char) > 127:
291 return '_'
292 return char
293
2aeb06d6
PH
294 # Handle timestamps
295 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 296 result = ''.join(map(replace_insane, s))
796173d0
PH
297 if not is_id:
298 while '__' in result:
299 result = result.replace('__', '_')
300 result = result.strip('_')
301 # Common case of "Foreign band name - English song title"
302 if restricted and result.startswith('-_'):
303 result = result[2:]
5a42414b
PH
304 if result.startswith('-'):
305 result = '_' + result[len('-'):]
a7440261 306 result = result.lstrip('.')
796173d0
PH
307 if not result:
308 result = '_'
59ae15a5 309 return result
d77c3dfd 310
5f6a1245 311
a2aaf4db
S
312def sanitize_path(s):
313 """Sanitizes and normalizes path on Windows"""
314 if sys.platform != 'win32':
315 return s
be531ef1
S
316 drive_or_unc, _ = os.path.splitdrive(s)
317 if sys.version_info < (2, 7) and not drive_or_unc:
318 drive_or_unc, _ = os.path.splitunc(s)
319 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
320 if drive_or_unc:
a2aaf4db
S
321 norm_path.pop(0)
322 sanitized_path = [
2ebfeaca 323 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
a2aaf4db 324 for path_part in norm_path]
be531ef1
S
325 if drive_or_unc:
326 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
327 return os.path.join(*sanitized_path)
328
329
92a4793b
S
330def sanitize_url_path_consecutive_slashes(url):
331 """Collapses consecutive slashes in URLs' path"""
332 parsed_url = list(compat_urlparse.urlparse(url))
333 parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2])
334 return compat_urlparse.urlunparse(parsed_url)
335
336
d77c3dfd 337def orderedSet(iterable):
59ae15a5
PH
338 """ Remove all duplicates from the input iterable """
339 res = []
340 for el in iterable:
341 if el not in res:
342 res.append(el)
343 return res
d77c3dfd 344
912b38b4 345
4e408e47
PH
346def _htmlentity_transform(entity):
347 """Transforms an HTML entity to a character."""
348 # Known non-numeric HTML entity
349 if entity in compat_html_entities.name2codepoint:
350 return compat_chr(compat_html_entities.name2codepoint[entity])
351
91757b0f 352 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
353 if mobj is not None:
354 numstr = mobj.group(1)
28e614de 355 if numstr.startswith('x'):
4e408e47 356 base = 16
28e614de 357 numstr = '0%s' % numstr
4e408e47
PH
358 else:
359 base = 10
360 return compat_chr(int(numstr, base))
361
362 # Unknown entity in name, return its literal representation
28e614de 363 return ('&%s;' % entity)
4e408e47
PH
364
365
d77c3dfd 366def unescapeHTML(s):
912b38b4
PH
367 if s is None:
368 return None
369 assert type(s) == compat_str
d77c3dfd 370
4e408e47
PH
371 return re.sub(
372 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 373
8bf48f23 374
aa49acd1
S
375def get_subprocess_encoding():
376 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
377 # For subprocess calls, encode with locale encoding
378 # Refer to http://stackoverflow.com/a/9951851/35070
379 encoding = preferredencoding()
380 else:
381 encoding = sys.getfilesystemencoding()
382 if encoding is None:
383 encoding = 'utf-8'
384 return encoding
385
386
8bf48f23 387def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
388 """
389 @param s The name of the file
390 """
d77c3dfd 391
8bf48f23 392 assert type(s) == compat_str
d77c3dfd 393
59ae15a5
PH
394 # Python 3 has a Unicode API
395 if sys.version_info >= (3, 0):
396 return s
0f00efed 397
aa49acd1
S
398 # Pass '' directly to use Unicode APIs on Windows 2000 and up
399 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
400 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
401 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
402 return s
403
404 return s.encode(get_subprocess_encoding(), 'ignore')
405
406
407def decodeFilename(b, for_subprocess=False):
408
409 if sys.version_info >= (3, 0):
410 return b
411
412 if not isinstance(b, bytes):
413 return b
414
415 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 416
f07b74fc
PH
417
418def encodeArgument(s):
419 if not isinstance(s, compat_str):
420 # Legacy code that uses byte strings
421 # Uncomment the following line after fixing all post processors
7af808a5 422 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
423 s = s.decode('ascii')
424 return encodeFilename(s, True)
425
426
aa49acd1
S
427def decodeArgument(b):
428 return decodeFilename(b, True)
429
430
8271226a
PH
431def decodeOption(optval):
432 if optval is None:
433 return optval
434 if isinstance(optval, bytes):
435 optval = optval.decode(preferredencoding())
436
437 assert isinstance(optval, compat_str)
438 return optval
1c256f70 439
5f6a1245 440
4539dd30
PH
441def formatSeconds(secs):
442 if secs > 3600:
443 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
444 elif secs > 60:
445 return '%d:%02d' % (secs // 60, secs % 60)
446 else:
447 return '%d' % secs
448
a0ddb8a2 449
be4a824d
PH
450def make_HTTPS_handler(params, **kwargs):
451 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 452 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 453 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 454 if opts_no_check_certificate:
be5f2c19 455 context.check_hostname = False
0db261ba 456 context.verify_mode = ssl.CERT_NONE
a2366922 457 try:
be4a824d 458 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
459 except TypeError:
460 # Python 2.7.8
461 # (create_default_context present but HTTPSHandler has no context=)
462 pass
463
464 if sys.version_info < (3, 2):
d7932313 465 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 466 else: # Python < 3.4
d7932313 467 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 468 context.verify_mode = (ssl.CERT_NONE
dca08720 469 if opts_no_check_certificate
ea6d901e 470 else ssl.CERT_REQUIRED)
303b479e 471 context.set_default_verify_paths()
be4a824d 472 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 473
732ea2f0 474
08f2a92c
JMF
475def bug_reports_message():
476 if ytdl_is_updateable():
477 update_cmd = 'type youtube-dl -U to update'
478 else:
479 update_cmd = 'see https://yt-dl.org/update on how to update'
480 msg = '; please report this issue on https://yt-dl.org/bug .'
481 msg += ' Make sure you are using the latest version; %s.' % update_cmd
482 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
483 return msg
484
485
1c256f70
PH
486class ExtractorError(Exception):
487 """Error during info extraction."""
5f6a1245 488
d11271dd 489 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
490 """ tb, if given, is the original traceback (so that it can be printed out).
491 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
492 """
493
494 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
495 expected = True
d11271dd
PH
496 if video_id is not None:
497 msg = video_id + ': ' + msg
410f3e73 498 if cause:
28e614de 499 msg += ' (caused by %r)' % cause
9a82b238 500 if not expected:
08f2a92c 501 msg += bug_reports_message()
1c256f70 502 super(ExtractorError, self).__init__(msg)
d5979c5d 503
1c256f70 504 self.traceback = tb
8cc83b8d 505 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 506 self.cause = cause
d11271dd 507 self.video_id = video_id
1c256f70 508
01951dda
PH
509 def format_traceback(self):
510 if self.traceback is None:
511 return None
28e614de 512 return ''.join(traceback.format_tb(self.traceback))
01951dda 513
1c256f70 514
416c7fcb
PH
515class UnsupportedError(ExtractorError):
516 def __init__(self, url):
517 super(UnsupportedError, self).__init__(
518 'Unsupported URL: %s' % url, expected=True)
519 self.url = url
520
521
55b3e45b
JMF
522class RegexNotFoundError(ExtractorError):
523 """Error when a regex didn't match"""
524 pass
525
526
d77c3dfd 527class DownloadError(Exception):
59ae15a5 528 """Download Error exception.
d77c3dfd 529
59ae15a5
PH
530 This exception may be thrown by FileDownloader objects if they are not
531 configured to continue on errors. They will contain the appropriate
532 error message.
533 """
5f6a1245 534
8cc83b8d
FV
535 def __init__(self, msg, exc_info=None):
536 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
537 super(DownloadError, self).__init__(msg)
538 self.exc_info = exc_info
d77c3dfd
FV
539
540
541class SameFileError(Exception):
59ae15a5 542 """Same File exception.
d77c3dfd 543
59ae15a5
PH
544 This exception will be thrown by FileDownloader objects if they detect
545 multiple files would have to be downloaded to the same file on disk.
546 """
547 pass
d77c3dfd
FV
548
549
550class PostProcessingError(Exception):
59ae15a5 551 """Post Processing exception.
d77c3dfd 552
59ae15a5
PH
553 This exception may be raised by PostProcessor's .run() method to
554 indicate an error in the postprocessing task.
555 """
5f6a1245 556
7851b379
PH
557 def __init__(self, msg):
558 self.msg = msg
d77c3dfd 559
5f6a1245 560
d77c3dfd 561class MaxDownloadsReached(Exception):
59ae15a5
PH
562 """ --max-downloads limit has been reached. """
563 pass
d77c3dfd
FV
564
565
566class UnavailableVideoError(Exception):
59ae15a5 567 """Unavailable Format exception.
d77c3dfd 568
59ae15a5
PH
569 This exception will be thrown when a video is requested
570 in a format that is not available for that video.
571 """
572 pass
d77c3dfd
FV
573
574
575class ContentTooShortError(Exception):
59ae15a5 576 """Content Too Short exception.
d77c3dfd 577
59ae15a5
PH
578 This exception may be raised by FileDownloader objects when a file they
579 download is too small for what the server announced first, indicating
580 the connection was probably interrupted.
581 """
582 # Both in bytes
583 downloaded = None
584 expected = None
d77c3dfd 585
59ae15a5
PH
586 def __init__(self, downloaded, expected):
587 self.downloaded = downloaded
588 self.expected = expected
d77c3dfd 589
5f6a1245 590
c5a59d93 591def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
be4a824d
PH
592 hc = http_class(*args, **kwargs)
593 source_address = ydl_handler._params.get('source_address')
594 if source_address is not None:
595 sa = (source_address, 0)
596 if hasattr(hc, 'source_address'): # Python 2.7+
597 hc.source_address = sa
598 else: # Python 2.6
599 def _hc_connect(self, *args, **kwargs):
600 sock = compat_socket_create_connection(
601 (self.host, self.port), self.timeout, sa)
602 if is_https:
d7932313
PH
603 self.sock = ssl.wrap_socket(
604 sock, self.key_file, self.cert_file,
605 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
606 else:
607 self.sock = sock
608 hc.connect = functools.partial(_hc_connect, hc)
609
610 return hc
611
612
acebc9cd 613class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
614 """Handler for HTTP requests and responses.
615
616 This class, when installed with an OpenerDirector, automatically adds
617 the standard headers to every HTTP request and handles gzipped and
618 deflated responses from web servers. If compression is to be avoided in
619 a particular request, the original request in the program code only has
620 to include the HTTP header "Youtubedl-No-Compression", which will be
621 removed before making the real request.
622
623 Part of this code was copied from:
624
625 http://techknack.net/python-urllib2-handlers/
626
627 Andrew Rowls, the author of that code, agreed to release it to the
628 public domain.
629 """
630
be4a824d
PH
631 def __init__(self, params, *args, **kwargs):
632 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
633 self._params = params
634
635 def http_open(self, req):
636 return self.do_open(functools.partial(
c5a59d93 637 _create_http_connection, self, compat_http_client.HTTPConnection, False),
be4a824d
PH
638 req)
639
59ae15a5
PH
640 @staticmethod
641 def deflate(data):
642 try:
643 return zlib.decompress(data, -zlib.MAX_WBITS)
644 except zlib.error:
645 return zlib.decompress(data)
646
647 @staticmethod
648 def addinfourl_wrapper(stream, headers, url, code):
649 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
650 return compat_urllib_request.addinfourl(stream, headers, url, code)
651 ret = compat_urllib_request.addinfourl(stream, headers, url)
652 ret.code = code
653 return ret
654
acebc9cd 655 def http_request(self, req):
33ac271b 656 for h, v in std_headers.items():
3d5f7a39
JK
657 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
658 # The dict keys are capitalized because of this bug by urllib
659 if h.capitalize() not in req.headers:
33ac271b 660 req.add_header(h, v)
59ae15a5
PH
661 if 'Youtubedl-no-compression' in req.headers:
662 if 'Accept-encoding' in req.headers:
663 del req.headers['Accept-encoding']
664 del req.headers['Youtubedl-no-compression']
989b4b2b
PH
665
666 if sys.version_info < (2, 7) and '#' in req.get_full_url():
667 # Python 2.6 is brain-dead when it comes to fragments
668 req._Request__original = req._Request__original.partition('#')[0]
669 req._Request__r_type = req._Request__r_type.partition('#')[0]
670
59ae15a5
PH
671 return req
672
acebc9cd 673 def http_response(self, req, resp):
59ae15a5
PH
674 old_resp = resp
675 # gzip
676 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
677 content = resp.read()
678 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
679 try:
680 uncompressed = io.BytesIO(gz.read())
681 except IOError as original_ioerror:
682 # There may be junk add the end of the file
683 # See http://stackoverflow.com/q/4928560/35070 for details
684 for i in range(1, 1024):
685 try:
686 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
687 uncompressed = io.BytesIO(gz.read())
688 except IOError:
689 continue
690 break
691 else:
692 raise original_ioerror
693 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
694 resp.msg = old_resp.msg
695 # deflate
696 if resp.headers.get('Content-encoding', '') == 'deflate':
697 gz = io.BytesIO(self.deflate(resp.read()))
698 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
699 resp.msg = old_resp.msg
700 return resp
0f8d03f8 701
acebc9cd
PH
702 https_request = http_request
703 https_response = http_response
bf50b038 704
5de90176 705
be4a824d
PH
706class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
707 def __init__(self, params, https_conn_class=None, *args, **kwargs):
708 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
709 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
710 self._params = params
711
712 def https_open(self, req):
4f264c02
JMF
713 kwargs = {}
714 if hasattr(self, '_context'): # python > 2.6
715 kwargs['context'] = self._context
716 if hasattr(self, '_check_hostname'): # python 3.x
717 kwargs['check_hostname'] = self._check_hostname
be4a824d
PH
718 return self.do_open(functools.partial(
719 _create_http_connection, self, self._https_conn_class, True),
4f264c02 720 req, **kwargs)
be4a824d
PH
721
722
08b38d54 723def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
724 """ Return a UNIX timestamp from the given date """
725
726 if date_str is None:
727 return None
728
08b38d54
PH
729 if timezone is None:
730 m = re.search(
731 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
732 date_str)
733 if not m:
912b38b4
PH
734 timezone = datetime.timedelta()
735 else:
08b38d54
PH
736 date_str = date_str[:-len(m.group(0))]
737 if not m.group('sign'):
738 timezone = datetime.timedelta()
739 else:
740 sign = 1 if m.group('sign') == '+' else -1
741 timezone = datetime.timedelta(
742 hours=sign * int(m.group('hours')),
743 minutes=sign * int(m.group('minutes')))
6ad4013d 744 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
305d0683 745 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
746 return calendar.timegm(dt.timetuple())
747
748
42bdd9d0 749def unified_strdate(date_str, day_first=True):
bf50b038 750 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
751
752 if date_str is None:
753 return None
bf50b038 754 upload_date = None
5f6a1245 755 # Replace commas
026fcc04 756 date_str = date_str.replace(',', ' ')
bf50b038 757 # %z (UTC offset) is only supported in python>=3.2
15ac8413
S
758 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
759 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0 760 # Remove AM/PM + timezone
9bb8e0a3 761 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
42bdd9d0 762
19e1d359
JMF
763 format_expressions = [
764 '%d %B %Y',
0f99566c 765 '%d %b %Y',
19e1d359
JMF
766 '%B %d %Y',
767 '%b %d %Y',
78ff59d0
PP
768 '%b %dst %Y %I:%M%p',
769 '%b %dnd %Y %I:%M%p',
770 '%b %dth %Y %I:%M%p',
a69801e2 771 '%Y %m %d',
19e1d359 772 '%Y-%m-%d',
fe556f1b 773 '%Y/%m/%d',
19e1d359 774 '%Y/%m/%d %H:%M:%S',
5d73273f 775 '%Y-%m-%d %H:%M:%S',
e9be9a6a 776 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 777 '%d.%m.%Y %H:%M',
b047de6f 778 '%d.%m.%Y %H.%M',
19e1d359 779 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
780 '%Y-%m-%dT%H:%M:%S.%fZ',
781 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 782 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 783 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 784 '%Y-%m-%dT%H:%M',
19e1d359 785 ]
42bdd9d0
PH
786 if day_first:
787 format_expressions.extend([
79c21abb 788 '%d-%m-%Y',
776dc399
S
789 '%d.%m.%Y',
790 '%d/%m/%Y',
791 '%d/%m/%y',
42bdd9d0
PH
792 '%d/%m/%Y %H:%M:%S',
793 ])
794 else:
795 format_expressions.extend([
79c21abb 796 '%m-%d-%Y',
776dc399
S
797 '%m.%d.%Y',
798 '%m/%d/%Y',
799 '%m/%d/%y',
42bdd9d0
PH
800 '%m/%d/%Y %H:%M:%S',
801 ])
bf50b038
JMF
802 for expression in format_expressions:
803 try:
804 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 805 except ValueError:
bf50b038 806 pass
42393ce2
PH
807 if upload_date is None:
808 timetuple = email.utils.parsedate_tz(date_str)
809 if timetuple:
810 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
811 return upload_date
812
5f6a1245 813
28e614de 814def determine_ext(url, default_ext='unknown_video'):
f4776371
S
815 if url is None:
816 return default_ext
28e614de 817 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
818 if re.match(r'^[A-Za-z0-9]+$', guess):
819 return guess
820 else:
cbdbb766 821 return default_ext
73e79f2a 822
5f6a1245 823
d4051a8e 824def subtitles_filename(filename, sub_lang, sub_format):
28e614de 825 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 826
5f6a1245 827
bd558525 828def date_from_str(date_str):
37254abc
JMF
829 """
830 Return a datetime object from a string in the format YYYYMMDD or
831 (now|today)[+-][0-9](day|week|month|year)(s)?"""
832 today = datetime.date.today()
f8795e10 833 if date_str in ('now', 'today'):
37254abc 834 return today
f8795e10
PH
835 if date_str == 'yesterday':
836 return today - datetime.timedelta(days=1)
37254abc
JMF
837 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
838 if match is not None:
839 sign = match.group('sign')
840 time = int(match.group('time'))
841 if sign == '-':
842 time = -time
843 unit = match.group('unit')
5f6a1245 844 # A bad aproximation?
37254abc
JMF
845 if unit == 'month':
846 unit = 'day'
847 time *= 30
848 elif unit == 'year':
849 unit = 'day'
850 time *= 365
851 unit += 's'
852 delta = datetime.timedelta(**{unit: time})
853 return today + delta
bd558525 854 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
5f6a1245
JW
855
856
e63fc1be 857def hyphenate_date(date_str):
858 """
859 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
860 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
861 if match is not None:
862 return '-'.join(match.groups())
863 else:
864 return date_str
865
5f6a1245 866
bd558525
JMF
867class DateRange(object):
868 """Represents a time interval between two dates"""
5f6a1245 869
bd558525
JMF
870 def __init__(self, start=None, end=None):
871 """start and end must be strings in the format accepted by date"""
872 if start is not None:
873 self.start = date_from_str(start)
874 else:
875 self.start = datetime.datetime.min.date()
876 if end is not None:
877 self.end = date_from_str(end)
878 else:
879 self.end = datetime.datetime.max.date()
37254abc 880 if self.start > self.end:
bd558525 881 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 882
bd558525
JMF
883 @classmethod
884 def day(cls, day):
885 """Returns a range that only contains the given day"""
5f6a1245
JW
886 return cls(day, day)
887
bd558525
JMF
888 def __contains__(self, date):
889 """Check if the date is in the range"""
37254abc
JMF
890 if not isinstance(date, datetime.date):
891 date = date_from_str(date)
892 return self.start <= date <= self.end
5f6a1245 893
bd558525 894 def __str__(self):
5f6a1245 895 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
896
897
898def platform_name():
899 """ Returns the platform name as a compat_str """
900 res = platform.platform()
901 if isinstance(res, bytes):
902 res = res.decode(preferredencoding())
903
904 assert isinstance(res, compat_str)
905 return res
c257baff
PH
906
907
b58ddb32
PH
908def _windows_write_string(s, out):
909 """ Returns True if the string was written using special methods,
910 False if it has yet to be written out."""
911 # Adapted from http://stackoverflow.com/a/3259271/35070
912
913 import ctypes
914 import ctypes.wintypes
915
916 WIN_OUTPUT_IDS = {
917 1: -11,
918 2: -12,
919 }
920
a383a98a
PH
921 try:
922 fileno = out.fileno()
923 except AttributeError:
924 # If the output stream doesn't have a fileno, it's virtual
925 return False
aa42e873
PH
926 except io.UnsupportedOperation:
927 # Some strange Windows pseudo files?
928 return False
b58ddb32
PH
929 if fileno not in WIN_OUTPUT_IDS:
930 return False
931
e2f89ec7 932 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 933 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
6ac4e806 934 (b"GetStdHandle", ctypes.windll.kernel32))
b58ddb32
PH
935 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
936
e2f89ec7 937 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
938 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
939 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
6ac4e806 940 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
b58ddb32
PH
941 written = ctypes.wintypes.DWORD(0)
942
6ac4e806 943 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
b58ddb32
PH
944 FILE_TYPE_CHAR = 0x0002
945 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 946 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
947 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
948 ctypes.POINTER(ctypes.wintypes.DWORD))(
6ac4e806 949 (b"GetConsoleMode", ctypes.windll.kernel32))
b58ddb32
PH
950 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
951
952 def not_a_console(handle):
953 if handle == INVALID_HANDLE_VALUE or handle is None:
954 return True
8fb3ac36
PH
955 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
956 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
957
958 if not_a_console(h):
959 return False
960
d1b9c912
PH
961 def next_nonbmp_pos(s):
962 try:
963 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
964 except StopIteration:
965 return len(s)
966
967 while s:
968 count = min(next_nonbmp_pos(s), 1024)
969
b58ddb32 970 ret = WriteConsoleW(
d1b9c912 971 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
972 if ret == 0:
973 raise OSError('Failed to write string')
d1b9c912
PH
974 if not count: # We just wrote a non-BMP character
975 assert written.value == 2
976 s = s[1:]
977 else:
978 assert written.value > 0
979 s = s[written.value:]
b58ddb32
PH
980 return True
981
982
734f90bb 983def write_string(s, out=None, encoding=None):
7459e3a2
PH
984 if out is None:
985 out = sys.stderr
8bf48f23 986 assert type(s) == compat_str
7459e3a2 987
b58ddb32
PH
988 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
989 if _windows_write_string(s, out):
990 return
991
7459e3a2
PH
992 if ('b' in getattr(out, 'mode', '') or
993 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
994 byt = s.encode(encoding or preferredencoding(), 'ignore')
995 out.write(byt)
996 elif hasattr(out, 'buffer'):
997 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
998 byt = s.encode(enc, 'ignore')
999 out.buffer.write(byt)
1000 else:
8bf48f23 1001 out.write(s)
7459e3a2
PH
1002 out.flush()
1003
1004
48ea9cea
PH
1005def bytes_to_intlist(bs):
1006 if not bs:
1007 return []
1008 if isinstance(bs[0], int): # Python 3
1009 return list(bs)
1010 else:
1011 return [ord(c) for c in bs]
1012
c257baff 1013
cba892fa 1014def intlist_to_bytes(xs):
1015 if not xs:
1016 return b''
eb4157fd 1017 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1018
1019
c1c9a79c
PH
1020# Cross-platform file locking
1021if sys.platform == 'win32':
1022 import ctypes.wintypes
1023 import msvcrt
1024
1025 class OVERLAPPED(ctypes.Structure):
1026 _fields_ = [
1027 ('Internal', ctypes.wintypes.LPVOID),
1028 ('InternalHigh', ctypes.wintypes.LPVOID),
1029 ('Offset', ctypes.wintypes.DWORD),
1030 ('OffsetHigh', ctypes.wintypes.DWORD),
1031 ('hEvent', ctypes.wintypes.HANDLE),
1032 ]
1033
1034 kernel32 = ctypes.windll.kernel32
1035 LockFileEx = kernel32.LockFileEx
1036 LockFileEx.argtypes = [
1037 ctypes.wintypes.HANDLE, # hFile
1038 ctypes.wintypes.DWORD, # dwFlags
1039 ctypes.wintypes.DWORD, # dwReserved
1040 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1041 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1042 ctypes.POINTER(OVERLAPPED) # Overlapped
1043 ]
1044 LockFileEx.restype = ctypes.wintypes.BOOL
1045 UnlockFileEx = kernel32.UnlockFileEx
1046 UnlockFileEx.argtypes = [
1047 ctypes.wintypes.HANDLE, # hFile
1048 ctypes.wintypes.DWORD, # dwReserved
1049 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1050 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1051 ctypes.POINTER(OVERLAPPED) # Overlapped
1052 ]
1053 UnlockFileEx.restype = ctypes.wintypes.BOOL
1054 whole_low = 0xffffffff
1055 whole_high = 0x7fffffff
1056
1057 def _lock_file(f, exclusive):
1058 overlapped = OVERLAPPED()
1059 overlapped.Offset = 0
1060 overlapped.OffsetHigh = 0
1061 overlapped.hEvent = 0
1062 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1063 handle = msvcrt.get_osfhandle(f.fileno())
1064 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1065 whole_low, whole_high, f._lock_file_overlapped_p):
1066 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1067
1068 def _unlock_file(f):
1069 assert f._lock_file_overlapped_p
1070 handle = msvcrt.get_osfhandle(f.fileno())
1071 if not UnlockFileEx(handle, 0,
1072 whole_low, whole_high, f._lock_file_overlapped_p):
1073 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1074
1075else:
1076 import fcntl
1077
1078 def _lock_file(f, exclusive):
2582bebe 1079 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
1080
1081 def _unlock_file(f):
2582bebe 1082 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
1083
1084
1085class locked_file(object):
1086 def __init__(self, filename, mode, encoding=None):
1087 assert mode in ['r', 'a', 'w']
1088 self.f = io.open(filename, mode, encoding=encoding)
1089 self.mode = mode
1090
1091 def __enter__(self):
1092 exclusive = self.mode != 'r'
1093 try:
1094 _lock_file(self.f, exclusive)
1095 except IOError:
1096 self.f.close()
1097 raise
1098 return self
1099
1100 def __exit__(self, etype, value, traceback):
1101 try:
1102 _unlock_file(self.f)
1103 finally:
1104 self.f.close()
1105
1106 def __iter__(self):
1107 return iter(self.f)
1108
1109 def write(self, *args):
1110 return self.f.write(*args)
1111
1112 def read(self, *args):
1113 return self.f.read(*args)
4eb7f1d1
JMF
1114
1115
4644ac55
S
1116def get_filesystem_encoding():
1117 encoding = sys.getfilesystemencoding()
1118 return encoding if encoding is not None else 'utf-8'
1119
1120
4eb7f1d1 1121def shell_quote(args):
a6a173c2 1122 quoted_args = []
4644ac55 1123 encoding = get_filesystem_encoding()
a6a173c2
JMF
1124 for a in args:
1125 if isinstance(a, bytes):
1126 # We may get a filename encoded with 'encodeFilename'
1127 a = a.decode(encoding)
1128 quoted_args.append(pipes.quote(a))
28e614de 1129 return ' '.join(quoted_args)
9d4660ca
PH
1130
1131
1132def smuggle_url(url, data):
1133 """ Pass additional data in a URL for internal use. """
1134
1135 sdata = compat_urllib_parse.urlencode(
28e614de
PH
1136 {'__youtubedl_smuggle': json.dumps(data)})
1137 return url + '#' + sdata
9d4660ca
PH
1138
1139
79f82953 1140def unsmuggle_url(smug_url, default=None):
83e865a3 1141 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1142 return smug_url, default
28e614de
PH
1143 url, _, sdata = smug_url.rpartition('#')
1144 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1145 data = json.loads(jsond)
1146 return url, data
02dbf93f
PH
1147
1148
02dbf93f
PH
1149def format_bytes(bytes):
1150 if bytes is None:
28e614de 1151 return 'N/A'
02dbf93f
PH
1152 if type(bytes) is str:
1153 bytes = float(bytes)
1154 if bytes == 0.0:
1155 exponent = 0
1156 else:
1157 exponent = int(math.log(bytes, 1024.0))
28e614de 1158 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1159 converted = float(bytes) / float(1024 ** exponent)
28e614de 1160 return '%.2f%s' % (converted, suffix)
f53c966a 1161
1c088fa8 1162
be64b5b0
PH
1163def parse_filesize(s):
1164 if s is None:
1165 return None
1166
1167 # The lower-case forms are of course incorrect and inofficial,
1168 # but we support those too
1169 _UNIT_TABLE = {
1170 'B': 1,
1171 'b': 1,
1172 'KiB': 1024,
1173 'KB': 1000,
1174 'kB': 1024,
1175 'Kb': 1000,
1176 'MiB': 1024 ** 2,
1177 'MB': 1000 ** 2,
1178 'mB': 1024 ** 2,
1179 'Mb': 1000 ** 2,
1180 'GiB': 1024 ** 3,
1181 'GB': 1000 ** 3,
1182 'gB': 1024 ** 3,
1183 'Gb': 1000 ** 3,
1184 'TiB': 1024 ** 4,
1185 'TB': 1000 ** 4,
1186 'tB': 1024 ** 4,
1187 'Tb': 1000 ** 4,
1188 'PiB': 1024 ** 5,
1189 'PB': 1000 ** 5,
1190 'pB': 1024 ** 5,
1191 'Pb': 1000 ** 5,
1192 'EiB': 1024 ** 6,
1193 'EB': 1000 ** 6,
1194 'eB': 1024 ** 6,
1195 'Eb': 1000 ** 6,
1196 'ZiB': 1024 ** 7,
1197 'ZB': 1000 ** 7,
1198 'zB': 1024 ** 7,
1199 'Zb': 1000 ** 7,
1200 'YiB': 1024 ** 8,
1201 'YB': 1000 ** 8,
1202 'yB': 1024 ** 8,
1203 'Yb': 1000 ** 8,
1204 }
1205
1206 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
4349c07d
PH
1207 m = re.match(
1208 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
be64b5b0
PH
1209 if not m:
1210 return None
1211
4349c07d
PH
1212 num_str = m.group('num').replace(',', '.')
1213 mult = _UNIT_TABLE[m.group('unit')]
1214 return int(float(num_str) * mult)
be64b5b0
PH
1215
1216
caefb1de
PH
1217def month_by_name(name):
1218 """ Return the number of a month by (locale-independently) English name """
1219
caefb1de 1220 try:
7105440c
YCH
1221 return ENGLISH_MONTH_NAMES.index(name) + 1
1222 except ValueError:
1223 return None
1224
1225
1226def month_by_abbreviation(abbrev):
1227 """ Return the number of a month by (locale-independently) English
1228 abbreviations """
1229
1230 try:
1231 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1232 except ValueError:
1233 return None
18258362
JMF
1234
1235
5aafe895 1236def fix_xml_ampersands(xml_str):
18258362 1237 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1238 return re.sub(
1239 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1240 '&amp;',
5aafe895 1241 xml_str)
e3946f98
PH
1242
1243
1244def setproctitle(title):
8bf48f23 1245 assert isinstance(title, compat_str)
e3946f98
PH
1246 try:
1247 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1248 except OSError:
1249 return
6eefe533
PH
1250 title_bytes = title.encode('utf-8')
1251 buf = ctypes.create_string_buffer(len(title_bytes))
1252 buf.value = title_bytes
e3946f98 1253 try:
6eefe533 1254 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1255 except AttributeError:
1256 return # Strange libc, just skip this
d7dda168
PH
1257
1258
1259def remove_start(s, start):
1260 if s.startswith(start):
1261 return s[len(start):]
1262 return s
29eb5174
PH
1263
1264
2b9faf55
PH
1265def remove_end(s, end):
1266 if s.endswith(end):
1267 return s[:-len(end)]
1268 return s
1269
1270
29eb5174 1271def url_basename(url):
9b8aaeed 1272 path = compat_urlparse.urlparse(url).path
28e614de 1273 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1274
1275
1276class HEADRequest(compat_urllib_request.Request):
1277 def get_method(self):
1278 return "HEAD"
7217e148
PH
1279
1280
9732d77e 1281def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1282 if get_attr:
1283 if v is not None:
1284 v = getattr(v, get_attr, None)
9572013d
PH
1285 if v == '':
1286 v = None
9732d77e
PH
1287 return default if v is None else (int(v) * invscale // scale)
1288
9572013d 1289
40a90862
JMF
1290def str_or_none(v, default=None):
1291 return default if v is None else compat_str(v)
1292
9732d77e
PH
1293
1294def str_to_int(int_str):
48d4681e 1295 """ A more relaxed version of int_or_none """
9732d77e
PH
1296 if int_str is None:
1297 return None
28e614de 1298 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1299 return int(int_str)
608d11f5
PH
1300
1301
9732d77e
PH
1302def float_or_none(v, scale=1, invscale=1, default=None):
1303 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1304
1305
608d11f5 1306def parse_duration(s):
8f9312c3 1307 if not isinstance(s, compat_basestring):
608d11f5
PH
1308 return None
1309
ca7b3246
S
1310 s = s.strip()
1311
608d11f5 1312 m = re.match(
9d22a7df 1313 r'''(?ix)(?:P?T)?
e8df5cee
PH
1314 (?:
1315 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1316 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1317
3e675fab 1318 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
6a68bb57 1319 (?:
8f4b58d7
PH
1320 (?:
1321 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1322 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1323 )?
6a68bb57
PH
1324 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1325 )?
e8df5cee
PH
1326 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1327 )$''', s)
608d11f5
PH
1328 if not m:
1329 return None
e8df5cee
PH
1330 res = 0
1331 if m.group('only_mins'):
1332 return float_or_none(m.group('only_mins'), invscale=60)
1333 if m.group('only_hours'):
1334 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1335 if m.group('secs'):
1336 res += int(m.group('secs'))
3e675fab
PH
1337 if m.group('mins_reversed'):
1338 res += int(m.group('mins_reversed')) * 60
608d11f5
PH
1339 if m.group('mins'):
1340 res += int(m.group('mins')) * 60
e8df5cee
PH
1341 if m.group('hours'):
1342 res += int(m.group('hours')) * 60 * 60
3e675fab
PH
1343 if m.group('hours_reversed'):
1344 res += int(m.group('hours_reversed')) * 60 * 60
8f4b58d7
PH
1345 if m.group('days'):
1346 res += int(m.group('days')) * 24 * 60 * 60
7adcbe75
PH
1347 if m.group('ms'):
1348 res += float(m.group('ms'))
608d11f5 1349 return res
91d7d0b3
JMF
1350
1351
e65e4c88 1352def prepend_extension(filename, ext, expected_real_ext=None):
5f6a1245 1353 name, real_ext = os.path.splitext(filename)
e65e4c88
S
1354 return (
1355 '{0}.{1}{2}'.format(name, ext, real_ext)
1356 if not expected_real_ext or real_ext[1:] == expected_real_ext
1357 else '{0}.{1}'.format(filename, ext))
d70ad093
PH
1358
1359
b3ed15b7
S
1360def replace_extension(filename, ext, expected_real_ext=None):
1361 name, real_ext = os.path.splitext(filename)
1362 return '{0}.{1}'.format(
1363 name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename,
1364 ext)
1365
1366
d70ad093
PH
1367def check_executable(exe, args=[]):
1368 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1369 args can be a list of arguments for a short output (like -version) """
1370 try:
1371 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1372 except OSError:
1373 return False
1374 return exe
b7ab0590
PH
1375
1376
95807118 1377def get_exe_version(exe, args=['--version'],
cae97f65 1378 version_re=None, unrecognized='present'):
95807118
PH
1379 """ Returns the version of the specified executable,
1380 or False if the executable is not present """
1381 try:
cae97f65 1382 out, _ = subprocess.Popen(
95807118
PH
1383 [exe] + args,
1384 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1385 except OSError:
1386 return False
cae97f65
PH
1387 if isinstance(out, bytes): # Python 2.x
1388 out = out.decode('ascii', 'ignore')
1389 return detect_exe_version(out, version_re, unrecognized)
1390
1391
1392def detect_exe_version(output, version_re=None, unrecognized='present'):
1393 assert isinstance(output, compat_str)
1394 if version_re is None:
1395 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1396 m = re.search(version_re, output)
95807118
PH
1397 if m:
1398 return m.group(1)
1399 else:
1400 return unrecognized
1401
1402
b7ab0590 1403class PagedList(object):
dd26ced1
PH
1404 def __len__(self):
1405 # This is only useful for tests
1406 return len(self.getslice())
1407
9c44d242
PH
1408
1409class OnDemandPagedList(PagedList):
1410 def __init__(self, pagefunc, pagesize):
1411 self._pagefunc = pagefunc
1412 self._pagesize = pagesize
1413
b7ab0590
PH
1414 def getslice(self, start=0, end=None):
1415 res = []
1416 for pagenum in itertools.count(start // self._pagesize):
1417 firstid = pagenum * self._pagesize
1418 nextfirstid = pagenum * self._pagesize + self._pagesize
1419 if start >= nextfirstid:
1420 continue
1421
1422 page_results = list(self._pagefunc(pagenum))
1423
1424 startv = (
1425 start % self._pagesize
1426 if firstid <= start < nextfirstid
1427 else 0)
1428
1429 endv = (
1430 ((end - 1) % self._pagesize) + 1
1431 if (end is not None and firstid <= end <= nextfirstid)
1432 else None)
1433
1434 if startv != 0 or endv is not None:
1435 page_results = page_results[startv:endv]
1436 res.extend(page_results)
1437
1438 # A little optimization - if current page is not "full", ie. does
1439 # not contain page_size videos then we can assume that this page
1440 # is the last one - there are no more ids on further pages -
1441 # i.e. no need to query again.
1442 if len(page_results) + startv < self._pagesize:
1443 break
1444
1445 # If we got the whole page, but the next page is not interesting,
1446 # break out early as well
1447 if end == nextfirstid:
1448 break
1449 return res
81c2f20b
PH
1450
1451
9c44d242
PH
1452class InAdvancePagedList(PagedList):
1453 def __init__(self, pagefunc, pagecount, pagesize):
1454 self._pagefunc = pagefunc
1455 self._pagecount = pagecount
1456 self._pagesize = pagesize
1457
1458 def getslice(self, start=0, end=None):
1459 res = []
1460 start_page = start // self._pagesize
1461 end_page = (
1462 self._pagecount if end is None else (end // self._pagesize + 1))
1463 skip_elems = start - start_page * self._pagesize
1464 only_more = None if end is None else end - start
1465 for pagenum in range(start_page, end_page):
1466 page = list(self._pagefunc(pagenum))
1467 if skip_elems:
1468 page = page[skip_elems:]
1469 skip_elems = None
1470 if only_more is not None:
1471 if len(page) < only_more:
1472 only_more -= len(page)
1473 else:
1474 page = page[:only_more]
1475 res.extend(page)
1476 break
1477 res.extend(page)
1478 return res
1479
1480
81c2f20b 1481def uppercase_escape(s):
676eb3f2 1482 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1483 return re.sub(
a612753d 1484 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1485 lambda m: unicode_escape(m.group(0))[0],
1486 s)
0fe2ff78
YCH
1487
1488
1489def lowercase_escape(s):
1490 unicode_escape = codecs.getdecoder('unicode_escape')
1491 return re.sub(
1492 r'\\u[0-9a-fA-F]{4}',
1493 lambda m: unicode_escape(m.group(0))[0],
1494 s)
b53466e1 1495
d05cfe06
S
1496
1497def escape_rfc3986(s):
1498 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1499 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1500 s = s.encode('utf-8')
ecc0c5ee 1501 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1502
1503
1504def escape_url(url):
1505 """Escape URL as suggested by RFC 3986"""
1506 url_parsed = compat_urllib_parse_urlparse(url)
1507 return url_parsed._replace(
1508 path=escape_rfc3986(url_parsed.path),
1509 params=escape_rfc3986(url_parsed.params),
1510 query=escape_rfc3986(url_parsed.query),
1511 fragment=escape_rfc3986(url_parsed.fragment)
1512 ).geturl()
1513
b53466e1 1514try:
28e614de 1515 struct.pack('!I', 0)
b53466e1
PH
1516except TypeError:
1517 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1518 def struct_pack(spec, *args):
1519 if isinstance(spec, compat_str):
1520 spec = spec.encode('ascii')
1521 return struct.pack(spec, *args)
1522
1523 def struct_unpack(spec, *args):
1524 if isinstance(spec, compat_str):
1525 spec = spec.encode('ascii')
1526 return struct.unpack(spec, *args)
1527else:
1528 struct_pack = struct.pack
1529 struct_unpack = struct.unpack
62e609ab
PH
1530
1531
1532def read_batch_urls(batch_fd):
1533 def fixup(url):
1534 if not isinstance(url, compat_str):
1535 url = url.decode('utf-8', 'replace')
28e614de 1536 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1537 if url.startswith(BOM_UTF8):
1538 url = url[len(BOM_UTF8):]
1539 url = url.strip()
1540 if url.startswith(('#', ';', ']')):
1541 return False
1542 return url
1543
1544 with contextlib.closing(batch_fd) as fd:
1545 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1546
1547
1548def urlencode_postdata(*args, **kargs):
1549 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1550
1551
0990305d
PH
1552try:
1553 etree_iter = xml.etree.ElementTree.Element.iter
1554except AttributeError: # Python <=2.6
1555 etree_iter = lambda n: n.findall('.//*')
1556
1557
bcf89ce6
PH
1558def parse_xml(s):
1559 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1560 def doctype(self, name, pubid, system):
1561 pass # Ignore doctypes
1562
1563 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1564 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1565 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1566 # Fix up XML parser in Python 2.x
1567 if sys.version_info < (3, 0):
1568 for n in etree_iter(tree):
1569 if n.text is not None:
1570 if not isinstance(n.text, compat_str):
1571 n.text = n.text.decode('utf-8')
1572 return tree
e68301af
PH
1573
1574
a1a530b0
PH
1575US_RATINGS = {
1576 'G': 0,
1577 'PG': 10,
1578 'PG-13': 13,
1579 'R': 16,
1580 'NC': 18,
1581}
fac55558
PH
1582
1583
146c80e2
S
1584def parse_age_limit(s):
1585 if s is None:
d838b1bd 1586 return None
146c80e2 1587 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d838b1bd 1588 return int(m.group('age')) if m else US_RATINGS.get(s, None)
146c80e2
S
1589
1590
fac55558 1591def strip_jsonp(code):
609a61e3
PH
1592 return re.sub(
1593 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1594
1595
e05f6939
PH
1596def js_to_json(code):
1597 def fix_kv(m):
e7b6d122
PH
1598 v = m.group(0)
1599 if v in ('true', 'false', 'null'):
1600 return v
1601 if v.startswith('"'):
1602 return v
1603 if v.startswith("'"):
1604 v = v[1:-1]
1605 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1606 '\\\\': '\\\\',
1607 "\\'": "'",
1608 '"': '\\"',
1609 }[m.group(0)], v)
1610 return '"%s"' % v
e05f6939
PH
1611
1612 res = re.sub(r'''(?x)
d305dd73
PH
1613 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1614 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
8f4b58d7 1615 [a-zA-Z_][.a-zA-Z_0-9]*
e05f6939 1616 ''', fix_kv, code)
ba9e68f4 1617 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
e05f6939
PH
1618 return res
1619
1620
478c2c61
PH
1621def qualities(quality_ids):
1622 """ Get a numeric quality value out of a list of possible values """
1623 def q(qid):
1624 try:
1625 return quality_ids.index(qid)
1626 except ValueError:
1627 return -1
1628 return q
1629
acd69589
PH
1630
1631DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1632
a020a0dc
PH
1633
1634def limit_length(s, length):
1635 """ Add ellipses to overly long strings """
1636 if s is None:
1637 return None
1638 ELLIPSES = '...'
1639 if len(s) > length:
1640 return s[:length - len(ELLIPSES)] + ELLIPSES
1641 return s
48844745
PH
1642
1643
1644def version_tuple(v):
5f9b8394 1645 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1646
1647
1648def is_outdated_version(version, limit, assume_new=True):
1649 if not version:
1650 return not assume_new
1651 try:
1652 return version_tuple(version) < version_tuple(limit)
1653 except ValueError:
1654 return not assume_new
732ea2f0
PH
1655
1656
1657def ytdl_is_updateable():
1658 """ Returns if youtube-dl can be updated with -U """
1659 from zipimport import zipimporter
1660
1661 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1662
1663
1664def args_to_str(args):
1665 # Get a short string representation for a subprocess command
1666 return ' '.join(shlex_quote(a) for a in args)
2ccd1b10
PH
1667
1668
c460bdd5
PH
1669def mimetype2ext(mt):
1670 _, _, res = mt.rpartition('/')
1671
1672 return {
1673 'x-ms-wmv': 'wmv',
1674 'x-mp4-fragmented': 'mp4',
1675 }.get(res, res)
1676
1677
2ccd1b10
PH
1678def urlhandle_detect_ext(url_handle):
1679 try:
1680 url_handle.headers
1681 getheader = lambda h: url_handle.headers[h]
1682 except AttributeError: # Python < 3
1683 getheader = url_handle.info().getheader
1684
b55ee18f
PH
1685 cd = getheader('Content-Disposition')
1686 if cd:
1687 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1688 if m:
1689 e = determine_ext(m.group('filename'), default_ext=None)
1690 if e:
1691 return e
1692
c460bdd5 1693 return mimetype2ext(getheader('Content-Type'))
05900629
PH
1694
1695
1696def age_restricted(content_limit, age_limit):
1697 """ Returns True iff the content should be blocked """
1698
1699 if age_limit is None: # No limit set
1700 return False
1701 if content_limit is None:
1702 return False # Content available for everyone
1703 return age_limit < content_limit
61ca9a80
PH
1704
1705
1706def is_html(first_bytes):
1707 """ Detect whether a file contains HTML by examining its first bytes. """
1708
1709 BOMS = [
1710 (b'\xef\xbb\xbf', 'utf-8'),
1711 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1712 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1713 (b'\xff\xfe', 'utf-16-le'),
1714 (b'\xfe\xff', 'utf-16-be'),
1715 ]
1716 for bom, enc in BOMS:
1717 if first_bytes.startswith(bom):
1718 s = first_bytes[len(bom):].decode(enc, 'replace')
1719 break
1720 else:
1721 s = first_bytes.decode('utf-8', 'replace')
1722
1723 return re.match(r'^\s*<', s)
a055469f
PH
1724
1725
1726def determine_protocol(info_dict):
1727 protocol = info_dict.get('protocol')
1728 if protocol is not None:
1729 return protocol
1730
1731 url = info_dict['url']
1732 if url.startswith('rtmp'):
1733 return 'rtmp'
1734 elif url.startswith('mms'):
1735 return 'mms'
1736 elif url.startswith('rtsp'):
1737 return 'rtsp'
1738
1739 ext = determine_ext(url)
1740 if ext == 'm3u8':
1741 return 'm3u8'
1742 elif ext == 'f4m':
1743 return 'f4m'
1744
1745 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
1746
1747
1748def render_table(header_row, data):
1749 """ Render a list of rows, each as a list of values """
1750 table = [header_row] + data
1751 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1752 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1753 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
1754
1755
1756def _match_one(filter_part, dct):
1757 COMPARISON_OPERATORS = {
1758 '<': operator.lt,
1759 '<=': operator.le,
1760 '>': operator.gt,
1761 '>=': operator.ge,
1762 '=': operator.eq,
1763 '!=': operator.ne,
1764 }
1765 operator_rex = re.compile(r'''(?x)\s*
1766 (?P<key>[a-z_]+)
1767 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1768 (?:
1769 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1770 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1771 )
1772 \s*$
1773 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1774 m = operator_rex.search(filter_part)
1775 if m:
1776 op = COMPARISON_OPERATORS[m.group('op')]
1777 if m.group('strval') is not None:
1778 if m.group('op') not in ('=', '!='):
1779 raise ValueError(
1780 'Operator %s does not support string values!' % m.group('op'))
1781 comparison_value = m.group('strval')
1782 else:
1783 try:
1784 comparison_value = int(m.group('intval'))
1785 except ValueError:
1786 comparison_value = parse_filesize(m.group('intval'))
1787 if comparison_value is None:
1788 comparison_value = parse_filesize(m.group('intval') + 'B')
1789 if comparison_value is None:
1790 raise ValueError(
1791 'Invalid integer value %r in filter part %r' % (
1792 m.group('intval'), filter_part))
1793 actual_value = dct.get(m.group('key'))
1794 if actual_value is None:
1795 return m.group('none_inclusive')
1796 return op(actual_value, comparison_value)
1797
1798 UNARY_OPERATORS = {
1799 '': lambda v: v is not None,
1800 '!': lambda v: v is None,
1801 }
1802 operator_rex = re.compile(r'''(?x)\s*
1803 (?P<op>%s)\s*(?P<key>[a-z_]+)
1804 \s*$
1805 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1806 m = operator_rex.search(filter_part)
1807 if m:
1808 op = UNARY_OPERATORS[m.group('op')]
1809 actual_value = dct.get(m.group('key'))
1810 return op(actual_value)
1811
1812 raise ValueError('Invalid filter part %r' % filter_part)
1813
1814
1815def match_str(filter_str, dct):
1816 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1817
1818 return all(
1819 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1820
1821
1822def match_filter_func(filter_str):
1823 def _match_func(info_dict):
1824 if match_str(filter_str, info_dict):
1825 return None
1826 else:
1827 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1828 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1829 return _match_func
91410c9b
PH
1830
1831
bf6427d2
YCH
1832def parse_dfxp_time_expr(time_expr):
1833 if not time_expr:
1834 return 0.0
1835
1836 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1837 if mobj:
1838 return float(mobj.group('time_offset'))
1839
1840 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1841 if mobj:
1842 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1843
1844
1845def format_srt_time(seconds):
1846 (mins, secs) = divmod(seconds, 60)
1847 (hours, mins) = divmod(mins, 60)
1848 millisecs = (secs - int(secs)) * 1000
1849 secs = int(secs)
1850 return '%02d:%02d:%02d,%03d' % (hours, mins, secs, millisecs)
1851
1852
1853def dfxp2srt(dfxp_data):
1854 _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'})
1855
1856 def parse_node(node):
1857 str_or_empty = functools.partial(str_or_none, default='')
1858
1859 out = str_or_empty(node.text)
1860
1861 for child in node:
1862 if child.tag == _x('ttml:br'):
1863 out += '\n' + str_or_empty(child.tail)
1864 elif child.tag == _x('ttml:span'):
1865 out += str_or_empty(parse_node(child))
1866 else:
1867 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1868
1869 return out
1870
1871 dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1872 out = []
1873 paras = dfxp.findall(_x('.//ttml:p'))
1874
1875 for para, index in zip(paras, itertools.count(1)):
1876 out.append('%d\n%s --> %s\n%s\n\n' % (
1877 index,
1878 format_srt_time(parse_dfxp_time_expr(para.attrib.get('begin'))),
1879 format_srt_time(parse_dfxp_time_expr(para.attrib.get('end'))),
1880 parse_node(para)))
1881
1882 return ''.join(out)
1883
1884
91410c9b 1885class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
1886 def __init__(self, proxies=None):
1887 # Set default handlers
1888 for type in ('http', 'https'):
1889 setattr(self, '%s_open' % type,
1890 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
1891 meth(r, proxy, type))
1892 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
1893
91410c9b 1894 def proxy_open(self, req, proxy, type):
2461f79d 1895 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
1896 if req_proxy is not None:
1897 proxy = req_proxy
2461f79d
PH
1898 del req.headers['Ytdl-request-proxy']
1899
1900 if proxy == '__noproxy__':
1901 return None # No Proxy
91410c9b
PH
1902 return compat_urllib_request.ProxyHandler.proxy_open(
1903 self, req, proxy, type)