]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
[baiduvideo] Add new extractor (closes #4563)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
ecc0c5ee
PH
4from __future__ import unicode_literals
5
912b38b4 6import calendar
676eb3f2 7import codecs
62e609ab 8import contextlib
e3946f98 9import ctypes
c496ca96
PH
10import datetime
11import email.utils
f45c185f 12import errno
be4a824d 13import functools
d77c3dfd 14import gzip
b7ab0590 15import itertools
03f9daab 16import io
f4bfd65f 17import json
d77c3dfd 18import locale
02dbf93f 19import math
347de493 20import operator
d77c3dfd 21import os
4eb7f1d1 22import pipes
c496ca96 23import platform
d77c3dfd 24import re
13ebea79 25import ssl
c496ca96 26import socket
b53466e1 27import struct
1c088fa8 28import subprocess
d77c3dfd 29import sys
181c8655 30import tempfile
01951dda 31import traceback
bcf89ce6 32import xml.etree.ElementTree
d77c3dfd 33import zlib
d77c3dfd 34
8c25f81b 35from .compat import (
8f9312c3 36 compat_basestring,
8c25f81b 37 compat_chr,
8c25f81b 38 compat_html_entities,
be4a824d 39 compat_http_client,
c86b6142 40 compat_kwargs,
8c25f81b 41 compat_parse_qs,
be4a824d 42 compat_socket_create_connection,
8c25f81b
PH
43 compat_str,
44 compat_urllib_error,
45 compat_urllib_parse,
46 compat_urllib_parse_urlparse,
47 compat_urllib_request,
48 compat_urlparse,
7d4111ed 49 shlex_quote,
8c25f81b 50)
4644ac55
S
51
52
468e2e92
FV
53# This is not clearly defined otherwise
54compiled_regex_type = type(re.compile(''))
55
3e669f36 56std_headers = {
18313934 57 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/20.0 (Chrome)',
59ae15a5
PH
58 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
59 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
60 'Accept-Encoding': 'gzip, deflate',
61 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 62}
f427df17 63
5f6a1245 64
7105440c
YCH
65ENGLISH_MONTH_NAMES = [
66 'January', 'February', 'March', 'April', 'May', 'June',
67 'July', 'August', 'September', 'October', 'November', 'December']
68
69
d77c3dfd 70def preferredencoding():
59ae15a5 71 """Get preferred encoding.
d77c3dfd 72
59ae15a5
PH
73 Returns the best encoding scheme for the system, based on
74 locale.getpreferredencoding() and some further tweaks.
75 """
76 try:
77 pref = locale.getpreferredencoding()
28e614de 78 'TEST'.encode(pref)
70a1165b 79 except Exception:
59ae15a5 80 pref = 'UTF-8'
bae611f2 81
59ae15a5 82 return pref
d77c3dfd 83
f4bfd65f 84
181c8655 85def write_json_file(obj, fn):
1394646a 86 """ Encode obj as JSON and write it to fn, atomically if possible """
181c8655 87
92120217 88 fn = encodeFilename(fn)
61ee5aeb 89 if sys.version_info < (3, 0) and sys.platform != 'win32':
ec5f6016
JMF
90 encoding = get_filesystem_encoding()
91 # os.path.basename returns a bytes object, but NamedTemporaryFile
92 # will fail if the filename contains non ascii characters unless we
93 # use a unicode object
94 path_basename = lambda f: os.path.basename(fn).decode(encoding)
95 # the same for os.path.dirname
96 path_dirname = lambda f: os.path.dirname(fn).decode(encoding)
97 else:
98 path_basename = os.path.basename
99 path_dirname = os.path.dirname
100
73159f99
S
101 args = {
102 'suffix': '.tmp',
ec5f6016
JMF
103 'prefix': path_basename(fn) + '.',
104 'dir': path_dirname(fn),
73159f99
S
105 'delete': False,
106 }
107
181c8655
PH
108 # In Python 2.x, json.dump expects a bytestream.
109 # In Python 3.x, it writes to a character stream
110 if sys.version_info < (3, 0):
73159f99 111 args['mode'] = 'wb'
181c8655 112 else:
73159f99
S
113 args.update({
114 'mode': 'w',
115 'encoding': 'utf-8',
116 })
117
c86b6142 118 tf = tempfile.NamedTemporaryFile(**compat_kwargs(args))
181c8655
PH
119
120 try:
121 with tf:
122 json.dump(obj, tf)
1394646a
IK
123 if sys.platform == 'win32':
124 # Need to remove existing file on Windows, else os.rename raises
125 # WindowsError or FileExistsError.
126 try:
127 os.unlink(fn)
128 except OSError:
129 pass
181c8655 130 os.rename(tf.name, fn)
70a1165b 131 except Exception:
181c8655
PH
132 try:
133 os.remove(tf.name)
134 except OSError:
135 pass
136 raise
137
138
139if sys.version_info >= (2, 7):
59ae56fa
PH
140 def find_xpath_attr(node, xpath, key, val):
141 """ Find the xpath xpath[@key=val] """
cbf915f3
PH
142 assert re.match(r'^[a-zA-Z-]+$', key)
143 assert re.match(r'^[a-zA-Z0-9@\s:._-]*$', val)
ab4ee31e 144 expr = xpath + "[@%s='%s']" % (key, val)
59ae56fa
PH
145 return node.find(expr)
146else:
147 def find_xpath_attr(node, xpath, key, val):
4eefbfdb
PH
148 # Here comes the crazy part: In 2.6, if the xpath is a unicode,
149 # .//node does not match if a node is a direct child of . !
8f9312c3 150 if isinstance(xpath, compat_str):
4eefbfdb
PH
151 xpath = xpath.encode('ascii')
152
59ae56fa
PH
153 for f in node.findall(xpath):
154 if f.attrib.get(key) == val:
155 return f
156 return None
157
d7e66d39
JMF
158# On python2.6 the xml.etree.ElementTree.Element methods don't support
159# the namespace parameter
5f6a1245
JW
160
161
d7e66d39
JMF
162def xpath_with_ns(path, ns_map):
163 components = [c.split(':') for c in path.split('/')]
164 replaced = []
165 for c in components:
166 if len(c) == 1:
167 replaced.append(c[0])
168 else:
169 ns, tag = c
170 replaced.append('{%s}%s' % (ns_map[ns], tag))
171 return '/'.join(replaced)
172
d77c3dfd 173
bf0ff932 174def xpath_text(node, xpath, name=None, fatal=False):
d74bebd5
PH
175 if sys.version_info < (2, 7): # Crazy 2.6
176 xpath = xpath.encode('ascii')
177
bf0ff932 178 n = node.find(xpath)
42bdd9d0 179 if n is None or n.text is None:
bf0ff932
PH
180 if fatal:
181 name = xpath if name is None else name
182 raise ExtractorError('Could not find XML element %s' % name)
183 else:
184 return None
185 return n.text
186
187
9e6dd238 188def get_element_by_id(id, html):
43e8fafd
ND
189 """Return the content of the tag with the specified ID in the passed HTML document"""
190 return get_element_by_attribute("id", id, html)
191
12ea2f30 192
43e8fafd
ND
193def get_element_by_attribute(attribute, value, html):
194 """Return the content of the tag with the specified attribute in the passed HTML document"""
9e6dd238 195
38285056
PH
196 m = re.search(r'''(?xs)
197 <([a-zA-Z0-9:._-]+)
198 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
199 \s+%s=['"]?%s['"]?
200 (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]+|="[^"]+"|='[^']+'))*?
201 \s*>
202 (?P<content>.*?)
203 </\1>
204 ''' % (re.escape(attribute), re.escape(value)), html)
205
206 if not m:
207 return None
208 res = m.group('content')
209
210 if res.startswith('"') or res.startswith("'"):
211 res = res[1:-1]
a921f407 212
38285056 213 return unescapeHTML(res)
a921f407 214
9e6dd238
FV
215
216def clean_html(html):
59ae15a5 217 """Clean an HTML snippet into a readable string"""
dd622d7c
PH
218
219 if html is None: # Convenience for sanitizing descriptions etc.
220 return html
221
59ae15a5
PH
222 # Newline vs <br />
223 html = html.replace('\n', ' ')
6b3aef80
FV
224 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
225 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
226 # Strip html tags
227 html = re.sub('<.*?>', '', html)
228 # Replace html entities
229 html = unescapeHTML(html)
7decf895 230 return html.strip()
9e6dd238
FV
231
232
d77c3dfd 233def sanitize_open(filename, open_mode):
59ae15a5
PH
234 """Try to open the given filename, and slightly tweak it if this fails.
235
236 Attempts to open the given filename. If this fails, it tries to change
237 the filename slightly, step by step, until it's either able to open it
238 or it fails and raises a final exception, like the standard open()
239 function.
240
241 It returns the tuple (stream, definitive_file_name).
242 """
243 try:
28e614de 244 if filename == '-':
59ae15a5
PH
245 if sys.platform == 'win32':
246 import msvcrt
247 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 248 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
249 stream = open(encodeFilename(filename), open_mode)
250 return (stream, filename)
251 except (IOError, OSError) as err:
f45c185f
PH
252 if err.errno in (errno.EACCES,):
253 raise
59ae15a5 254
f45c185f 255 # In case of error, try to remove win32 forbidden chars
d55de57b 256 alt_filename = sanitize_path(filename)
f45c185f
PH
257 if alt_filename == filename:
258 raise
259 else:
260 # An exception here should be caught in the caller
d55de57b 261 stream = open(encodeFilename(alt_filename), open_mode)
f45c185f 262 return (stream, alt_filename)
d77c3dfd
FV
263
264
265def timeconvert(timestr):
59ae15a5
PH
266 """Convert RFC 2822 defined time string into system timestamp"""
267 timestamp = None
268 timetuple = email.utils.parsedate_tz(timestr)
269 if timetuple is not None:
270 timestamp = email.utils.mktime_tz(timetuple)
271 return timestamp
1c469a94 272
5f6a1245 273
796173d0 274def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
275 """Sanitizes a string so it could be used as part of a filename.
276 If restricted is set, use a stricter subset of allowed characters.
796173d0 277 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
278 """
279 def replace_insane(char):
280 if char == '?' or ord(char) < 32 or ord(char) == 127:
281 return ''
282 elif char == '"':
283 return '' if restricted else '\''
284 elif char == ':':
285 return '_-' if restricted else ' -'
286 elif char in '\\/|*<>':
287 return '_'
627dcfff 288 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
289 return '_'
290 if restricted and ord(char) > 127:
291 return '_'
292 return char
293
2aeb06d6
PH
294 # Handle timestamps
295 s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s)
28e614de 296 result = ''.join(map(replace_insane, s))
796173d0
PH
297 if not is_id:
298 while '__' in result:
299 result = result.replace('__', '_')
300 result = result.strip('_')
301 # Common case of "Foreign band name - English song title"
302 if restricted and result.startswith('-_'):
303 result = result[2:]
5a42414b
PH
304 if result.startswith('-'):
305 result = '_' + result[len('-'):]
a7440261 306 result = result.lstrip('.')
796173d0
PH
307 if not result:
308 result = '_'
59ae15a5 309 return result
d77c3dfd 310
5f6a1245 311
a2aaf4db
S
312def sanitize_path(s):
313 """Sanitizes and normalizes path on Windows"""
314 if sys.platform != 'win32':
315 return s
be531ef1
S
316 drive_or_unc, _ = os.path.splitdrive(s)
317 if sys.version_info < (2, 7) and not drive_or_unc:
318 drive_or_unc, _ = os.path.splitunc(s)
319 norm_path = os.path.normpath(remove_start(s, drive_or_unc)).split(os.path.sep)
320 if drive_or_unc:
a2aaf4db
S
321 norm_path.pop(0)
322 sanitized_path = [
2ebfeaca 323 path_part if path_part in ['.', '..'] else re.sub('(?:[/<>:"\\|\\\\?\\*]|\.$)', '#', path_part)
a2aaf4db 324 for path_part in norm_path]
be531ef1
S
325 if drive_or_unc:
326 sanitized_path.insert(0, drive_or_unc + os.path.sep)
a2aaf4db
S
327 return os.path.join(*sanitized_path)
328
329
92a4793b
S
330def sanitize_url_path_consecutive_slashes(url):
331 """Collapses consecutive slashes in URLs' path"""
332 parsed_url = list(compat_urlparse.urlparse(url))
333 parsed_url[2] = re.sub(r'/{2,}', '/', parsed_url[2])
334 return compat_urlparse.urlunparse(parsed_url)
335
336
d77c3dfd 337def orderedSet(iterable):
59ae15a5
PH
338 """ Remove all duplicates from the input iterable """
339 res = []
340 for el in iterable:
341 if el not in res:
342 res.append(el)
343 return res
d77c3dfd 344
912b38b4 345
4e408e47
PH
346def _htmlentity_transform(entity):
347 """Transforms an HTML entity to a character."""
348 # Known non-numeric HTML entity
349 if entity in compat_html_entities.name2codepoint:
350 return compat_chr(compat_html_entities.name2codepoint[entity])
351
91757b0f 352 mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
4e408e47
PH
353 if mobj is not None:
354 numstr = mobj.group(1)
28e614de 355 if numstr.startswith('x'):
4e408e47 356 base = 16
28e614de 357 numstr = '0%s' % numstr
4e408e47
PH
358 else:
359 base = 10
360 return compat_chr(int(numstr, base))
361
362 # Unknown entity in name, return its literal representation
28e614de 363 return ('&%s;' % entity)
4e408e47
PH
364
365
d77c3dfd 366def unescapeHTML(s):
912b38b4
PH
367 if s is None:
368 return None
369 assert type(s) == compat_str
d77c3dfd 370
4e408e47
PH
371 return re.sub(
372 r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s)
d77c3dfd 373
8bf48f23 374
aa49acd1
S
375def get_subprocess_encoding():
376 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
377 # For subprocess calls, encode with locale encoding
378 # Refer to http://stackoverflow.com/a/9951851/35070
379 encoding = preferredencoding()
380 else:
381 encoding = sys.getfilesystemencoding()
382 if encoding is None:
383 encoding = 'utf-8'
384 return encoding
385
386
8bf48f23 387def encodeFilename(s, for_subprocess=False):
59ae15a5
PH
388 """
389 @param s The name of the file
390 """
d77c3dfd 391
8bf48f23 392 assert type(s) == compat_str
d77c3dfd 393
59ae15a5
PH
394 # Python 3 has a Unicode API
395 if sys.version_info >= (3, 0):
396 return s
0f00efed 397
aa49acd1
S
398 # Pass '' directly to use Unicode APIs on Windows 2000 and up
399 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
400 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
401 if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
402 return s
403
404 return s.encode(get_subprocess_encoding(), 'ignore')
405
406
407def decodeFilename(b, for_subprocess=False):
408
409 if sys.version_info >= (3, 0):
410 return b
411
412 if not isinstance(b, bytes):
413 return b
414
415 return b.decode(get_subprocess_encoding(), 'ignore')
8bf48f23 416
f07b74fc
PH
417
418def encodeArgument(s):
419 if not isinstance(s, compat_str):
420 # Legacy code that uses byte strings
421 # Uncomment the following line after fixing all post processors
7af808a5 422 # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s))
f07b74fc
PH
423 s = s.decode('ascii')
424 return encodeFilename(s, True)
425
426
aa49acd1
S
427def decodeArgument(b):
428 return decodeFilename(b, True)
429
430
8271226a
PH
431def decodeOption(optval):
432 if optval is None:
433 return optval
434 if isinstance(optval, bytes):
435 optval = optval.decode(preferredencoding())
436
437 assert isinstance(optval, compat_str)
438 return optval
1c256f70 439
5f6a1245 440
4539dd30
PH
441def formatSeconds(secs):
442 if secs > 3600:
443 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
444 elif secs > 60:
445 return '%d:%02d' % (secs // 60, secs % 60)
446 else:
447 return '%d' % secs
448
a0ddb8a2 449
be4a824d
PH
450def make_HTTPS_handler(params, **kwargs):
451 opts_no_check_certificate = params.get('nocheckcertificate', False)
0db261ba 452 if hasattr(ssl, 'create_default_context'): # Python >= 3.4 or 2.7.9
be5f2c19 453 context = ssl.create_default_context(ssl.Purpose.SERVER_AUTH)
0db261ba 454 if opts_no_check_certificate:
be5f2c19 455 context.check_hostname = False
0db261ba 456 context.verify_mode = ssl.CERT_NONE
a2366922 457 try:
be4a824d 458 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
a2366922
PH
459 except TypeError:
460 # Python 2.7.8
461 # (create_default_context present but HTTPSHandler has no context=)
462 pass
463
464 if sys.version_info < (3, 2):
d7932313 465 return YoutubeDLHTTPSHandler(params, **kwargs)
aa37e3d4 466 else: # Python < 3.4
d7932313 467 context = ssl.SSLContext(ssl.PROTOCOL_TLSv1)
ea6d901e 468 context.verify_mode = (ssl.CERT_NONE
dca08720 469 if opts_no_check_certificate
ea6d901e 470 else ssl.CERT_REQUIRED)
303b479e 471 context.set_default_verify_paths()
be4a824d 472 return YoutubeDLHTTPSHandler(params, context=context, **kwargs)
ea6d901e 473
732ea2f0 474
08f2a92c
JMF
475def bug_reports_message():
476 if ytdl_is_updateable():
477 update_cmd = 'type youtube-dl -U to update'
478 else:
479 update_cmd = 'see https://yt-dl.org/update on how to update'
480 msg = '; please report this issue on https://yt-dl.org/bug .'
481 msg += ' Make sure you are using the latest version; %s.' % update_cmd
482 msg += ' Be sure to call youtube-dl with the --verbose flag and include its complete output.'
483 return msg
484
485
1c256f70
PH
486class ExtractorError(Exception):
487 """Error during info extraction."""
5f6a1245 488
d11271dd 489 def __init__(self, msg, tb=None, expected=False, cause=None, video_id=None):
9a82b238
PH
490 """ tb, if given, is the original traceback (so that it can be printed out).
491 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
492 """
493
494 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
495 expected = True
d11271dd
PH
496 if video_id is not None:
497 msg = video_id + ': ' + msg
410f3e73 498 if cause:
28e614de 499 msg += ' (caused by %r)' % cause
9a82b238 500 if not expected:
08f2a92c 501 msg += bug_reports_message()
1c256f70 502 super(ExtractorError, self).__init__(msg)
d5979c5d 503
1c256f70 504 self.traceback = tb
8cc83b8d 505 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 506 self.cause = cause
d11271dd 507 self.video_id = video_id
1c256f70 508
01951dda
PH
509 def format_traceback(self):
510 if self.traceback is None:
511 return None
28e614de 512 return ''.join(traceback.format_tb(self.traceback))
01951dda 513
1c256f70 514
416c7fcb
PH
515class UnsupportedError(ExtractorError):
516 def __init__(self, url):
517 super(UnsupportedError, self).__init__(
518 'Unsupported URL: %s' % url, expected=True)
519 self.url = url
520
521
55b3e45b
JMF
522class RegexNotFoundError(ExtractorError):
523 """Error when a regex didn't match"""
524 pass
525
526
d77c3dfd 527class DownloadError(Exception):
59ae15a5 528 """Download Error exception.
d77c3dfd 529
59ae15a5
PH
530 This exception may be thrown by FileDownloader objects if they are not
531 configured to continue on errors. They will contain the appropriate
532 error message.
533 """
5f6a1245 534
8cc83b8d
FV
535 def __init__(self, msg, exc_info=None):
536 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
537 super(DownloadError, self).__init__(msg)
538 self.exc_info = exc_info
d77c3dfd
FV
539
540
541class SameFileError(Exception):
59ae15a5 542 """Same File exception.
d77c3dfd 543
59ae15a5
PH
544 This exception will be thrown by FileDownloader objects if they detect
545 multiple files would have to be downloaded to the same file on disk.
546 """
547 pass
d77c3dfd
FV
548
549
550class PostProcessingError(Exception):
59ae15a5 551 """Post Processing exception.
d77c3dfd 552
59ae15a5
PH
553 This exception may be raised by PostProcessor's .run() method to
554 indicate an error in the postprocessing task.
555 """
5f6a1245 556
7851b379
PH
557 def __init__(self, msg):
558 self.msg = msg
d77c3dfd 559
5f6a1245 560
d77c3dfd 561class MaxDownloadsReached(Exception):
59ae15a5
PH
562 """ --max-downloads limit has been reached. """
563 pass
d77c3dfd
FV
564
565
566class UnavailableVideoError(Exception):
59ae15a5 567 """Unavailable Format exception.
d77c3dfd 568
59ae15a5
PH
569 This exception will be thrown when a video is requested
570 in a format that is not available for that video.
571 """
572 pass
d77c3dfd
FV
573
574
575class ContentTooShortError(Exception):
59ae15a5 576 """Content Too Short exception.
d77c3dfd 577
59ae15a5
PH
578 This exception may be raised by FileDownloader objects when a file they
579 download is too small for what the server announced first, indicating
580 the connection was probably interrupted.
581 """
582 # Both in bytes
583 downloaded = None
584 expected = None
d77c3dfd 585
59ae15a5
PH
586 def __init__(self, downloaded, expected):
587 self.downloaded = downloaded
588 self.expected = expected
d77c3dfd 589
5f6a1245 590
c5a59d93 591def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs):
be4a824d
PH
592 hc = http_class(*args, **kwargs)
593 source_address = ydl_handler._params.get('source_address')
594 if source_address is not None:
595 sa = (source_address, 0)
596 if hasattr(hc, 'source_address'): # Python 2.7+
597 hc.source_address = sa
598 else: # Python 2.6
599 def _hc_connect(self, *args, **kwargs):
600 sock = compat_socket_create_connection(
601 (self.host, self.port), self.timeout, sa)
602 if is_https:
d7932313
PH
603 self.sock = ssl.wrap_socket(
604 sock, self.key_file, self.cert_file,
605 ssl_version=ssl.PROTOCOL_TLSv1)
be4a824d
PH
606 else:
607 self.sock = sock
608 hc.connect = functools.partial(_hc_connect, hc)
609
610 return hc
611
612
acebc9cd 613class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
614 """Handler for HTTP requests and responses.
615
616 This class, when installed with an OpenerDirector, automatically adds
617 the standard headers to every HTTP request and handles gzipped and
618 deflated responses from web servers. If compression is to be avoided in
619 a particular request, the original request in the program code only has
620 to include the HTTP header "Youtubedl-No-Compression", which will be
621 removed before making the real request.
622
623 Part of this code was copied from:
624
625 http://techknack.net/python-urllib2-handlers/
626
627 Andrew Rowls, the author of that code, agreed to release it to the
628 public domain.
629 """
630
be4a824d
PH
631 def __init__(self, params, *args, **kwargs):
632 compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs)
633 self._params = params
634
635 def http_open(self, req):
636 return self.do_open(functools.partial(
c5a59d93 637 _create_http_connection, self, compat_http_client.HTTPConnection, False),
be4a824d
PH
638 req)
639
59ae15a5
PH
640 @staticmethod
641 def deflate(data):
642 try:
643 return zlib.decompress(data, -zlib.MAX_WBITS)
644 except zlib.error:
645 return zlib.decompress(data)
646
647 @staticmethod
648 def addinfourl_wrapper(stream, headers, url, code):
649 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
650 return compat_urllib_request.addinfourl(stream, headers, url, code)
651 ret = compat_urllib_request.addinfourl(stream, headers, url)
652 ret.code = code
653 return ret
654
acebc9cd 655 def http_request(self, req):
33ac271b 656 for h, v in std_headers.items():
3d5f7a39
JK
657 # Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
658 # The dict keys are capitalized because of this bug by urllib
659 if h.capitalize() not in req.headers:
33ac271b 660 req.add_header(h, v)
59ae15a5
PH
661 if 'Youtubedl-no-compression' in req.headers:
662 if 'Accept-encoding' in req.headers:
663 del req.headers['Accept-encoding']
664 del req.headers['Youtubedl-no-compression']
989b4b2b
PH
665
666 if sys.version_info < (2, 7) and '#' in req.get_full_url():
667 # Python 2.6 is brain-dead when it comes to fragments
668 req._Request__original = req._Request__original.partition('#')[0]
669 req._Request__r_type = req._Request__r_type.partition('#')[0]
670
59ae15a5
PH
671 return req
672
acebc9cd 673 def http_response(self, req, resp):
59ae15a5
PH
674 old_resp = resp
675 # gzip
676 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
677 content = resp.read()
678 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
679 try:
680 uncompressed = io.BytesIO(gz.read())
681 except IOError as original_ioerror:
682 # There may be junk add the end of the file
683 # See http://stackoverflow.com/q/4928560/35070 for details
684 for i in range(1, 1024):
685 try:
686 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
687 uncompressed = io.BytesIO(gz.read())
688 except IOError:
689 continue
690 break
691 else:
692 raise original_ioerror
693 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
694 resp.msg = old_resp.msg
695 # deflate
696 if resp.headers.get('Content-encoding', '') == 'deflate':
697 gz = io.BytesIO(self.deflate(resp.read()))
698 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
699 resp.msg = old_resp.msg
700 return resp
0f8d03f8 701
acebc9cd
PH
702 https_request = http_request
703 https_response = http_response
bf50b038 704
5de90176 705
be4a824d
PH
706class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
707 def __init__(self, params, https_conn_class=None, *args, **kwargs):
708 compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs)
709 self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection
710 self._params = params
711
712 def https_open(self, req):
4f264c02
JMF
713 kwargs = {}
714 if hasattr(self, '_context'): # python > 2.6
715 kwargs['context'] = self._context
716 if hasattr(self, '_check_hostname'): # python 3.x
717 kwargs['check_hostname'] = self._check_hostname
be4a824d
PH
718 return self.do_open(functools.partial(
719 _create_http_connection, self, self._https_conn_class, True),
4f264c02 720 req, **kwargs)
be4a824d
PH
721
722
08b38d54 723def parse_iso8601(date_str, delimiter='T', timezone=None):
912b38b4
PH
724 """ Return a UNIX timestamp from the given date """
725
726 if date_str is None:
727 return None
728
08b38d54
PH
729 if timezone is None:
730 m = re.search(
731 r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
732 date_str)
733 if not m:
912b38b4
PH
734 timezone = datetime.timedelta()
735 else:
08b38d54
PH
736 date_str = date_str[:-len(m.group(0))]
737 if not m.group('sign'):
738 timezone = datetime.timedelta()
739 else:
740 sign = 1 if m.group('sign') == '+' else -1
741 timezone = datetime.timedelta(
742 hours=sign * int(m.group('hours')),
743 minutes=sign * int(m.group('minutes')))
6ad4013d 744 date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
305d0683 745 dt = datetime.datetime.strptime(date_str, date_format) - timezone
912b38b4
PH
746 return calendar.timegm(dt.timetuple())
747
748
42bdd9d0 749def unified_strdate(date_str, day_first=True):
bf50b038 750 """Return a string with the date in the format YYYYMMDD"""
64e7ad60
PH
751
752 if date_str is None:
753 return None
bf50b038 754 upload_date = None
5f6a1245 755 # Replace commas
026fcc04 756 date_str = date_str.replace(',', ' ')
bf50b038 757 # %z (UTC offset) is only supported in python>=3.2
15ac8413
S
758 if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str):
759 date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str)
42bdd9d0 760 # Remove AM/PM + timezone
9bb8e0a3 761 date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
42bdd9d0 762
19e1d359
JMF
763 format_expressions = [
764 '%d %B %Y',
0f99566c 765 '%d %b %Y',
19e1d359
JMF
766 '%B %d %Y',
767 '%b %d %Y',
78ff59d0
PP
768 '%b %dst %Y %I:%M%p',
769 '%b %dnd %Y %I:%M%p',
770 '%b %dth %Y %I:%M%p',
a69801e2 771 '%Y %m %d',
19e1d359 772 '%Y-%m-%d',
fe556f1b 773 '%Y/%m/%d',
19e1d359 774 '%Y/%m/%d %H:%M:%S',
5d73273f 775 '%Y-%m-%d %H:%M:%S',
e9be9a6a 776 '%Y-%m-%d %H:%M:%S.%f',
19e1d359 777 '%d.%m.%Y %H:%M',
b047de6f 778 '%d.%m.%Y %H.%M',
19e1d359 779 '%Y-%m-%dT%H:%M:%SZ',
59040888
PH
780 '%Y-%m-%dT%H:%M:%S.%fZ',
781 '%Y-%m-%dT%H:%M:%S.%f0Z',
2e1fa03b 782 '%Y-%m-%dT%H:%M:%S',
7ff5d5c2 783 '%Y-%m-%dT%H:%M:%S.%f',
5de90176 784 '%Y-%m-%dT%H:%M',
19e1d359 785 ]
42bdd9d0
PH
786 if day_first:
787 format_expressions.extend([
79c21abb 788 '%d-%m-%Y',
776dc399
S
789 '%d.%m.%Y',
790 '%d/%m/%Y',
791 '%d/%m/%y',
42bdd9d0
PH
792 '%d/%m/%Y %H:%M:%S',
793 ])
794 else:
795 format_expressions.extend([
79c21abb 796 '%m-%d-%Y',
776dc399
S
797 '%m.%d.%Y',
798 '%m/%d/%Y',
799 '%m/%d/%y',
42bdd9d0
PH
800 '%m/%d/%Y %H:%M:%S',
801 ])
bf50b038
JMF
802 for expression in format_expressions:
803 try:
804 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
5de90176 805 except ValueError:
bf50b038 806 pass
42393ce2
PH
807 if upload_date is None:
808 timetuple = email.utils.parsedate_tz(date_str)
809 if timetuple:
810 upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d')
bf50b038
JMF
811 return upload_date
812
5f6a1245 813
28e614de 814def determine_ext(url, default_ext='unknown_video'):
f4776371
S
815 if url is None:
816 return default_ext
28e614de 817 guess = url.partition('?')[0].rpartition('.')[2]
73e79f2a
PH
818 if re.match(r'^[A-Za-z0-9]+$', guess):
819 return guess
820 else:
cbdbb766 821 return default_ext
73e79f2a 822
5f6a1245 823
d4051a8e 824def subtitles_filename(filename, sub_lang, sub_format):
28e614de 825 return filename.rsplit('.', 1)[0] + '.' + sub_lang + '.' + sub_format
d4051a8e 826
5f6a1245 827
bd558525 828def date_from_str(date_str):
37254abc
JMF
829 """
830 Return a datetime object from a string in the format YYYYMMDD or
831 (now|today)[+-][0-9](day|week|month|year)(s)?"""
832 today = datetime.date.today()
f8795e10 833 if date_str in ('now', 'today'):
37254abc 834 return today
f8795e10
PH
835 if date_str == 'yesterday':
836 return today - datetime.timedelta(days=1)
37254abc
JMF
837 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
838 if match is not None:
839 sign = match.group('sign')
840 time = int(match.group('time'))
841 if sign == '-':
842 time = -time
843 unit = match.group('unit')
5f6a1245 844 # A bad aproximation?
37254abc
JMF
845 if unit == 'month':
846 unit = 'day'
847 time *= 30
848 elif unit == 'year':
849 unit = 'day'
850 time *= 365
851 unit += 's'
852 delta = datetime.timedelta(**{unit: time})
853 return today + delta
bd558525 854 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
5f6a1245
JW
855
856
e63fc1be 857def hyphenate_date(date_str):
858 """
859 Convert a date in 'YYYYMMDD' format to 'YYYY-MM-DD' format"""
860 match = re.match(r'^(\d\d\d\d)(\d\d)(\d\d)$', date_str)
861 if match is not None:
862 return '-'.join(match.groups())
863 else:
864 return date_str
865
5f6a1245 866
bd558525
JMF
867class DateRange(object):
868 """Represents a time interval between two dates"""
5f6a1245 869
bd558525
JMF
870 def __init__(self, start=None, end=None):
871 """start and end must be strings in the format accepted by date"""
872 if start is not None:
873 self.start = date_from_str(start)
874 else:
875 self.start = datetime.datetime.min.date()
876 if end is not None:
877 self.end = date_from_str(end)
878 else:
879 self.end = datetime.datetime.max.date()
37254abc 880 if self.start > self.end:
bd558525 881 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
5f6a1245 882
bd558525
JMF
883 @classmethod
884 def day(cls, day):
885 """Returns a range that only contains the given day"""
5f6a1245
JW
886 return cls(day, day)
887
bd558525
JMF
888 def __contains__(self, date):
889 """Check if the date is in the range"""
37254abc
JMF
890 if not isinstance(date, datetime.date):
891 date = date_from_str(date)
892 return self.start <= date <= self.end
5f6a1245 893
bd558525 894 def __str__(self):
5f6a1245 895 return '%s - %s' % (self.start.isoformat(), self.end.isoformat())
c496ca96
PH
896
897
898def platform_name():
899 """ Returns the platform name as a compat_str """
900 res = platform.platform()
901 if isinstance(res, bytes):
902 res = res.decode(preferredencoding())
903
904 assert isinstance(res, compat_str)
905 return res
c257baff
PH
906
907
b58ddb32
PH
908def _windows_write_string(s, out):
909 """ Returns True if the string was written using special methods,
910 False if it has yet to be written out."""
911 # Adapted from http://stackoverflow.com/a/3259271/35070
912
913 import ctypes
914 import ctypes.wintypes
915
916 WIN_OUTPUT_IDS = {
917 1: -11,
918 2: -12,
919 }
920
a383a98a
PH
921 try:
922 fileno = out.fileno()
923 except AttributeError:
924 # If the output stream doesn't have a fileno, it's virtual
925 return False
aa42e873
PH
926 except io.UnsupportedOperation:
927 # Some strange Windows pseudo files?
928 return False
b58ddb32
PH
929 if fileno not in WIN_OUTPUT_IDS:
930 return False
931
e2f89ec7 932 GetStdHandle = ctypes.WINFUNCTYPE(
b58ddb32 933 ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)(
6ac4e806 934 (b"GetStdHandle", ctypes.windll.kernel32))
b58ddb32
PH
935 h = GetStdHandle(WIN_OUTPUT_IDS[fileno])
936
e2f89ec7 937 WriteConsoleW = ctypes.WINFUNCTYPE(
b58ddb32
PH
938 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR,
939 ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD),
6ac4e806 940 ctypes.wintypes.LPVOID)((b"WriteConsoleW", ctypes.windll.kernel32))
b58ddb32
PH
941 written = ctypes.wintypes.DWORD(0)
942
6ac4e806 943 GetFileType = ctypes.WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)((b"GetFileType", ctypes.windll.kernel32))
b58ddb32
PH
944 FILE_TYPE_CHAR = 0x0002
945 FILE_TYPE_REMOTE = 0x8000
e2f89ec7 946 GetConsoleMode = ctypes.WINFUNCTYPE(
b58ddb32
PH
947 ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE,
948 ctypes.POINTER(ctypes.wintypes.DWORD))(
6ac4e806 949 (b"GetConsoleMode", ctypes.windll.kernel32))
b58ddb32
PH
950 INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value
951
952 def not_a_console(handle):
953 if handle == INVALID_HANDLE_VALUE or handle is None:
954 return True
8fb3ac36
PH
955 return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR or
956 GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0)
b58ddb32
PH
957
958 if not_a_console(h):
959 return False
960
d1b9c912
PH
961 def next_nonbmp_pos(s):
962 try:
963 return next(i for i, c in enumerate(s) if ord(c) > 0xffff)
964 except StopIteration:
965 return len(s)
966
967 while s:
968 count = min(next_nonbmp_pos(s), 1024)
969
b58ddb32 970 ret = WriteConsoleW(
d1b9c912 971 h, s, count if count else 2, ctypes.byref(written), None)
b58ddb32
PH
972 if ret == 0:
973 raise OSError('Failed to write string')
d1b9c912
PH
974 if not count: # We just wrote a non-BMP character
975 assert written.value == 2
976 s = s[1:]
977 else:
978 assert written.value > 0
979 s = s[written.value:]
b58ddb32
PH
980 return True
981
982
734f90bb 983def write_string(s, out=None, encoding=None):
7459e3a2
PH
984 if out is None:
985 out = sys.stderr
8bf48f23 986 assert type(s) == compat_str
7459e3a2 987
b58ddb32
PH
988 if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'):
989 if _windows_write_string(s, out):
990 return
991
7459e3a2
PH
992 if ('b' in getattr(out, 'mode', '') or
993 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
104aa738
PH
994 byt = s.encode(encoding or preferredencoding(), 'ignore')
995 out.write(byt)
996 elif hasattr(out, 'buffer'):
997 enc = encoding or getattr(out, 'encoding', None) or preferredencoding()
998 byt = s.encode(enc, 'ignore')
999 out.buffer.write(byt)
1000 else:
8bf48f23 1001 out.write(s)
7459e3a2
PH
1002 out.flush()
1003
1004
48ea9cea
PH
1005def bytes_to_intlist(bs):
1006 if not bs:
1007 return []
1008 if isinstance(bs[0], int): # Python 3
1009 return list(bs)
1010 else:
1011 return [ord(c) for c in bs]
1012
c257baff 1013
cba892fa 1014def intlist_to_bytes(xs):
1015 if not xs:
1016 return b''
eb4157fd 1017 return struct_pack('%dB' % len(xs), *xs)
c38b1e77
PH
1018
1019
c1c9a79c
PH
1020# Cross-platform file locking
1021if sys.platform == 'win32':
1022 import ctypes.wintypes
1023 import msvcrt
1024
1025 class OVERLAPPED(ctypes.Structure):
1026 _fields_ = [
1027 ('Internal', ctypes.wintypes.LPVOID),
1028 ('InternalHigh', ctypes.wintypes.LPVOID),
1029 ('Offset', ctypes.wintypes.DWORD),
1030 ('OffsetHigh', ctypes.wintypes.DWORD),
1031 ('hEvent', ctypes.wintypes.HANDLE),
1032 ]
1033
1034 kernel32 = ctypes.windll.kernel32
1035 LockFileEx = kernel32.LockFileEx
1036 LockFileEx.argtypes = [
1037 ctypes.wintypes.HANDLE, # hFile
1038 ctypes.wintypes.DWORD, # dwFlags
1039 ctypes.wintypes.DWORD, # dwReserved
1040 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1041 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1042 ctypes.POINTER(OVERLAPPED) # Overlapped
1043 ]
1044 LockFileEx.restype = ctypes.wintypes.BOOL
1045 UnlockFileEx = kernel32.UnlockFileEx
1046 UnlockFileEx.argtypes = [
1047 ctypes.wintypes.HANDLE, # hFile
1048 ctypes.wintypes.DWORD, # dwReserved
1049 ctypes.wintypes.DWORD, # nNumberOfBytesToLockLow
1050 ctypes.wintypes.DWORD, # nNumberOfBytesToLockHigh
1051 ctypes.POINTER(OVERLAPPED) # Overlapped
1052 ]
1053 UnlockFileEx.restype = ctypes.wintypes.BOOL
1054 whole_low = 0xffffffff
1055 whole_high = 0x7fffffff
1056
1057 def _lock_file(f, exclusive):
1058 overlapped = OVERLAPPED()
1059 overlapped.Offset = 0
1060 overlapped.OffsetHigh = 0
1061 overlapped.hEvent = 0
1062 f._lock_file_overlapped_p = ctypes.pointer(overlapped)
1063 handle = msvcrt.get_osfhandle(f.fileno())
1064 if not LockFileEx(handle, 0x2 if exclusive else 0x0, 0,
1065 whole_low, whole_high, f._lock_file_overlapped_p):
1066 raise OSError('Locking file failed: %r' % ctypes.FormatError())
1067
1068 def _unlock_file(f):
1069 assert f._lock_file_overlapped_p
1070 handle = msvcrt.get_osfhandle(f.fileno())
1071 if not UnlockFileEx(handle, 0,
1072 whole_low, whole_high, f._lock_file_overlapped_p):
1073 raise OSError('Unlocking file failed: %r' % ctypes.FormatError())
1074
1075else:
1076 import fcntl
1077
1078 def _lock_file(f, exclusive):
2582bebe 1079 fcntl.flock(f, fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH)
c1c9a79c
PH
1080
1081 def _unlock_file(f):
2582bebe 1082 fcntl.flock(f, fcntl.LOCK_UN)
c1c9a79c
PH
1083
1084
1085class locked_file(object):
1086 def __init__(self, filename, mode, encoding=None):
1087 assert mode in ['r', 'a', 'w']
1088 self.f = io.open(filename, mode, encoding=encoding)
1089 self.mode = mode
1090
1091 def __enter__(self):
1092 exclusive = self.mode != 'r'
1093 try:
1094 _lock_file(self.f, exclusive)
1095 except IOError:
1096 self.f.close()
1097 raise
1098 return self
1099
1100 def __exit__(self, etype, value, traceback):
1101 try:
1102 _unlock_file(self.f)
1103 finally:
1104 self.f.close()
1105
1106 def __iter__(self):
1107 return iter(self.f)
1108
1109 def write(self, *args):
1110 return self.f.write(*args)
1111
1112 def read(self, *args):
1113 return self.f.read(*args)
4eb7f1d1
JMF
1114
1115
4644ac55
S
1116def get_filesystem_encoding():
1117 encoding = sys.getfilesystemencoding()
1118 return encoding if encoding is not None else 'utf-8'
1119
1120
4eb7f1d1 1121def shell_quote(args):
a6a173c2 1122 quoted_args = []
4644ac55 1123 encoding = get_filesystem_encoding()
a6a173c2
JMF
1124 for a in args:
1125 if isinstance(a, bytes):
1126 # We may get a filename encoded with 'encodeFilename'
1127 a = a.decode(encoding)
1128 quoted_args.append(pipes.quote(a))
28e614de 1129 return ' '.join(quoted_args)
9d4660ca
PH
1130
1131
1132def smuggle_url(url, data):
1133 """ Pass additional data in a URL for internal use. """
1134
1135 sdata = compat_urllib_parse.urlencode(
28e614de
PH
1136 {'__youtubedl_smuggle': json.dumps(data)})
1137 return url + '#' + sdata
9d4660ca
PH
1138
1139
79f82953 1140def unsmuggle_url(smug_url, default=None):
83e865a3 1141 if '#__youtubedl_smuggle' not in smug_url:
79f82953 1142 return smug_url, default
28e614de
PH
1143 url, _, sdata = smug_url.rpartition('#')
1144 jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0]
9d4660ca
PH
1145 data = json.loads(jsond)
1146 return url, data
02dbf93f
PH
1147
1148
02dbf93f
PH
1149def format_bytes(bytes):
1150 if bytes is None:
28e614de 1151 return 'N/A'
02dbf93f
PH
1152 if type(bytes) is str:
1153 bytes = float(bytes)
1154 if bytes == 0.0:
1155 exponent = 0
1156 else:
1157 exponent = int(math.log(bytes, 1024.0))
28e614de 1158 suffix = ['B', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB', 'YiB'][exponent]
02dbf93f 1159 converted = float(bytes) / float(1024 ** exponent)
28e614de 1160 return '%.2f%s' % (converted, suffix)
f53c966a 1161
1c088fa8 1162
be64b5b0
PH
1163def parse_filesize(s):
1164 if s is None:
1165 return None
1166
1167 # The lower-case forms are of course incorrect and inofficial,
1168 # but we support those too
1169 _UNIT_TABLE = {
1170 'B': 1,
1171 'b': 1,
1172 'KiB': 1024,
1173 'KB': 1000,
1174 'kB': 1024,
1175 'Kb': 1000,
1176 'MiB': 1024 ** 2,
1177 'MB': 1000 ** 2,
1178 'mB': 1024 ** 2,
1179 'Mb': 1000 ** 2,
1180 'GiB': 1024 ** 3,
1181 'GB': 1000 ** 3,
1182 'gB': 1024 ** 3,
1183 'Gb': 1000 ** 3,
1184 'TiB': 1024 ** 4,
1185 'TB': 1000 ** 4,
1186 'tB': 1024 ** 4,
1187 'Tb': 1000 ** 4,
1188 'PiB': 1024 ** 5,
1189 'PB': 1000 ** 5,
1190 'pB': 1024 ** 5,
1191 'Pb': 1000 ** 5,
1192 'EiB': 1024 ** 6,
1193 'EB': 1000 ** 6,
1194 'eB': 1024 ** 6,
1195 'Eb': 1000 ** 6,
1196 'ZiB': 1024 ** 7,
1197 'ZB': 1000 ** 7,
1198 'zB': 1024 ** 7,
1199 'Zb': 1000 ** 7,
1200 'YiB': 1024 ** 8,
1201 'YB': 1000 ** 8,
1202 'yB': 1024 ** 8,
1203 'Yb': 1000 ** 8,
1204 }
1205
1206 units_re = '|'.join(re.escape(u) for u in _UNIT_TABLE)
4349c07d
PH
1207 m = re.match(
1208 r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)' % units_re, s)
be64b5b0
PH
1209 if not m:
1210 return None
1211
4349c07d
PH
1212 num_str = m.group('num').replace(',', '.')
1213 mult = _UNIT_TABLE[m.group('unit')]
1214 return int(float(num_str) * mult)
be64b5b0
PH
1215
1216
caefb1de
PH
1217def month_by_name(name):
1218 """ Return the number of a month by (locale-independently) English name """
1219
caefb1de 1220 try:
7105440c
YCH
1221 return ENGLISH_MONTH_NAMES.index(name) + 1
1222 except ValueError:
1223 return None
1224
1225
1226def month_by_abbreviation(abbrev):
1227 """ Return the number of a month by (locale-independently) English
1228 abbreviations """
1229
1230 try:
1231 return [s[:3] for s in ENGLISH_MONTH_NAMES].index(abbrev) + 1
caefb1de
PH
1232 except ValueError:
1233 return None
18258362
JMF
1234
1235
5aafe895 1236def fix_xml_ampersands(xml_str):
18258362 1237 """Replace all the '&' by '&amp;' in XML"""
5aafe895
PH
1238 return re.sub(
1239 r'&(?!amp;|lt;|gt;|apos;|quot;|#x[0-9a-fA-F]{,4};|#[0-9]{,4};)',
28e614de 1240 '&amp;',
5aafe895 1241 xml_str)
e3946f98
PH
1242
1243
1244def setproctitle(title):
8bf48f23 1245 assert isinstance(title, compat_str)
e3946f98
PH
1246 try:
1247 libc = ctypes.cdll.LoadLibrary("libc.so.6")
1248 except OSError:
1249 return
6eefe533
PH
1250 title_bytes = title.encode('utf-8')
1251 buf = ctypes.create_string_buffer(len(title_bytes))
1252 buf.value = title_bytes
e3946f98 1253 try:
6eefe533 1254 libc.prctl(15, buf, 0, 0, 0)
e3946f98
PH
1255 except AttributeError:
1256 return # Strange libc, just skip this
d7dda168
PH
1257
1258
1259def remove_start(s, start):
1260 if s.startswith(start):
1261 return s[len(start):]
1262 return s
29eb5174
PH
1263
1264
2b9faf55
PH
1265def remove_end(s, end):
1266 if s.endswith(end):
1267 return s[:-len(end)]
1268 return s
1269
1270
29eb5174 1271def url_basename(url):
9b8aaeed 1272 path = compat_urlparse.urlparse(url).path
28e614de 1273 return path.strip('/').split('/')[-1]
aa94a6d3
PH
1274
1275
1276class HEADRequest(compat_urllib_request.Request):
1277 def get_method(self):
1278 return "HEAD"
7217e148
PH
1279
1280
9732d77e 1281def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
28746fbd
PH
1282 if get_attr:
1283 if v is not None:
1284 v = getattr(v, get_attr, None)
9572013d
PH
1285 if v == '':
1286 v = None
9732d77e
PH
1287 return default if v is None else (int(v) * invscale // scale)
1288
9572013d 1289
40a90862
JMF
1290def str_or_none(v, default=None):
1291 return default if v is None else compat_str(v)
1292
9732d77e
PH
1293
1294def str_to_int(int_str):
48d4681e 1295 """ A more relaxed version of int_or_none """
9732d77e
PH
1296 if int_str is None:
1297 return None
28e614de 1298 int_str = re.sub(r'[,\.\+]', '', int_str)
9732d77e 1299 return int(int_str)
608d11f5
PH
1300
1301
9732d77e
PH
1302def float_or_none(v, scale=1, invscale=1, default=None):
1303 return default if v is None else (float(v) * invscale / scale)
43f775e4
PH
1304
1305
608d11f5 1306def parse_duration(s):
8f9312c3 1307 if not isinstance(s, compat_basestring):
608d11f5
PH
1308 return None
1309
ca7b3246
S
1310 s = s.strip()
1311
608d11f5 1312 m = re.match(
9d22a7df 1313 r'''(?ix)(?:P?T)?
e8df5cee
PH
1314 (?:
1315 (?P<only_mins>[0-9.]+)\s*(?:mins?|minutes?)\s*|
1316 (?P<only_hours>[0-9.]+)\s*(?:hours?)|
1317
3e675fab 1318 \s*(?P<hours_reversed>[0-9]+)\s*(?:[:h]|hours?)\s*(?P<mins_reversed>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*|
6a68bb57 1319 (?:
8f4b58d7
PH
1320 (?:
1321 (?:(?P<days>[0-9]+)\s*(?:[:d]|days?)\s*)?
1322 (?P<hours>[0-9]+)\s*(?:[:h]|hours?)\s*
1323 )?
6a68bb57
PH
1324 (?P<mins>[0-9]+)\s*(?:[:m]|mins?|minutes?)\s*
1325 )?
e8df5cee
PH
1326 (?P<secs>[0-9]+)(?P<ms>\.[0-9]+)?\s*(?:s|secs?|seconds?)?
1327 )$''', s)
608d11f5
PH
1328 if not m:
1329 return None
e8df5cee
PH
1330 res = 0
1331 if m.group('only_mins'):
1332 return float_or_none(m.group('only_mins'), invscale=60)
1333 if m.group('only_hours'):
1334 return float_or_none(m.group('only_hours'), invscale=60 * 60)
1335 if m.group('secs'):
1336 res += int(m.group('secs'))
3e675fab
PH
1337 if m.group('mins_reversed'):
1338 res += int(m.group('mins_reversed')) * 60
608d11f5
PH
1339 if m.group('mins'):
1340 res += int(m.group('mins')) * 60
e8df5cee
PH
1341 if m.group('hours'):
1342 res += int(m.group('hours')) * 60 * 60
3e675fab
PH
1343 if m.group('hours_reversed'):
1344 res += int(m.group('hours_reversed')) * 60 * 60
8f4b58d7
PH
1345 if m.group('days'):
1346 res += int(m.group('days')) * 24 * 60 * 60
7adcbe75
PH
1347 if m.group('ms'):
1348 res += float(m.group('ms'))
608d11f5 1349 return res
91d7d0b3
JMF
1350
1351
1352def prepend_extension(filename, ext):
5f6a1245 1353 name, real_ext = os.path.splitext(filename)
28e614de 1354 return '{0}.{1}{2}'.format(name, ext, real_ext)
d70ad093
PH
1355
1356
1357def check_executable(exe, args=[]):
1358 """ Checks if the given binary is installed somewhere in PATH, and returns its name.
1359 args can be a list of arguments for a short output (like -version) """
1360 try:
1361 subprocess.Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
1362 except OSError:
1363 return False
1364 return exe
b7ab0590
PH
1365
1366
95807118 1367def get_exe_version(exe, args=['--version'],
cae97f65 1368 version_re=None, unrecognized='present'):
95807118
PH
1369 """ Returns the version of the specified executable,
1370 or False if the executable is not present """
1371 try:
cae97f65 1372 out, _ = subprocess.Popen(
95807118
PH
1373 [exe] + args,
1374 stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate()
1375 except OSError:
1376 return False
cae97f65
PH
1377 if isinstance(out, bytes): # Python 2.x
1378 out = out.decode('ascii', 'ignore')
1379 return detect_exe_version(out, version_re, unrecognized)
1380
1381
1382def detect_exe_version(output, version_re=None, unrecognized='present'):
1383 assert isinstance(output, compat_str)
1384 if version_re is None:
1385 version_re = r'version\s+([-0-9._a-zA-Z]+)'
1386 m = re.search(version_re, output)
95807118
PH
1387 if m:
1388 return m.group(1)
1389 else:
1390 return unrecognized
1391
1392
b7ab0590 1393class PagedList(object):
dd26ced1
PH
1394 def __len__(self):
1395 # This is only useful for tests
1396 return len(self.getslice())
1397
9c44d242
PH
1398
1399class OnDemandPagedList(PagedList):
1400 def __init__(self, pagefunc, pagesize):
1401 self._pagefunc = pagefunc
1402 self._pagesize = pagesize
1403
b7ab0590
PH
1404 def getslice(self, start=0, end=None):
1405 res = []
1406 for pagenum in itertools.count(start // self._pagesize):
1407 firstid = pagenum * self._pagesize
1408 nextfirstid = pagenum * self._pagesize + self._pagesize
1409 if start >= nextfirstid:
1410 continue
1411
1412 page_results = list(self._pagefunc(pagenum))
1413
1414 startv = (
1415 start % self._pagesize
1416 if firstid <= start < nextfirstid
1417 else 0)
1418
1419 endv = (
1420 ((end - 1) % self._pagesize) + 1
1421 if (end is not None and firstid <= end <= nextfirstid)
1422 else None)
1423
1424 if startv != 0 or endv is not None:
1425 page_results = page_results[startv:endv]
1426 res.extend(page_results)
1427
1428 # A little optimization - if current page is not "full", ie. does
1429 # not contain page_size videos then we can assume that this page
1430 # is the last one - there are no more ids on further pages -
1431 # i.e. no need to query again.
1432 if len(page_results) + startv < self._pagesize:
1433 break
1434
1435 # If we got the whole page, but the next page is not interesting,
1436 # break out early as well
1437 if end == nextfirstid:
1438 break
1439 return res
81c2f20b
PH
1440
1441
9c44d242
PH
1442class InAdvancePagedList(PagedList):
1443 def __init__(self, pagefunc, pagecount, pagesize):
1444 self._pagefunc = pagefunc
1445 self._pagecount = pagecount
1446 self._pagesize = pagesize
1447
1448 def getslice(self, start=0, end=None):
1449 res = []
1450 start_page = start // self._pagesize
1451 end_page = (
1452 self._pagecount if end is None else (end // self._pagesize + 1))
1453 skip_elems = start - start_page * self._pagesize
1454 only_more = None if end is None else end - start
1455 for pagenum in range(start_page, end_page):
1456 page = list(self._pagefunc(pagenum))
1457 if skip_elems:
1458 page = page[skip_elems:]
1459 skip_elems = None
1460 if only_more is not None:
1461 if len(page) < only_more:
1462 only_more -= len(page)
1463 else:
1464 page = page[:only_more]
1465 res.extend(page)
1466 break
1467 res.extend(page)
1468 return res
1469
1470
81c2f20b 1471def uppercase_escape(s):
676eb3f2 1472 unicode_escape = codecs.getdecoder('unicode_escape')
81c2f20b 1473 return re.sub(
a612753d 1474 r'\\U[0-9a-fA-F]{8}',
676eb3f2
PH
1475 lambda m: unicode_escape(m.group(0))[0],
1476 s)
b53466e1 1477
d05cfe06
S
1478
1479def escape_rfc3986(s):
1480 """Escape non-ASCII characters as suggested by RFC 3986"""
8f9312c3 1481 if sys.version_info < (3, 0) and isinstance(s, compat_str):
d05cfe06 1482 s = s.encode('utf-8')
ecc0c5ee 1483 return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]")
d05cfe06
S
1484
1485
1486def escape_url(url):
1487 """Escape URL as suggested by RFC 3986"""
1488 url_parsed = compat_urllib_parse_urlparse(url)
1489 return url_parsed._replace(
1490 path=escape_rfc3986(url_parsed.path),
1491 params=escape_rfc3986(url_parsed.params),
1492 query=escape_rfc3986(url_parsed.query),
1493 fragment=escape_rfc3986(url_parsed.fragment)
1494 ).geturl()
1495
b53466e1 1496try:
28e614de 1497 struct.pack('!I', 0)
b53466e1
PH
1498except TypeError:
1499 # In Python 2.6 (and some 2.7 versions), struct requires a bytes argument
1500 def struct_pack(spec, *args):
1501 if isinstance(spec, compat_str):
1502 spec = spec.encode('ascii')
1503 return struct.pack(spec, *args)
1504
1505 def struct_unpack(spec, *args):
1506 if isinstance(spec, compat_str):
1507 spec = spec.encode('ascii')
1508 return struct.unpack(spec, *args)
1509else:
1510 struct_pack = struct.pack
1511 struct_unpack = struct.unpack
62e609ab
PH
1512
1513
1514def read_batch_urls(batch_fd):
1515 def fixup(url):
1516 if not isinstance(url, compat_str):
1517 url = url.decode('utf-8', 'replace')
28e614de 1518 BOM_UTF8 = '\xef\xbb\xbf'
62e609ab
PH
1519 if url.startswith(BOM_UTF8):
1520 url = url[len(BOM_UTF8):]
1521 url = url.strip()
1522 if url.startswith(('#', ';', ']')):
1523 return False
1524 return url
1525
1526 with contextlib.closing(batch_fd) as fd:
1527 return [url for url in map(fixup, fd) if url]
b74fa8cd
JMF
1528
1529
1530def urlencode_postdata(*args, **kargs):
1531 return compat_urllib_parse.urlencode(*args, **kargs).encode('ascii')
bcf89ce6
PH
1532
1533
0990305d
PH
1534try:
1535 etree_iter = xml.etree.ElementTree.Element.iter
1536except AttributeError: # Python <=2.6
1537 etree_iter = lambda n: n.findall('.//*')
1538
1539
bcf89ce6
PH
1540def parse_xml(s):
1541 class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
1542 def doctype(self, name, pubid, system):
1543 pass # Ignore doctypes
1544
1545 parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
1546 kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
0990305d
PH
1547 tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
1548 # Fix up XML parser in Python 2.x
1549 if sys.version_info < (3, 0):
1550 for n in etree_iter(tree):
1551 if n.text is not None:
1552 if not isinstance(n.text, compat_str):
1553 n.text = n.text.decode('utf-8')
1554 return tree
e68301af
PH
1555
1556
a1a530b0
PH
1557US_RATINGS = {
1558 'G': 0,
1559 'PG': 10,
1560 'PG-13': 13,
1561 'R': 16,
1562 'NC': 18,
1563}
fac55558
PH
1564
1565
146c80e2
S
1566def parse_age_limit(s):
1567 if s is None:
d838b1bd 1568 return None
146c80e2 1569 m = re.match(r'^(?P<age>\d{1,2})\+?$', s)
d838b1bd 1570 return int(m.group('age')) if m else US_RATINGS.get(s, None)
146c80e2
S
1571
1572
fac55558 1573def strip_jsonp(code):
609a61e3
PH
1574 return re.sub(
1575 r'(?s)^[a-zA-Z0-9_]+\s*\(\s*(.*)\);?\s*?(?://[^\n]*)*$', r'\1', code)
478c2c61
PH
1576
1577
e05f6939
PH
1578def js_to_json(code):
1579 def fix_kv(m):
e7b6d122
PH
1580 v = m.group(0)
1581 if v in ('true', 'false', 'null'):
1582 return v
1583 if v.startswith('"'):
1584 return v
1585 if v.startswith("'"):
1586 v = v[1:-1]
1587 v = re.sub(r"\\\\|\\'|\"", lambda m: {
1588 '\\\\': '\\\\',
1589 "\\'": "'",
1590 '"': '\\"',
1591 }[m.group(0)], v)
1592 return '"%s"' % v
e05f6939
PH
1593
1594 res = re.sub(r'''(?x)
d305dd73
PH
1595 "(?:[^"\\]*(?:\\\\|\\['"nu]))*[^"\\]*"|
1596 '(?:[^'\\]*(?:\\\\|\\['"nu]))*[^'\\]*'|
8f4b58d7 1597 [a-zA-Z_][.a-zA-Z_0-9]*
e05f6939 1598 ''', fix_kv, code)
ba9e68f4 1599 res = re.sub(r',(\s*[\]}])', lambda m: m.group(1), res)
e05f6939
PH
1600 return res
1601
1602
478c2c61
PH
1603def qualities(quality_ids):
1604 """ Get a numeric quality value out of a list of possible values """
1605 def q(qid):
1606 try:
1607 return quality_ids.index(qid)
1608 except ValueError:
1609 return -1
1610 return q
1611
acd69589
PH
1612
1613DEFAULT_OUTTMPL = '%(title)s-%(id)s.%(ext)s'
0a871f68 1614
a020a0dc
PH
1615
1616def limit_length(s, length):
1617 """ Add ellipses to overly long strings """
1618 if s is None:
1619 return None
1620 ELLIPSES = '...'
1621 if len(s) > length:
1622 return s[:length - len(ELLIPSES)] + ELLIPSES
1623 return s
48844745
PH
1624
1625
1626def version_tuple(v):
5f9b8394 1627 return tuple(int(e) for e in re.split(r'[-.]', v))
48844745
PH
1628
1629
1630def is_outdated_version(version, limit, assume_new=True):
1631 if not version:
1632 return not assume_new
1633 try:
1634 return version_tuple(version) < version_tuple(limit)
1635 except ValueError:
1636 return not assume_new
732ea2f0
PH
1637
1638
1639def ytdl_is_updateable():
1640 """ Returns if youtube-dl can be updated with -U """
1641 from zipimport import zipimporter
1642
1643 return isinstance(globals().get('__loader__'), zipimporter) or hasattr(sys, 'frozen')
7d4111ed
PH
1644
1645
1646def args_to_str(args):
1647 # Get a short string representation for a subprocess command
1648 return ' '.join(shlex_quote(a) for a in args)
2ccd1b10
PH
1649
1650
c460bdd5
PH
1651def mimetype2ext(mt):
1652 _, _, res = mt.rpartition('/')
1653
1654 return {
1655 'x-ms-wmv': 'wmv',
1656 'x-mp4-fragmented': 'mp4',
1657 }.get(res, res)
1658
1659
2ccd1b10
PH
1660def urlhandle_detect_ext(url_handle):
1661 try:
1662 url_handle.headers
1663 getheader = lambda h: url_handle.headers[h]
1664 except AttributeError: # Python < 3
1665 getheader = url_handle.info().getheader
1666
b55ee18f
PH
1667 cd = getheader('Content-Disposition')
1668 if cd:
1669 m = re.match(r'attachment;\s*filename="(?P<filename>[^"]+)"', cd)
1670 if m:
1671 e = determine_ext(m.group('filename'), default_ext=None)
1672 if e:
1673 return e
1674
c460bdd5 1675 return mimetype2ext(getheader('Content-Type'))
05900629
PH
1676
1677
1678def age_restricted(content_limit, age_limit):
1679 """ Returns True iff the content should be blocked """
1680
1681 if age_limit is None: # No limit set
1682 return False
1683 if content_limit is None:
1684 return False # Content available for everyone
1685 return age_limit < content_limit
61ca9a80
PH
1686
1687
1688def is_html(first_bytes):
1689 """ Detect whether a file contains HTML by examining its first bytes. """
1690
1691 BOMS = [
1692 (b'\xef\xbb\xbf', 'utf-8'),
1693 (b'\x00\x00\xfe\xff', 'utf-32-be'),
1694 (b'\xff\xfe\x00\x00', 'utf-32-le'),
1695 (b'\xff\xfe', 'utf-16-le'),
1696 (b'\xfe\xff', 'utf-16-be'),
1697 ]
1698 for bom, enc in BOMS:
1699 if first_bytes.startswith(bom):
1700 s = first_bytes[len(bom):].decode(enc, 'replace')
1701 break
1702 else:
1703 s = first_bytes.decode('utf-8', 'replace')
1704
1705 return re.match(r'^\s*<', s)
a055469f
PH
1706
1707
1708def determine_protocol(info_dict):
1709 protocol = info_dict.get('protocol')
1710 if protocol is not None:
1711 return protocol
1712
1713 url = info_dict['url']
1714 if url.startswith('rtmp'):
1715 return 'rtmp'
1716 elif url.startswith('mms'):
1717 return 'mms'
1718 elif url.startswith('rtsp'):
1719 return 'rtsp'
1720
1721 ext = determine_ext(url)
1722 if ext == 'm3u8':
1723 return 'm3u8'
1724 elif ext == 'f4m':
1725 return 'f4m'
1726
1727 return compat_urllib_parse_urlparse(url).scheme
cfb56d1a
PH
1728
1729
1730def render_table(header_row, data):
1731 """ Render a list of rows, each as a list of values """
1732 table = [header_row] + data
1733 max_lens = [max(len(compat_str(v)) for v in col) for col in zip(*table)]
1734 format_str = ' '.join('%-' + compat_str(ml + 1) + 's' for ml in max_lens[:-1]) + '%s'
1735 return '\n'.join(format_str % tuple(row) for row in table)
347de493
PH
1736
1737
1738def _match_one(filter_part, dct):
1739 COMPARISON_OPERATORS = {
1740 '<': operator.lt,
1741 '<=': operator.le,
1742 '>': operator.gt,
1743 '>=': operator.ge,
1744 '=': operator.eq,
1745 '!=': operator.ne,
1746 }
1747 operator_rex = re.compile(r'''(?x)\s*
1748 (?P<key>[a-z_]+)
1749 \s*(?P<op>%s)(?P<none_inclusive>\s*\?)?\s*
1750 (?:
1751 (?P<intval>[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?)|
1752 (?P<strval>(?![0-9.])[a-z0-9A-Z]*)
1753 )
1754 \s*$
1755 ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys())))
1756 m = operator_rex.search(filter_part)
1757 if m:
1758 op = COMPARISON_OPERATORS[m.group('op')]
1759 if m.group('strval') is not None:
1760 if m.group('op') not in ('=', '!='):
1761 raise ValueError(
1762 'Operator %s does not support string values!' % m.group('op'))
1763 comparison_value = m.group('strval')
1764 else:
1765 try:
1766 comparison_value = int(m.group('intval'))
1767 except ValueError:
1768 comparison_value = parse_filesize(m.group('intval'))
1769 if comparison_value is None:
1770 comparison_value = parse_filesize(m.group('intval') + 'B')
1771 if comparison_value is None:
1772 raise ValueError(
1773 'Invalid integer value %r in filter part %r' % (
1774 m.group('intval'), filter_part))
1775 actual_value = dct.get(m.group('key'))
1776 if actual_value is None:
1777 return m.group('none_inclusive')
1778 return op(actual_value, comparison_value)
1779
1780 UNARY_OPERATORS = {
1781 '': lambda v: v is not None,
1782 '!': lambda v: v is None,
1783 }
1784 operator_rex = re.compile(r'''(?x)\s*
1785 (?P<op>%s)\s*(?P<key>[a-z_]+)
1786 \s*$
1787 ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys())))
1788 m = operator_rex.search(filter_part)
1789 if m:
1790 op = UNARY_OPERATORS[m.group('op')]
1791 actual_value = dct.get(m.group('key'))
1792 return op(actual_value)
1793
1794 raise ValueError('Invalid filter part %r' % filter_part)
1795
1796
1797def match_str(filter_str, dct):
1798 """ Filter a dictionary with a simple string syntax. Returns True (=passes filter) or false """
1799
1800 return all(
1801 _match_one(filter_part, dct) for filter_part in filter_str.split('&'))
1802
1803
1804def match_filter_func(filter_str):
1805 def _match_func(info_dict):
1806 if match_str(filter_str, info_dict):
1807 return None
1808 else:
1809 video_title = info_dict.get('title', info_dict.get('id', 'video'))
1810 return '%s does not pass filter %s, skipping ..' % (video_title, filter_str)
1811 return _match_func
91410c9b
PH
1812
1813
bf6427d2
YCH
1814def parse_dfxp_time_expr(time_expr):
1815 if not time_expr:
1816 return 0.0
1817
1818 mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
1819 if mobj:
1820 return float(mobj.group('time_offset'))
1821
1822 mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
1823 if mobj:
1824 return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
1825
1826
1827def format_srt_time(seconds):
1828 (mins, secs) = divmod(seconds, 60)
1829 (hours, mins) = divmod(mins, 60)
1830 millisecs = (secs - int(secs)) * 1000
1831 secs = int(secs)
1832 return '%02d:%02d:%02d,%03d' % (hours, mins, secs, millisecs)
1833
1834
1835def dfxp2srt(dfxp_data):
1836 _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'})
1837
1838 def parse_node(node):
1839 str_or_empty = functools.partial(str_or_none, default='')
1840
1841 out = str_or_empty(node.text)
1842
1843 for child in node:
1844 if child.tag == _x('ttml:br'):
1845 out += '\n' + str_or_empty(child.tail)
1846 elif child.tag == _x('ttml:span'):
1847 out += str_or_empty(parse_node(child))
1848 else:
1849 out += str_or_empty(xml.etree.ElementTree.tostring(child))
1850
1851 return out
1852
1853 dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
1854 out = []
1855 paras = dfxp.findall(_x('.//ttml:p'))
1856
1857 for para, index in zip(paras, itertools.count(1)):
1858 out.append('%d\n%s --> %s\n%s\n\n' % (
1859 index,
1860 format_srt_time(parse_dfxp_time_expr(para.attrib.get('begin'))),
1861 format_srt_time(parse_dfxp_time_expr(para.attrib.get('end'))),
1862 parse_node(para)))
1863
1864 return ''.join(out)
1865
1866
91410c9b 1867class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
2461f79d
PH
1868 def __init__(self, proxies=None):
1869 # Set default handlers
1870 for type in ('http', 'https'):
1871 setattr(self, '%s_open' % type,
1872 lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open:
1873 meth(r, proxy, type))
1874 return compat_urllib_request.ProxyHandler.__init__(self, proxies)
1875
91410c9b 1876 def proxy_open(self, req, proxy, type):
2461f79d 1877 req_proxy = req.headers.get('Ytdl-request-proxy')
91410c9b
PH
1878 if req_proxy is not None:
1879 proxy = req_proxy
2461f79d
PH
1880 del req.headers['Ytdl-request-proxy']
1881
1882 if proxy == '__noproxy__':
1883 return None # No Proxy
91410c9b
PH
1884 return compat_urllib_request.ProxyHandler.proxy_open(
1885 self, req, proxy, type)