]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
Now --all-sub is a modifier to --write-sub and --write-auto-sub (closes #1412)
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
c496ca96
PH
4import datetime
5import email.utils
f45c185f 6import errno
d77c3dfd 7import gzip
03f9daab 8import io
f4bfd65f 9import json
d77c3dfd
FV
10import locale
11import os
c496ca96 12import platform
d77c3dfd 13import re
c496ca96 14import socket
d77c3dfd 15import sys
01951dda 16import traceback
d77c3dfd 17import zlib
d77c3dfd 18
01ba00ca 19try:
59ae15a5 20 import urllib.request as compat_urllib_request
01ba00ca 21except ImportError: # Python 2
59ae15a5 22 import urllib2 as compat_urllib_request
01ba00ca
PH
23
24try:
59ae15a5 25 import urllib.error as compat_urllib_error
01ba00ca 26except ImportError: # Python 2
59ae15a5 27 import urllib2 as compat_urllib_error
01ba00ca
PH
28
29try:
59ae15a5 30 import urllib.parse as compat_urllib_parse
01ba00ca 31except ImportError: # Python 2
59ae15a5 32 import urllib as compat_urllib_parse
01ba00ca 33
799c0763
PH
34try:
35 from urllib.parse import urlparse as compat_urllib_parse_urlparse
36except ImportError: # Python 2
37 from urlparse import urlparse as compat_urllib_parse_urlparse
38
6543f0dc
JMF
39try:
40 import urllib.parse as compat_urlparse
41except ImportError: # Python 2
42 import urlparse as compat_urlparse
43
01ba00ca 44try:
59ae15a5 45 import http.cookiejar as compat_cookiejar
01ba00ca 46except ImportError: # Python 2
59ae15a5 47 import cookielib as compat_cookiejar
01ba00ca 48
3e669f36 49try:
59ae15a5 50 import html.entities as compat_html_entities
9f37a959 51except ImportError: # Python 2
59ae15a5 52 import htmlentitydefs as compat_html_entities
3e669f36 53
a8156c1d 54try:
59ae15a5 55 import html.parser as compat_html_parser
9f37a959 56except ImportError: # Python 2
59ae15a5 57 import HTMLParser as compat_html_parser
a8156c1d 58
348d0a7a 59try:
59ae15a5 60 import http.client as compat_http_client
9f37a959 61except ImportError: # Python 2
59ae15a5 62 import httplib as compat_http_client
348d0a7a 63
2eabb802 64try:
0e283428 65 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
66except ImportError: # Python 2
67 from urllib2 import HTTPError as compat_HTTPError
68
5910e210
PH
69try:
70 from subprocess import DEVNULL
71 compat_subprocess_get_DEVNULL = lambda: DEVNULL
72except ImportError:
73 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
74
9f37a959 75try:
59ae15a5 76 from urllib.parse import parse_qs as compat_parse_qs
9f37a959 77except ImportError: # Python 2
59ae15a5
PH
78 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
79 # Python 2's version is apparently totally broken
80 def _unquote(string, encoding='utf-8', errors='replace'):
81 if string == '':
82 return string
83 res = string.split('%')
84 if len(res) == 1:
85 return string
86 if encoding is None:
87 encoding = 'utf-8'
88 if errors is None:
89 errors = 'replace'
90 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
91 pct_sequence = b''
92 string = res[0]
93 for item in res[1:]:
94 try:
95 if not item:
96 raise ValueError
97 pct_sequence += item[:2].decode('hex')
98 rest = item[2:]
99 if not rest:
100 # This segment was just a single percent-encoded character.
101 # May be part of a sequence of code units, so delay decoding.
102 # (Stored in pct_sequence).
103 continue
104 except ValueError:
105 rest = '%' + item
106 # Encountered non-percent-encoded characters. Flush the current
107 # pct_sequence.
108 string += pct_sequence.decode(encoding, errors) + rest
109 pct_sequence = b''
110 if pct_sequence:
111 # Flush the final pct_sequence
112 string += pct_sequence.decode(encoding, errors)
113 return string
114
115 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
116 encoding='utf-8', errors='replace'):
117 qs, _coerce_result = qs, unicode
118 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
119 r = []
120 for name_value in pairs:
121 if not name_value and not strict_parsing:
122 continue
123 nv = name_value.split('=', 1)
124 if len(nv) != 2:
125 if strict_parsing:
126 raise ValueError("bad query field: %r" % (name_value,))
127 # Handle case of a control-name with no equal sign
128 if keep_blank_values:
129 nv.append('')
130 else:
131 continue
132 if len(nv[1]) or keep_blank_values:
133 name = nv[0].replace('+', ' ')
134 name = _unquote(name, encoding=encoding, errors=errors)
135 name = _coerce_result(name)
136 value = nv[1].replace('+', ' ')
137 value = _unquote(value, encoding=encoding, errors=errors)
138 value = _coerce_result(value)
139 r.append((name, value))
140 return r
141
142 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
143 encoding='utf-8', errors='replace'):
144 parsed_result = {}
145 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
146 encoding=encoding, errors=errors)
147 for name, value in pairs:
148 if name in parsed_result:
149 parsed_result[name].append(value)
150 else:
151 parsed_result[name] = [value]
152 return parsed_result
348d0a7a 153
3e669f36 154try:
59ae15a5 155 compat_str = unicode # Python 2
3e669f36 156except NameError:
59ae15a5 157 compat_str = str
3e669f36
PH
158
159try:
59ae15a5 160 compat_chr = unichr # Python 2
3e669f36 161except NameError:
59ae15a5 162 compat_chr = chr
3e669f36 163
b31756c1
FV
164def compat_ord(c):
165 if type(c) is int: return c
166 else: return ord(c)
167
468e2e92
FV
168# This is not clearly defined otherwise
169compiled_regex_type = type(re.compile(''))
170
3e669f36 171std_headers = {
59ae15a5
PH
172 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
173 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
174 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
175 'Accept-Encoding': 'gzip, deflate',
176 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 177}
f427df17 178
d77c3dfd 179def preferredencoding():
59ae15a5 180 """Get preferred encoding.
d77c3dfd 181
59ae15a5
PH
182 Returns the best encoding scheme for the system, based on
183 locale.getpreferredencoding() and some further tweaks.
184 """
185 try:
186 pref = locale.getpreferredencoding()
187 u'TEST'.encode(pref)
188 except:
189 pref = 'UTF-8'
bae611f2 190
59ae15a5 191 return pref
d77c3dfd 192
8cd10ac4 193if sys.version_info < (3,0):
59ae15a5
PH
194 def compat_print(s):
195 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 196else:
59ae15a5
PH
197 def compat_print(s):
198 assert type(s) == type(u'')
199 print(s)
d77c3dfd 200
f4bfd65f
PH
201# In Python 2.x, json.dump expects a bytestream.
202# In Python 3.x, it writes to a character stream
203if sys.version_info < (3,0):
204 def write_json_file(obj, fn):
205 with open(fn, 'wb') as f:
206 json.dump(obj, f)
207else:
208 def write_json_file(obj, fn):
209 with open(fn, 'w', encoding='utf-8') as f:
210 json.dump(obj, f)
211
59ae56fa
PH
212if sys.version_info >= (2,7):
213 def find_xpath_attr(node, xpath, key, val):
214 """ Find the xpath xpath[@key=val] """
5de3ece2 215 assert re.match(r'^[a-zA-Z]+$', key)
54543467 216 assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
59ae56fa
PH
217 expr = xpath + u"[@%s='%s']" % (key, val)
218 return node.find(expr)
219else:
220 def find_xpath_attr(node, xpath, key, val):
221 for f in node.findall(xpath):
222 if f.attrib.get(key) == val:
223 return f
224 return None
225
d77c3dfd 226def htmlentity_transform(matchobj):
59ae15a5
PH
227 """Transforms an HTML entity to a character.
228
229 This function receives a match object and is intended to be used with
230 the re.sub() function.
231 """
232 entity = matchobj.group(1)
233
234 # Known non-numeric HTML entity
235 if entity in compat_html_entities.name2codepoint:
236 return compat_chr(compat_html_entities.name2codepoint[entity])
237
238 mobj = re.match(u'(?u)#(x?\\d+)', entity)
239 if mobj is not None:
240 numstr = mobj.group(1)
241 if numstr.startswith(u'x'):
242 base = 16
243 numstr = u'0%s' % numstr
244 else:
245 base = 10
246 return compat_chr(int(numstr, base))
247
248 # Unknown entity in name, return its literal representation
249 return (u'&%s;' % entity)
d77c3dfd 250
a8156c1d 251compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
252class BaseHTMLParser(compat_html_parser.HTMLParser):
253 def __init(self):
254 compat_html_parser.HTMLParser.__init__(self)
255 self.html = None
256
257 def loads(self, html):
258 self.html = html
259 self.feed(html)
260 self.close()
261
262class AttrParser(BaseHTMLParser):
43e8fafd
ND
263 """Modified HTMLParser that isolates a tag with the specified attribute"""
264 def __init__(self, attribute, value):
265 self.attribute = attribute
266 self.value = value
59ae15a5
PH
267 self.result = None
268 self.started = False
269 self.depth = {}
59ae15a5
PH
270 self.watch_startpos = False
271 self.error_count = 0
a921f407 272 BaseHTMLParser.__init__(self)
59ae15a5
PH
273
274 def error(self, message):
275 if self.error_count > 10 or self.started:
276 raise compat_html_parser.HTMLParseError(message, self.getpos())
277 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
278 self.error_count += 1
279 self.goahead(1)
280
59ae15a5
PH
281 def handle_starttag(self, tag, attrs):
282 attrs = dict(attrs)
283 if self.started:
284 self.find_startpos(None)
43e8fafd 285 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
286 self.result = [tag]
287 self.started = True
288 self.watch_startpos = True
289 if self.started:
290 if not tag in self.depth: self.depth[tag] = 0
291 self.depth[tag] += 1
292
293 def handle_endtag(self, tag):
294 if self.started:
295 if tag in self.depth: self.depth[tag] -= 1
296 if self.depth[self.result[0]] == 0:
297 self.started = False
298 self.result.append(self.getpos())
299
300 def find_startpos(self, x):
301 """Needed to put the start position of the result (self.result[1])
302 after the opening tag with the requested id"""
303 if self.watch_startpos:
304 self.watch_startpos = False
305 self.result.append(self.getpos())
306 handle_entityref = handle_charref = handle_data = handle_comment = \
307 handle_decl = handle_pi = unknown_decl = find_startpos
308
309 def get_result(self):
310 if self.result is None:
311 return None
312 if len(self.result) != 3:
313 return None
314 lines = self.html.split('\n')
315 lines = lines[self.result[1][0]-1:self.result[2][0]]
316 lines[0] = lines[0][self.result[1][1]:]
317 if len(lines) == 1:
318 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
319 lines[-1] = lines[-1][:self.result[2][1]]
320 return '\n'.join(lines).strip()
3b024e17
PH
321# Hack for https://github.com/rg3/youtube-dl/issues/662
322if sys.version_info < (2, 7, 3):
323 AttrParser.parse_endtag = (lambda self, i:
324 i + len("</scr'+'ipt>")
325 if self.rawdata[i:].startswith("</scr'+'ipt>")
326 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
327
328def get_element_by_id(id, html):
43e8fafd
ND
329 """Return the content of the tag with the specified ID in the passed HTML document"""
330 return get_element_by_attribute("id", id, html)
331
332def get_element_by_attribute(attribute, value, html):
333 """Return the content of the tag with the specified attribute in the passed HTML document"""
334 parser = AttrParser(attribute, value)
59ae15a5
PH
335 try:
336 parser.loads(html)
337 except compat_html_parser.HTMLParseError:
338 pass
339 return parser.get_result()
9e6dd238 340
a921f407
JMF
341class MetaParser(BaseHTMLParser):
342 """
343 Modified HTMLParser that isolates a meta tag with the specified name
344 attribute.
345 """
346 def __init__(self, name):
347 BaseHTMLParser.__init__(self)
348 self.name = name
349 self.content = None
350 self.result = None
351
352 def handle_starttag(self, tag, attrs):
353 if tag != 'meta':
354 return
355 attrs = dict(attrs)
356 if attrs.get('name') == self.name:
357 self.result = attrs.get('content')
358
359 def get_result(self):
360 return self.result
361
362def get_meta_content(name, html):
363 """
364 Return the content attribute from the meta tag with the given name attribute.
365 """
366 parser = MetaParser(name)
367 try:
368 parser.loads(html)
369 except compat_html_parser.HTMLParseError:
370 pass
371 return parser.get_result()
372
9e6dd238
FV
373
374def clean_html(html):
59ae15a5
PH
375 """Clean an HTML snippet into a readable string"""
376 # Newline vs <br />
377 html = html.replace('\n', ' ')
6b3aef80
FV
378 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
379 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
380 # Strip html tags
381 html = re.sub('<.*?>', '', html)
382 # Replace html entities
383 html = unescapeHTML(html)
7decf895 384 return html.strip()
9e6dd238
FV
385
386
d77c3dfd 387def sanitize_open(filename, open_mode):
59ae15a5
PH
388 """Try to open the given filename, and slightly tweak it if this fails.
389
390 Attempts to open the given filename. If this fails, it tries to change
391 the filename slightly, step by step, until it's either able to open it
392 or it fails and raises a final exception, like the standard open()
393 function.
394
395 It returns the tuple (stream, definitive_file_name).
396 """
397 try:
398 if filename == u'-':
399 if sys.platform == 'win32':
400 import msvcrt
401 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 402 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
403 stream = open(encodeFilename(filename), open_mode)
404 return (stream, filename)
405 except (IOError, OSError) as err:
f45c185f
PH
406 if err.errno in (errno.EACCES,):
407 raise
59ae15a5 408
f45c185f
PH
409 # In case of error, try to remove win32 forbidden chars
410 alt_filename = os.path.join(
411 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
412 for path_part in os.path.split(filename)
413 )
414 if alt_filename == filename:
415 raise
416 else:
417 # An exception here should be caught in the caller
418 stream = open(encodeFilename(filename), open_mode)
419 return (stream, alt_filename)
d77c3dfd
FV
420
421
422def timeconvert(timestr):
59ae15a5
PH
423 """Convert RFC 2822 defined time string into system timestamp"""
424 timestamp = None
425 timetuple = email.utils.parsedate_tz(timestr)
426 if timetuple is not None:
427 timestamp = email.utils.mktime_tz(timetuple)
428 return timestamp
1c469a94 429
796173d0 430def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
431 """Sanitizes a string so it could be used as part of a filename.
432 If restricted is set, use a stricter subset of allowed characters.
796173d0 433 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
434 """
435 def replace_insane(char):
436 if char == '?' or ord(char) < 32 or ord(char) == 127:
437 return ''
438 elif char == '"':
439 return '' if restricted else '\''
440 elif char == ':':
441 return '_-' if restricted else ' -'
442 elif char in '\\/|*<>':
443 return '_'
627dcfff 444 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
445 return '_'
446 if restricted and ord(char) > 127:
447 return '_'
448 return char
449
450 result = u''.join(map(replace_insane, s))
796173d0
PH
451 if not is_id:
452 while '__' in result:
453 result = result.replace('__', '_')
454 result = result.strip('_')
455 # Common case of "Foreign band name - English song title"
456 if restricted and result.startswith('-_'):
457 result = result[2:]
458 if not result:
459 result = '_'
59ae15a5 460 return result
d77c3dfd
FV
461
462def orderedSet(iterable):
59ae15a5
PH
463 """ Remove all duplicates from the input iterable """
464 res = []
465 for el in iterable:
466 if el not in res:
467 res.append(el)
468 return res
d77c3dfd
FV
469
470def unescapeHTML(s):
59ae15a5
PH
471 """
472 @param s a string
473 """
474 assert type(s) == type(u'')
d77c3dfd 475
59ae15a5
PH
476 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
477 return result
d77c3dfd
FV
478
479def encodeFilename(s):
59ae15a5
PH
480 """
481 @param s The name of the file
482 """
d77c3dfd 483
59ae15a5 484 assert type(s) == type(u'')
d77c3dfd 485
59ae15a5
PH
486 # Python 3 has a Unicode API
487 if sys.version_info >= (3, 0):
488 return s
0f00efed 489
59ae15a5
PH
490 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
491 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
492 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
493 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
494 return s
495 else:
6df40dcb
PH
496 encoding = sys.getfilesystemencoding()
497 if encoding is None:
498 encoding = 'utf-8'
499 return s.encode(encoding, 'ignore')
d77c3dfd 500
8271226a
PH
501def decodeOption(optval):
502 if optval is None:
503 return optval
504 if isinstance(optval, bytes):
505 optval = optval.decode(preferredencoding())
506
507 assert isinstance(optval, compat_str)
508 return optval
1c256f70 509
4539dd30
PH
510def formatSeconds(secs):
511 if secs > 3600:
512 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
513 elif secs > 60:
514 return '%d:%02d' % (secs // 60, secs % 60)
515 else:
516 return '%d' % secs
517
ea6d901e
PH
518def make_HTTPS_handler(opts):
519 if sys.version_info < (3,2):
520 # Python's 2.x handler is very simplistic
acebc9cd 521 return compat_urllib_request.HTTPSHandler()
ea6d901e
PH
522 else:
523 import ssl
524 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
525 context.set_default_verify_paths()
526
527 context.verify_mode = (ssl.CERT_NONE
528 if opts.no_check_certificate
529 else ssl.CERT_REQUIRED)
acebc9cd 530 return compat_urllib_request.HTTPSHandler(context=context)
ea6d901e 531
1c256f70
PH
532class ExtractorError(Exception):
533 """Error during info extraction."""
2eabb802 534 def __init__(self, msg, tb=None, expected=False, cause=None):
9a82b238
PH
535 """ tb, if given, is the original traceback (so that it can be printed out).
536 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
537 """
538
539 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
540 expected = True
541 if not expected:
298f833b 542 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 543 super(ExtractorError, self).__init__(msg)
d5979c5d 544
1c256f70 545 self.traceback = tb
8cc83b8d 546 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 547 self.cause = cause
1c256f70 548
01951dda
PH
549 def format_traceback(self):
550 if self.traceback is None:
551 return None
552 return u''.join(traceback.format_tb(self.traceback))
553
1c256f70 554
d77c3dfd 555class DownloadError(Exception):
59ae15a5 556 """Download Error exception.
d77c3dfd 557
59ae15a5
PH
558 This exception may be thrown by FileDownloader objects if they are not
559 configured to continue on errors. They will contain the appropriate
560 error message.
561 """
8cc83b8d
FV
562 def __init__(self, msg, exc_info=None):
563 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
564 super(DownloadError, self).__init__(msg)
565 self.exc_info = exc_info
d77c3dfd
FV
566
567
568class SameFileError(Exception):
59ae15a5 569 """Same File exception.
d77c3dfd 570
59ae15a5
PH
571 This exception will be thrown by FileDownloader objects if they detect
572 multiple files would have to be downloaded to the same file on disk.
573 """
574 pass
d77c3dfd
FV
575
576
577class PostProcessingError(Exception):
59ae15a5 578 """Post Processing exception.
d77c3dfd 579
59ae15a5
PH
580 This exception may be raised by PostProcessor's .run() method to
581 indicate an error in the postprocessing task.
582 """
7851b379
PH
583 def __init__(self, msg):
584 self.msg = msg
d77c3dfd
FV
585
586class MaxDownloadsReached(Exception):
59ae15a5
PH
587 """ --max-downloads limit has been reached. """
588 pass
d77c3dfd
FV
589
590
591class UnavailableVideoError(Exception):
59ae15a5 592 """Unavailable Format exception.
d77c3dfd 593
59ae15a5
PH
594 This exception will be thrown when a video is requested
595 in a format that is not available for that video.
596 """
597 pass
d77c3dfd
FV
598
599
600class ContentTooShortError(Exception):
59ae15a5 601 """Content Too Short exception.
d77c3dfd 602
59ae15a5
PH
603 This exception may be raised by FileDownloader objects when a file they
604 download is too small for what the server announced first, indicating
605 the connection was probably interrupted.
606 """
607 # Both in bytes
608 downloaded = None
609 expected = None
d77c3dfd 610
59ae15a5
PH
611 def __init__(self, downloaded, expected):
612 self.downloaded = downloaded
613 self.expected = expected
d77c3dfd 614
acebc9cd 615class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
616 """Handler for HTTP requests and responses.
617
618 This class, when installed with an OpenerDirector, automatically adds
619 the standard headers to every HTTP request and handles gzipped and
620 deflated responses from web servers. If compression is to be avoided in
621 a particular request, the original request in the program code only has
622 to include the HTTP header "Youtubedl-No-Compression", which will be
623 removed before making the real request.
624
625 Part of this code was copied from:
626
627 http://techknack.net/python-urllib2-handlers/
628
629 Andrew Rowls, the author of that code, agreed to release it to the
630 public domain.
631 """
632
633 @staticmethod
634 def deflate(data):
635 try:
636 return zlib.decompress(data, -zlib.MAX_WBITS)
637 except zlib.error:
638 return zlib.decompress(data)
639
640 @staticmethod
641 def addinfourl_wrapper(stream, headers, url, code):
642 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
643 return compat_urllib_request.addinfourl(stream, headers, url, code)
644 ret = compat_urllib_request.addinfourl(stream, headers, url)
645 ret.code = code
646 return ret
647
acebc9cd
PH
648 def http_request(self, req):
649 for h,v in std_headers.items():
59ae15a5
PH
650 if h in req.headers:
651 del req.headers[h]
335959e7 652 req.add_header(h, v)
59ae15a5
PH
653 if 'Youtubedl-no-compression' in req.headers:
654 if 'Accept-encoding' in req.headers:
655 del req.headers['Accept-encoding']
656 del req.headers['Youtubedl-no-compression']
3446dfb7 657 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
658 if 'User-agent' in req.headers:
659 del req.headers['User-agent']
660 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 661 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
662 return req
663
acebc9cd 664 def http_response(self, req, resp):
59ae15a5
PH
665 old_resp = resp
666 # gzip
667 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
668 content = resp.read()
669 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
670 try:
671 uncompressed = io.BytesIO(gz.read())
672 except IOError as original_ioerror:
673 # There may be junk add the end of the file
674 # See http://stackoverflow.com/q/4928560/35070 for details
675 for i in range(1, 1024):
676 try:
677 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
678 uncompressed = io.BytesIO(gz.read())
679 except IOError:
680 continue
681 break
682 else:
683 raise original_ioerror
684 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
685 resp.msg = old_resp.msg
686 # deflate
687 if resp.headers.get('Content-encoding', '') == 'deflate':
688 gz = io.BytesIO(self.deflate(resp.read()))
689 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
690 resp.msg = old_resp.msg
691 return resp
0f8d03f8 692
acebc9cd
PH
693 https_request = http_request
694 https_response = http_response
bf50b038
JMF
695
696def unified_strdate(date_str):
697 """Return a string with the date in the format YYYYMMDD"""
698 upload_date = None
699 #Replace commas
700 date_str = date_str.replace(',',' ')
701 # %z (UTC offset) is only supported in python>=3.2
702 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
73e79f2a 703 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y', '%Y-%m-%d', '%d/%m/%Y', '%Y/%m/%d %H:%M:%S', '%d.%m.%Y %H:%M']
bf50b038
JMF
704 for expression in format_expressions:
705 try:
706 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
707 except:
708 pass
709 return upload_date
710
cbdbb766 711def determine_ext(url, default_ext=u'unknown_video'):
73e79f2a
PH
712 guess = url.partition(u'?')[0].rpartition(u'.')[2]
713 if re.match(r'^[A-Za-z0-9]+$', guess):
714 return guess
715 else:
cbdbb766 716 return default_ext
73e79f2a 717
d4051a8e
JMF
718def subtitles_filename(filename, sub_lang, sub_format):
719 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
720
bd558525 721def date_from_str(date_str):
37254abc
JMF
722 """
723 Return a datetime object from a string in the format YYYYMMDD or
724 (now|today)[+-][0-9](day|week|month|year)(s)?"""
725 today = datetime.date.today()
726 if date_str == 'now'or date_str == 'today':
727 return today
728 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
729 if match is not None:
730 sign = match.group('sign')
731 time = int(match.group('time'))
732 if sign == '-':
733 time = -time
734 unit = match.group('unit')
735 #A bad aproximation?
736 if unit == 'month':
737 unit = 'day'
738 time *= 30
739 elif unit == 'year':
740 unit = 'day'
741 time *= 365
742 unit += 's'
743 delta = datetime.timedelta(**{unit: time})
744 return today + delta
bd558525
JMF
745 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
746
747class DateRange(object):
748 """Represents a time interval between two dates"""
749 def __init__(self, start=None, end=None):
750 """start and end must be strings in the format accepted by date"""
751 if start is not None:
752 self.start = date_from_str(start)
753 else:
754 self.start = datetime.datetime.min.date()
755 if end is not None:
756 self.end = date_from_str(end)
757 else:
758 self.end = datetime.datetime.max.date()
37254abc 759 if self.start > self.end:
bd558525
JMF
760 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
761 @classmethod
762 def day(cls, day):
763 """Returns a range that only contains the given day"""
764 return cls(day,day)
765 def __contains__(self, date):
766 """Check if the date is in the range"""
37254abc
JMF
767 if not isinstance(date, datetime.date):
768 date = date_from_str(date)
769 return self.start <= date <= self.end
bd558525
JMF
770 def __str__(self):
771 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
772
773
774def platform_name():
775 """ Returns the platform name as a compat_str """
776 res = platform.platform()
777 if isinstance(res, bytes):
778 res = res.decode(preferredencoding())
779
780 assert isinstance(res, compat_str)
781 return res
c257baff
PH
782
783
48ea9cea
PH
784def bytes_to_intlist(bs):
785 if not bs:
786 return []
787 if isinstance(bs[0], int): # Python 3
788 return list(bs)
789 else:
790 return [ord(c) for c in bs]
791
c257baff 792
cba892fa 793def intlist_to_bytes(xs):
794 if not xs:
795 return b''
796 if isinstance(chr(0), bytes): # Python 2
797 return ''.join([chr(x) for x in xs])
798 else:
799 return bytes(xs)