]> jfr.im git - yt-dlp.git/blame - youtube_dl/utils.py
Restore accidentally deleted commits
[yt-dlp.git] / youtube_dl / utils.py
CommitLineData
d77c3dfd
FV
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3
c496ca96
PH
4import datetime
5import email.utils
f45c185f 6import errno
d77c3dfd 7import gzip
03f9daab 8import io
f4bfd65f 9import json
d77c3dfd
FV
10import locale
11import os
c496ca96 12import platform
d77c3dfd 13import re
c496ca96 14import socket
d77c3dfd 15import sys
01951dda 16import traceback
d77c3dfd 17import zlib
d77c3dfd 18
01ba00ca 19try:
59ae15a5 20 import urllib.request as compat_urllib_request
01ba00ca 21except ImportError: # Python 2
59ae15a5 22 import urllib2 as compat_urllib_request
01ba00ca
PH
23
24try:
59ae15a5 25 import urllib.error as compat_urllib_error
01ba00ca 26except ImportError: # Python 2
59ae15a5 27 import urllib2 as compat_urllib_error
01ba00ca
PH
28
29try:
59ae15a5 30 import urllib.parse as compat_urllib_parse
01ba00ca 31except ImportError: # Python 2
59ae15a5 32 import urllib as compat_urllib_parse
01ba00ca 33
799c0763
PH
34try:
35 from urllib.parse import urlparse as compat_urllib_parse_urlparse
36except ImportError: # Python 2
37 from urlparse import urlparse as compat_urllib_parse_urlparse
38
6543f0dc
JMF
39try:
40 import urllib.parse as compat_urlparse
41except ImportError: # Python 2
42 import urlparse as compat_urlparse
43
01ba00ca 44try:
59ae15a5 45 import http.cookiejar as compat_cookiejar
01ba00ca 46except ImportError: # Python 2
59ae15a5 47 import cookielib as compat_cookiejar
01ba00ca 48
3e669f36 49try:
59ae15a5 50 import html.entities as compat_html_entities
9f37a959 51except ImportError: # Python 2
59ae15a5 52 import htmlentitydefs as compat_html_entities
3e669f36 53
a8156c1d 54try:
59ae15a5 55 import html.parser as compat_html_parser
9f37a959 56except ImportError: # Python 2
59ae15a5 57 import HTMLParser as compat_html_parser
a8156c1d 58
348d0a7a 59try:
59ae15a5 60 import http.client as compat_http_client
9f37a959 61except ImportError: # Python 2
59ae15a5 62 import httplib as compat_http_client
348d0a7a 63
2eabb802 64try:
0e283428 65 from urllib.error import HTTPError as compat_HTTPError
2eabb802
PH
66except ImportError: # Python 2
67 from urllib2 import HTTPError as compat_HTTPError
68
e0df6211
PH
69try:
70 from urllib.request import urlretrieve as compat_urlretrieve
71except ImportError: # Python 2
72 from urllib import urlretrieve as compat_urlretrieve
73
74
5910e210
PH
75try:
76 from subprocess import DEVNULL
77 compat_subprocess_get_DEVNULL = lambda: DEVNULL
78except ImportError:
79 compat_subprocess_get_DEVNULL = lambda: open(os.path.devnull, 'w')
80
9f37a959 81try:
59ae15a5 82 from urllib.parse import parse_qs as compat_parse_qs
9f37a959 83except ImportError: # Python 2
59ae15a5
PH
84 # HACK: The following is the correct parse_qs implementation from cpython 3's stdlib.
85 # Python 2's version is apparently totally broken
86 def _unquote(string, encoding='utf-8', errors='replace'):
87 if string == '':
88 return string
89 res = string.split('%')
90 if len(res) == 1:
91 return string
92 if encoding is None:
93 encoding = 'utf-8'
94 if errors is None:
95 errors = 'replace'
96 # pct_sequence: contiguous sequence of percent-encoded bytes, decoded
97 pct_sequence = b''
98 string = res[0]
99 for item in res[1:]:
100 try:
101 if not item:
102 raise ValueError
103 pct_sequence += item[:2].decode('hex')
104 rest = item[2:]
105 if not rest:
106 # This segment was just a single percent-encoded character.
107 # May be part of a sequence of code units, so delay decoding.
108 # (Stored in pct_sequence).
109 continue
110 except ValueError:
111 rest = '%' + item
112 # Encountered non-percent-encoded characters. Flush the current
113 # pct_sequence.
114 string += pct_sequence.decode(encoding, errors) + rest
115 pct_sequence = b''
116 if pct_sequence:
117 # Flush the final pct_sequence
118 string += pct_sequence.decode(encoding, errors)
119 return string
120
121 def _parse_qsl(qs, keep_blank_values=False, strict_parsing=False,
122 encoding='utf-8', errors='replace'):
123 qs, _coerce_result = qs, unicode
124 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
125 r = []
126 for name_value in pairs:
127 if not name_value and not strict_parsing:
128 continue
129 nv = name_value.split('=', 1)
130 if len(nv) != 2:
131 if strict_parsing:
132 raise ValueError("bad query field: %r" % (name_value,))
133 # Handle case of a control-name with no equal sign
134 if keep_blank_values:
135 nv.append('')
136 else:
137 continue
138 if len(nv[1]) or keep_blank_values:
139 name = nv[0].replace('+', ' ')
140 name = _unquote(name, encoding=encoding, errors=errors)
141 name = _coerce_result(name)
142 value = nv[1].replace('+', ' ')
143 value = _unquote(value, encoding=encoding, errors=errors)
144 value = _coerce_result(value)
145 r.append((name, value))
146 return r
147
148 def compat_parse_qs(qs, keep_blank_values=False, strict_parsing=False,
149 encoding='utf-8', errors='replace'):
150 parsed_result = {}
151 pairs = _parse_qsl(qs, keep_blank_values, strict_parsing,
152 encoding=encoding, errors=errors)
153 for name, value in pairs:
154 if name in parsed_result:
155 parsed_result[name].append(value)
156 else:
157 parsed_result[name] = [value]
158 return parsed_result
348d0a7a 159
3e669f36 160try:
59ae15a5 161 compat_str = unicode # Python 2
3e669f36 162except NameError:
59ae15a5 163 compat_str = str
3e669f36
PH
164
165try:
59ae15a5 166 compat_chr = unichr # Python 2
3e669f36 167except NameError:
59ae15a5 168 compat_chr = chr
3e669f36 169
b31756c1
FV
170def compat_ord(c):
171 if type(c) is int: return c
172 else: return ord(c)
173
468e2e92
FV
174# This is not clearly defined otherwise
175compiled_regex_type = type(re.compile(''))
176
3e669f36 177std_headers = {
59ae15a5
PH
178 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0',
179 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
180 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
181 'Accept-Encoding': 'gzip, deflate',
182 'Accept-Language': 'en-us,en;q=0.5',
3e669f36 183}
f427df17 184
d77c3dfd 185def preferredencoding():
59ae15a5 186 """Get preferred encoding.
d77c3dfd 187
59ae15a5
PH
188 Returns the best encoding scheme for the system, based on
189 locale.getpreferredencoding() and some further tweaks.
190 """
191 try:
192 pref = locale.getpreferredencoding()
193 u'TEST'.encode(pref)
194 except:
195 pref = 'UTF-8'
bae611f2 196
59ae15a5 197 return pref
d77c3dfd 198
8cd10ac4 199if sys.version_info < (3,0):
59ae15a5
PH
200 def compat_print(s):
201 print(s.encode(preferredencoding(), 'xmlcharrefreplace'))
8cd10ac4 202else:
59ae15a5
PH
203 def compat_print(s):
204 assert type(s) == type(u'')
205 print(s)
d77c3dfd 206
f4bfd65f
PH
207# In Python 2.x, json.dump expects a bytestream.
208# In Python 3.x, it writes to a character stream
209if sys.version_info < (3,0):
210 def write_json_file(obj, fn):
211 with open(fn, 'wb') as f:
212 json.dump(obj, f)
213else:
214 def write_json_file(obj, fn):
215 with open(fn, 'w', encoding='utf-8') as f:
216 json.dump(obj, f)
217
59ae56fa
PH
218if sys.version_info >= (2,7):
219 def find_xpath_attr(node, xpath, key, val):
220 """ Find the xpath xpath[@key=val] """
5de3ece2 221 assert re.match(r'^[a-zA-Z]+$', key)
54543467 222 assert re.match(r'^[a-zA-Z0-9@\s]*$', val)
59ae56fa
PH
223 expr = xpath + u"[@%s='%s']" % (key, val)
224 return node.find(expr)
225else:
226 def find_xpath_attr(node, xpath, key, val):
227 for f in node.findall(xpath):
228 if f.attrib.get(key) == val:
229 return f
230 return None
231
d77c3dfd 232def htmlentity_transform(matchobj):
59ae15a5
PH
233 """Transforms an HTML entity to a character.
234
235 This function receives a match object and is intended to be used with
236 the re.sub() function.
237 """
238 entity = matchobj.group(1)
239
240 # Known non-numeric HTML entity
241 if entity in compat_html_entities.name2codepoint:
242 return compat_chr(compat_html_entities.name2codepoint[entity])
243
244 mobj = re.match(u'(?u)#(x?\\d+)', entity)
245 if mobj is not None:
246 numstr = mobj.group(1)
247 if numstr.startswith(u'x'):
248 base = 16
249 numstr = u'0%s' % numstr
250 else:
251 base = 10
252 return compat_chr(int(numstr, base))
253
254 # Unknown entity in name, return its literal representation
255 return (u'&%s;' % entity)
d77c3dfd 256
a8156c1d 257compat_html_parser.locatestarttagend = re.compile(r"""<[a-zA-Z][-.a-zA-Z0-9:_]*(?:\s+(?:(?<=['"\s])[^\s/>][^\s/=>]*(?:\s*=+\s*(?:'[^']*'|"[^"]*"|(?!['"])[^>\s]*))?\s*)*)?\s*""", re.VERBOSE) # backport bugfix
a921f407
JMF
258class BaseHTMLParser(compat_html_parser.HTMLParser):
259 def __init(self):
260 compat_html_parser.HTMLParser.__init__(self)
261 self.html = None
262
263 def loads(self, html):
264 self.html = html
265 self.feed(html)
266 self.close()
267
268class AttrParser(BaseHTMLParser):
43e8fafd
ND
269 """Modified HTMLParser that isolates a tag with the specified attribute"""
270 def __init__(self, attribute, value):
271 self.attribute = attribute
272 self.value = value
59ae15a5
PH
273 self.result = None
274 self.started = False
275 self.depth = {}
59ae15a5
PH
276 self.watch_startpos = False
277 self.error_count = 0
a921f407 278 BaseHTMLParser.__init__(self)
59ae15a5
PH
279
280 def error(self, message):
281 if self.error_count > 10 or self.started:
282 raise compat_html_parser.HTMLParseError(message, self.getpos())
283 self.rawdata = '\n'.join(self.html.split('\n')[self.getpos()[0]:]) # skip one line
284 self.error_count += 1
285 self.goahead(1)
286
59ae15a5
PH
287 def handle_starttag(self, tag, attrs):
288 attrs = dict(attrs)
289 if self.started:
290 self.find_startpos(None)
43e8fafd 291 if self.attribute in attrs and attrs[self.attribute] == self.value:
59ae15a5
PH
292 self.result = [tag]
293 self.started = True
294 self.watch_startpos = True
295 if self.started:
296 if not tag in self.depth: self.depth[tag] = 0
297 self.depth[tag] += 1
298
299 def handle_endtag(self, tag):
300 if self.started:
301 if tag in self.depth: self.depth[tag] -= 1
302 if self.depth[self.result[0]] == 0:
303 self.started = False
304 self.result.append(self.getpos())
305
306 def find_startpos(self, x):
307 """Needed to put the start position of the result (self.result[1])
308 after the opening tag with the requested id"""
309 if self.watch_startpos:
310 self.watch_startpos = False
311 self.result.append(self.getpos())
312 handle_entityref = handle_charref = handle_data = handle_comment = \
313 handle_decl = handle_pi = unknown_decl = find_startpos
314
315 def get_result(self):
316 if self.result is None:
317 return None
318 if len(self.result) != 3:
319 return None
320 lines = self.html.split('\n')
321 lines = lines[self.result[1][0]-1:self.result[2][0]]
322 lines[0] = lines[0][self.result[1][1]:]
323 if len(lines) == 1:
324 lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
325 lines[-1] = lines[-1][:self.result[2][1]]
326 return '\n'.join(lines).strip()
3b024e17
PH
327# Hack for https://github.com/rg3/youtube-dl/issues/662
328if sys.version_info < (2, 7, 3):
329 AttrParser.parse_endtag = (lambda self, i:
330 i + len("</scr'+'ipt>")
331 if self.rawdata[i:].startswith("</scr'+'ipt>")
332 else compat_html_parser.HTMLParser.parse_endtag(self, i))
9e6dd238
FV
333
334def get_element_by_id(id, html):
43e8fafd
ND
335 """Return the content of the tag with the specified ID in the passed HTML document"""
336 return get_element_by_attribute("id", id, html)
337
338def get_element_by_attribute(attribute, value, html):
339 """Return the content of the tag with the specified attribute in the passed HTML document"""
340 parser = AttrParser(attribute, value)
59ae15a5
PH
341 try:
342 parser.loads(html)
343 except compat_html_parser.HTMLParseError:
344 pass
345 return parser.get_result()
9e6dd238 346
a921f407
JMF
347class MetaParser(BaseHTMLParser):
348 """
349 Modified HTMLParser that isolates a meta tag with the specified name
350 attribute.
351 """
352 def __init__(self, name):
353 BaseHTMLParser.__init__(self)
354 self.name = name
355 self.content = None
356 self.result = None
357
358 def handle_starttag(self, tag, attrs):
359 if tag != 'meta':
360 return
361 attrs = dict(attrs)
362 if attrs.get('name') == self.name:
363 self.result = attrs.get('content')
364
365 def get_result(self):
366 return self.result
367
368def get_meta_content(name, html):
369 """
370 Return the content attribute from the meta tag with the given name attribute.
371 """
372 parser = MetaParser(name)
373 try:
374 parser.loads(html)
375 except compat_html_parser.HTMLParseError:
376 pass
377 return parser.get_result()
378
9e6dd238
FV
379
380def clean_html(html):
59ae15a5
PH
381 """Clean an HTML snippet into a readable string"""
382 # Newline vs <br />
383 html = html.replace('\n', ' ')
6b3aef80
FV
384 html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
385 html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
59ae15a5
PH
386 # Strip html tags
387 html = re.sub('<.*?>', '', html)
388 # Replace html entities
389 html = unescapeHTML(html)
7decf895 390 return html.strip()
9e6dd238
FV
391
392
d77c3dfd 393def sanitize_open(filename, open_mode):
59ae15a5
PH
394 """Try to open the given filename, and slightly tweak it if this fails.
395
396 Attempts to open the given filename. If this fails, it tries to change
397 the filename slightly, step by step, until it's either able to open it
398 or it fails and raises a final exception, like the standard open()
399 function.
400
401 It returns the tuple (stream, definitive_file_name).
402 """
403 try:
404 if filename == u'-':
405 if sys.platform == 'win32':
406 import msvcrt
407 msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
898280a0 408 return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename)
59ae15a5
PH
409 stream = open(encodeFilename(filename), open_mode)
410 return (stream, filename)
411 except (IOError, OSError) as err:
f45c185f
PH
412 if err.errno in (errno.EACCES,):
413 raise
59ae15a5 414
f45c185f
PH
415 # In case of error, try to remove win32 forbidden chars
416 alt_filename = os.path.join(
417 re.sub(u'[/<>:"\\|\\\\?\\*]', u'#', path_part)
418 for path_part in os.path.split(filename)
419 )
420 if alt_filename == filename:
421 raise
422 else:
423 # An exception here should be caught in the caller
424 stream = open(encodeFilename(filename), open_mode)
425 return (stream, alt_filename)
d77c3dfd
FV
426
427
428def timeconvert(timestr):
59ae15a5
PH
429 """Convert RFC 2822 defined time string into system timestamp"""
430 timestamp = None
431 timetuple = email.utils.parsedate_tz(timestr)
432 if timetuple is not None:
433 timestamp = email.utils.mktime_tz(timetuple)
434 return timestamp
1c469a94 435
796173d0 436def sanitize_filename(s, restricted=False, is_id=False):
59ae15a5
PH
437 """Sanitizes a string so it could be used as part of a filename.
438 If restricted is set, use a stricter subset of allowed characters.
796173d0 439 Set is_id if this is not an arbitrary string, but an ID that should be kept if possible
59ae15a5
PH
440 """
441 def replace_insane(char):
442 if char == '?' or ord(char) < 32 or ord(char) == 127:
443 return ''
444 elif char == '"':
445 return '' if restricted else '\''
446 elif char == ':':
447 return '_-' if restricted else ' -'
448 elif char in '\\/|*<>':
449 return '_'
627dcfff 450 if restricted and (char in '!&\'()[]{}$;`^,#' or char.isspace()):
59ae15a5
PH
451 return '_'
452 if restricted and ord(char) > 127:
453 return '_'
454 return char
455
456 result = u''.join(map(replace_insane, s))
796173d0
PH
457 if not is_id:
458 while '__' in result:
459 result = result.replace('__', '_')
460 result = result.strip('_')
461 # Common case of "Foreign band name - English song title"
462 if restricted and result.startswith('-_'):
463 result = result[2:]
464 if not result:
465 result = '_'
59ae15a5 466 return result
d77c3dfd
FV
467
468def orderedSet(iterable):
59ae15a5
PH
469 """ Remove all duplicates from the input iterable """
470 res = []
471 for el in iterable:
472 if el not in res:
473 res.append(el)
474 return res
d77c3dfd
FV
475
476def unescapeHTML(s):
59ae15a5
PH
477 """
478 @param s a string
479 """
480 assert type(s) == type(u'')
d77c3dfd 481
59ae15a5
PH
482 result = re.sub(u'(?u)&(.+?);', htmlentity_transform, s)
483 return result
d77c3dfd
FV
484
485def encodeFilename(s):
59ae15a5
PH
486 """
487 @param s The name of the file
488 """
d77c3dfd 489
59ae15a5 490 assert type(s) == type(u'')
d77c3dfd 491
59ae15a5
PH
492 # Python 3 has a Unicode API
493 if sys.version_info >= (3, 0):
494 return s
0f00efed 495
59ae15a5
PH
496 if sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5:
497 # Pass u'' directly to use Unicode APIs on Windows 2000 and up
498 # (Detecting Windows NT 4 is tricky because 'major >= 4' would
499 # match Windows 9x series as well. Besides, NT 4 is obsolete.)
500 return s
501 else:
6df40dcb
PH
502 encoding = sys.getfilesystemencoding()
503 if encoding is None:
504 encoding = 'utf-8'
505 return s.encode(encoding, 'ignore')
d77c3dfd 506
8271226a
PH
507def decodeOption(optval):
508 if optval is None:
509 return optval
510 if isinstance(optval, bytes):
511 optval = optval.decode(preferredencoding())
512
513 assert isinstance(optval, compat_str)
514 return optval
1c256f70 515
4539dd30
PH
516def formatSeconds(secs):
517 if secs > 3600:
518 return '%d:%02d:%02d' % (secs // 3600, (secs % 3600) // 60, secs % 60)
519 elif secs > 60:
520 return '%d:%02d' % (secs // 60, secs % 60)
521 else:
522 return '%d' % secs
523
ea6d901e
PH
524def make_HTTPS_handler(opts):
525 if sys.version_info < (3,2):
526 # Python's 2.x handler is very simplistic
acebc9cd 527 return compat_urllib_request.HTTPSHandler()
ea6d901e
PH
528 else:
529 import ssl
530 context = ssl.SSLContext(ssl.PROTOCOL_SSLv23)
531 context.set_default_verify_paths()
532
533 context.verify_mode = (ssl.CERT_NONE
534 if opts.no_check_certificate
535 else ssl.CERT_REQUIRED)
acebc9cd 536 return compat_urllib_request.HTTPSHandler(context=context)
ea6d901e 537
1c256f70
PH
538class ExtractorError(Exception):
539 """Error during info extraction."""
2eabb802 540 def __init__(self, msg, tb=None, expected=False, cause=None):
9a82b238
PH
541 """ tb, if given, is the original traceback (so that it can be printed out).
542 If expected is set, this is a normal error message and most likely not a bug in youtube-dl.
543 """
544
545 if sys.exc_info()[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError):
546 expected = True
547 if not expected:
298f833b 548 msg = msg + u'; please report this issue on https://yt-dl.org/bug . Be sure to call youtube-dl with the --verbose flag and include its complete output. Make sure you are using the latest version; type youtube-dl -U to update.'
1c256f70 549 super(ExtractorError, self).__init__(msg)
d5979c5d 550
1c256f70 551 self.traceback = tb
8cc83b8d 552 self.exc_info = sys.exc_info() # preserve original exception
2eabb802 553 self.cause = cause
1c256f70 554
01951dda
PH
555 def format_traceback(self):
556 if self.traceback is None:
557 return None
558 return u''.join(traceback.format_tb(self.traceback))
559
1c256f70 560
d77c3dfd 561class DownloadError(Exception):
59ae15a5 562 """Download Error exception.
d77c3dfd 563
59ae15a5
PH
564 This exception may be thrown by FileDownloader objects if they are not
565 configured to continue on errors. They will contain the appropriate
566 error message.
567 """
8cc83b8d
FV
568 def __init__(self, msg, exc_info=None):
569 """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """
570 super(DownloadError, self).__init__(msg)
571 self.exc_info = exc_info
d77c3dfd
FV
572
573
574class SameFileError(Exception):
59ae15a5 575 """Same File exception.
d77c3dfd 576
59ae15a5
PH
577 This exception will be thrown by FileDownloader objects if they detect
578 multiple files would have to be downloaded to the same file on disk.
579 """
580 pass
d77c3dfd
FV
581
582
583class PostProcessingError(Exception):
59ae15a5 584 """Post Processing exception.
d77c3dfd 585
59ae15a5
PH
586 This exception may be raised by PostProcessor's .run() method to
587 indicate an error in the postprocessing task.
588 """
7851b379
PH
589 def __init__(self, msg):
590 self.msg = msg
d77c3dfd
FV
591
592class MaxDownloadsReached(Exception):
59ae15a5
PH
593 """ --max-downloads limit has been reached. """
594 pass
d77c3dfd
FV
595
596
597class UnavailableVideoError(Exception):
59ae15a5 598 """Unavailable Format exception.
d77c3dfd 599
59ae15a5
PH
600 This exception will be thrown when a video is requested
601 in a format that is not available for that video.
602 """
603 pass
d77c3dfd
FV
604
605
606class ContentTooShortError(Exception):
59ae15a5 607 """Content Too Short exception.
d77c3dfd 608
59ae15a5
PH
609 This exception may be raised by FileDownloader objects when a file they
610 download is too small for what the server announced first, indicating
611 the connection was probably interrupted.
612 """
613 # Both in bytes
614 downloaded = None
615 expected = None
d77c3dfd 616
59ae15a5
PH
617 def __init__(self, downloaded, expected):
618 self.downloaded = downloaded
619 self.expected = expected
d77c3dfd 620
acebc9cd 621class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
59ae15a5
PH
622 """Handler for HTTP requests and responses.
623
624 This class, when installed with an OpenerDirector, automatically adds
625 the standard headers to every HTTP request and handles gzipped and
626 deflated responses from web servers. If compression is to be avoided in
627 a particular request, the original request in the program code only has
628 to include the HTTP header "Youtubedl-No-Compression", which will be
629 removed before making the real request.
630
631 Part of this code was copied from:
632
633 http://techknack.net/python-urllib2-handlers/
634
635 Andrew Rowls, the author of that code, agreed to release it to the
636 public domain.
637 """
638
639 @staticmethod
640 def deflate(data):
641 try:
642 return zlib.decompress(data, -zlib.MAX_WBITS)
643 except zlib.error:
644 return zlib.decompress(data)
645
646 @staticmethod
647 def addinfourl_wrapper(stream, headers, url, code):
648 if hasattr(compat_urllib_request.addinfourl, 'getcode'):
649 return compat_urllib_request.addinfourl(stream, headers, url, code)
650 ret = compat_urllib_request.addinfourl(stream, headers, url)
651 ret.code = code
652 return ret
653
acebc9cd
PH
654 def http_request(self, req):
655 for h,v in std_headers.items():
59ae15a5
PH
656 if h in req.headers:
657 del req.headers[h]
335959e7 658 req.add_header(h, v)
59ae15a5
PH
659 if 'Youtubedl-no-compression' in req.headers:
660 if 'Accept-encoding' in req.headers:
661 del req.headers['Accept-encoding']
662 del req.headers['Youtubedl-no-compression']
3446dfb7 663 if 'Youtubedl-user-agent' in req.headers:
335959e7
PH
664 if 'User-agent' in req.headers:
665 del req.headers['User-agent']
666 req.headers['User-agent'] = req.headers['Youtubedl-user-agent']
3446dfb7 667 del req.headers['Youtubedl-user-agent']
59ae15a5
PH
668 return req
669
acebc9cd 670 def http_response(self, req, resp):
59ae15a5
PH
671 old_resp = resp
672 # gzip
673 if resp.headers.get('Content-encoding', '') == 'gzip':
aa3e9507
PH
674 content = resp.read()
675 gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb')
676 try:
677 uncompressed = io.BytesIO(gz.read())
678 except IOError as original_ioerror:
679 # There may be junk add the end of the file
680 # See http://stackoverflow.com/q/4928560/35070 for details
681 for i in range(1, 1024):
682 try:
683 gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb')
684 uncompressed = io.BytesIO(gz.read())
685 except IOError:
686 continue
687 break
688 else:
689 raise original_ioerror
690 resp = self.addinfourl_wrapper(uncompressed, old_resp.headers, old_resp.url, old_resp.code)
59ae15a5
PH
691 resp.msg = old_resp.msg
692 # deflate
693 if resp.headers.get('Content-encoding', '') == 'deflate':
694 gz = io.BytesIO(self.deflate(resp.read()))
695 resp = self.addinfourl_wrapper(gz, old_resp.headers, old_resp.url, old_resp.code)
696 resp.msg = old_resp.msg
697 return resp
0f8d03f8 698
acebc9cd
PH
699 https_request = http_request
700 https_response = http_response
bf50b038
JMF
701
702def unified_strdate(date_str):
703 """Return a string with the date in the format YYYYMMDD"""
704 upload_date = None
705 #Replace commas
706 date_str = date_str.replace(',',' ')
707 # %z (UTC offset) is only supported in python>=3.2
708 date_str = re.sub(r' (\+|-)[\d]*$', '', date_str)
19e1d359
JMF
709 format_expressions = [
710 '%d %B %Y',
711 '%B %d %Y',
712 '%b %d %Y',
713 '%Y-%m-%d',
714 '%d/%m/%Y',
715 '%Y/%m/%d %H:%M:%S',
716 '%d.%m.%Y %H:%M',
717 '%Y-%m-%dT%H:%M:%SZ',
718 ]
bf50b038
JMF
719 for expression in format_expressions:
720 try:
721 upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d')
722 except:
723 pass
724 return upload_date
725
cbdbb766 726def determine_ext(url, default_ext=u'unknown_video'):
73e79f2a
PH
727 guess = url.partition(u'?')[0].rpartition(u'.')[2]
728 if re.match(r'^[A-Za-z0-9]+$', guess):
729 return guess
730 else:
cbdbb766 731 return default_ext
73e79f2a 732
d4051a8e
JMF
733def subtitles_filename(filename, sub_lang, sub_format):
734 return filename.rsplit('.', 1)[0] + u'.' + sub_lang + u'.' + sub_format
735
bd558525 736def date_from_str(date_str):
37254abc
JMF
737 """
738 Return a datetime object from a string in the format YYYYMMDD or
739 (now|today)[+-][0-9](day|week|month|year)(s)?"""
740 today = datetime.date.today()
741 if date_str == 'now'or date_str == 'today':
742 return today
743 match = re.match('(now|today)(?P<sign>[+-])(?P<time>\d+)(?P<unit>day|week|month|year)(s)?', date_str)
744 if match is not None:
745 sign = match.group('sign')
746 time = int(match.group('time'))
747 if sign == '-':
748 time = -time
749 unit = match.group('unit')
750 #A bad aproximation?
751 if unit == 'month':
752 unit = 'day'
753 time *= 30
754 elif unit == 'year':
755 unit = 'day'
756 time *= 365
757 unit += 's'
758 delta = datetime.timedelta(**{unit: time})
759 return today + delta
bd558525
JMF
760 return datetime.datetime.strptime(date_str, "%Y%m%d").date()
761
762class DateRange(object):
763 """Represents a time interval between two dates"""
764 def __init__(self, start=None, end=None):
765 """start and end must be strings in the format accepted by date"""
766 if start is not None:
767 self.start = date_from_str(start)
768 else:
769 self.start = datetime.datetime.min.date()
770 if end is not None:
771 self.end = date_from_str(end)
772 else:
773 self.end = datetime.datetime.max.date()
37254abc 774 if self.start > self.end:
bd558525
JMF
775 raise ValueError('Date range: "%s" , the start date must be before the end date' % self)
776 @classmethod
777 def day(cls, day):
778 """Returns a range that only contains the given day"""
779 return cls(day,day)
780 def __contains__(self, date):
781 """Check if the date is in the range"""
37254abc
JMF
782 if not isinstance(date, datetime.date):
783 date = date_from_str(date)
784 return self.start <= date <= self.end
bd558525
JMF
785 def __str__(self):
786 return '%s - %s' % ( self.start.isoformat(), self.end.isoformat())
c496ca96
PH
787
788
789def platform_name():
790 """ Returns the platform name as a compat_str """
791 res = platform.platform()
792 if isinstance(res, bytes):
793 res = res.decode(preferredencoding())
794
795 assert isinstance(res, compat_str)
796 return res
c257baff
PH
797
798
7459e3a2
PH
799def write_string(s, out=None):
800 if out is None:
801 out = sys.stderr
802 assert type(s) == type(u'')
803
804 if ('b' in getattr(out, 'mode', '') or
805 sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr
806 s = s.encode(preferredencoding(), 'ignore')
807 out.write(s)
808 out.flush()
809
810
48ea9cea
PH
811def bytes_to_intlist(bs):
812 if not bs:
813 return []
814 if isinstance(bs[0], int): # Python 3
815 return list(bs)
816 else:
817 return [ord(c) for c in bs]
818
c257baff 819
cba892fa 820def intlist_to_bytes(xs):
821 if not xs:
822 return b''
823 if isinstance(chr(0), bytes): # Python 2
824 return ''.join([chr(x) for x in xs])
825 else:
826 return bytes(xs)